From b6bf5efd882c7f848508435256560a87d30be5ca Mon Sep 17 00:00:00 2001 From: ponzischeme89 Date: Sun, 18 Jan 2026 23:36:29 +1300 Subject: [PATCH] Wikipedia support --- README.md | 2 +- frontend/src/components/AppSidebar.svelte | 2 +- .../src/components/scan/ResultsList.svelte | 17 +- frontend/src/components/scan/ScanPanel.svelte | 10 + .../settings/FolderRulesSettings.svelte | 1 + .../settings/IntegrationsSettings.svelte | 137 ++++++++- server/app.py | 49 ++- server/core/database.py | 4 +- server/core/subtitle_processor.py | 30 +- server/core/wikipedia_client.py | 284 ++++++++++++++++++ 10 files changed, 520 insertions(+), 16 deletions(-) create mode 100644 server/core/wikipedia_client.py diff --git a/README.md b/README.md index e9572e4..fd66a43 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Why? Because if the cast list and IMDb/RT rating show up in the first minute, my ## Features - Insert plot summaries into existing .srt files without shifting timings -- Fetch metadata (plot, runtime, director, cast, IMDb/RT ratings) using OMDb, TMDb, and TVMaze — add these integrations under Settings before scanning +- Fetch metadata (plot, runtime, director, cast, IMDb/RT ratings) using OMDb, TMDb, TVMaze and Wikipedia — add these integrations under Settings before scanning - Preserve original dialogue and timing with safe insertion logic - Folder Rules to have seperate logic for different folders (for example TV shows could have runtime but not actors, etc) - Clean, fast web UI for scanning and batch processing built with Svelte + Python/Flask diff --git a/frontend/src/components/AppSidebar.svelte b/frontend/src/components/AppSidebar.svelte index c048eef..0073dc9 100644 --- a/frontend/src/components/AppSidebar.svelte +++ b/frontend/src/components/AppSidebar.svelte @@ -154,7 +154,7 @@ > {#if !collapsed} v1.0.9 Release Candiatev1.0.10 Release Candiate {:else} v diff --git a/frontend/src/components/scan/ResultsList.svelte b/frontend/src/components/scan/ResultsList.svelte index 59ff50d..85a0485 100644 --- a/frontend/src/components/scan/ResultsList.svelte +++ b/frontend/src/components/scan/ResultsList.svelte @@ -24,7 +24,12 @@ export let disabled = false; export let metadataProvider = "omdb"; export let metadataLanguage = ""; - export let activeIntegrations = { omdb: true, tmdb: true, tvmaze: true }; + export let activeIntegrations = { + omdb: true, + tmdb: true, + tvmaze: true, + wikipedia: true, + }; export let loading = false; // Pagination @@ -797,13 +802,15 @@ if (!metadataProvider) return "Select source"; if (metadataProvider === "both") return "OMDb + TMDb"; if (metadataProvider === "tvmaze") return "TVmaze"; + if (metadataProvider === "wikipedia") return "Wikipedia"; return metadataProvider.toUpperCase(); } $: omdbActive = activeIntegrations?.omdb ?? false; $: tmdbActive = activeIntegrations?.tmdb ?? false; $: tvmazeActive = activeIntegrations?.tvmaze ?? false; - $: hasActiveIntegrations = omdbActive || tmdbActive || tvmazeActive; + $: wikipediaActive = activeIntegrations?.wikipedia ?? false; + $: hasActiveIntegrations = omdbActive || tmdbActive || tvmazeActive || wikipediaActive; $: if (!hasActiveIntegrations && showMetadataDropup) { showMetadataDropup = false; } @@ -826,6 +833,12 @@ description: "TV metadata without an API key", enabled: tvmazeActive, }, + { + value: "wikipedia", + label: "Wikipedia", + description: "Strict encyclopedia summaries", + enabled: wikipediaActive, + }, { value: "both", label: "Both", diff --git a/frontend/src/components/scan/ScanPanel.svelte b/frontend/src/components/scan/ScanPanel.svelte index b0a61bd..a918457 100644 --- a/frontend/src/components/scan/ScanPanel.svelte +++ b/frontend/src/components/scan/ScanPanel.svelte @@ -69,6 +69,7 @@ let omdbEnabled = false; let tmdbEnabled = false; let tvmazeEnabled = false; + let wikipediaEnabled = false; let settingsLoaded = false; let defaultDirectory = ""; let showSaveDirectoryPrompt = false; @@ -105,6 +106,12 @@ description: "TV metadata without an API key", disabled: !tvmazeEnabled, }, + { + value: "wikipedia", + label: "Wikipedia", + description: "Strict encyclopedia summaries", + disabled: !wikipediaEnabled, + }, { value: "both", label: "Both", @@ -150,6 +157,7 @@ omdbEnabled = settings.omdb_enabled ?? false; tmdbEnabled = settings.tmdb_enabled ?? false; tvmazeEnabled = settings.tvmaze_enabled ?? false; + wikipediaEnabled = settings.wikipedia_enabled ?? false; settingsLoaded = true; // Load previous scan results from store if available @@ -573,6 +581,7 @@ function formatMetadataLabel(source) { if (source === "both") return "OMDb + TMDb"; if (source === "tvmaze") return "TVmaze"; + if (source === "wikipedia") return "Wikipedia"; return source.toUpperCase(); } @@ -858,6 +867,7 @@ omdb: omdbEnabled, tmdb: tmdbEnabled, tvmaze: tvmazeEnabled, + wikipedia: wikipediaEnabled, }} loading={processing} on:processSingle={handleProcessSingle} diff --git a/frontend/src/components/settings/FolderRulesSettings.svelte b/frontend/src/components/settings/FolderRulesSettings.svelte index e2ea734..e2f465c 100644 --- a/frontend/src/components/settings/FolderRulesSettings.svelte +++ b/frontend/src/components/settings/FolderRulesSettings.svelte @@ -25,6 +25,7 @@ { value: 'omdb', label: 'OMDb' }, { value: 'tmdb', label: 'TMDb' }, { value: 'tvmaze', label: 'TVmaze' }, + { value: 'wikipedia', label: 'Wikipedia' }, { value: 'both', label: 'OMDb + TMDb' } ] diff --git a/frontend/src/components/settings/IntegrationsSettings.svelte b/frontend/src/components/settings/IntegrationsSettings.svelte index 5d1507a..3b39c7a 100644 --- a/frontend/src/components/settings/IntegrationsSettings.svelte +++ b/frontend/src/components/settings/IntegrationsSettings.svelte @@ -12,11 +12,13 @@ let omdbEnabled = settings.omdb_enabled ?? false; let tmdbEnabled = settings.tmdb_enabled ?? false; let tvmazeEnabled = settings.tvmaze_enabled ?? false; + let wikipediaEnabled = settings.wikipedia_enabled ?? false; let usage = null; let loadingUsage = false; let showOmdbHelp = false; let showTmdbHelp = false; let showTvmazeHelp = false; + let showWikipediaHelp = false; /* =============================== Lifecycle @@ -48,6 +50,15 @@ } } + function toggleWikipediaHelp() { + showWikipediaHelp = !showWikipediaHelp; + if (showWikipediaHelp) { + showOmdbHelp = false; + showTmdbHelp = false; + showTvmazeHelp = false; + } + } + function clickOutside(node, handler) { if (typeof handler !== "function") return { destroy() {} }; const onPointerDown = (event) => { @@ -82,6 +93,7 @@ omdb_enabled: omdbEnabled, tmdb_enabled: tmdbEnabled, tvmaze_enabled: tvmazeEnabled, + wikipedia_enabled: wikipediaEnabled, }); } @@ -133,6 +145,7 @@ $: omdbUsage = usage?.omdb; $: tmdbUsage = usage?.tmdb; $: tvmazeUsage = usage?.tvmaze; + $: wikipediaUsage = usage?.wikipedia; $: omdbPercent = omdbUsage ? usagePercent(omdbUsage.total_calls_24h, omdbUsage.limit) @@ -144,12 +157,19 @@ $: tvmazePercent = tvmazeUsage ? usagePercent(tvmazeUsage.total_calls_24h, tvmazeUsage.limit) : 0; + $: wikipediaPercent = wikipediaUsage + ? usagePercent(wikipediaUsage.total_calls_24h, wikipediaUsage.limit) + : 0; $: omdbState = usageState(omdbPercent); $: tmdbState = usageState(tmdbPercent); $: tvmazeState = usageState(tvmazePercent); + $: wikipediaState = usageState(wikipediaPercent); $: enabledCount = - (omdbEnabled ? 1 : 0) + (tmdbEnabled ? 1 : 0) + (tvmazeEnabled ? 1 : 0); + (omdbEnabled ? 1 : 0) + + (tmdbEnabled ? 1 : 0) + + (tvmazeEnabled ? 1 : 0) + + (wikipediaEnabled ? 1 : 0);
@@ -189,6 +209,13 @@ > Add TVmaze + {/if} @@ -228,6 +255,15 @@ Add TVmaze {/if} + {#if !wikipediaEnabled} + + {/if} {/if} @@ -444,6 +480,105 @@ {/if} + + {#if wikipediaEnabled} +
+
+
+

Wikipedia

+

+ Community encyclopedia summaries (strict title matching) +

+
+
+
(showWikipediaHelp = false)}> + + {#if showWikipediaHelp} +
+

Wikipedia strict mode

+

+ We only accept exact title matches and validate year + media type. +

+
    +
  1. Enable Wikipedia
  2. +
  3. Select Wikipedia as metadata source
  4. +
  5. Use exact titles for best matches
  6. +
+
+ {/if} +
+ + Movies & Series + +
+
+ +
+
+

Enable Wikipedia

+

Use Wikipedia for strict metadata lookups.

+
+ +
+ + {#if wikipediaUsage && wikipediaEnabled} +
+
+ + {usageLabel(wikipediaState)} Usage (24h) + + + Resets in {formatResetTime(wikipediaUsage.reset_time)} + +
+ +
+
+ + {wikipediaUsage.total_calls_24h} / {wikipediaUsage.limit} calls + + + {wikipediaPercent}% + +
+ +
+
+ +
+ + {wikipediaPercent}% + +
+
+
+
+ {/if} +
+ {/if} + diff --git a/server/app.py b/server/app.py index 858b118..b27d0ac 100644 --- a/server/app.py +++ b/server/app.py @@ -15,6 +15,7 @@ from core.config_manager import ConfigManager from core.omdb_client import OMDbClient from core.tmdb_client import TMDbClient from core.tvmaze_client import TVMazeClient +from core.wikipedia_client import WikipediaClient from core.subtitle_processor import SubtitleProcessor, SubtitleFormatOptions, SUBLOGUE_TOKEN_PATTERN, SUBLOGUE_SENTINEL from core.keyword_stripper import get_stripper from core.file_scanner import FileScanner @@ -36,6 +37,7 @@ db = DatabaseManager() omdb_client = None tmdb_client = None tvmaze_client = None +wikipedia_client = None processor = None # In-memory scan state (still used for current session) @@ -129,7 +131,7 @@ def start_scheduled_scan_worker(): def initialize_clients(): """Initialize OMDb, TMDb, TVmaze clients and processor with current API keys""" - global omdb_client, tmdb_client, tvmaze_client, processor + global omdb_client, tmdb_client, tvmaze_client, wikipedia_client, processor # Load OMDb API key omdb_key = _get_str_setting("omdb_api_key", "") @@ -144,6 +146,7 @@ def initialize_clients(): omdb_enabled = _get_bool_setting("omdb_enabled", False) tmdb_enabled = _get_bool_setting("tmdb_enabled", False) tvmaze_enabled = _get_bool_setting("tvmaze_enabled", False) + wikipedia_enabled = _get_bool_setting("wikipedia_enabled", False) preferred_source = _get_str_setting("preferred_source", "omdb") # Initialize clients with db_manager for usage tracking @@ -166,12 +169,20 @@ def initialize_clients(): tvmaze_client = None logger.info("TVmaze integration disabled") + if wikipedia_enabled: + wikipedia_client = WikipediaClient(db_manager=db) + logger.info("Wikipedia client initialized with usage tracking") + else: + wikipedia_client = None + logger.info("Wikipedia integration disabled") + # Initialize processor with available clients - if omdb_client or tmdb_client or tvmaze_client: + if omdb_client or tmdb_client or tvmaze_client or wikipedia_client: processor = SubtitleProcessor( omdb_client, tmdb_client, tvmaze_client, + wikipedia_client, preferred_source=preferred_source, ) logger.info("Processor initialized") @@ -406,6 +417,8 @@ def get_settings(): settings["tmdb_enabled"] = False if "tvmaze_enabled" not in settings: settings["tvmaze_enabled"] = False + if "wikipedia_enabled" not in settings: + settings["wikipedia_enabled"] = False # Subtitle formatting settings if "subtitle_title_bold" not in settings: @@ -460,6 +473,8 @@ def update_settings(): db.set_setting("tmdb_enabled", bool(data["tmdb_enabled"])) if "tvmaze_enabled" in data: db.set_setting("tvmaze_enabled", bool(data["tvmaze_enabled"])) + if "wikipedia_enabled" in data: + db.set_setting("wikipedia_enabled", bool(data["wikipedia_enabled"])) # Subtitle formatting settings if "subtitle_title_bold" in data: @@ -989,7 +1004,7 @@ def clear_caches(): @app.route('/api/search', methods=['POST']) def search_title(): - """Search for title matches from OMDb/TMDb + """Search for title matches from OMDb/TMDb/Wikipedia Query params: query: The search query (required) @@ -1007,7 +1022,7 @@ def search_title(): "error": "No search query provided" }), 400 - if not omdb_client and not tmdb_client: + if not omdb_client and not tmdb_client and not wikipedia_client: return jsonify({ "success": False, "error": "API not configured" @@ -1017,6 +1032,26 @@ def search_title(): language = data.get("language") results = [] + if preferred_source == "wikipedia" and wikipedia_client: + try: + import asyncio + + title, year = SubtitleProcessor.extract_title_and_year(query, strip_keywords=True) + results = asyncio.run( + wikipedia_client.search_titles( + title, + year=year, + is_series=False, + max_results=5 if mode == "full" else 1, + ) + ) + return jsonify({ + "success": True, + "results": results + }) + except Exception as e: + logger.error(f"Error searching Wikipedia: {e}") + if preferred_source == "tmdb" and tmdb_client: try: import aiohttp @@ -1567,17 +1602,19 @@ def health_check(): omdb_enabled = bool(db.get_setting("omdb_enabled", False)) tmdb_enabled = bool(db.get_setting("tmdb_enabled", False)) tvmaze_enabled = bool(db.get_setting("tvmaze_enabled", False)) + wikipedia_enabled = bool(db.get_setting("wikipedia_enabled", False)) omdb_configured = bool(db.get_setting("omdb_api_key") or db.get_setting("api_key")) tmdb_configured = bool(db.get_setting("tmdb_api_key")) return jsonify({ "status": "ok", - "api_key_configured": (omdb_enabled and omdb_configured) or (tmdb_enabled and tmdb_configured) or tvmaze_enabled, + "api_key_configured": (omdb_enabled and omdb_configured) or (tmdb_enabled and tmdb_configured) or tvmaze_enabled or wikipedia_enabled, "omdb_configured": omdb_configured, "tmdb_configured": tmdb_configured, "omdb_enabled": omdb_enabled, "tmdb_enabled": tmdb_enabled, - "tvmaze_enabled": tvmaze_enabled + "tvmaze_enabled": tvmaze_enabled, + "wikipedia_enabled": wikipedia_enabled }) diff --git a/server/core/database.py b/server/core/database.py index 3f667db..2842fdb 100644 --- a/server/core/database.py +++ b/server/core/database.py @@ -133,7 +133,7 @@ class ApiUsage(Base): __tablename__ = 'api_usage' id = Column(Integer, primary_key=True) - provider = Column(String(50), nullable=False, index=True) # omdb, tmdb, tvmaze + provider = Column(String(50), nullable=False, index=True) # omdb, tmdb, tvmaze, wikipedia endpoint = Column(String(200)) # Specific endpoint called timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True) success = Column(Boolean, default=True) @@ -835,7 +835,7 @@ class DatabaseManager: def get_all_usage_stats(self): """Get usage statistics for all providers""" - providers = ['omdb', 'tmdb', 'tvmaze'] + providers = ['omdb', 'tmdb', 'tvmaze', 'wikipedia'] return {provider: self.get_usage_stats(provider) for provider in providers} # ============ SUGGESTED MATCHES OPERATIONS ============ diff --git a/server/core/subtitle_processor.py b/server/core/subtitle_processor.py index bf82016..85055c0 100644 --- a/server/core/subtitle_processor.py +++ b/server/core/subtitle_processor.py @@ -1251,10 +1251,11 @@ class SubtitleProcessor: MAX_SRT_BYTES = 5 * 1024 * 1024 PLOT_SCAN_LINES = 40 - def __init__(self, omdb_client=None, tmdb_client=None, tvmaze_client=None, preferred_source="omdb"): + def __init__(self, omdb_client=None, tmdb_client=None, tvmaze_client=None, wikipedia_client=None, preferred_source="omdb"): self.omdb_client = omdb_client self.tmdb_client = tmdb_client self.tvmaze_client = tvmaze_client + self.wikipedia_client = wikipedia_client self.preferred_source = preferred_source async def process_file( @@ -1557,7 +1558,7 @@ class SubtitleProcessor: Fetch metadata from configured sources with fallback. Priority: - 1. Preferred source (omdb, tmdb, tvmaze) + 1. Preferred source (omdb, tmdb, tvmaze, wikipedia) 2. Fallback to other source if preferred fails Year validation ensures we don't match wrong movies (e.g., "Eternity 2025" @@ -1571,6 +1572,17 @@ class SubtitleProcessor: tmdb_type = "tv" if is_series else "movie" # Try preferred source first + if source_preference == "wikipedia" and self.wikipedia_client: + result = await self.wikipedia_client.fetch_summary( + movie_name, + year=year, + is_series=is_series, + season=season, + episode=episode, + ) + if result: + logger.info("Found metadata via Wikipedia: %s (%s)", result.get("title"), result.get("year")) + return result if source_preference == "tvmaze" and self.tvmaze_client and is_series: result = await self.tvmaze_client.fetch_summary( movie_name, @@ -1631,6 +1643,18 @@ class SubtitleProcessor: logger.info("Found metadata via TMDb (fallback): %s (%s)", result.get("title"), result.get("year")) return result + if not result and self.wikipedia_client and source_preference != "wikipedia": + result = await self.wikipedia_client.fetch_summary( + movie_name, + year=year, + is_series=is_series, + season=season, + episode=episode, + ) + if result: + logger.info("Found metadata via Wikipedia (fallback): %s (%s)", result.get("title"), result.get("year")) + return result + if not result and self.tvmaze_client and source_preference != "tvmaze" and is_series: result = await self.tvmaze_client.fetch_summary( movie_name, @@ -1766,7 +1790,7 @@ class SubtitleProcessor: torrent/release tags like quality indicators (1080p, BluRay), codecs (x264, HEVC), release groups (YTS, RARBG), and subtitle ads (OpenSubtitles). - This ONLY affects what title is searched for on OMDb/TMDb/TVmaze. + This ONLY affects what title is searched for on OMDb/TMDb/TVmaze/Wikipedia. It does NOT modify the subtitle file content or timing in any way. Examples: diff --git a/server/core/wikipedia_client.py b/server/core/wikipedia_client.py new file mode 100644 index 0000000..9728530 --- /dev/null +++ b/server/core/wikipedia_client.py @@ -0,0 +1,284 @@ +""" +Wikipedia API client - strict metadata fetching + +Uses the MediaWiki search API and REST summary endpoint. +Strict matching avoids false positives by requiring: +- exact base title match (after normalization) +- year match when provided +- media type hints (film vs TV series) +""" + +from __future__ import annotations + +import asyncio +import re +import time +from typing import Dict, List, Optional + +import aiohttp + +from logging_utils import get_logger + +logger = get_logger(__name__) + + +class RateLimiter: + """Simple async rate limiter.""" + + def __init__(self, rate_per_second: float): + self._interval = 1.0 / rate_per_second + self._lock = asyncio.Lock() + self._last_call = 0.0 + + async def wait(self): + async with self._lock: + now = time.monotonic() + delta = now - self._last_call + if delta < self._interval: + await asyncio.sleep(self._interval - delta) + self._last_call = time.monotonic() + + +class WikipediaClient: + """Async, strict Wikipedia client.""" + + API_URL = "https://en.wikipedia.org/w/api.php" + SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{title}" + + def __init__( + self, + *, + max_concurrent: int = 4, + rate_limit_per_sec: float = 2.0, + db_manager=None, + timeout: int = 15, + ): + self.db_manager = db_manager + self.semaphore = asyncio.Semaphore(max_concurrent) + self.rate_limiter = RateLimiter(rate_limit_per_sec) + self._timeout = aiohttp.ClientTimeout(total=timeout) + self._session: Optional[aiohttp.ClientSession] = None + + async def _get_session(self) -> aiohttp.ClientSession: + if self._session is None or self._session.closed: + self._session = aiohttp.ClientSession(timeout=self._timeout) + return self._session + + async def close(self): + if self._session and not self._session.closed: + await self._session.close() + + async def fetch_summary( + self, + title: str, + *, + year: Optional[str] = None, + is_series: bool = False, + season: Optional[int] = None, + episode: Optional[int] = None, + ) -> Optional[dict]: + """ + Fetch a strict Wikipedia summary match for a title. + + Wikipedia does not provide episode-level summaries in a structured way. + If season/episode is provided, return None to avoid incorrect matches. + """ + if season is not None or episode is not None: + return None + + matches = await self.search_titles( + title, + year=year, + is_series=is_series, + max_results=1, + ) + return matches[0] if matches else None + + async def search_titles( + self, + title: str, + *, + year: Optional[str] = None, + is_series: bool = False, + max_results: int = 5, + ) -> List[dict]: + """Search Wikipedia with strict filtering and return summary results.""" + if not title: + return [] + + async with self.semaphore: + await self.rate_limiter.wait() + + query = f'intitle:"{title}"' + if year: + query = f'{query} {year}' + if is_series: + query = f"{query} television series" + else: + query = f"{query} film" + + params = { + "action": "query", + "list": "search", + "srsearch": query, + "srlimit": max_results * 2, + "format": "json", + } + + session = await self._get_session() + start = time.monotonic() + + try: + async with session.get(self.API_URL, params=params) as resp: + elapsed_ms = int((time.monotonic() - start) * 1000) + if resp.status != 200: + self._track(False, "/w/api.php", elapsed_ms) + return [] + data = await resp.json() + self._track(True, "/w/api.php", elapsed_ms) + except (asyncio.TimeoutError, aiohttp.ClientError) as e: + logger.error("Wikipedia search error for '%s': %s", title, e) + return [] + + search_results = data.get("query", {}).get("search", []) + if not search_results: + return [] + + results: List[dict] = [] + for item in search_results: + page_title = item.get("title") + if not page_title: + continue + + summary = await self._fetch_summary_for_title(page_title) + if not summary: + continue + + description = summary.get("description") or "" + extract = summary.get("extract") or "" + + if not self._is_strict_match( + title, + page_title, + year=year, + is_series=is_series, + description=description, + extract=extract, + ): + continue + + results.append(self._build_result(summary, is_series)) + if len(results) >= max_results: + break + + return results + + async def _fetch_summary_for_title(self, title: str) -> Optional[dict]: + session = await self._get_session() + url = self.SUMMARY_URL.format(title=aiohttp.helpers.quote(title, safe="")) + start = time.monotonic() + + try: + async with session.get(url) as resp: + elapsed_ms = int((time.monotonic() - start) * 1000) + if resp.status != 200: + self._track(False, "/page/summary", elapsed_ms) + return None + data = await resp.json() + self._track(True, "/page/summary", elapsed_ms) + except (asyncio.TimeoutError, aiohttp.ClientError) as e: + logger.error("Wikipedia summary error for '%s': %s", title, e) + return None + + # Skip disambiguation and empty extracts + if data.get("type") == "disambiguation": + return None + if not data.get("extract"): + return None + return data + + def _build_result(self, summary: dict, is_series: bool) -> dict: + title = summary.get("title") + description = summary.get("description") or "" + extract = summary.get("extract") or "No plot available" + year = self._extract_year(summary.get("title", ""), description, extract) + poster = None + thumbnail = summary.get("thumbnail") or {} + if isinstance(thumbnail, dict): + poster = thumbnail.get("source") + + return { + "plot": extract, + "title": title, + "year": year or "N/A", + "media_type": "series" if is_series else "movie", + "imdb_rating": "N/A", + "runtime": "N/A", + "poster": poster, + "imdb_id": None, + "director": "N/A", + "actors": "N/A", + "released": "N/A", + "genre": description or "N/A", + } + + def _is_strict_match( + self, + query_title: str, + page_title: str, + *, + year: Optional[str], + is_series: bool, + description: str, + extract: str, + ) -> bool: + query_base = self._normalize_title(query_title) + page_base = self._normalize_title(self._strip_parenthetical(page_title)) + if query_base != page_base: + return False + + text = f"{description} {extract}".lower() + if year: + if year not in page_title and not re.search(rf"\\b{re.escape(year)}\\b", text): + return False + + if is_series: + if "television series" not in text and "tv series" not in text and "miniseries" not in text: + return False + else: + if "film" not in text and "movie" not in text: + return False + + if "disambiguation" in text: + return False + + return True + + @staticmethod + def _strip_parenthetical(title: str) -> str: + return re.sub(r"\s*\(.*?\)\s*$", "", title or "") + + @staticmethod + def _normalize_title(title: str) -> str: + normalized = re.sub(r"[^a-z0-9]+", " ", title.lower()) + return " ".join(normalized.split()) + + @staticmethod + def _extract_year(title: str, description: str, extract: str) -> Optional[str]: + title_match = re.search(r"\b(19|20)\d{2}\b", title or "") + if title_match: + return title_match.group(0) + text = f"{description} {extract}" + match = re.search(r"\b(19|20)\d{2}\b", text) + return match.group(0) if match else None + + def _track(self, success: bool, endpoint: str, response_time_ms: int): + if not self.db_manager: + return + self.db_manager.track_api_call( + provider="wikipedia", + endpoint=endpoint, + success=success, + response_time_ms=response_time_ms, + call_count=1, + )