Wikipedia support

2026-01-18 23:36:29 +13:00
parent 131b6f8d02
commit b6bf5efd88
10 changed files with 520 additions and 16 deletions
@@ -133,7 +133,7 @@ class ApiUsage(Base):
    __tablename__ = 'api_usage'

    id = Column(Integer, primary_key=True)
-    provider = Column(String(50), nullable=False, index=True)  # omdb, tmdb, tvmaze
+    provider = Column(String(50), nullable=False, index=True)  # omdb, tmdb, tvmaze, wikipedia
    endpoint = Column(String(200))  # Specific endpoint called
    timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
    success = Column(Boolean, default=True)
@@ -835,7 +835,7 @@ class DatabaseManager:

    def get_all_usage_stats(self):
        """Get usage statistics for all providers"""
-        providers = ['omdb', 'tmdb', 'tvmaze']
+        providers = ['omdb', 'tmdb', 'tvmaze', 'wikipedia']
        return {provider: self.get_usage_stats(provider) for provider in providers}

    # ============ SUGGESTED MATCHES OPERATIONS ============
@@ -1251,10 +1251,11 @@ class SubtitleProcessor:
    MAX_SRT_BYTES = 5 * 1024 * 1024
    PLOT_SCAN_LINES = 40

-    def __init__(self, omdb_client=None, tmdb_client=None, tvmaze_client=None, preferred_source="omdb"):
+    def __init__(self, omdb_client=None, tmdb_client=None, tvmaze_client=None, wikipedia_client=None, preferred_source="omdb"):
        self.omdb_client = omdb_client
        self.tmdb_client = tmdb_client
        self.tvmaze_client = tvmaze_client
+        self.wikipedia_client = wikipedia_client
        self.preferred_source = preferred_source

    async def process_file(
@@ -1557,7 +1558,7 @@ class SubtitleProcessor:
        Fetch metadata from configured sources with fallback.

        Priority:
-        1. Preferred source (omdb, tmdb, tvmaze)
+        1. Preferred source (omdb, tmdb, tvmaze, wikipedia)
        2. Fallback to other source if preferred fails

        Year validation ensures we don't match wrong movies (e.g., "Eternity 2025"
@@ -1571,6 +1572,17 @@ class SubtitleProcessor:
        tmdb_type = "tv" if is_series else "movie"

        # Try preferred source first
+        if source_preference == "wikipedia" and self.wikipedia_client:
+            result = await self.wikipedia_client.fetch_summary(
+                movie_name,
+                year=year,
+                is_series=is_series,
+                season=season,
+                episode=episode,
+            )
+            if result:
+                logger.info("Found metadata via Wikipedia: %s (%s)", result.get("title"), result.get("year"))
+                return result
        if source_preference == "tvmaze" and self.tvmaze_client and is_series:
            result = await self.tvmaze_client.fetch_summary(
                movie_name,
@@ -1631,6 +1643,18 @@ class SubtitleProcessor:
                logger.info("Found metadata via TMDb (fallback): %s (%s)", result.get("title"), result.get("year"))
                return result

+        if not result and self.wikipedia_client and source_preference != "wikipedia":
+            result = await self.wikipedia_client.fetch_summary(
+                movie_name,
+                year=year,
+                is_series=is_series,
+                season=season,
+                episode=episode,
+            )
+            if result:
+                logger.info("Found metadata via Wikipedia (fallback): %s (%s)", result.get("title"), result.get("year"))
+                return result
+
        if not result and self.tvmaze_client and source_preference != "tvmaze" and is_series:
            result = await self.tvmaze_client.fetch_summary(
                movie_name,
@@ -1766,7 +1790,7 @@ class SubtitleProcessor:
        torrent/release tags like quality indicators (1080p, BluRay), codecs (x264, HEVC),
        release groups (YTS, RARBG), and subtitle ads (OpenSubtitles).

-        This ONLY affects what title is searched for on OMDb/TMDb/TVmaze.
+        This ONLY affects what title is searched for on OMDb/TMDb/TVmaze/Wikipedia.
        It does NOT modify the subtitle file content or timing in any way.

        Examples:
@@ -0,0 +1,284 @@
+"""
+Wikipedia API client - strict metadata fetching
+
+Uses the MediaWiki search API and REST summary endpoint.
+Strict matching avoids false positives by requiring:
+- exact base title match (after normalization)
+- year match when provided
+- media type hints (film vs TV series)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import re
+import time
+from typing import Dict, List, Optional
+
+import aiohttp
+
+from logging_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class RateLimiter:
+    """Simple async rate limiter."""
+
+    def __init__(self, rate_per_second: float):
+        self._interval = 1.0 / rate_per_second
+        self._lock = asyncio.Lock()
+        self._last_call = 0.0
+
+    async def wait(self):
+        async with self._lock:
+            now = time.monotonic()
+            delta = now - self._last_call
+            if delta < self._interval:
+                await asyncio.sleep(self._interval - delta)
+            self._last_call = time.monotonic()
+
+
+class WikipediaClient:
+    """Async, strict Wikipedia client."""
+
+    API_URL = "https://en.wikipedia.org/w/api.php"
+    SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
+
+    def __init__(
+        self,
+        *,
+        max_concurrent: int = 4,
+        rate_limit_per_sec: float = 2.0,
+        db_manager=None,
+        timeout: int = 15,
+    ):
+        self.db_manager = db_manager
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.rate_limiter = RateLimiter(rate_limit_per_sec)
+        self._timeout = aiohttp.ClientTimeout(total=timeout)
+        self._session: Optional[aiohttp.ClientSession] = None
+
+    async def _get_session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession(timeout=self._timeout)
+        return self._session
+
+    async def close(self):
+        if self._session and not self._session.closed:
+            await self._session.close()
+
+    async def fetch_summary(
+        self,
+        title: str,
+        *,
+        year: Optional[str] = None,
+        is_series: bool = False,
+        season: Optional[int] = None,
+        episode: Optional[int] = None,
+    ) -> Optional[dict]:
+        """
+        Fetch a strict Wikipedia summary match for a title.
+
+        Wikipedia does not provide episode-level summaries in a structured way.
+        If season/episode is provided, return None to avoid incorrect matches.
+        """
+        if season is not None or episode is not None:
+            return None
+
+        matches = await self.search_titles(
+            title,
+            year=year,
+            is_series=is_series,
+            max_results=1,
+        )
+        return matches[0] if matches else None
+
+    async def search_titles(
+        self,
+        title: str,
+        *,
+        year: Optional[str] = None,
+        is_series: bool = False,
+        max_results: int = 5,
+    ) -> List[dict]:
+        """Search Wikipedia with strict filtering and return summary results."""
+        if not title:
+            return []
+
+        async with self.semaphore:
+            await self.rate_limiter.wait()
+
+            query = f'intitle:"{title}"'
+            if year:
+                query = f'{query} {year}'
+            if is_series:
+                query = f"{query} television series"
+            else:
+                query = f"{query} film"
+
+            params = {
+                "action": "query",
+                "list": "search",
+                "srsearch": query,
+                "srlimit": max_results * 2,
+                "format": "json",
+            }
+
+            session = await self._get_session()
+            start = time.monotonic()
+
+            try:
+                async with session.get(self.API_URL, params=params) as resp:
+                    elapsed_ms = int((time.monotonic() - start) * 1000)
+                    if resp.status != 200:
+                        self._track(False, "/w/api.php", elapsed_ms)
+                        return []
+                    data = await resp.json()
+                    self._track(True, "/w/api.php", elapsed_ms)
+            except (asyncio.TimeoutError, aiohttp.ClientError) as e:
+                logger.error("Wikipedia search error for '%s': %s", title, e)
+                return []
+
+            search_results = data.get("query", {}).get("search", [])
+            if not search_results:
+                return []
+
+            results: List[dict] = []
+            for item in search_results:
+                page_title = item.get("title")
+                if not page_title:
+                    continue
+
+                summary = await self._fetch_summary_for_title(page_title)
+                if not summary:
+                    continue
+
+                description = summary.get("description") or ""
+                extract = summary.get("extract") or ""
+
+                if not self._is_strict_match(
+                    title,
+                    page_title,
+                    year=year,
+                    is_series=is_series,
+                    description=description,
+                    extract=extract,
+                ):
+                    continue
+
+                results.append(self._build_result(summary, is_series))
+                if len(results) >= max_results:
+                    break
+
+            return results
+
+    async def _fetch_summary_for_title(self, title: str) -> Optional[dict]:
+        session = await self._get_session()
+        url = self.SUMMARY_URL.format(title=aiohttp.helpers.quote(title, safe=""))
+        start = time.monotonic()
+
+        try:
+            async with session.get(url) as resp:
+                elapsed_ms = int((time.monotonic() - start) * 1000)
+                if resp.status != 200:
+                    self._track(False, "/page/summary", elapsed_ms)
+                    return None
+                data = await resp.json()
+                self._track(True, "/page/summary", elapsed_ms)
+        except (asyncio.TimeoutError, aiohttp.ClientError) as e:
+            logger.error("Wikipedia summary error for '%s': %s", title, e)
+            return None
+
+        # Skip disambiguation and empty extracts
+        if data.get("type") == "disambiguation":
+            return None
+        if not data.get("extract"):
+            return None
+        return data
+
+    def _build_result(self, summary: dict, is_series: bool) -> dict:
+        title = summary.get("title")
+        description = summary.get("description") or ""
+        extract = summary.get("extract") or "No plot available"
+        year = self._extract_year(summary.get("title", ""), description, extract)
+        poster = None
+        thumbnail = summary.get("thumbnail") or {}
+        if isinstance(thumbnail, dict):
+            poster = thumbnail.get("source")
+
+        return {
+            "plot": extract,
+            "title": title,
+            "year": year or "N/A",
+            "media_type": "series" if is_series else "movie",
+            "imdb_rating": "N/A",
+            "runtime": "N/A",
+            "poster": poster,
+            "imdb_id": None,
+            "director": "N/A",
+            "actors": "N/A",
+            "released": "N/A",
+            "genre": description or "N/A",
+        }
+
+    def _is_strict_match(
+        self,
+        query_title: str,
+        page_title: str,
+        *,
+        year: Optional[str],
+        is_series: bool,
+        description: str,
+        extract: str,
+    ) -> bool:
+        query_base = self._normalize_title(query_title)
+        page_base = self._normalize_title(self._strip_parenthetical(page_title))
+        if query_base != page_base:
+            return False
+
+        text = f"{description} {extract}".lower()
+        if year:
+            if year not in page_title and not re.search(rf"\\b{re.escape(year)}\\b", text):
+                return False
+
+        if is_series:
+            if "television series" not in text and "tv series" not in text and "miniseries" not in text:
+                return False
+        else:
+            if "film" not in text and "movie" not in text:
+                return False
+
+        if "disambiguation" in text:
+            return False
+
+        return True
+
+    @staticmethod
+    def _strip_parenthetical(title: str) -> str:
+        return re.sub(r"\s*\(.*?\)\s*$", "", title or "")
+
+    @staticmethod
+    def _normalize_title(title: str) -> str:
+        normalized = re.sub(r"[^a-z0-9]+", " ", title.lower())
+        return " ".join(normalized.split())
+
+    @staticmethod
+    def _extract_year(title: str, description: str, extract: str) -> Optional[str]:
+        title_match = re.search(r"\b(19|20)\d{2}\b", title or "")
+        if title_match:
+            return title_match.group(0)
+        text = f"{description} {extract}"
+        match = re.search(r"\b(19|20)\d{2}\b", text)
+        return match.group(0) if match else None
+
+    def _track(self, success: bool, endpoint: str, response_time_ms: int):
+        if not self.db_manager:
+            return
+        self.db_manager.track_api_call(
+            provider="wikipedia",
+            endpoint=endpoint,
+            success=success,
+            response_time_ms=response_time_ms,
+            call_count=1,
+        )