Files
sublogue/server/core/wikipedia_client.py
2026-01-18 23:36:29 +13:00

285 lines
9.0 KiB
Python

"""
Wikipedia API client - strict metadata fetching
Uses the MediaWiki search API and REST summary endpoint.
Strict matching avoids false positives by requiring:
- exact base title match (after normalization)
- year match when provided
- media type hints (film vs TV series)
"""
from __future__ import annotations
import asyncio
import re
import time
from typing import Dict, List, Optional
import aiohttp
from logging_utils import get_logger
logger = get_logger(__name__)
class RateLimiter:
"""Simple async rate limiter."""
def __init__(self, rate_per_second: float):
self._interval = 1.0 / rate_per_second
self._lock = asyncio.Lock()
self._last_call = 0.0
async def wait(self):
async with self._lock:
now = time.monotonic()
delta = now - self._last_call
if delta < self._interval:
await asyncio.sleep(self._interval - delta)
self._last_call = time.monotonic()
class WikipediaClient:
"""Async, strict Wikipedia client."""
API_URL = "https://en.wikipedia.org/w/api.php"
SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
def __init__(
self,
*,
max_concurrent: int = 4,
rate_limit_per_sec: float = 2.0,
db_manager=None,
timeout: int = 15,
):
self.db_manager = db_manager
self.semaphore = asyncio.Semaphore(max_concurrent)
self.rate_limiter = RateLimiter(rate_limit_per_sec)
self._timeout = aiohttp.ClientTimeout(total=timeout)
self._session: Optional[aiohttp.ClientSession] = None
async def _get_session(self) -> aiohttp.ClientSession:
if self._session is None or self._session.closed:
self._session = aiohttp.ClientSession(timeout=self._timeout)
return self._session
async def close(self):
if self._session and not self._session.closed:
await self._session.close()
async def fetch_summary(
self,
title: str,
*,
year: Optional[str] = None,
is_series: bool = False,
season: Optional[int] = None,
episode: Optional[int] = None,
) -> Optional[dict]:
"""
Fetch a strict Wikipedia summary match for a title.
Wikipedia does not provide episode-level summaries in a structured way.
If season/episode is provided, return None to avoid incorrect matches.
"""
if season is not None or episode is not None:
return None
matches = await self.search_titles(
title,
year=year,
is_series=is_series,
max_results=1,
)
return matches[0] if matches else None
async def search_titles(
self,
title: str,
*,
year: Optional[str] = None,
is_series: bool = False,
max_results: int = 5,
) -> List[dict]:
"""Search Wikipedia with strict filtering and return summary results."""
if not title:
return []
async with self.semaphore:
await self.rate_limiter.wait()
query = f'intitle:"{title}"'
if year:
query = f'{query} {year}'
if is_series:
query = f"{query} television series"
else:
query = f"{query} film"
params = {
"action": "query",
"list": "search",
"srsearch": query,
"srlimit": max_results * 2,
"format": "json",
}
session = await self._get_session()
start = time.monotonic()
try:
async with session.get(self.API_URL, params=params) as resp:
elapsed_ms = int((time.monotonic() - start) * 1000)
if resp.status != 200:
self._track(False, "/w/api.php", elapsed_ms)
return []
data = await resp.json()
self._track(True, "/w/api.php", elapsed_ms)
except (asyncio.TimeoutError, aiohttp.ClientError) as e:
logger.error("Wikipedia search error for '%s': %s", title, e)
return []
search_results = data.get("query", {}).get("search", [])
if not search_results:
return []
results: List[dict] = []
for item in search_results:
page_title = item.get("title")
if not page_title:
continue
summary = await self._fetch_summary_for_title(page_title)
if not summary:
continue
description = summary.get("description") or ""
extract = summary.get("extract") or ""
if not self._is_strict_match(
title,
page_title,
year=year,
is_series=is_series,
description=description,
extract=extract,
):
continue
results.append(self._build_result(summary, is_series))
if len(results) >= max_results:
break
return results
async def _fetch_summary_for_title(self, title: str) -> Optional[dict]:
session = await self._get_session()
url = self.SUMMARY_URL.format(title=aiohttp.helpers.quote(title, safe=""))
start = time.monotonic()
try:
async with session.get(url) as resp:
elapsed_ms = int((time.monotonic() - start) * 1000)
if resp.status != 200:
self._track(False, "/page/summary", elapsed_ms)
return None
data = await resp.json()
self._track(True, "/page/summary", elapsed_ms)
except (asyncio.TimeoutError, aiohttp.ClientError) as e:
logger.error("Wikipedia summary error for '%s': %s", title, e)
return None
# Skip disambiguation and empty extracts
if data.get("type") == "disambiguation":
return None
if not data.get("extract"):
return None
return data
def _build_result(self, summary: dict, is_series: bool) -> dict:
title = summary.get("title")
description = summary.get("description") or ""
extract = summary.get("extract") or "No plot available"
year = self._extract_year(summary.get("title", ""), description, extract)
poster = None
thumbnail = summary.get("thumbnail") or {}
if isinstance(thumbnail, dict):
poster = thumbnail.get("source")
return {
"plot": extract,
"title": title,
"year": year or "N/A",
"media_type": "series" if is_series else "movie",
"imdb_rating": "N/A",
"runtime": "N/A",
"poster": poster,
"imdb_id": None,
"director": "N/A",
"actors": "N/A",
"released": "N/A",
"genre": description or "N/A",
}
def _is_strict_match(
self,
query_title: str,
page_title: str,
*,
year: Optional[str],
is_series: bool,
description: str,
extract: str,
) -> bool:
query_base = self._normalize_title(query_title)
page_base = self._normalize_title(self._strip_parenthetical(page_title))
if query_base != page_base:
return False
text = f"{description} {extract}".lower()
if year:
if year not in page_title and not re.search(rf"\\b{re.escape(year)}\\b", text):
return False
if is_series:
if "television series" not in text and "tv series" not in text and "miniseries" not in text:
return False
else:
if "film" not in text and "movie" not in text:
return False
if "disambiguation" in text:
return False
return True
@staticmethod
def _strip_parenthetical(title: str) -> str:
return re.sub(r"\s*\(.*?\)\s*$", "", title or "")
@staticmethod
def _normalize_title(title: str) -> str:
normalized = re.sub(r"[^a-z0-9]+", " ", title.lower())
return " ".join(normalized.split())
@staticmethod
def _extract_year(title: str, description: str, extract: str) -> Optional[str]:
title_match = re.search(r"\b(19|20)\d{2}\b", title or "")
if title_match:
return title_match.group(0)
text = f"{description} {extract}"
match = re.search(r"\b(19|20)\d{2}\b", text)
return match.group(0) if match else None
def _track(self, success: bool, endpoint: str, response_time_ms: int):
if not self.db_manager:
return
self.db_manager.track_api_call(
provider="wikipedia",
endpoint=endpoint,
success=success,
response_time_ms=response_time_ms,
call_count=1,
)