Wikipedia support
This commit is contained in:
+43
-6
@@ -15,6 +15,7 @@ from core.config_manager import ConfigManager
|
||||
from core.omdb_client import OMDbClient
|
||||
from core.tmdb_client import TMDbClient
|
||||
from core.tvmaze_client import TVMazeClient
|
||||
from core.wikipedia_client import WikipediaClient
|
||||
from core.subtitle_processor import SubtitleProcessor, SubtitleFormatOptions, SUBLOGUE_TOKEN_PATTERN, SUBLOGUE_SENTINEL
|
||||
from core.keyword_stripper import get_stripper
|
||||
from core.file_scanner import FileScanner
|
||||
@@ -36,6 +37,7 @@ db = DatabaseManager()
|
||||
omdb_client = None
|
||||
tmdb_client = None
|
||||
tvmaze_client = None
|
||||
wikipedia_client = None
|
||||
processor = None
|
||||
|
||||
# In-memory scan state (still used for current session)
|
||||
@@ -129,7 +131,7 @@ def start_scheduled_scan_worker():
|
||||
|
||||
def initialize_clients():
|
||||
"""Initialize OMDb, TMDb, TVmaze clients and processor with current API keys"""
|
||||
global omdb_client, tmdb_client, tvmaze_client, processor
|
||||
global omdb_client, tmdb_client, tvmaze_client, wikipedia_client, processor
|
||||
|
||||
# Load OMDb API key
|
||||
omdb_key = _get_str_setting("omdb_api_key", "")
|
||||
@@ -144,6 +146,7 @@ def initialize_clients():
|
||||
omdb_enabled = _get_bool_setting("omdb_enabled", False)
|
||||
tmdb_enabled = _get_bool_setting("tmdb_enabled", False)
|
||||
tvmaze_enabled = _get_bool_setting("tvmaze_enabled", False)
|
||||
wikipedia_enabled = _get_bool_setting("wikipedia_enabled", False)
|
||||
preferred_source = _get_str_setting("preferred_source", "omdb")
|
||||
|
||||
# Initialize clients with db_manager for usage tracking
|
||||
@@ -166,12 +169,20 @@ def initialize_clients():
|
||||
tvmaze_client = None
|
||||
logger.info("TVmaze integration disabled")
|
||||
|
||||
if wikipedia_enabled:
|
||||
wikipedia_client = WikipediaClient(db_manager=db)
|
||||
logger.info("Wikipedia client initialized with usage tracking")
|
||||
else:
|
||||
wikipedia_client = None
|
||||
logger.info("Wikipedia integration disabled")
|
||||
|
||||
# Initialize processor with available clients
|
||||
if omdb_client or tmdb_client or tvmaze_client:
|
||||
if omdb_client or tmdb_client or tvmaze_client or wikipedia_client:
|
||||
processor = SubtitleProcessor(
|
||||
omdb_client,
|
||||
tmdb_client,
|
||||
tvmaze_client,
|
||||
wikipedia_client,
|
||||
preferred_source=preferred_source,
|
||||
)
|
||||
logger.info("Processor initialized")
|
||||
@@ -406,6 +417,8 @@ def get_settings():
|
||||
settings["tmdb_enabled"] = False
|
||||
if "tvmaze_enabled" not in settings:
|
||||
settings["tvmaze_enabled"] = False
|
||||
if "wikipedia_enabled" not in settings:
|
||||
settings["wikipedia_enabled"] = False
|
||||
|
||||
# Subtitle formatting settings
|
||||
if "subtitle_title_bold" not in settings:
|
||||
@@ -460,6 +473,8 @@ def update_settings():
|
||||
db.set_setting("tmdb_enabled", bool(data["tmdb_enabled"]))
|
||||
if "tvmaze_enabled" in data:
|
||||
db.set_setting("tvmaze_enabled", bool(data["tvmaze_enabled"]))
|
||||
if "wikipedia_enabled" in data:
|
||||
db.set_setting("wikipedia_enabled", bool(data["wikipedia_enabled"]))
|
||||
|
||||
# Subtitle formatting settings
|
||||
if "subtitle_title_bold" in data:
|
||||
@@ -989,7 +1004,7 @@ def clear_caches():
|
||||
|
||||
@app.route('/api/search', methods=['POST'])
|
||||
def search_title():
|
||||
"""Search for title matches from OMDb/TMDb
|
||||
"""Search for title matches from OMDb/TMDb/Wikipedia
|
||||
|
||||
Query params:
|
||||
query: The search query (required)
|
||||
@@ -1007,7 +1022,7 @@ def search_title():
|
||||
"error": "No search query provided"
|
||||
}), 400
|
||||
|
||||
if not omdb_client and not tmdb_client:
|
||||
if not omdb_client and not tmdb_client and not wikipedia_client:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": "API not configured"
|
||||
@@ -1017,6 +1032,26 @@ def search_title():
|
||||
language = data.get("language")
|
||||
|
||||
results = []
|
||||
if preferred_source == "wikipedia" and wikipedia_client:
|
||||
try:
|
||||
import asyncio
|
||||
|
||||
title, year = SubtitleProcessor.extract_title_and_year(query, strip_keywords=True)
|
||||
results = asyncio.run(
|
||||
wikipedia_client.search_titles(
|
||||
title,
|
||||
year=year,
|
||||
is_series=False,
|
||||
max_results=5 if mode == "full" else 1,
|
||||
)
|
||||
)
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"results": results
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching Wikipedia: {e}")
|
||||
|
||||
if preferred_source == "tmdb" and tmdb_client:
|
||||
try:
|
||||
import aiohttp
|
||||
@@ -1567,17 +1602,19 @@ def health_check():
|
||||
omdb_enabled = bool(db.get_setting("omdb_enabled", False))
|
||||
tmdb_enabled = bool(db.get_setting("tmdb_enabled", False))
|
||||
tvmaze_enabled = bool(db.get_setting("tvmaze_enabled", False))
|
||||
wikipedia_enabled = bool(db.get_setting("wikipedia_enabled", False))
|
||||
omdb_configured = bool(db.get_setting("omdb_api_key") or db.get_setting("api_key"))
|
||||
tmdb_configured = bool(db.get_setting("tmdb_api_key"))
|
||||
|
||||
return jsonify({
|
||||
"status": "ok",
|
||||
"api_key_configured": (omdb_enabled and omdb_configured) or (tmdb_enabled and tmdb_configured) or tvmaze_enabled,
|
||||
"api_key_configured": (omdb_enabled and omdb_configured) or (tmdb_enabled and tmdb_configured) or tvmaze_enabled or wikipedia_enabled,
|
||||
"omdb_configured": omdb_configured,
|
||||
"tmdb_configured": tmdb_configured,
|
||||
"omdb_enabled": omdb_enabled,
|
||||
"tmdb_enabled": tmdb_enabled,
|
||||
"tvmaze_enabled": tvmaze_enabled
|
||||
"tvmaze_enabled": tvmaze_enabled,
|
||||
"wikipedia_enabled": wikipedia_enabled
|
||||
})
|
||||
|
||||
|
||||
|
||||
@@ -133,7 +133,7 @@ class ApiUsage(Base):
|
||||
__tablename__ = 'api_usage'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
provider = Column(String(50), nullable=False, index=True) # omdb, tmdb, tvmaze
|
||||
provider = Column(String(50), nullable=False, index=True) # omdb, tmdb, tvmaze, wikipedia
|
||||
endpoint = Column(String(200)) # Specific endpoint called
|
||||
timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
||||
success = Column(Boolean, default=True)
|
||||
@@ -835,7 +835,7 @@ class DatabaseManager:
|
||||
|
||||
def get_all_usage_stats(self):
|
||||
"""Get usage statistics for all providers"""
|
||||
providers = ['omdb', 'tmdb', 'tvmaze']
|
||||
providers = ['omdb', 'tmdb', 'tvmaze', 'wikipedia']
|
||||
return {provider: self.get_usage_stats(provider) for provider in providers}
|
||||
|
||||
# ============ SUGGESTED MATCHES OPERATIONS ============
|
||||
|
||||
@@ -1251,10 +1251,11 @@ class SubtitleProcessor:
|
||||
MAX_SRT_BYTES = 5 * 1024 * 1024
|
||||
PLOT_SCAN_LINES = 40
|
||||
|
||||
def __init__(self, omdb_client=None, tmdb_client=None, tvmaze_client=None, preferred_source="omdb"):
|
||||
def __init__(self, omdb_client=None, tmdb_client=None, tvmaze_client=None, wikipedia_client=None, preferred_source="omdb"):
|
||||
self.omdb_client = omdb_client
|
||||
self.tmdb_client = tmdb_client
|
||||
self.tvmaze_client = tvmaze_client
|
||||
self.wikipedia_client = wikipedia_client
|
||||
self.preferred_source = preferred_source
|
||||
|
||||
async def process_file(
|
||||
@@ -1557,7 +1558,7 @@ class SubtitleProcessor:
|
||||
Fetch metadata from configured sources with fallback.
|
||||
|
||||
Priority:
|
||||
1. Preferred source (omdb, tmdb, tvmaze)
|
||||
1. Preferred source (omdb, tmdb, tvmaze, wikipedia)
|
||||
2. Fallback to other source if preferred fails
|
||||
|
||||
Year validation ensures we don't match wrong movies (e.g., "Eternity 2025"
|
||||
@@ -1571,6 +1572,17 @@ class SubtitleProcessor:
|
||||
tmdb_type = "tv" if is_series else "movie"
|
||||
|
||||
# Try preferred source first
|
||||
if source_preference == "wikipedia" and self.wikipedia_client:
|
||||
result = await self.wikipedia_client.fetch_summary(
|
||||
movie_name,
|
||||
year=year,
|
||||
is_series=is_series,
|
||||
season=season,
|
||||
episode=episode,
|
||||
)
|
||||
if result:
|
||||
logger.info("Found metadata via Wikipedia: %s (%s)", result.get("title"), result.get("year"))
|
||||
return result
|
||||
if source_preference == "tvmaze" and self.tvmaze_client and is_series:
|
||||
result = await self.tvmaze_client.fetch_summary(
|
||||
movie_name,
|
||||
@@ -1631,6 +1643,18 @@ class SubtitleProcessor:
|
||||
logger.info("Found metadata via TMDb (fallback): %s (%s)", result.get("title"), result.get("year"))
|
||||
return result
|
||||
|
||||
if not result and self.wikipedia_client and source_preference != "wikipedia":
|
||||
result = await self.wikipedia_client.fetch_summary(
|
||||
movie_name,
|
||||
year=year,
|
||||
is_series=is_series,
|
||||
season=season,
|
||||
episode=episode,
|
||||
)
|
||||
if result:
|
||||
logger.info("Found metadata via Wikipedia (fallback): %s (%s)", result.get("title"), result.get("year"))
|
||||
return result
|
||||
|
||||
if not result and self.tvmaze_client and source_preference != "tvmaze" and is_series:
|
||||
result = await self.tvmaze_client.fetch_summary(
|
||||
movie_name,
|
||||
@@ -1766,7 +1790,7 @@ class SubtitleProcessor:
|
||||
torrent/release tags like quality indicators (1080p, BluRay), codecs (x264, HEVC),
|
||||
release groups (YTS, RARBG), and subtitle ads (OpenSubtitles).
|
||||
|
||||
This ONLY affects what title is searched for on OMDb/TMDb/TVmaze.
|
||||
This ONLY affects what title is searched for on OMDb/TMDb/TVmaze/Wikipedia.
|
||||
It does NOT modify the subtitle file content or timing in any way.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -0,0 +1,284 @@
|
||||
"""
|
||||
Wikipedia API client - strict metadata fetching
|
||||
|
||||
Uses the MediaWiki search API and REST summary endpoint.
|
||||
Strict matching avoids false positives by requiring:
|
||||
- exact base title match (after normalization)
|
||||
- year match when provided
|
||||
- media type hints (film vs TV series)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
from logging_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Simple async rate limiter."""
|
||||
|
||||
def __init__(self, rate_per_second: float):
|
||||
self._interval = 1.0 / rate_per_second
|
||||
self._lock = asyncio.Lock()
|
||||
self._last_call = 0.0
|
||||
|
||||
async def wait(self):
|
||||
async with self._lock:
|
||||
now = time.monotonic()
|
||||
delta = now - self._last_call
|
||||
if delta < self._interval:
|
||||
await asyncio.sleep(self._interval - delta)
|
||||
self._last_call = time.monotonic()
|
||||
|
||||
|
||||
class WikipediaClient:
|
||||
"""Async, strict Wikipedia client."""
|
||||
|
||||
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||
SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_concurrent: int = 4,
|
||||
rate_limit_per_sec: float = 2.0,
|
||||
db_manager=None,
|
||||
timeout: int = 15,
|
||||
):
|
||||
self.db_manager = db_manager
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(rate_limit_per_sec)
|
||||
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
if self._session is None or self._session.closed:
|
||||
self._session = aiohttp.ClientSession(timeout=self._timeout)
|
||||
return self._session
|
||||
|
||||
async def close(self):
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
|
||||
async def fetch_summary(
|
||||
self,
|
||||
title: str,
|
||||
*,
|
||||
year: Optional[str] = None,
|
||||
is_series: bool = False,
|
||||
season: Optional[int] = None,
|
||||
episode: Optional[int] = None,
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
Fetch a strict Wikipedia summary match for a title.
|
||||
|
||||
Wikipedia does not provide episode-level summaries in a structured way.
|
||||
If season/episode is provided, return None to avoid incorrect matches.
|
||||
"""
|
||||
if season is not None or episode is not None:
|
||||
return None
|
||||
|
||||
matches = await self.search_titles(
|
||||
title,
|
||||
year=year,
|
||||
is_series=is_series,
|
||||
max_results=1,
|
||||
)
|
||||
return matches[0] if matches else None
|
||||
|
||||
async def search_titles(
|
||||
self,
|
||||
title: str,
|
||||
*,
|
||||
year: Optional[str] = None,
|
||||
is_series: bool = False,
|
||||
max_results: int = 5,
|
||||
) -> List[dict]:
|
||||
"""Search Wikipedia with strict filtering and return summary results."""
|
||||
if not title:
|
||||
return []
|
||||
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.wait()
|
||||
|
||||
query = f'intitle:"{title}"'
|
||||
if year:
|
||||
query = f'{query} {year}'
|
||||
if is_series:
|
||||
query = f"{query} television series"
|
||||
else:
|
||||
query = f"{query} film"
|
||||
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "search",
|
||||
"srsearch": query,
|
||||
"srlimit": max_results * 2,
|
||||
"format": "json",
|
||||
}
|
||||
|
||||
session = await self._get_session()
|
||||
start = time.monotonic()
|
||||
|
||||
try:
|
||||
async with session.get(self.API_URL, params=params) as resp:
|
||||
elapsed_ms = int((time.monotonic() - start) * 1000)
|
||||
if resp.status != 200:
|
||||
self._track(False, "/w/api.php", elapsed_ms)
|
||||
return []
|
||||
data = await resp.json()
|
||||
self._track(True, "/w/api.php", elapsed_ms)
|
||||
except (asyncio.TimeoutError, aiohttp.ClientError) as e:
|
||||
logger.error("Wikipedia search error for '%s': %s", title, e)
|
||||
return []
|
||||
|
||||
search_results = data.get("query", {}).get("search", [])
|
||||
if not search_results:
|
||||
return []
|
||||
|
||||
results: List[dict] = []
|
||||
for item in search_results:
|
||||
page_title = item.get("title")
|
||||
if not page_title:
|
||||
continue
|
||||
|
||||
summary = await self._fetch_summary_for_title(page_title)
|
||||
if not summary:
|
||||
continue
|
||||
|
||||
description = summary.get("description") or ""
|
||||
extract = summary.get("extract") or ""
|
||||
|
||||
if not self._is_strict_match(
|
||||
title,
|
||||
page_title,
|
||||
year=year,
|
||||
is_series=is_series,
|
||||
description=description,
|
||||
extract=extract,
|
||||
):
|
||||
continue
|
||||
|
||||
results.append(self._build_result(summary, is_series))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
async def _fetch_summary_for_title(self, title: str) -> Optional[dict]:
|
||||
session = await self._get_session()
|
||||
url = self.SUMMARY_URL.format(title=aiohttp.helpers.quote(title, safe=""))
|
||||
start = time.monotonic()
|
||||
|
||||
try:
|
||||
async with session.get(url) as resp:
|
||||
elapsed_ms = int((time.monotonic() - start) * 1000)
|
||||
if resp.status != 200:
|
||||
self._track(False, "/page/summary", elapsed_ms)
|
||||
return None
|
||||
data = await resp.json()
|
||||
self._track(True, "/page/summary", elapsed_ms)
|
||||
except (asyncio.TimeoutError, aiohttp.ClientError) as e:
|
||||
logger.error("Wikipedia summary error for '%s': %s", title, e)
|
||||
return None
|
||||
|
||||
# Skip disambiguation and empty extracts
|
||||
if data.get("type") == "disambiguation":
|
||||
return None
|
||||
if not data.get("extract"):
|
||||
return None
|
||||
return data
|
||||
|
||||
def _build_result(self, summary: dict, is_series: bool) -> dict:
|
||||
title = summary.get("title")
|
||||
description = summary.get("description") or ""
|
||||
extract = summary.get("extract") or "No plot available"
|
||||
year = self._extract_year(summary.get("title", ""), description, extract)
|
||||
poster = None
|
||||
thumbnail = summary.get("thumbnail") or {}
|
||||
if isinstance(thumbnail, dict):
|
||||
poster = thumbnail.get("source")
|
||||
|
||||
return {
|
||||
"plot": extract,
|
||||
"title": title,
|
||||
"year": year or "N/A",
|
||||
"media_type": "series" if is_series else "movie",
|
||||
"imdb_rating": "N/A",
|
||||
"runtime": "N/A",
|
||||
"poster": poster,
|
||||
"imdb_id": None,
|
||||
"director": "N/A",
|
||||
"actors": "N/A",
|
||||
"released": "N/A",
|
||||
"genre": description or "N/A",
|
||||
}
|
||||
|
||||
def _is_strict_match(
|
||||
self,
|
||||
query_title: str,
|
||||
page_title: str,
|
||||
*,
|
||||
year: Optional[str],
|
||||
is_series: bool,
|
||||
description: str,
|
||||
extract: str,
|
||||
) -> bool:
|
||||
query_base = self._normalize_title(query_title)
|
||||
page_base = self._normalize_title(self._strip_parenthetical(page_title))
|
||||
if query_base != page_base:
|
||||
return False
|
||||
|
||||
text = f"{description} {extract}".lower()
|
||||
if year:
|
||||
if year not in page_title and not re.search(rf"\\b{re.escape(year)}\\b", text):
|
||||
return False
|
||||
|
||||
if is_series:
|
||||
if "television series" not in text and "tv series" not in text and "miniseries" not in text:
|
||||
return False
|
||||
else:
|
||||
if "film" not in text and "movie" not in text:
|
||||
return False
|
||||
|
||||
if "disambiguation" in text:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _strip_parenthetical(title: str) -> str:
|
||||
return re.sub(r"\s*\(.*?\)\s*$", "", title or "")
|
||||
|
||||
@staticmethod
|
||||
def _normalize_title(title: str) -> str:
|
||||
normalized = re.sub(r"[^a-z0-9]+", " ", title.lower())
|
||||
return " ".join(normalized.split())
|
||||
|
||||
@staticmethod
|
||||
def _extract_year(title: str, description: str, extract: str) -> Optional[str]:
|
||||
title_match = re.search(r"\b(19|20)\d{2}\b", title or "")
|
||||
if title_match:
|
||||
return title_match.group(0)
|
||||
text = f"{description} {extract}"
|
||||
match = re.search(r"\b(19|20)\d{2}\b", text)
|
||||
return match.group(0) if match else None
|
||||
|
||||
def _track(self, success: bool, endpoint: str, response_time_ms: int):
|
||||
if not self.db_manager:
|
||||
return
|
||||
self.db_manager.track_api_call(
|
||||
provider="wikipedia",
|
||||
endpoint=endpoint,
|
||||
success=success,
|
||||
response_time_ms=response_time_ms,
|
||||
call_count=1,
|
||||
)
|
||||
Reference in New Issue
Block a user