1.0.0.7 - Matching improves, added library page. Removed schedule scans support

This commit is contained in:
ponzischeme89
2026-01-18 22:29:51 +13:00
parent 170694bc28
commit 9251e6e837
10 changed files with 502 additions and 52 deletions
+139 -4
View File
@@ -4,6 +4,8 @@ import logging
import os
import threading
import time
import re
from difflib import SequenceMatcher
from datetime import datetime, timezone
from pathlib import Path
@@ -14,7 +16,8 @@ from core.config_manager import ConfigManager
from core.omdb_client import OMDbClient
from core.tmdb_client import TMDbClient
from core.tvmaze_client import TVMazeClient
from core.subtitle_processor import SubtitleProcessor, SubtitleFormatOptions
from core.subtitle_processor import SubtitleProcessor, SubtitleFormatOptions, SUBLOGUE_TOKEN_PATTERN, SUBLOGUE_SENTINEL
from core.keyword_stripper import get_stripper
from core.file_scanner import FileScanner
from core.database import DatabaseManager
@@ -75,12 +78,13 @@ def perform_scheduled_scan(directory):
scan_duration_ms = int((time.time() - start_time) * 1000)
files_with_plot = sum(1 for f in files if f.get("has_plot", False))
db.add_scan_history(
scan_id = db.add_scan_history(
directory=directory,
files_found=len(files),
files_with_plot=files_with_plot,
scan_duration_ms=scan_duration_ms
)
db.add_scan_files(scan_id, files)
return {
"files_found": len(files),
@@ -246,6 +250,115 @@ def _merge_format_options(base_options: SubtitleFormatOptions, rule: dict | None
)
def _parse_library_identity(file_info: dict) -> dict:
"""Parse title, year, season, and episode from filename metadata."""
file_name = file_info.get("name", "")
title = file_info.get("title")
year = file_info.get("year")
if not title:
stripped = get_stripper().clean_filename(file_name, preserve_year=True)
title = stripped.get("cleaned_title") or Path(file_name).stem
year = year or stripped.get("year")
season = stripped.get("season")
episode = stripped.get("episode")
else:
season, episode = get_stripper().extract_season_episode(file_name)
clean_title = title or Path(file_name).stem
clean_title = clean_title.replace(SUBLOGUE_SENTINEL, "")
clean_title = re.sub(r"<[^>]+>", "", clean_title)
clean_title = SUBLOGUE_TOKEN_PATTERN.sub("", clean_title)
clean_title = re.sub(r"\b(en|eng|english|ita|it|italian|fr|es|de|multi)\b", "", clean_title, flags=re.I)
clean_title = re.sub(r'\s*-\s*copy\b', '', clean_title, flags=re.I)
clean_title = re.sub(r'\s*copy\b', '', clean_title, flags=re.I)
clean_title = re.sub(r"\((\d{4})\)\s*\(\1\)", r"(\1)", clean_title)
if year:
clean_title = re.sub(rf"\s*\({re.escape(str(year))}\)$", "", clean_title)
clean_title = " ".join(clean_title.split()).strip()
return {
"title": clean_title,
"year": year,
"season": season,
"episode": episode,
}
def _group_key(title: str, year: str | None) -> str:
base = title.strip().lower()
return f"{base} ({year})" if year else base
def _build_library_items(files: list[dict], latest_results: dict, limit: int) -> list[dict]:
"""Aggregate scan files into library items."""
grouped = {}
for file_info in files:
parsed = _parse_library_identity(file_info)
key = _group_key(parsed["title"], parsed["year"])
item = grouped.get(key)
if not item:
# Try fuzzy match to existing groups
for existing_key, existing in grouped.items():
ratio = SequenceMatcher(None, existing["title"].lower(), parsed["title"].lower()).ratio()
if ratio >= 0.88:
key = existing_key
item = existing
break
if not item:
item = grouped.setdefault(key, {
"title": parsed["title"],
"year": parsed["year"],
"files": [],
"health": {
"missing_plot": 0,
"duplicate_plot": 0,
"insufficient_gap": 0
}
})
issues = []
if not file_info.get("has_plot"):
issues.append({"type": "missing_plot", "reason": "No plot detected"})
item["health"]["missing_plot"] += 1
if (file_info.get("plot_marker_count") or 0) > 1:
issues.append({"type": "duplicate_plot", "reason": "Multiple plot markers detected"})
item["health"]["duplicate_plot"] += 1
latest_result = latest_results.get(file_info.get("path"))
if latest_result and latest_result.get("status") == "Insufficient Gap":
issues.append({
"type": "insufficient_gap",
"reason": latest_result.get("error_message") or "Insufficient gap before first subtitle"
})
item["health"]["insufficient_gap"] += 1
display_name = parsed["title"]
if parsed["season"] is not None and parsed["episode"] is not None:
display_name = f"{parsed['title']} - S{parsed['season']:02d}E{parsed['episode']:02d}"
elif parsed["year"]:
display_name = f"{parsed['title']} ({parsed['year']})"
item["files"].append({
**file_info,
"display_name": display_name,
"duplicate_plot": (file_info.get("plot_marker_count") or 0) > 1,
"latest_status": latest_result.get("status") if latest_result else None,
"latest_error": latest_result.get("error_message") if latest_result else None,
"issues": issues
})
items = list(grouped.values())
items.sort(
key=lambda entry: (
entry["health"]["missing_plot"]
+ entry["health"]["duplicate_plot"]
+ entry["health"]["insufficient_gap"]
),
reverse=True
)
return items[:limit]
def get_format_options_from_settings() -> SubtitleFormatOptions:
"""Load subtitle formatting options from database settings."""
return SubtitleFormatOptions(
@@ -418,12 +531,13 @@ def start_scan():
files_with_plot = sum(1 for f in files if f.get("has_plot", False))
# Save scan history to database
db.add_scan_history(
scan_id = db.add_scan_history(
directory=directory,
files_found=len(files),
files_with_plot=files_with_plot,
scan_duration_ms=scan_duration_ms
)
db.add_scan_files(scan_id, files)
# Load existing suggested matches for this directory
suggested_matches = db.get_suggested_matches_for_directory(directory)
@@ -529,12 +643,13 @@ def stream_scan():
files_with_plot = sum(1 for f in all_files if f.get("has_plot", False))
# Save scan history to database
db.add_scan_history(
scan_id = db.add_scan_history(
directory=directory,
files_found=len(all_files),
files_with_plot=files_with_plot,
scan_duration_ms=scan_duration_ms
)
db.add_scan_files(scan_id, all_files)
# Load existing suggested matches
logger.info("Loading suggested matches from database...")
@@ -1520,6 +1635,26 @@ def get_scan_history():
}), 500
@app.route('/api/library', methods=['GET'])
def get_library_report():
"""Get library health report with scan files and issue summaries"""
try:
limit = request.args.get('limit', 200, type=int)
latest_files = db.get_latest_scan_files()
latest_results = db.get_latest_file_results()
return jsonify({
"success": True,
"items": _build_library_items(latest_files, latest_results, limit)
})
except Exception as e:
logger.error(f"Error fetching library report: {e}")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/api/statistics', methods=['GET'])
def get_statistics():
"""Get overall statistics"""
+129 -1
View File
@@ -4,7 +4,7 @@ Handles persistent storage for settings, runs, and history
"""
from datetime import datetime
from pathlib import Path
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, Float, Text, ForeignKey
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, Float, Text, ForeignKey, text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship, scoped_session
import json
@@ -85,6 +85,28 @@ class ScanHistory(Base):
return f"<ScanHistory(id={self.id}, directory='{self.directory}', files_found={self.files_found})>"
class ScanFile(Base):
"""Scan files table - stores file details per scan"""
__tablename__ = 'scan_files'
id = Column(Integer, primary_key=True)
scan_id = Column(Integer, ForeignKey('scan_history.id'), nullable=False, index=True)
file_path = Column(String(500), nullable=False, index=True)
file_name = Column(String(255), nullable=False)
title = Column(String(255))
year = Column(String(10))
has_plot = Column(Boolean, default=False)
plot_marker_count = Column(Integer, default=0)
status = Column(String(100))
summary = Column(Text)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
scan = relationship("ScanHistory")
def __repr__(self):
return f"<ScanFile(id={self.id}, file_name='{self.file_name}', has_plot={self.has_plot})>"
class ScheduledScan(Base):
"""Scheduled scans table - stores scheduled scan jobs and results"""
__tablename__ = 'scheduled_scans'
@@ -172,12 +194,32 @@ class DatabaseManager:
# Create tables if they don't exist
Base.metadata.create_all(self.engine)
self._ensure_scan_files_schema()
logger.info(f"Database initialized at {self.db_path}")
def get_session(self):
"""Get a new database session"""
return self.Session()
def _ensure_scan_files_schema(self):
"""Ensure scan_files table has newer columns in existing databases."""
session = self.get_session()
try:
columns = session.execute(text("PRAGMA table_info(scan_files)")).fetchall()
if not columns:
return
existing = {row[1] for row in columns} # column name is index 1
if "title" not in existing:
session.execute(text("ALTER TABLE scan_files ADD COLUMN title VARCHAR(255)"))
if "year" not in existing:
session.execute(text("ALTER TABLE scan_files ADD COLUMN year VARCHAR(10)"))
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error migrating scan_files schema: {e}")
finally:
session.close()
def close_session(self):
"""Close the session"""
self.Session.remove()
@@ -387,6 +429,7 @@ class DatabaseManager:
session.add(scan)
session.commit()
logger.info(f"Scan history saved for {directory}")
return scan.id
except Exception as e:
session.rollback()
logger.error(f"Error saving scan history: {e}")
@@ -416,6 +459,91 @@ class DatabaseManager:
finally:
session.close()
def add_scan_files(self, scan_id, files):
"""Persist file details for a scan"""
session = self.get_session()
try:
for file_info in files:
session.add(ScanFile(
scan_id=scan_id,
file_path=file_info.get("path"),
file_name=file_info.get("name"),
title=file_info.get("title"),
year=file_info.get("year"),
has_plot=bool(file_info.get("has_plot")),
plot_marker_count=int(file_info.get("plot_marker_count") or 0),
status=file_info.get("status"),
summary=file_info.get("summary", "")
))
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error saving scan files: {e}")
raise
finally:
session.close()
def get_scan_files(self, scan_id):
"""Get all files for a scan"""
session = self.get_session()
try:
files = session.query(ScanFile).filter_by(scan_id=scan_id).all()
return [
{
"path": f.file_path,
"name": f.file_name,
"title": f.title,
"year": f.year,
"has_plot": f.has_plot,
"plot_marker_count": f.plot_marker_count,
"status": f.status,
"summary": f.summary
}
for f in files
]
finally:
session.close()
def get_latest_scan_files(self):
"""Get latest scan entry per file path"""
session = self.get_session()
try:
files = session.query(ScanFile).order_by(ScanFile.created_at.desc()).all()
latest = {}
for file_entry in files:
if file_entry.file_path in latest:
continue
latest[file_entry.file_path] = {
"path": file_entry.file_path,
"name": file_entry.file_name,
"title": file_entry.title,
"year": file_entry.year,
"has_plot": file_entry.has_plot,
"plot_marker_count": file_entry.plot_marker_count,
"status": file_entry.status,
"summary": file_entry.summary
}
return list(latest.values())
finally:
session.close()
def get_latest_file_results(self):
"""Get latest processing result per file"""
session = self.get_session()
try:
results = session.query(FileResult).order_by(FileResult.processed_at.desc()).all()
latest = {}
for result in results:
if result.file_path not in latest:
latest[result.file_path] = {
"status": result.status,
"error_message": result.error_message,
"processed_at": result.processed_at.isoformat() if result.processed_at else None
}
return latest
finally:
session.close()
# ============ SCHEDULED SCAN OPERATIONS ============
def create_scheduled_scan(self, directory, scheduled_for):
+19 -18
View File
@@ -24,7 +24,7 @@ logger.addHandler(handler)
import sys
sys.path.insert(0, str(Path(__file__).parent))
from subtitle_processor import parse_srt
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
class FileScanner:
@@ -114,7 +114,8 @@ class FileScanner:
# --------------------------------------------
try:
has_plot = cls._check_has_plot(file_path)
plot_marker_count = cls._count_plot_markers(file_path)
has_plot = plot_marker_count > 0
logger.debug(
"Plot check for %s: %s",
file_path.name,
@@ -148,6 +149,8 @@ class FileScanner:
"path": str(file_path),
"name": file_path.name,
"has_plot": has_plot,
"plot_marker_count": plot_marker_count,
"duplicate_plot": plot_marker_count > 1,
"status": "Has Plot" if has_plot else "Not Loaded",
"summary": metadata.get("summary", ""),
"plot": metadata.get("summary", ""),
@@ -216,30 +219,25 @@ class FileScanner:
)
@classmethod
def _check_has_plot(cls, file_path: Path) -> bool:
def _count_plot_markers(cls, file_path: Path) -> int:
"""
Check first N lines for Sublogue signature.
Count Sublogue plot markers to detect duplicates.
"""
logger.debug("Scanning for plot marker in %s", file_path.name)
logger.debug("Scanning for plot markers in %s", file_path.name)
try:
with file_path.open("r", encoding="utf-8", errors="ignore") as f:
for i, line in enumerate(f):
if i >= cls.PLOT_SCAN_LINES:
break
if "generated by sublogue" in line.lower():
logger.debug(
"Plot marker found in %s (line %d)",
file_path.name, i + 1
)
return True
content = file_path.read_text(encoding="utf-8", errors="ignore")
lower_content = content.lower()
generated_count = lower_content.count("generated by sublogue")
if generated_count > 0:
return generated_count
return content.count(SUBLOGUE_SENTINEL)
except Exception as e:
logger.error(
"Error reading file during plot scan: %s (%s)",
file_path, e
)
return False
return 0
@classmethod
def _extract_metadata(cls, file_path: Path) -> Dict:
@@ -270,6 +268,7 @@ class FileScanner:
plot_text = blocks[1].text
plot_text = plot_text.split("Generated by Sublogue")[0].strip()
plot_text = SUBLOGUE_TOKEN_PATTERN.sub("", plot_text).strip()
metadata["summary"] = plot_text
# --------------------------------------------
@@ -279,7 +278,9 @@ class FileScanner:
header_lines = blocks[0].text.split("\n")
if header_lines:
first_line = header_lines[0]
first_line = header_lines[0].strip()
if first_line == SUBLOGUE_SENTINEL and len(header_lines) > 1:
first_line = header_lines[1].strip()
year_match = re.search(r"\((\d{4})\)", first_line)
if year_match:
metadata["year"] = year_match.group(1)
+8
View File
@@ -1463,6 +1463,14 @@ class SubtitleProcessor:
)
# ─────────────────────────────────────────────────────────────
if insertion_position != "end" and not intro_blocks:
return {
"success": False,
"error": "Insufficient gap before first subtitle",
"status": "Insufficient Gap",
"summary": ""
}
# PHASE 5: Combine intro + original subtitles
#
# NOTE: We're ONLY renumbering indices (1, 2, 3...), NOT timestamps!