1.1.0 - automations, clean only mode, bug fixes

This commit is contained in:
ponzischeme89
2026-01-19 02:10:08 +13:00
parent 93e8b38e24
commit 9345ac4331
25 changed files with 2690 additions and 499 deletions
+289 -20
View File
@@ -1,9 +1,12 @@
import asyncio
import atexit
import json
import os
import threading
import time
import re
import uuid
import signal
from difflib import SequenceMatcher
from datetime import datetime, timezone
from pathlib import Path
@@ -18,9 +21,12 @@ from core.tvmaze_client import TVMazeClient
from core.wikipedia_client import WikipediaClient
from core.subtitle_processor import SubtitleProcessor, SubtitleFormatOptions, SUBLOGUE_TOKEN_PATTERN, SUBLOGUE_SENTINEL
from core.keyword_stripper import get_stripper
from core.keyword_stripper import get_stripper
from core.file_scanner import FileScanner
from core.database import DatabaseManager
from logging_utils import configure_logging, get_logger
from automations.engine import AutomationEngine
from apscheduler.triggers.cron import CronTrigger
# Configure logging
configure_logging()
@@ -39,6 +45,7 @@ tmdb_client = None
tvmaze_client = None
wikipedia_client = None
processor = None
automation_engine = None
# In-memory scan state (still used for current session)
scan_state = {
@@ -71,7 +78,11 @@ def perform_scheduled_scan(directory):
start_time = time.time()
files = []
for batch in FileScanner.scan_directory(directory, batch_size=10):
for batch in FileScanner.scan_directory(
directory,
batch_size=10,
detect_cleanup_keywords=True,
):
files.extend(batch)
scan_duration_ms = int((time.time() - start_time) * 1000)
@@ -129,6 +140,31 @@ def start_scheduled_scan_worker():
scheduled_scan_thread.start()
def start_automation_engine():
"""Start automation scheduler once."""
global automation_engine
if automation_engine is None:
automation_engine = AutomationEngine(db)
automation_engine.start()
def stop_automation_engine():
"""Stop automation scheduler."""
if automation_engine:
automation_engine.shutdown()
def start_automation_engine_async():
"""Start automation scheduler in a background thread."""
def _run():
try:
start_automation_engine()
except Exception as e:
logger.exception("Automation engine failed to start: %s", e)
threading.Thread(target=_run, daemon=True).start()
def initialize_clients():
"""Initialize OMDb, TMDb, TVmaze clients and processor with current API keys"""
global omdb_client, tmdb_client, tvmaze_client, wikipedia_client, processor
@@ -189,6 +225,8 @@ def initialize_clients():
else:
logger.warning("No metadata providers configured")
_apply_cleanup_keywords_from_settings()
# Migrate existing settings to database on startup
def migrate_settings():
@@ -220,6 +258,27 @@ def _get_str_setting(key: str, default: str) -> str:
return default
return str(value)
def _normalize_keyword_list(value) -> list[str]:
if value is None:
return []
if isinstance(value, list):
return [str(v).strip() for v in value if str(v).strip()]
if isinstance(value, str):
parts = re.split(r"[\n,]+", value)
return [p.strip() for p in parts if p.strip()]
return [str(value).strip()] if str(value).strip() else []
def _apply_cleanup_keywords_from_settings():
keywords = _normalize_keyword_list(db.get_setting("clean_subtitle_force_remove", []))
get_stripper().set_force_remove_keywords(keywords)
def _ensure_automation_engine():
global automation_engine
if automation_engine is None:
automation_engine = AutomationEngine(db)
automation_engine.start()
return automation_engine
def _get_folder_rule_for_path(file_path: str, rules: list[dict]) -> dict | None:
"""Pick the most specific folder rule that matches the file path."""
if not rules:
@@ -386,6 +445,15 @@ migrate_settings()
initialize_clients()
def _handle_shutdown(signum, frame):
stop_automation_engine()
raise SystemExit(0)
atexit.register(stop_automation_engine)
signal.signal(signal.SIGTERM, _handle_shutdown)
# ============ SETTINGS ENDPOINTS ============
@app.route('/api/settings', methods=['GET'])
@@ -411,6 +479,8 @@ def get_settings():
settings["strip_keywords"] = True
if "clean_subtitle_content" not in settings:
settings["clean_subtitle_content"] = True
if "clean_subtitle_force_remove" not in settings:
settings["clean_subtitle_force_remove"] = ["YTS", "OpenSubtitles"]
if "omdb_enabled" not in settings:
settings["omdb_enabled"] = False
if "tmdb_enabled" not in settings:
@@ -467,6 +537,11 @@ def update_settings():
db.set_setting("strip_keywords", bool(data["strip_keywords"]))
if "clean_subtitle_content" in data:
db.set_setting("clean_subtitle_content", bool(data["clean_subtitle_content"]))
if "clean_subtitle_force_remove" in data:
db.set_setting(
"clean_subtitle_force_remove",
_normalize_keyword_list(data["clean_subtitle_force_remove"]),
)
if "omdb_enabled" in data:
db.set_setting("omdb_enabled", bool(data["omdb_enabled"]))
if "tmdb_enabled" in data:
@@ -510,6 +585,160 @@ def update_settings():
}), 500
# ============ AUTOMATION ENDPOINTS ============
@app.route('/api/automation/rules', methods=['GET'])
def get_automation_rules():
try:
rules = db.get_automation_rules()
return jsonify({
"success": True,
"rules": rules
})
except Exception as e:
logger.error(f"Error fetching automation rules: {e}")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/api/automation/rules', methods=['POST'])
def create_automation_rule():
try:
data = request.json or {}
name = (data.get("name") or "").strip()
schedule = (data.get("schedule") or "").strip()
patterns = data.get("patterns") or []
target_folders = data.get("target_folders") or []
enabled = bool(data.get("enabled", True))
if not name:
return jsonify({"success": False, "error": "Name is required"}), 400
if not schedule:
return jsonify({"success": False, "error": "Schedule is required"}), 400
try:
CronTrigger.from_crontab(schedule)
except ValueError:
return jsonify({"success": False, "error": "Invalid cron schedule"}), 400
rule_id = data.get("id") or str(uuid.uuid4())
saved = db.upsert_automation_rule({
"id": rule_id,
"name": name,
"schedule": schedule,
"enabled": enabled,
"patterns": patterns,
"target_folders": target_folders
})
if not saved:
return jsonify({"success": False, "error": "Failed to save rule"}), 500
engine = _ensure_automation_engine()
engine.reload_rules()
return jsonify({
"success": True,
"rule": db.get_automation_rule(rule_id)
})
except Exception as e:
logger.error(f"Error creating automation rule: {e}")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/api/automation/rules/<rule_id>', methods=['PUT'])
def update_automation_rule(rule_id):
try:
existing = db.get_automation_rule(rule_id)
if not existing:
return jsonify({"success": False, "error": "Rule not found"}), 404
data = request.json or {}
name = (data.get("name") or existing["name"]).strip()
schedule = (data.get("schedule") or existing["schedule"]).strip()
patterns = data.get("patterns", existing["patterns"])
target_folders = data.get("target_folders", existing["target_folders"])
enabled = bool(data.get("enabled", existing["enabled"]))
if not name:
return jsonify({"success": False, "error": "Name is required"}), 400
if not schedule:
return jsonify({"success": False, "error": "Schedule is required"}), 400
try:
CronTrigger.from_crontab(schedule)
except ValueError:
return jsonify({"success": False, "error": "Invalid cron schedule"}), 400
saved = db.upsert_automation_rule({
"id": rule_id,
"name": name,
"schedule": schedule,
"enabled": enabled,
"patterns": patterns,
"target_folders": target_folders
})
if not saved:
return jsonify({"success": False, "error": "Failed to update rule"}), 500
engine = _ensure_automation_engine()
engine.reload_rules()
return jsonify({
"success": True,
"rule": db.get_automation_rule(rule_id)
})
except Exception as e:
logger.error(f"Error updating automation rule: {e}")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/api/automation/rules/<rule_id>', methods=['DELETE'])
def delete_automation_rule(rule_id):
try:
deleted = db.delete_automation_rule(rule_id)
if not deleted:
return jsonify({"success": False, "error": "Rule not found"}), 404
engine = _ensure_automation_engine()
engine.reload_rules()
return jsonify({
"success": True
})
except Exception as e:
logger.error(f"Error deleting automation rule: {e}")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/api/automation/rules/<rule_id>/run', methods=['POST'])
def run_automation_rule(rule_id):
try:
data = request.json or {}
dry_run = bool(data.get("dry_run", False))
engine = _ensure_automation_engine()
result = engine.run_rule_now(rule_id, dry_run=dry_run)
if not result.get("success"):
return jsonify(result), 404
return jsonify(result)
except Exception as e:
logger.error(f"Error running automation rule: {e}")
return jsonify({
"success": False,
"error": str(e)
}), 500
# ============ SCAN ENDPOINTS ============
@app.route('/api/scan/start', methods=['POST'])
@@ -535,7 +764,11 @@ def start_scan():
# Perform scan - collect batches into a flat list
files = []
for batch in FileScanner.scan_directory(directory, batch_size=10):
for batch in FileScanner.scan_directory(
directory,
batch_size=10,
detect_cleanup_keywords=True,
):
files.extend(batch)
# Calculate scan duration
@@ -631,7 +864,11 @@ def stream_scan():
batch_count = 0
# Stream batches as they're found
for batch in FileScanner.scan_directory(directory, batch_size=10):
for batch in FileScanner.scan_directory(
directory,
batch_size=10,
detect_cleanup_keywords=True,
):
if client_closed.is_set():
logger.info("Client disconnected, stopping scan loop")
scan_state["scanning"] = False
@@ -1306,6 +1543,7 @@ def process_files():
duration = data.get("duration", db.get_setting("duration", 40))
title_override = data.get("titleOverride", None) # Optional title override
force_reprocess = data.get("forceReprocess", False) # Optional force flag
clean_only = bool(data.get("clean_only", False))
if not file_paths:
return jsonify({
@@ -1313,12 +1551,22 @@ def process_files():
"error": "No files specified"
}), 400
if not processor:
if not processor and not clean_only:
return jsonify({
"success": False,
"error": "Metadata provider not configured"
}), 400
processor_instance = processor
if clean_only and processor_instance is None:
processor_instance = SubtitleProcessor(
omdb_client,
tmdb_client,
tvmaze_client,
wikipedia_client,
preferred_source=_get_str_setting("preferred_source", "omdb"),
)
# Load default format options from settings
format_options = get_format_options_from_settings()
@@ -1351,18 +1599,25 @@ def process_files():
preferred_source = rule.get("preferred_source") if rule else None
language = rule.get("language") if rule else None
result = asyncio.run(processor.process_file(
file_path,
duration,
force_reprocess=force_reprocess,
title_override=title_override,
format_options=effective_format,
strip_keywords=strip_keywords,
clean_subtitle_content=clean_subtitle_content,
insertion_position=insertion_position or default_insertion_position,
preferred_source=preferred_source or default_preferred_source,
language=language,
))
if clean_only:
result = processor_instance.clean_file(
file_path,
clean_subtitle_content=clean_subtitle_content,
)
result["clean_only"] = True
else:
result = asyncio.run(processor_instance.process_file(
file_path,
duration,
force_reprocess=force_reprocess,
title_override=title_override,
format_options=effective_format,
strip_keywords=strip_keywords,
clean_subtitle_content=clean_subtitle_content,
insertion_position=insertion_position or default_insertion_position,
preferred_source=preferred_source or default_preferred_source,
language=language,
))
# Track success/failure
if result["success"]:
@@ -1386,7 +1641,9 @@ def process_files():
"success": result["success"],
"status": result.get("status", "Unknown"),
"summary": result.get("summary", ""),
"error": result.get("error")
"error": result.get("error"),
"clean_only": result.get("clean_only", False),
"clean_keywords": result.get("clean_keywords", []),
})
# Update scan state
@@ -1394,7 +1651,10 @@ def process_files():
if file_info["path"] == file_path:
file_info["status"] = result.get("status", "Unknown")
file_info["summary"] = result.get("summary", "") if isinstance(result.get("summary"), str) else ""
file_info["has_plot"] = result["success"]
if not result.get("clean_only"):
file_info["has_plot"] = result["success"]
else:
file_info["clean_keywords"] = result.get("clean_keywords", [])
break
except Exception as e:
@@ -1417,7 +1677,9 @@ def process_files():
"success": False,
"status": "Error",
"summary": "",
"error": str(e)
"error": str(e),
"clean_only": clean_only,
"clean_keywords": [],
})
# Complete the run in database
@@ -1795,4 +2057,11 @@ if __name__ == '__main__':
logger.info("Starting Sublogue API server on http://localhost:5000")
if os.environ.get("WERKZEUG_RUN_MAIN") == "true" or not app.debug:
start_scheduled_scan_worker()
app.run(debug=True, host='0.0.0.0', port=5000)
start_automation_engine_async()
os.environ.pop("FLASK_RUN_FROM_CLI", None)
try:
logger.info("Launching Flask app on 0.0.0.0:5000")
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)
except Exception as e:
logger.exception("Flask server failed to start: %s", e)
raise
+4
View File
@@ -0,0 +1,4 @@
from .models import AutomationRule
from .engine import AutomationEngine
__all__ = ["AutomationRule", "AutomationEngine"]
+74
View File
@@ -0,0 +1,74 @@
from __future__ import annotations
from pathlib import Path
from typing import Iterable, List, Tuple
from logging_utils import get_logger
from core.subtitle_processor import SubtitleBlock, parse_srt, format_srt
logger = get_logger(__name__)
def enumerate_srt_files(folders: Iterable[str]) -> List[Path]:
files: List[Path] = []
for folder in folders:
if not folder:
continue
path = Path(folder)
if not path.exists():
logger.warning("Automation folder does not exist: %s", folder)
continue
if not path.is_dir():
continue
files.extend([p for p in path.rglob("*.srt") if p.is_file()])
return files
def remove_lines_matching_patterns(file_path: str, patterns: List[str], dry_run: bool = False) -> Tuple[bool, int]:
"""Remove subtitle lines containing any of the provided patterns."""
if not patterns:
return False, 0
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
content = path.read_text(encoding="utf-8", errors="ignore")
blocks = parse_srt(content)
lowered_patterns = [p.lower() for p in patterns if p]
removed_lines = 0
updated_blocks: List[SubtitleBlock] = []
for block in blocks:
lines = block.text.splitlines()
kept_lines = []
for line in lines:
line_lower = line.lower()
if any(pattern in line_lower for pattern in lowered_patterns):
removed_lines += 1
continue
kept_lines.append(line)
if kept_lines:
updated_blocks.append(
SubtitleBlock(
index=block.index,
start_time=block.start_time,
end_time=block.end_time,
text="\n".join(kept_lines).strip(),
)
)
if removed_lines == 0:
return False, 0
renumbered = [
SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
for i, b in enumerate(updated_blocks)
]
if not dry_run:
path.write_text(format_srt(renumbered), encoding="utf-8")
return True, removed_lines
+131
View File
@@ -0,0 +1,131 @@
from __future__ import annotations
import threading
from typing import Dict, List, Optional
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from logging_utils import get_logger
from automations.actions import enumerate_srt_files, remove_lines_matching_patterns
from automations.models import AutomationRule
logger = get_logger(__name__)
class AutomationEngine:
"""Scheduler for automation rules."""
def __init__(self, db_manager):
self.db_manager = db_manager
self._scheduler = BackgroundScheduler(daemon=True)
self._lock = threading.Lock()
self._started = False
def start(self):
with self._lock:
if self._started:
return
self._scheduler.start()
self._started = True
self.reload_rules()
logger.info("Automation scheduler started")
def shutdown(self):
with self._lock:
if not self._started:
return
self._scheduler.shutdown(wait=False)
self._started = False
logger.info("Automation scheduler stopped")
def reload_rules(self):
"""Reload all automation rules from storage."""
with self._lock:
self._scheduler.remove_all_jobs()
rules = self._load_rules()
for rule in rules:
if not rule.enabled:
continue
self._schedule_rule(rule)
def run_rule_now(self, rule_id: str, dry_run: bool = False) -> dict:
rule = self._get_rule(rule_id)
if not rule:
return {"success": False, "error": "Rule not found"}
return self._execute_rule(rule, dry_run=dry_run)
def _load_rules(self) -> List[AutomationRule]:
rules_raw = self.db_manager.get_automation_rules()
return [AutomationRule.from_dict(r) for r in rules_raw]
def _get_rule(self, rule_id: str) -> Optional[AutomationRule]:
raw = self.db_manager.get_automation_rule(rule_id)
return AutomationRule.from_dict(raw) if raw else None
def _schedule_rule(self, rule: AutomationRule):
try:
trigger = CronTrigger.from_crontab(rule.schedule)
except ValueError as e:
logger.error("Invalid cron schedule for rule %s: %s", rule.id, e)
return
self._scheduler.add_job(
self._run_rule_job,
trigger=trigger,
args=[rule.id],
id=f"automation:{rule.id}",
replace_existing=True,
misfire_grace_time=300,
max_instances=1,
)
def _run_rule_job(self, rule_id: str):
rule = self._get_rule(rule_id)
if not rule or not rule.enabled:
return
self._execute_rule(rule, dry_run=False)
def _execute_rule(self, rule: AutomationRule, dry_run: bool) -> dict:
files = enumerate_srt_files(rule.target_folders)
modified = 0
total_removed = 0
errors: List[str] = []
for file_path in files:
try:
did_modify, removed_lines = remove_lines_matching_patterns(
str(file_path),
rule.patterns,
dry_run=dry_run,
)
if did_modify:
modified += 1
total_removed += removed_lines
self.db_manager.add_automation_log(
rule_id=rule.id,
file_path=str(file_path),
modified=did_modify,
removed_lines=removed_lines,
dry_run=dry_run,
error_message=None,
)
except Exception as e:
errors.append(f"{file_path}: {e}")
self.db_manager.add_automation_log(
rule_id=rule.id,
file_path=str(file_path),
modified=False,
removed_lines=0,
dry_run=dry_run,
error_message=str(e),
)
return {
"success": True,
"rule_id": rule.id,
"files_scanned": len(files),
"files_modified": modified,
"removed_lines": total_removed,
"dry_run": dry_run,
"errors": errors,
}
+25
View File
@@ -0,0 +1,25 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import List
@dataclass(slots=True)
class AutomationRule:
id: str
name: str
schedule: str
enabled: bool
patterns: List[str]
target_folders: List[str]
@staticmethod
def from_dict(data: dict) -> "AutomationRule":
return AutomationRule(
id=str(data.get("id", "")),
name=str(data.get("name", "")),
schedule=str(data.get("schedule", "")),
enabled=bool(data.get("enabled", True)),
patterns=list(data.get("patterns", []) or []),
target_folders=list(data.get("target_folders", []) or []),
)
+140
View File
@@ -184,6 +184,40 @@ class FolderRule(Base):
return f"<FolderRule(id={self.id}, directory='{self.directory}')>"
class AutomationRule(Base):
"""Automation rules for scheduled tasks"""
__tablename__ = 'automation_rules'
id = Column(String(64), primary_key=True)
name = Column(String(255), nullable=False)
schedule = Column(String(100), nullable=False)
enabled = Column(Boolean, default=True, nullable=False)
patterns = Column(Text, nullable=False) # JSON list
target_folders = Column(Text, nullable=False) # JSON list
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
def __repr__(self):
return f"<AutomationRule(id='{self.id}', name='{self.name}', enabled={self.enabled})>"
class AutomationLog(Base):
"""Automation run log entries"""
__tablename__ = 'automation_logs'
id = Column(Integer, primary_key=True)
rule_id = Column(String(64), nullable=False, index=True)
file_path = Column(String(500), nullable=False)
modified = Column(Boolean, default=False)
removed_lines = Column(Integer, default=0)
dry_run = Column(Boolean, default=False)
error_message = Column(Text)
run_at = Column(DateTime, default=datetime.utcnow, nullable=False)
def __repr__(self):
return f"<AutomationLog(rule_id='{self.rule_id}', file_path='{self.file_path}')>"
class DatabaseManager:
"""Manages database connections and operations"""
@@ -1033,6 +1067,112 @@ class DatabaseManager:
finally:
session.close()
# ============ AUTOMATION RULES OPERATIONS ============
def get_automation_rules(self):
"""Get all automation rules"""
session = self.get_session()
try:
rules = session.query(AutomationRule).order_by(AutomationRule.created_at.asc()).all()
result = []
for rule in rules:
result.append({
"id": rule.id,
"name": rule.name,
"schedule": rule.schedule,
"enabled": rule.enabled,
"patterns": json.loads(rule.patterns) if rule.patterns else [],
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
"created_at": rule.created_at.isoformat() if rule.created_at else None,
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
})
return result
finally:
session.close()
def get_automation_rule(self, rule_id):
"""Get a single automation rule"""
session = self.get_session()
try:
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
if not rule:
return None
return {
"id": rule.id,
"name": rule.name,
"schedule": rule.schedule,
"enabled": rule.enabled,
"patterns": json.loads(rule.patterns) if rule.patterns else [],
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
"created_at": rule.created_at.isoformat() if rule.created_at else None,
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
}
finally:
session.close()
def upsert_automation_rule(self, rule_data):
"""Create or update an automation rule"""
session = self.get_session()
try:
rule_id = rule_data["id"]
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
if not rule:
rule = AutomationRule(id=rule_id)
session.add(rule)
rule.name = rule_data["name"]
rule.schedule = rule_data["schedule"]
rule.enabled = bool(rule_data.get("enabled", True))
rule.patterns = json.dumps(rule_data.get("patterns", []))
rule.target_folders = json.dumps(rule_data.get("target_folders", []))
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error saving automation rule: {e}")
return False
finally:
session.close()
def delete_automation_rule(self, rule_id):
"""Delete an automation rule"""
session = self.get_session()
try:
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
if rule:
session.delete(rule)
session.commit()
return True
return False
except Exception as e:
session.rollback()
logger.error(f"Error deleting automation rule: {e}")
return False
finally:
session.close()
def add_automation_log(self, rule_id, file_path, modified, removed_lines, dry_run=False, error_message=None):
"""Add an automation log entry"""
session = self.get_session()
try:
entry = AutomationLog(
rule_id=rule_id,
file_path=file_path,
modified=bool(modified),
removed_lines=int(removed_lines or 0),
dry_run=bool(dry_run),
error_message=error_message
)
session.add(entry)
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error saving automation log: {e}")
raise
finally:
session.close()
# ============ MAINTENANCE OPERATIONS ============
def clear_settings(self, keep_api_keys=False):
+21 -6
View File
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
import sys
sys.path.insert(0, str(Path(__file__).parent))
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
from keyword_stripper import get_stripper
class FileScanner:
@@ -45,6 +46,7 @@ class FileScanner:
directory_path: str | Path,
batch_size: int = DEFAULT_BATCH_SIZE,
follow_symlinks: bool = False,
detect_cleanup_keywords: bool = False,
) -> Generator[List[Dict], None, None]:
"""
Recursively scan a directory tree for .srt files.
@@ -106,8 +108,11 @@ class FileScanner:
# Plot detection
# --------------------------------------------
content = None
try:
plot_marker_count = cls._count_plot_markers(file_path)
if detect_cleanup_keywords:
content = file_path.read_text(encoding="utf-8", errors="ignore")
plot_marker_count = cls._count_plot_markers(file_path, content=content)
has_plot = plot_marker_count > 0
logger.debug(
"Plot check for %s: %s",
@@ -126,7 +131,7 @@ class FileScanner:
if has_plot:
try:
metadata = cls._extract_metadata(file_path)
metadata = cls._extract_metadata(file_path, content=content)
logger.debug(
"Extracted metadata from %s: %s",
file_path.name,
@@ -138,6 +143,13 @@ class FileScanner:
file_path.name, e
)
clean_keywords = []
if detect_cleanup_keywords and content:
try:
clean_keywords = get_stripper().detect_subtitle_watermarks(content)
except Exception as e:
logger.debug("Cleanup keyword detection failed: %s", e)
status = "Has Plot" if has_plot else "Not Loaded"
if plot_marker_count > 1:
status = "Duplicate Plot"
@@ -156,6 +168,7 @@ class FileScanner:
"imdb_rating": metadata.get("imdb_rating"),
"rating": metadata.get("imdb_rating"),
"runtime": metadata.get("runtime"),
"clean_keywords": clean_keywords,
"selected": False,
})
@@ -216,14 +229,15 @@ class FileScanner:
)
@classmethod
def _count_plot_markers(cls, file_path: Path) -> int:
def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
"""
Count Sublogue plot markers to detect duplicates.
"""
logger.debug("Scanning for plot markers in %s", file_path.name)
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
if content is None:
content = file_path.read_text(encoding="utf-8", errors="ignore")
lower_content = content.lower()
generated_count = lower_content.count("generated by sublogue")
if generated_count > 0:
@@ -237,14 +251,15 @@ class FileScanner:
return 0
@classmethod
def _extract_metadata(cls, file_path: Path) -> Dict:
def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
"""
Extract title, year, rating, runtime, and plot
from Sublogue-generated subtitles.
"""
logger.debug("Extracting metadata from %s", file_path.name)
content = file_path.read_text(encoding="utf-8", errors="ignore")
if content is None:
content = file_path.read_text(encoding="utf-8", errors="ignore")
blocks = parse_srt(content)
metadata = {
+64
View File
@@ -199,6 +199,39 @@ class KeywordStripper:
r"sign\s+up\s+(now|today|free)",
]
# Force-remove entire subtitle blocks if these appear anywhere in a line.
# Partial matches are intentional (e.g. "OpenSubtitles.org").
SUBTITLE_FORCE_REMOVE = [
r"yts",
r"opensubtitles?",
]
_custom_force_remove_keywords: List[str] = []
# Labels used for reporting detected watermark keywords in clean-only scans
SUBTITLE_WATERMARK_LABELS = [
(r"yts\.mx|yts\.am|yts\.lt|yts\.ag|\byts\b", "YTS"),
(r"\byify\b", "YIFY"),
(r"\brarbg\b", "RARBG"),
(r"\beztv\b", "EZTV"),
(r"\bettv\b", "ETTV"),
(r"torrentgalaxy|\btgx\b", "TorrentGalaxy"),
(r"1337x", "1337x"),
(r"limetorrents?", "LimeTorrents"),
(r"\bevo\b", "EVO"),
(r"\bpsa\b", "PSA"),
(r"\bfgt\b", "FGT"),
(r"opensubtitles?", "OpenSubtitles"),
(r"sub\.?scene|subscene", "Subscene"),
(r"addic7ed", "Addic7ed"),
(r"podnapisi", "Podnapisi"),
(r"yifysubtitles?", "YIFY Subtitles"),
(r"legendas\.?tv", "LegendasTV"),
(r"shooter\.?cn", "ShooterCN"),
(r"subhd", "SubHD"),
(r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)|https?://", "URL"),
]
# Patterns that indicate an ENTIRE subtitle block should be removed
# (not just the matching text, but the whole block)
SUBTITLE_BLOCK_REMOVERS = [
@@ -234,6 +267,11 @@ class KeywordStripper:
def c(p):
return re.compile(p, re.IGNORECASE | re.VERBOSE)
custom_force_remove = [
re.escape(k) for k in cls._custom_force_remove_keywords if k
]
combined_force_remove = cls.SUBTITLE_FORCE_REMOVE + custom_force_remove
cls._compiled = {
"junk": c("|".join([
cls.QUALITY,
@@ -255,6 +293,9 @@ class KeywordStripper:
"subtitle_block_removers": [
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
],
"subtitle_force_remove": [
re.compile(p, re.IGNORECASE) for p in combined_force_remove
],
}
return cls._compiled
@@ -348,6 +389,11 @@ class KeywordStripper:
if not line:
continue
# Hard kill-switch: if a line mentions these sources, drop the whole block.
for pattern in rx["subtitle_force_remove"]:
if pattern.search(line):
return True
# Check if this line matches any block remover pattern
is_ad_line = False
for pattern in rx["subtitle_block_removers"]:
@@ -469,6 +515,24 @@ class KeywordStripper:
return cleaned
def detect_subtitle_watermarks(self, text: str) -> List[str]:
"""Detect known subtitle watermark keywords in raw subtitle text."""
detected = []
for pattern, label in self.SUBTITLE_WATERMARK_LABELS:
if re.search(pattern, text, re.IGNORECASE):
detected.append(label)
for keyword in self._custom_force_remove_keywords:
if keyword and re.search(re.escape(keyword), text, re.IGNORECASE):
detected.append(keyword)
return detected
def set_force_remove_keywords(self, keywords: List[str]) -> None:
"""Set custom force-remove keywords and refresh regex cache."""
type(self)._custom_force_remove_keywords = [
k.strip() for k in (keywords or []) if k and k.strip()
]
type(self)._compiled = None
# -----------------------------
# SINGLETON HELPERS
+112
View File
@@ -1373,6 +1373,8 @@ class SubtitleProcessor:
# ─────────────────────────────────────────────────────────────
original = file_path.read_text(encoding="utf-8", errors="ignore")
subs = parse_srt(original)
stripper = get_stripper()
detected_keywords = stripper.detect_subtitle_watermarks(original)
if not subs:
return self._fail("No valid subtitle blocks found")
@@ -1540,6 +1542,116 @@ class SubtitleProcessor:
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
return self._fail(f"File is being processed by another task: {e}")
def clean_file(
self,
file_path: str | Path,
clean_subtitle_content: bool = True,
) -> dict:
"""Clean ad/watermark content from a subtitle file without inserting plots."""
file_path = Path(file_path)
if not file_path.exists():
return self._fail("File not found")
if file_path.stat().st_size > self.MAX_SRT_BYTES:
return self._fail("Subtitle file too large")
try:
with file_lock(file_path, timeout=30.0):
original = file_path.read_text(encoding="utf-8", errors="ignore")
subs = parse_srt(original)
if not subs:
return self._fail("No valid subtitle blocks found")
original_blocks = subs
removed_count = 0
modified_count = 0
if clean_subtitle_content:
cleaned_blocks: List[SubtitleBlock] = []
for block in original_blocks:
text = block.text
if stripper.should_remove_subtitle_block(text):
removed_count += 1
continue
cleaned_text = stripper.clean_subtitle_text(text)
if not cleaned_text.strip():
removed_count += 1
continue
if cleaned_text != text:
modified_count += 1
cleaned_blocks.append(
SubtitleBlock(
block.index,
block.start_time,
block.end_time,
cleaned_text,
)
)
else:
cleaned_blocks = list(original_blocks)
sanitized = sanitize_all_blocks(cleaned_blocks)
if len(sanitized) < len(cleaned_blocks):
removed_count += len(cleaned_blocks) - len(sanitized)
if not sanitized:
return self._fail("No dialogue subtitles found after cleaning")
renumbered = [
SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
for i, b in enumerate(sanitized)
]
changed = len(renumbered) != len(original_blocks)
if not changed:
for updated, original_block in zip(renumbered, original_blocks):
if (
updated.start_time != original_block.start_time
or updated.end_time != original_block.end_time
or updated.text != original_block.text
):
changed = True
break
if not changed:
return {
"success": True,
"status": "Skipped",
"summary": "No changes needed",
"removed_blocks": 0,
"modified_blocks": 0,
"clean_keywords": detected_keywords,
}
tmp = file_path.with_suffix(".srt.tmp")
tmp.write_text(format_srt(renumbered), encoding="utf-8")
tmp.replace(file_path)
summary = (
f"Removed {removed_count} ad blocks, modified {modified_count} blocks"
if clean_subtitle_content
else "Cleaned subtitle content"
)
return {
"success": True,
"status": "Cleaned",
"summary": summary,
"removed_blocks": removed_count,
"modified_blocks": modified_count,
"clean_keywords": detected_keywords,
}
except FileLockError as e:
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
return self._fail(f"File is being processed by another task: {e}")
# ========================================================
# Metadata fetching
# ========================================================
+3
View File
@@ -6,6 +6,9 @@ aiohttp>=3.9.0
# Database
sqlalchemy>=2.0.0
# Scheduler
apscheduler>=3.10.4
# Environment
python-dotenv>=1.0.0