1.1.0 - automations, clean only mode, bug fixes
This commit is contained in:
+289
-20
@@ -1,9 +1,12 @@
|
||||
import asyncio
|
||||
import atexit
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import re
|
||||
import uuid
|
||||
import signal
|
||||
from difflib import SequenceMatcher
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
@@ -18,9 +21,12 @@ from core.tvmaze_client import TVMazeClient
|
||||
from core.wikipedia_client import WikipediaClient
|
||||
from core.subtitle_processor import SubtitleProcessor, SubtitleFormatOptions, SUBLOGUE_TOKEN_PATTERN, SUBLOGUE_SENTINEL
|
||||
from core.keyword_stripper import get_stripper
|
||||
from core.keyword_stripper import get_stripper
|
||||
from core.file_scanner import FileScanner
|
||||
from core.database import DatabaseManager
|
||||
from logging_utils import configure_logging, get_logger
|
||||
from automations.engine import AutomationEngine
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
# Configure logging
|
||||
configure_logging()
|
||||
@@ -39,6 +45,7 @@ tmdb_client = None
|
||||
tvmaze_client = None
|
||||
wikipedia_client = None
|
||||
processor = None
|
||||
automation_engine = None
|
||||
|
||||
# In-memory scan state (still used for current session)
|
||||
scan_state = {
|
||||
@@ -71,7 +78,11 @@ def perform_scheduled_scan(directory):
|
||||
start_time = time.time()
|
||||
files = []
|
||||
|
||||
for batch in FileScanner.scan_directory(directory, batch_size=10):
|
||||
for batch in FileScanner.scan_directory(
|
||||
directory,
|
||||
batch_size=10,
|
||||
detect_cleanup_keywords=True,
|
||||
):
|
||||
files.extend(batch)
|
||||
|
||||
scan_duration_ms = int((time.time() - start_time) * 1000)
|
||||
@@ -129,6 +140,31 @@ def start_scheduled_scan_worker():
|
||||
scheduled_scan_thread.start()
|
||||
|
||||
|
||||
def start_automation_engine():
|
||||
"""Start automation scheduler once."""
|
||||
global automation_engine
|
||||
if automation_engine is None:
|
||||
automation_engine = AutomationEngine(db)
|
||||
automation_engine.start()
|
||||
|
||||
|
||||
def stop_automation_engine():
|
||||
"""Stop automation scheduler."""
|
||||
if automation_engine:
|
||||
automation_engine.shutdown()
|
||||
|
||||
|
||||
def start_automation_engine_async():
|
||||
"""Start automation scheduler in a background thread."""
|
||||
def _run():
|
||||
try:
|
||||
start_automation_engine()
|
||||
except Exception as e:
|
||||
logger.exception("Automation engine failed to start: %s", e)
|
||||
|
||||
threading.Thread(target=_run, daemon=True).start()
|
||||
|
||||
|
||||
def initialize_clients():
|
||||
"""Initialize OMDb, TMDb, TVmaze clients and processor with current API keys"""
|
||||
global omdb_client, tmdb_client, tvmaze_client, wikipedia_client, processor
|
||||
@@ -189,6 +225,8 @@ def initialize_clients():
|
||||
else:
|
||||
logger.warning("No metadata providers configured")
|
||||
|
||||
_apply_cleanup_keywords_from_settings()
|
||||
|
||||
|
||||
# Migrate existing settings to database on startup
|
||||
def migrate_settings():
|
||||
@@ -220,6 +258,27 @@ def _get_str_setting(key: str, default: str) -> str:
|
||||
return default
|
||||
return str(value)
|
||||
|
||||
def _normalize_keyword_list(value) -> list[str]:
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, list):
|
||||
return [str(v).strip() for v in value if str(v).strip()]
|
||||
if isinstance(value, str):
|
||||
parts = re.split(r"[\n,]+", value)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
return [str(value).strip()] if str(value).strip() else []
|
||||
|
||||
def _apply_cleanup_keywords_from_settings():
|
||||
keywords = _normalize_keyword_list(db.get_setting("clean_subtitle_force_remove", []))
|
||||
get_stripper().set_force_remove_keywords(keywords)
|
||||
|
||||
def _ensure_automation_engine():
|
||||
global automation_engine
|
||||
if automation_engine is None:
|
||||
automation_engine = AutomationEngine(db)
|
||||
automation_engine.start()
|
||||
return automation_engine
|
||||
|
||||
def _get_folder_rule_for_path(file_path: str, rules: list[dict]) -> dict | None:
|
||||
"""Pick the most specific folder rule that matches the file path."""
|
||||
if not rules:
|
||||
@@ -386,6 +445,15 @@ migrate_settings()
|
||||
initialize_clients()
|
||||
|
||||
|
||||
def _handle_shutdown(signum, frame):
|
||||
stop_automation_engine()
|
||||
raise SystemExit(0)
|
||||
|
||||
|
||||
atexit.register(stop_automation_engine)
|
||||
signal.signal(signal.SIGTERM, _handle_shutdown)
|
||||
|
||||
|
||||
# ============ SETTINGS ENDPOINTS ============
|
||||
|
||||
@app.route('/api/settings', methods=['GET'])
|
||||
@@ -411,6 +479,8 @@ def get_settings():
|
||||
settings["strip_keywords"] = True
|
||||
if "clean_subtitle_content" not in settings:
|
||||
settings["clean_subtitle_content"] = True
|
||||
if "clean_subtitle_force_remove" not in settings:
|
||||
settings["clean_subtitle_force_remove"] = ["YTS", "OpenSubtitles"]
|
||||
if "omdb_enabled" not in settings:
|
||||
settings["omdb_enabled"] = False
|
||||
if "tmdb_enabled" not in settings:
|
||||
@@ -467,6 +537,11 @@ def update_settings():
|
||||
db.set_setting("strip_keywords", bool(data["strip_keywords"]))
|
||||
if "clean_subtitle_content" in data:
|
||||
db.set_setting("clean_subtitle_content", bool(data["clean_subtitle_content"]))
|
||||
if "clean_subtitle_force_remove" in data:
|
||||
db.set_setting(
|
||||
"clean_subtitle_force_remove",
|
||||
_normalize_keyword_list(data["clean_subtitle_force_remove"]),
|
||||
)
|
||||
if "omdb_enabled" in data:
|
||||
db.set_setting("omdb_enabled", bool(data["omdb_enabled"]))
|
||||
if "tmdb_enabled" in data:
|
||||
@@ -510,6 +585,160 @@ def update_settings():
|
||||
}), 500
|
||||
|
||||
|
||||
# ============ AUTOMATION ENDPOINTS ============
|
||||
|
||||
@app.route('/api/automation/rules', methods=['GET'])
|
||||
def get_automation_rules():
|
||||
try:
|
||||
rules = db.get_automation_rules()
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"rules": rules
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching automation rules: {e}")
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@app.route('/api/automation/rules', methods=['POST'])
|
||||
def create_automation_rule():
|
||||
try:
|
||||
data = request.json or {}
|
||||
name = (data.get("name") or "").strip()
|
||||
schedule = (data.get("schedule") or "").strip()
|
||||
patterns = data.get("patterns") or []
|
||||
target_folders = data.get("target_folders") or []
|
||||
enabled = bool(data.get("enabled", True))
|
||||
|
||||
if not name:
|
||||
return jsonify({"success": False, "error": "Name is required"}), 400
|
||||
if not schedule:
|
||||
return jsonify({"success": False, "error": "Schedule is required"}), 400
|
||||
|
||||
try:
|
||||
CronTrigger.from_crontab(schedule)
|
||||
except ValueError:
|
||||
return jsonify({"success": False, "error": "Invalid cron schedule"}), 400
|
||||
|
||||
rule_id = data.get("id") or str(uuid.uuid4())
|
||||
saved = db.upsert_automation_rule({
|
||||
"id": rule_id,
|
||||
"name": name,
|
||||
"schedule": schedule,
|
||||
"enabled": enabled,
|
||||
"patterns": patterns,
|
||||
"target_folders": target_folders
|
||||
})
|
||||
if not saved:
|
||||
return jsonify({"success": False, "error": "Failed to save rule"}), 500
|
||||
|
||||
engine = _ensure_automation_engine()
|
||||
engine.reload_rules()
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"rule": db.get_automation_rule(rule_id)
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating automation rule: {e}")
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@app.route('/api/automation/rules/<rule_id>', methods=['PUT'])
|
||||
def update_automation_rule(rule_id):
|
||||
try:
|
||||
existing = db.get_automation_rule(rule_id)
|
||||
if not existing:
|
||||
return jsonify({"success": False, "error": "Rule not found"}), 404
|
||||
|
||||
data = request.json or {}
|
||||
name = (data.get("name") or existing["name"]).strip()
|
||||
schedule = (data.get("schedule") or existing["schedule"]).strip()
|
||||
patterns = data.get("patterns", existing["patterns"])
|
||||
target_folders = data.get("target_folders", existing["target_folders"])
|
||||
enabled = bool(data.get("enabled", existing["enabled"]))
|
||||
|
||||
if not name:
|
||||
return jsonify({"success": False, "error": "Name is required"}), 400
|
||||
if not schedule:
|
||||
return jsonify({"success": False, "error": "Schedule is required"}), 400
|
||||
|
||||
try:
|
||||
CronTrigger.from_crontab(schedule)
|
||||
except ValueError:
|
||||
return jsonify({"success": False, "error": "Invalid cron schedule"}), 400
|
||||
|
||||
saved = db.upsert_automation_rule({
|
||||
"id": rule_id,
|
||||
"name": name,
|
||||
"schedule": schedule,
|
||||
"enabled": enabled,
|
||||
"patterns": patterns,
|
||||
"target_folders": target_folders
|
||||
})
|
||||
if not saved:
|
||||
return jsonify({"success": False, "error": "Failed to update rule"}), 500
|
||||
|
||||
engine = _ensure_automation_engine()
|
||||
engine.reload_rules()
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"rule": db.get_automation_rule(rule_id)
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating automation rule: {e}")
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@app.route('/api/automation/rules/<rule_id>', methods=['DELETE'])
|
||||
def delete_automation_rule(rule_id):
|
||||
try:
|
||||
deleted = db.delete_automation_rule(rule_id)
|
||||
if not deleted:
|
||||
return jsonify({"success": False, "error": "Rule not found"}), 404
|
||||
|
||||
engine = _ensure_automation_engine()
|
||||
engine.reload_rules()
|
||||
|
||||
return jsonify({
|
||||
"success": True
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting automation rule: {e}")
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@app.route('/api/automation/rules/<rule_id>/run', methods=['POST'])
|
||||
def run_automation_rule(rule_id):
|
||||
try:
|
||||
data = request.json or {}
|
||||
dry_run = bool(data.get("dry_run", False))
|
||||
engine = _ensure_automation_engine()
|
||||
result = engine.run_rule_now(rule_id, dry_run=dry_run)
|
||||
if not result.get("success"):
|
||||
return jsonify(result), 404
|
||||
return jsonify(result)
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation rule: {e}")
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
# ============ SCAN ENDPOINTS ============
|
||||
|
||||
@app.route('/api/scan/start', methods=['POST'])
|
||||
@@ -535,7 +764,11 @@ def start_scan():
|
||||
|
||||
# Perform scan - collect batches into a flat list
|
||||
files = []
|
||||
for batch in FileScanner.scan_directory(directory, batch_size=10):
|
||||
for batch in FileScanner.scan_directory(
|
||||
directory,
|
||||
batch_size=10,
|
||||
detect_cleanup_keywords=True,
|
||||
):
|
||||
files.extend(batch)
|
||||
|
||||
# Calculate scan duration
|
||||
@@ -631,7 +864,11 @@ def stream_scan():
|
||||
batch_count = 0
|
||||
|
||||
# Stream batches as they're found
|
||||
for batch in FileScanner.scan_directory(directory, batch_size=10):
|
||||
for batch in FileScanner.scan_directory(
|
||||
directory,
|
||||
batch_size=10,
|
||||
detect_cleanup_keywords=True,
|
||||
):
|
||||
if client_closed.is_set():
|
||||
logger.info("Client disconnected, stopping scan loop")
|
||||
scan_state["scanning"] = False
|
||||
@@ -1306,6 +1543,7 @@ def process_files():
|
||||
duration = data.get("duration", db.get_setting("duration", 40))
|
||||
title_override = data.get("titleOverride", None) # Optional title override
|
||||
force_reprocess = data.get("forceReprocess", False) # Optional force flag
|
||||
clean_only = bool(data.get("clean_only", False))
|
||||
|
||||
if not file_paths:
|
||||
return jsonify({
|
||||
@@ -1313,12 +1551,22 @@ def process_files():
|
||||
"error": "No files specified"
|
||||
}), 400
|
||||
|
||||
if not processor:
|
||||
if not processor and not clean_only:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": "Metadata provider not configured"
|
||||
}), 400
|
||||
|
||||
processor_instance = processor
|
||||
if clean_only and processor_instance is None:
|
||||
processor_instance = SubtitleProcessor(
|
||||
omdb_client,
|
||||
tmdb_client,
|
||||
tvmaze_client,
|
||||
wikipedia_client,
|
||||
preferred_source=_get_str_setting("preferred_source", "omdb"),
|
||||
)
|
||||
|
||||
# Load default format options from settings
|
||||
format_options = get_format_options_from_settings()
|
||||
|
||||
@@ -1351,18 +1599,25 @@ def process_files():
|
||||
preferred_source = rule.get("preferred_source") if rule else None
|
||||
language = rule.get("language") if rule else None
|
||||
|
||||
result = asyncio.run(processor.process_file(
|
||||
file_path,
|
||||
duration,
|
||||
force_reprocess=force_reprocess,
|
||||
title_override=title_override,
|
||||
format_options=effective_format,
|
||||
strip_keywords=strip_keywords,
|
||||
clean_subtitle_content=clean_subtitle_content,
|
||||
insertion_position=insertion_position or default_insertion_position,
|
||||
preferred_source=preferred_source or default_preferred_source,
|
||||
language=language,
|
||||
))
|
||||
if clean_only:
|
||||
result = processor_instance.clean_file(
|
||||
file_path,
|
||||
clean_subtitle_content=clean_subtitle_content,
|
||||
)
|
||||
result["clean_only"] = True
|
||||
else:
|
||||
result = asyncio.run(processor_instance.process_file(
|
||||
file_path,
|
||||
duration,
|
||||
force_reprocess=force_reprocess,
|
||||
title_override=title_override,
|
||||
format_options=effective_format,
|
||||
strip_keywords=strip_keywords,
|
||||
clean_subtitle_content=clean_subtitle_content,
|
||||
insertion_position=insertion_position or default_insertion_position,
|
||||
preferred_source=preferred_source or default_preferred_source,
|
||||
language=language,
|
||||
))
|
||||
|
||||
# Track success/failure
|
||||
if result["success"]:
|
||||
@@ -1386,7 +1641,9 @@ def process_files():
|
||||
"success": result["success"],
|
||||
"status": result.get("status", "Unknown"),
|
||||
"summary": result.get("summary", ""),
|
||||
"error": result.get("error")
|
||||
"error": result.get("error"),
|
||||
"clean_only": result.get("clean_only", False),
|
||||
"clean_keywords": result.get("clean_keywords", []),
|
||||
})
|
||||
|
||||
# Update scan state
|
||||
@@ -1394,7 +1651,10 @@ def process_files():
|
||||
if file_info["path"] == file_path:
|
||||
file_info["status"] = result.get("status", "Unknown")
|
||||
file_info["summary"] = result.get("summary", "") if isinstance(result.get("summary"), str) else ""
|
||||
file_info["has_plot"] = result["success"]
|
||||
if not result.get("clean_only"):
|
||||
file_info["has_plot"] = result["success"]
|
||||
else:
|
||||
file_info["clean_keywords"] = result.get("clean_keywords", [])
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
@@ -1417,7 +1677,9 @@ def process_files():
|
||||
"success": False,
|
||||
"status": "Error",
|
||||
"summary": "",
|
||||
"error": str(e)
|
||||
"error": str(e),
|
||||
"clean_only": clean_only,
|
||||
"clean_keywords": [],
|
||||
})
|
||||
|
||||
# Complete the run in database
|
||||
@@ -1795,4 +2057,11 @@ if __name__ == '__main__':
|
||||
logger.info("Starting Sublogue API server on http://localhost:5000")
|
||||
if os.environ.get("WERKZEUG_RUN_MAIN") == "true" or not app.debug:
|
||||
start_scheduled_scan_worker()
|
||||
app.run(debug=True, host='0.0.0.0', port=5000)
|
||||
start_automation_engine_async()
|
||||
os.environ.pop("FLASK_RUN_FROM_CLI", None)
|
||||
try:
|
||||
logger.info("Launching Flask app on 0.0.0.0:5000")
|
||||
app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)
|
||||
except Exception as e:
|
||||
logger.exception("Flask server failed to start: %s", e)
|
||||
raise
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
from .models import AutomationRule
|
||||
from .engine import AutomationEngine
|
||||
|
||||
__all__ = ["AutomationRule", "AutomationEngine"]
|
||||
@@ -0,0 +1,74 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Tuple
|
||||
|
||||
from logging_utils import get_logger
|
||||
from core.subtitle_processor import SubtitleBlock, parse_srt, format_srt
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def enumerate_srt_files(folders: Iterable[str]) -> List[Path]:
|
||||
files: List[Path] = []
|
||||
for folder in folders:
|
||||
if not folder:
|
||||
continue
|
||||
path = Path(folder)
|
||||
if not path.exists():
|
||||
logger.warning("Automation folder does not exist: %s", folder)
|
||||
continue
|
||||
if not path.is_dir():
|
||||
continue
|
||||
files.extend([p for p in path.rglob("*.srt") if p.is_file()])
|
||||
return files
|
||||
|
||||
|
||||
def remove_lines_matching_patterns(file_path: str, patterns: List[str], dry_run: bool = False) -> Tuple[bool, int]:
|
||||
"""Remove subtitle lines containing any of the provided patterns."""
|
||||
if not patterns:
|
||||
return False, 0
|
||||
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
content = path.read_text(encoding="utf-8", errors="ignore")
|
||||
blocks = parse_srt(content)
|
||||
|
||||
lowered_patterns = [p.lower() for p in patterns if p]
|
||||
removed_lines = 0
|
||||
updated_blocks: List[SubtitleBlock] = []
|
||||
|
||||
for block in blocks:
|
||||
lines = block.text.splitlines()
|
||||
kept_lines = []
|
||||
for line in lines:
|
||||
line_lower = line.lower()
|
||||
if any(pattern in line_lower for pattern in lowered_patterns):
|
||||
removed_lines += 1
|
||||
continue
|
||||
kept_lines.append(line)
|
||||
|
||||
if kept_lines:
|
||||
updated_blocks.append(
|
||||
SubtitleBlock(
|
||||
index=block.index,
|
||||
start_time=block.start_time,
|
||||
end_time=block.end_time,
|
||||
text="\n".join(kept_lines).strip(),
|
||||
)
|
||||
)
|
||||
|
||||
if removed_lines == 0:
|
||||
return False, 0
|
||||
|
||||
renumbered = [
|
||||
SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
|
||||
for i, b in enumerate(updated_blocks)
|
||||
]
|
||||
|
||||
if not dry_run:
|
||||
path.write_text(format_srt(renumbered), encoding="utf-8")
|
||||
|
||||
return True, removed_lines
|
||||
@@ -0,0 +1,131 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
from logging_utils import get_logger
|
||||
from automations.actions import enumerate_srt_files, remove_lines_matching_patterns
|
||||
from automations.models import AutomationRule
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class AutomationEngine:
|
||||
"""Scheduler for automation rules."""
|
||||
|
||||
def __init__(self, db_manager):
|
||||
self.db_manager = db_manager
|
||||
self._scheduler = BackgroundScheduler(daemon=True)
|
||||
self._lock = threading.Lock()
|
||||
self._started = False
|
||||
|
||||
def start(self):
|
||||
with self._lock:
|
||||
if self._started:
|
||||
return
|
||||
self._scheduler.start()
|
||||
self._started = True
|
||||
self.reload_rules()
|
||||
logger.info("Automation scheduler started")
|
||||
|
||||
def shutdown(self):
|
||||
with self._lock:
|
||||
if not self._started:
|
||||
return
|
||||
self._scheduler.shutdown(wait=False)
|
||||
self._started = False
|
||||
logger.info("Automation scheduler stopped")
|
||||
|
||||
def reload_rules(self):
|
||||
"""Reload all automation rules from storage."""
|
||||
with self._lock:
|
||||
self._scheduler.remove_all_jobs()
|
||||
rules = self._load_rules()
|
||||
for rule in rules:
|
||||
if not rule.enabled:
|
||||
continue
|
||||
self._schedule_rule(rule)
|
||||
|
||||
def run_rule_now(self, rule_id: str, dry_run: bool = False) -> dict:
|
||||
rule = self._get_rule(rule_id)
|
||||
if not rule:
|
||||
return {"success": False, "error": "Rule not found"}
|
||||
return self._execute_rule(rule, dry_run=dry_run)
|
||||
|
||||
def _load_rules(self) -> List[AutomationRule]:
|
||||
rules_raw = self.db_manager.get_automation_rules()
|
||||
return [AutomationRule.from_dict(r) for r in rules_raw]
|
||||
|
||||
def _get_rule(self, rule_id: str) -> Optional[AutomationRule]:
|
||||
raw = self.db_manager.get_automation_rule(rule_id)
|
||||
return AutomationRule.from_dict(raw) if raw else None
|
||||
|
||||
def _schedule_rule(self, rule: AutomationRule):
|
||||
try:
|
||||
trigger = CronTrigger.from_crontab(rule.schedule)
|
||||
except ValueError as e:
|
||||
logger.error("Invalid cron schedule for rule %s: %s", rule.id, e)
|
||||
return
|
||||
self._scheduler.add_job(
|
||||
self._run_rule_job,
|
||||
trigger=trigger,
|
||||
args=[rule.id],
|
||||
id=f"automation:{rule.id}",
|
||||
replace_existing=True,
|
||||
misfire_grace_time=300,
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
def _run_rule_job(self, rule_id: str):
|
||||
rule = self._get_rule(rule_id)
|
||||
if not rule or not rule.enabled:
|
||||
return
|
||||
self._execute_rule(rule, dry_run=False)
|
||||
|
||||
def _execute_rule(self, rule: AutomationRule, dry_run: bool) -> dict:
|
||||
files = enumerate_srt_files(rule.target_folders)
|
||||
modified = 0
|
||||
total_removed = 0
|
||||
errors: List[str] = []
|
||||
|
||||
for file_path in files:
|
||||
try:
|
||||
did_modify, removed_lines = remove_lines_matching_patterns(
|
||||
str(file_path),
|
||||
rule.patterns,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
if did_modify:
|
||||
modified += 1
|
||||
total_removed += removed_lines
|
||||
self.db_manager.add_automation_log(
|
||||
rule_id=rule.id,
|
||||
file_path=str(file_path),
|
||||
modified=did_modify,
|
||||
removed_lines=removed_lines,
|
||||
dry_run=dry_run,
|
||||
error_message=None,
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(f"{file_path}: {e}")
|
||||
self.db_manager.add_automation_log(
|
||||
rule_id=rule.id,
|
||||
file_path=str(file_path),
|
||||
modified=False,
|
||||
removed_lines=0,
|
||||
dry_run=dry_run,
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"rule_id": rule.id,
|
||||
"files_scanned": len(files),
|
||||
"files_modified": modified,
|
||||
"removed_lines": total_removed,
|
||||
"dry_run": dry_run,
|
||||
"errors": errors,
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class AutomationRule:
|
||||
id: str
|
||||
name: str
|
||||
schedule: str
|
||||
enabled: bool
|
||||
patterns: List[str]
|
||||
target_folders: List[str]
|
||||
|
||||
@staticmethod
|
||||
def from_dict(data: dict) -> "AutomationRule":
|
||||
return AutomationRule(
|
||||
id=str(data.get("id", "")),
|
||||
name=str(data.get("name", "")),
|
||||
schedule=str(data.get("schedule", "")),
|
||||
enabled=bool(data.get("enabled", True)),
|
||||
patterns=list(data.get("patterns", []) or []),
|
||||
target_folders=list(data.get("target_folders", []) or []),
|
||||
)
|
||||
@@ -184,6 +184,40 @@ class FolderRule(Base):
|
||||
return f"<FolderRule(id={self.id}, directory='{self.directory}')>"
|
||||
|
||||
|
||||
class AutomationRule(Base):
|
||||
"""Automation rules for scheduled tasks"""
|
||||
__tablename__ = 'automation_rules'
|
||||
|
||||
id = Column(String(64), primary_key=True)
|
||||
name = Column(String(255), nullable=False)
|
||||
schedule = Column(String(100), nullable=False)
|
||||
enabled = Column(Boolean, default=True, nullable=False)
|
||||
patterns = Column(Text, nullable=False) # JSON list
|
||||
target_folders = Column(Text, nullable=False) # JSON list
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<AutomationRule(id='{self.id}', name='{self.name}', enabled={self.enabled})>"
|
||||
|
||||
|
||||
class AutomationLog(Base):
|
||||
"""Automation run log entries"""
|
||||
__tablename__ = 'automation_logs'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
rule_id = Column(String(64), nullable=False, index=True)
|
||||
file_path = Column(String(500), nullable=False)
|
||||
modified = Column(Boolean, default=False)
|
||||
removed_lines = Column(Integer, default=0)
|
||||
dry_run = Column(Boolean, default=False)
|
||||
error_message = Column(Text)
|
||||
run_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<AutomationLog(rule_id='{self.rule_id}', file_path='{self.file_path}')>"
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""Manages database connections and operations"""
|
||||
|
||||
@@ -1033,6 +1067,112 @@ class DatabaseManager:
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ AUTOMATION RULES OPERATIONS ============
|
||||
|
||||
def get_automation_rules(self):
|
||||
"""Get all automation rules"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rules = session.query(AutomationRule).order_by(AutomationRule.created_at.asc()).all()
|
||||
result = []
|
||||
for rule in rules:
|
||||
result.append({
|
||||
"id": rule.id,
|
||||
"name": rule.name,
|
||||
"schedule": rule.schedule,
|
||||
"enabled": rule.enabled,
|
||||
"patterns": json.loads(rule.patterns) if rule.patterns else [],
|
||||
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
|
||||
"created_at": rule.created_at.isoformat() if rule.created_at else None,
|
||||
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
|
||||
})
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_automation_rule(self, rule_id):
|
||||
"""Get a single automation rule"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
|
||||
if not rule:
|
||||
return None
|
||||
return {
|
||||
"id": rule.id,
|
||||
"name": rule.name,
|
||||
"schedule": rule.schedule,
|
||||
"enabled": rule.enabled,
|
||||
"patterns": json.loads(rule.patterns) if rule.patterns else [],
|
||||
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
|
||||
"created_at": rule.created_at.isoformat() if rule.created_at else None,
|
||||
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def upsert_automation_rule(self, rule_data):
|
||||
"""Create or update an automation rule"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rule_id = rule_data["id"]
|
||||
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
|
||||
if not rule:
|
||||
rule = AutomationRule(id=rule_id)
|
||||
session.add(rule)
|
||||
|
||||
rule.name = rule_data["name"]
|
||||
rule.schedule = rule_data["schedule"]
|
||||
rule.enabled = bool(rule_data.get("enabled", True))
|
||||
rule.patterns = json.dumps(rule_data.get("patterns", []))
|
||||
rule.target_folders = json.dumps(rule_data.get("target_folders", []))
|
||||
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error saving automation rule: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def delete_automation_rule(self, rule_id):
|
||||
"""Delete an automation rule"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
|
||||
if rule:
|
||||
session.delete(rule)
|
||||
session.commit()
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error deleting automation rule: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def add_automation_log(self, rule_id, file_path, modified, removed_lines, dry_run=False, error_message=None):
|
||||
"""Add an automation log entry"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
entry = AutomationLog(
|
||||
rule_id=rule_id,
|
||||
file_path=file_path,
|
||||
modified=bool(modified),
|
||||
removed_lines=int(removed_lines or 0),
|
||||
dry_run=bool(dry_run),
|
||||
error_message=error_message
|
||||
)
|
||||
session.add(entry)
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error saving automation log: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ MAINTENANCE OPERATIONS ============
|
||||
|
||||
def clear_settings(self, keep_api_keys=False):
|
||||
|
||||
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
|
||||
from keyword_stripper import get_stripper
|
||||
|
||||
|
||||
class FileScanner:
|
||||
@@ -45,6 +46,7 @@ class FileScanner:
|
||||
directory_path: str | Path,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
follow_symlinks: bool = False,
|
||||
detect_cleanup_keywords: bool = False,
|
||||
) -> Generator[List[Dict], None, None]:
|
||||
"""
|
||||
Recursively scan a directory tree for .srt files.
|
||||
@@ -106,8 +108,11 @@ class FileScanner:
|
||||
# Plot detection
|
||||
# --------------------------------------------
|
||||
|
||||
content = None
|
||||
try:
|
||||
plot_marker_count = cls._count_plot_markers(file_path)
|
||||
if detect_cleanup_keywords:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
plot_marker_count = cls._count_plot_markers(file_path, content=content)
|
||||
has_plot = plot_marker_count > 0
|
||||
logger.debug(
|
||||
"Plot check for %s: %s",
|
||||
@@ -126,7 +131,7 @@ class FileScanner:
|
||||
|
||||
if has_plot:
|
||||
try:
|
||||
metadata = cls._extract_metadata(file_path)
|
||||
metadata = cls._extract_metadata(file_path, content=content)
|
||||
logger.debug(
|
||||
"Extracted metadata from %s: %s",
|
||||
file_path.name,
|
||||
@@ -138,6 +143,13 @@ class FileScanner:
|
||||
file_path.name, e
|
||||
)
|
||||
|
||||
clean_keywords = []
|
||||
if detect_cleanup_keywords and content:
|
||||
try:
|
||||
clean_keywords = get_stripper().detect_subtitle_watermarks(content)
|
||||
except Exception as e:
|
||||
logger.debug("Cleanup keyword detection failed: %s", e)
|
||||
|
||||
status = "Has Plot" if has_plot else "Not Loaded"
|
||||
if plot_marker_count > 1:
|
||||
status = "Duplicate Plot"
|
||||
@@ -156,6 +168,7 @@ class FileScanner:
|
||||
"imdb_rating": metadata.get("imdb_rating"),
|
||||
"rating": metadata.get("imdb_rating"),
|
||||
"runtime": metadata.get("runtime"),
|
||||
"clean_keywords": clean_keywords,
|
||||
"selected": False,
|
||||
})
|
||||
|
||||
@@ -216,14 +229,15 @@ class FileScanner:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _count_plot_markers(cls, file_path: Path) -> int:
|
||||
def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
|
||||
"""
|
||||
Count Sublogue plot markers to detect duplicates.
|
||||
"""
|
||||
logger.debug("Scanning for plot markers in %s", file_path.name)
|
||||
|
||||
try:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content is None:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
lower_content = content.lower()
|
||||
generated_count = lower_content.count("generated by sublogue")
|
||||
if generated_count > 0:
|
||||
@@ -237,14 +251,15 @@ class FileScanner:
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def _extract_metadata(cls, file_path: Path) -> Dict:
|
||||
def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
|
||||
"""
|
||||
Extract title, year, rating, runtime, and plot
|
||||
from Sublogue-generated subtitles.
|
||||
"""
|
||||
logger.debug("Extracting metadata from %s", file_path.name)
|
||||
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content is None:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
blocks = parse_srt(content)
|
||||
|
||||
metadata = {
|
||||
|
||||
@@ -199,6 +199,39 @@ class KeywordStripper:
|
||||
r"sign\s+up\s+(now|today|free)",
|
||||
]
|
||||
|
||||
# Force-remove entire subtitle blocks if these appear anywhere in a line.
|
||||
# Partial matches are intentional (e.g. "OpenSubtitles.org").
|
||||
SUBTITLE_FORCE_REMOVE = [
|
||||
r"yts",
|
||||
r"opensubtitles?",
|
||||
]
|
||||
|
||||
_custom_force_remove_keywords: List[str] = []
|
||||
|
||||
# Labels used for reporting detected watermark keywords in clean-only scans
|
||||
SUBTITLE_WATERMARK_LABELS = [
|
||||
(r"yts\.mx|yts\.am|yts\.lt|yts\.ag|\byts\b", "YTS"),
|
||||
(r"\byify\b", "YIFY"),
|
||||
(r"\brarbg\b", "RARBG"),
|
||||
(r"\beztv\b", "EZTV"),
|
||||
(r"\bettv\b", "ETTV"),
|
||||
(r"torrentgalaxy|\btgx\b", "TorrentGalaxy"),
|
||||
(r"1337x", "1337x"),
|
||||
(r"limetorrents?", "LimeTorrents"),
|
||||
(r"\bevo\b", "EVO"),
|
||||
(r"\bpsa\b", "PSA"),
|
||||
(r"\bfgt\b", "FGT"),
|
||||
(r"opensubtitles?", "OpenSubtitles"),
|
||||
(r"sub\.?scene|subscene", "Subscene"),
|
||||
(r"addic7ed", "Addic7ed"),
|
||||
(r"podnapisi", "Podnapisi"),
|
||||
(r"yifysubtitles?", "YIFY Subtitles"),
|
||||
(r"legendas\.?tv", "LegendasTV"),
|
||||
(r"shooter\.?cn", "ShooterCN"),
|
||||
(r"subhd", "SubHD"),
|
||||
(r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)|https?://", "URL"),
|
||||
]
|
||||
|
||||
# Patterns that indicate an ENTIRE subtitle block should be removed
|
||||
# (not just the matching text, but the whole block)
|
||||
SUBTITLE_BLOCK_REMOVERS = [
|
||||
@@ -234,6 +267,11 @@ class KeywordStripper:
|
||||
def c(p):
|
||||
return re.compile(p, re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
custom_force_remove = [
|
||||
re.escape(k) for k in cls._custom_force_remove_keywords if k
|
||||
]
|
||||
combined_force_remove = cls.SUBTITLE_FORCE_REMOVE + custom_force_remove
|
||||
|
||||
cls._compiled = {
|
||||
"junk": c("|".join([
|
||||
cls.QUALITY,
|
||||
@@ -255,6 +293,9 @@ class KeywordStripper:
|
||||
"subtitle_block_removers": [
|
||||
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
|
||||
],
|
||||
"subtitle_force_remove": [
|
||||
re.compile(p, re.IGNORECASE) for p in combined_force_remove
|
||||
],
|
||||
}
|
||||
|
||||
return cls._compiled
|
||||
@@ -348,6 +389,11 @@ class KeywordStripper:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Hard kill-switch: if a line mentions these sources, drop the whole block.
|
||||
for pattern in rx["subtitle_force_remove"]:
|
||||
if pattern.search(line):
|
||||
return True
|
||||
|
||||
# Check if this line matches any block remover pattern
|
||||
is_ad_line = False
|
||||
for pattern in rx["subtitle_block_removers"]:
|
||||
@@ -469,6 +515,24 @@ class KeywordStripper:
|
||||
|
||||
return cleaned
|
||||
|
||||
def detect_subtitle_watermarks(self, text: str) -> List[str]:
|
||||
"""Detect known subtitle watermark keywords in raw subtitle text."""
|
||||
detected = []
|
||||
for pattern, label in self.SUBTITLE_WATERMARK_LABELS:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
detected.append(label)
|
||||
for keyword in self._custom_force_remove_keywords:
|
||||
if keyword and re.search(re.escape(keyword), text, re.IGNORECASE):
|
||||
detected.append(keyword)
|
||||
return detected
|
||||
|
||||
def set_force_remove_keywords(self, keywords: List[str]) -> None:
|
||||
"""Set custom force-remove keywords and refresh regex cache."""
|
||||
type(self)._custom_force_remove_keywords = [
|
||||
k.strip() for k in (keywords or []) if k and k.strip()
|
||||
]
|
||||
type(self)._compiled = None
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# SINGLETON HELPERS
|
||||
|
||||
@@ -1373,6 +1373,8 @@ class SubtitleProcessor:
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
original = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
subs = parse_srt(original)
|
||||
stripper = get_stripper()
|
||||
detected_keywords = stripper.detect_subtitle_watermarks(original)
|
||||
|
||||
if not subs:
|
||||
return self._fail("No valid subtitle blocks found")
|
||||
@@ -1540,6 +1542,116 @@ class SubtitleProcessor:
|
||||
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
|
||||
return self._fail(f"File is being processed by another task: {e}")
|
||||
|
||||
def clean_file(
|
||||
self,
|
||||
file_path: str | Path,
|
||||
clean_subtitle_content: bool = True,
|
||||
) -> dict:
|
||||
"""Clean ad/watermark content from a subtitle file without inserting plots."""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if not file_path.exists():
|
||||
return self._fail("File not found")
|
||||
|
||||
if file_path.stat().st_size > self.MAX_SRT_BYTES:
|
||||
return self._fail("Subtitle file too large")
|
||||
|
||||
try:
|
||||
with file_lock(file_path, timeout=30.0):
|
||||
original = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
subs = parse_srt(original)
|
||||
|
||||
if not subs:
|
||||
return self._fail("No valid subtitle blocks found")
|
||||
|
||||
original_blocks = subs
|
||||
removed_count = 0
|
||||
modified_count = 0
|
||||
|
||||
if clean_subtitle_content:
|
||||
cleaned_blocks: List[SubtitleBlock] = []
|
||||
|
||||
for block in original_blocks:
|
||||
text = block.text
|
||||
if stripper.should_remove_subtitle_block(text):
|
||||
removed_count += 1
|
||||
continue
|
||||
|
||||
cleaned_text = stripper.clean_subtitle_text(text)
|
||||
if not cleaned_text.strip():
|
||||
removed_count += 1
|
||||
continue
|
||||
|
||||
if cleaned_text != text:
|
||||
modified_count += 1
|
||||
|
||||
cleaned_blocks.append(
|
||||
SubtitleBlock(
|
||||
block.index,
|
||||
block.start_time,
|
||||
block.end_time,
|
||||
cleaned_text,
|
||||
)
|
||||
)
|
||||
else:
|
||||
cleaned_blocks = list(original_blocks)
|
||||
|
||||
sanitized = sanitize_all_blocks(cleaned_blocks)
|
||||
if len(sanitized) < len(cleaned_blocks):
|
||||
removed_count += len(cleaned_blocks) - len(sanitized)
|
||||
|
||||
if not sanitized:
|
||||
return self._fail("No dialogue subtitles found after cleaning")
|
||||
|
||||
renumbered = [
|
||||
SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
|
||||
for i, b in enumerate(sanitized)
|
||||
]
|
||||
|
||||
changed = len(renumbered) != len(original_blocks)
|
||||
if not changed:
|
||||
for updated, original_block in zip(renumbered, original_blocks):
|
||||
if (
|
||||
updated.start_time != original_block.start_time
|
||||
or updated.end_time != original_block.end_time
|
||||
or updated.text != original_block.text
|
||||
):
|
||||
changed = True
|
||||
break
|
||||
|
||||
if not changed:
|
||||
return {
|
||||
"success": True,
|
||||
"status": "Skipped",
|
||||
"summary": "No changes needed",
|
||||
"removed_blocks": 0,
|
||||
"modified_blocks": 0,
|
||||
"clean_keywords": detected_keywords,
|
||||
}
|
||||
|
||||
tmp = file_path.with_suffix(".srt.tmp")
|
||||
tmp.write_text(format_srt(renumbered), encoding="utf-8")
|
||||
tmp.replace(file_path)
|
||||
|
||||
summary = (
|
||||
f"Removed {removed_count} ad blocks, modified {modified_count} blocks"
|
||||
if clean_subtitle_content
|
||||
else "Cleaned subtitle content"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"status": "Cleaned",
|
||||
"summary": summary,
|
||||
"removed_blocks": removed_count,
|
||||
"modified_blocks": modified_count,
|
||||
"clean_keywords": detected_keywords,
|
||||
}
|
||||
|
||||
except FileLockError as e:
|
||||
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
|
||||
return self._fail(f"File is being processed by another task: {e}")
|
||||
|
||||
# ========================================================
|
||||
# Metadata fetching
|
||||
# ========================================================
|
||||
|
||||
@@ -6,6 +6,9 @@ aiohttp>=3.9.0
|
||||
# Database
|
||||
sqlalchemy>=2.0.0
|
||||
|
||||
# Scheduler
|
||||
apscheduler>=3.10.4
|
||||
|
||||
# Environment
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user