1.1.0 - automations, clean only mode, bug fixes

2026-01-19 02:10:08 +13:00
parent 93e8b38e24
commit 9345ac4331
25 changed files with 2690 additions and 499 deletions
@@ -184,6 +184,40 @@ class FolderRule(Base):
        return f"<FolderRule(id={self.id}, directory='{self.directory}')>"


+class AutomationRule(Base):
+    """Automation rules for scheduled tasks"""
+    __tablename__ = 'automation_rules'
+
+    id = Column(String(64), primary_key=True)
+    name = Column(String(255), nullable=False)
+    schedule = Column(String(100), nullable=False)
+    enabled = Column(Boolean, default=True, nullable=False)
+    patterns = Column(Text, nullable=False)  # JSON list
+    target_folders = Column(Text, nullable=False)  # JSON list
+    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+    def __repr__(self):
+        return f"<AutomationRule(id='{self.id}', name='{self.name}', enabled={self.enabled})>"
+
+
+class AutomationLog(Base):
+    """Automation run log entries"""
+    __tablename__ = 'automation_logs'
+
+    id = Column(Integer, primary_key=True)
+    rule_id = Column(String(64), nullable=False, index=True)
+    file_path = Column(String(500), nullable=False)
+    modified = Column(Boolean, default=False)
+    removed_lines = Column(Integer, default=0)
+    dry_run = Column(Boolean, default=False)
+    error_message = Column(Text)
+    run_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+
+    def __repr__(self):
+        return f"<AutomationLog(rule_id='{self.rule_id}', file_path='{self.file_path}')>"
+
+
 class DatabaseManager:
    """Manages database connections and operations"""

@@ -1033,6 +1067,112 @@ class DatabaseManager:
        finally:
            session.close()

+    # ============ AUTOMATION RULES OPERATIONS ============
+
+    def get_automation_rules(self):
+        """Get all automation rules"""
+        session = self.get_session()
+        try:
+            rules = session.query(AutomationRule).order_by(AutomationRule.created_at.asc()).all()
+            result = []
+            for rule in rules:
+                result.append({
+                    "id": rule.id,
+                    "name": rule.name,
+                    "schedule": rule.schedule,
+                    "enabled": rule.enabled,
+                    "patterns": json.loads(rule.patterns) if rule.patterns else [],
+                    "target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
+                    "created_at": rule.created_at.isoformat() if rule.created_at else None,
+                    "updated_at": rule.updated_at.isoformat() if rule.updated_at else None
+                })
+            return result
+        finally:
+            session.close()
+
+    def get_automation_rule(self, rule_id):
+        """Get a single automation rule"""
+        session = self.get_session()
+        try:
+            rule = session.query(AutomationRule).filter_by(id=rule_id).first()
+            if not rule:
+                return None
+            return {
+                "id": rule.id,
+                "name": rule.name,
+                "schedule": rule.schedule,
+                "enabled": rule.enabled,
+                "patterns": json.loads(rule.patterns) if rule.patterns else [],
+                "target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
+                "created_at": rule.created_at.isoformat() if rule.created_at else None,
+                "updated_at": rule.updated_at.isoformat() if rule.updated_at else None
+            }
+        finally:
+            session.close()
+
+    def upsert_automation_rule(self, rule_data):
+        """Create or update an automation rule"""
+        session = self.get_session()
+        try:
+            rule_id = rule_data["id"]
+            rule = session.query(AutomationRule).filter_by(id=rule_id).first()
+            if not rule:
+                rule = AutomationRule(id=rule_id)
+                session.add(rule)
+
+            rule.name = rule_data["name"]
+            rule.schedule = rule_data["schedule"]
+            rule.enabled = bool(rule_data.get("enabled", True))
+            rule.patterns = json.dumps(rule_data.get("patterns", []))
+            rule.target_folders = json.dumps(rule_data.get("target_folders", []))
+
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error saving automation rule: {e}")
+            return False
+        finally:
+            session.close()
+
+    def delete_automation_rule(self, rule_id):
+        """Delete an automation rule"""
+        session = self.get_session()
+        try:
+            rule = session.query(AutomationRule).filter_by(id=rule_id).first()
+            if rule:
+                session.delete(rule)
+                session.commit()
+                return True
+            return False
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error deleting automation rule: {e}")
+            return False
+        finally:
+            session.close()
+
+    def add_automation_log(self, rule_id, file_path, modified, removed_lines, dry_run=False, error_message=None):
+        """Add an automation log entry"""
+        session = self.get_session()
+        try:
+            entry = AutomationLog(
+                rule_id=rule_id,
+                file_path=file_path,
+                modified=bool(modified),
+                removed_lines=int(removed_lines or 0),
+                dry_run=bool(dry_run),
+                error_message=error_message
+            )
+            session.add(entry)
+            session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error saving automation log: {e}")
+            raise
+        finally:
+            session.close()
+
    # ============ MAINTENANCE OPERATIONS ============

    def clear_settings(self, keep_api_keys=False):
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
 import sys
 sys.path.insert(0, str(Path(__file__).parent))
 from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
+from keyword_stripper import get_stripper


 class FileScanner:
@@ -45,6 +46,7 @@ class FileScanner:
        directory_path: str | Path,
        batch_size: int = DEFAULT_BATCH_SIZE,
        follow_symlinks: bool = False,
+        detect_cleanup_keywords: bool = False,
    ) -> Generator[List[Dict], None, None]:
        """
        Recursively scan a directory tree for .srt files.
@@ -106,8 +108,11 @@ class FileScanner:
            # Plot detection
            # --------------------------------------------

+            content = None
            try:
-                plot_marker_count = cls._count_plot_markers(file_path)
+                if detect_cleanup_keywords:
+                    content = file_path.read_text(encoding="utf-8", errors="ignore")
+                plot_marker_count = cls._count_plot_markers(file_path, content=content)
                has_plot = plot_marker_count > 0
                logger.debug(
                    "Plot check for %s: %s",
@@ -126,7 +131,7 @@ class FileScanner:

            if has_plot:
                try:
-                    metadata = cls._extract_metadata(file_path)
+                    metadata = cls._extract_metadata(file_path, content=content)
                    logger.debug(
                        "Extracted metadata from %s: %s",
                        file_path.name,
@@ -138,6 +143,13 @@ class FileScanner:
                        file_path.name, e
                    )

+            clean_keywords = []
+            if detect_cleanup_keywords and content:
+                try:
+                    clean_keywords = get_stripper().detect_subtitle_watermarks(content)
+                except Exception as e:
+                    logger.debug("Cleanup keyword detection failed: %s", e)
+
            status = "Has Plot" if has_plot else "Not Loaded"
            if plot_marker_count > 1:
                status = "Duplicate Plot"
@@ -156,6 +168,7 @@ class FileScanner:
                "imdb_rating": metadata.get("imdb_rating"),
                "rating": metadata.get("imdb_rating"),
                "runtime": metadata.get("runtime"),
+                "clean_keywords": clean_keywords,
                "selected": False,
            })

@@ -216,14 +229,15 @@ class FileScanner:
                )

    @classmethod
-    def _count_plot_markers(cls, file_path: Path) -> int:
+    def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
        """
        Count Sublogue plot markers to detect duplicates.
        """
        logger.debug("Scanning for plot markers in %s", file_path.name)

        try:
-            content = file_path.read_text(encoding="utf-8", errors="ignore")
+            if content is None:
+                content = file_path.read_text(encoding="utf-8", errors="ignore")
            lower_content = content.lower()
            generated_count = lower_content.count("generated by sublogue")
            if generated_count > 0:
@@ -237,14 +251,15 @@ class FileScanner:
            return 0

    @classmethod
-    def _extract_metadata(cls, file_path: Path) -> Dict:
+    def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
        """
        Extract title, year, rating, runtime, and plot
        from Sublogue-generated subtitles.
        """
        logger.debug("Extracting metadata from %s", file_path.name)

-        content = file_path.read_text(encoding="utf-8", errors="ignore")
+        if content is None:
+            content = file_path.read_text(encoding="utf-8", errors="ignore")
        blocks = parse_srt(content)

        metadata = {
@@ -199,6 +199,39 @@ class KeywordStripper:
        r"sign\s+up\s+(now|today|free)",
    ]

+    # Force-remove entire subtitle blocks if these appear anywhere in a line.
+    # Partial matches are intentional (e.g. "OpenSubtitles.org").
+    SUBTITLE_FORCE_REMOVE = [
+        r"yts",
+        r"opensubtitles?",
+    ]
+
+    _custom_force_remove_keywords: List[str] = []
+
+    # Labels used for reporting detected watermark keywords in clean-only scans
+    SUBTITLE_WATERMARK_LABELS = [
+        (r"yts\.mx|yts\.am|yts\.lt|yts\.ag|\byts\b", "YTS"),
+        (r"\byify\b", "YIFY"),
+        (r"\brarbg\b", "RARBG"),
+        (r"\beztv\b", "EZTV"),
+        (r"\bettv\b", "ETTV"),
+        (r"torrentgalaxy|\btgx\b", "TorrentGalaxy"),
+        (r"1337x", "1337x"),
+        (r"limetorrents?", "LimeTorrents"),
+        (r"\bevo\b", "EVO"),
+        (r"\bpsa\b", "PSA"),
+        (r"\bfgt\b", "FGT"),
+        (r"opensubtitles?", "OpenSubtitles"),
+        (r"sub\.?scene|subscene", "Subscene"),
+        (r"addic7ed", "Addic7ed"),
+        (r"podnapisi", "Podnapisi"),
+        (r"yifysubtitles?", "YIFY Subtitles"),
+        (r"legendas\.?tv", "LegendasTV"),
+        (r"shooter\.?cn", "ShooterCN"),
+        (r"subhd", "SubHD"),
+        (r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)|https?://", "URL"),
+    ]
+
    # Patterns that indicate an ENTIRE subtitle block should be removed
    # (not just the matching text, but the whole block)
    SUBTITLE_BLOCK_REMOVERS = [
@@ -234,6 +267,11 @@ class KeywordStripper:
        def c(p):
            return re.compile(p, re.IGNORECASE | re.VERBOSE)

+        custom_force_remove = [
+            re.escape(k) for k in cls._custom_force_remove_keywords if k
+        ]
+        combined_force_remove = cls.SUBTITLE_FORCE_REMOVE + custom_force_remove
+
        cls._compiled = {
            "junk": c("|".join([
                cls.QUALITY,
@@ -255,6 +293,9 @@ class KeywordStripper:
            "subtitle_block_removers": [
                re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
            ],
+            "subtitle_force_remove": [
+                re.compile(p, re.IGNORECASE) for p in combined_force_remove
+            ],
        }

        return cls._compiled
@@ -348,6 +389,11 @@ class KeywordStripper:
            if not line:
                continue

+            # Hard kill-switch: if a line mentions these sources, drop the whole block.
+            for pattern in rx["subtitle_force_remove"]:
+                if pattern.search(line):
+                    return True
+
            # Check if this line matches any block remover pattern
            is_ad_line = False
            for pattern in rx["subtitle_block_removers"]:
@@ -469,6 +515,24 @@ class KeywordStripper:

        return cleaned

+    def detect_subtitle_watermarks(self, text: str) -> List[str]:
+        """Detect known subtitle watermark keywords in raw subtitle text."""
+        detected = []
+        for pattern, label in self.SUBTITLE_WATERMARK_LABELS:
+            if re.search(pattern, text, re.IGNORECASE):
+                detected.append(label)
+        for keyword in self._custom_force_remove_keywords:
+            if keyword and re.search(re.escape(keyword), text, re.IGNORECASE):
+                detected.append(keyword)
+        return detected
+
+    def set_force_remove_keywords(self, keywords: List[str]) -> None:
+        """Set custom force-remove keywords and refresh regex cache."""
+        type(self)._custom_force_remove_keywords = [
+            k.strip() for k in (keywords or []) if k and k.strip()
+        ]
+        type(self)._compiled = None
+

 # -----------------------------
 # SINGLETON HELPERS
@@ -1373,6 +1373,8 @@ class SubtitleProcessor:
                # ─────────────────────────────────────────────────────────────
                original = file_path.read_text(encoding="utf-8", errors="ignore")
                subs = parse_srt(original)
+                stripper = get_stripper()
+                detected_keywords = stripper.detect_subtitle_watermarks(original)

                if not subs:
                    return self._fail("No valid subtitle blocks found")
@@ -1540,6 +1542,116 @@ class SubtitleProcessor:
            logger.error(f"Could not acquire lock for {file_path.name}: {e}")
            return self._fail(f"File is being processed by another task: {e}")

+    def clean_file(
+        self,
+        file_path: str | Path,
+        clean_subtitle_content: bool = True,
+    ) -> dict:
+        """Clean ad/watermark content from a subtitle file without inserting plots."""
+        file_path = Path(file_path)
+
+        if not file_path.exists():
+            return self._fail("File not found")
+
+        if file_path.stat().st_size > self.MAX_SRT_BYTES:
+            return self._fail("Subtitle file too large")
+
+        try:
+            with file_lock(file_path, timeout=30.0):
+                original = file_path.read_text(encoding="utf-8", errors="ignore")
+                subs = parse_srt(original)
+
+                if not subs:
+                    return self._fail("No valid subtitle blocks found")
+
+                original_blocks = subs
+                removed_count = 0
+                modified_count = 0
+
+                if clean_subtitle_content:
+                    cleaned_blocks: List[SubtitleBlock] = []
+
+                    for block in original_blocks:
+                        text = block.text
+                        if stripper.should_remove_subtitle_block(text):
+                            removed_count += 1
+                            continue
+
+                        cleaned_text = stripper.clean_subtitle_text(text)
+                        if not cleaned_text.strip():
+                            removed_count += 1
+                            continue
+
+                        if cleaned_text != text:
+                            modified_count += 1
+
+                        cleaned_blocks.append(
+                            SubtitleBlock(
+                                block.index,
+                                block.start_time,
+                                block.end_time,
+                                cleaned_text,
+                            )
+                        )
+                else:
+                    cleaned_blocks = list(original_blocks)
+
+                sanitized = sanitize_all_blocks(cleaned_blocks)
+                if len(sanitized) < len(cleaned_blocks):
+                    removed_count += len(cleaned_blocks) - len(sanitized)
+
+                if not sanitized:
+                    return self._fail("No dialogue subtitles found after cleaning")
+
+                renumbered = [
+                    SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
+                    for i, b in enumerate(sanitized)
+                ]
+
+                changed = len(renumbered) != len(original_blocks)
+                if not changed:
+                    for updated, original_block in zip(renumbered, original_blocks):
+                        if (
+                            updated.start_time != original_block.start_time
+                            or updated.end_time != original_block.end_time
+                            or updated.text != original_block.text
+                        ):
+                            changed = True
+                            break
+
+                if not changed:
+                    return {
+                        "success": True,
+                        "status": "Skipped",
+                        "summary": "No changes needed",
+                        "removed_blocks": 0,
+                        "modified_blocks": 0,
+                        "clean_keywords": detected_keywords,
+                    }
+
+                tmp = file_path.with_suffix(".srt.tmp")
+                tmp.write_text(format_srt(renumbered), encoding="utf-8")
+                tmp.replace(file_path)
+
+                summary = (
+                    f"Removed {removed_count} ad blocks, modified {modified_count} blocks"
+                    if clean_subtitle_content
+                    else "Cleaned subtitle content"
+                )
+
+                return {
+                    "success": True,
+                    "status": "Cleaned",
+                    "summary": summary,
+                    "removed_blocks": removed_count,
+                    "modified_blocks": modified_count,
+                    "clean_keywords": detected_keywords,
+                }
+
+        except FileLockError as e:
+            logger.error(f"Could not acquire lock for {file_path.name}: {e}")
+            return self._fail(f"File is being processed by another task: {e}")
+
    # ========================================================
    # Metadata fetching
    # ========================================================