sublogue/server/core/keyword_stripper.py

"""
Keyword stripper utility - removes common junk keywords from filenames and subtitle content
Optimised for torrent / subtitle garbage while preserving real titles and dialogue
"""

from __future__ import annotations

import re
import logging
from logging_utils import get_logger
from typing import Optional, List

logger = get_logger(__name__)


class KeywordStripper:
    """
    High-performance filename cleaner for movies & TV.

    Design goals:
    - Torrent / subtitle spam annihilation
    - Minimal false positives
    - Regex compiled once
    - Fast enough for large libraries
    """

    # -----------------------------
    # CORE JUNK PATTERNS
    # -----------------------------

    QUALITY = r"""
        \b(
            480p|720p|1080p|2160p|4320p|
            4k|8k|
            hdr|hdr10|hdr10\+|dolby\s*vision|dv|
            bluray|blu[-\s]?ray|bdrip|brrip|bd|
            webrip|web[-\s]?dl|web|
            dvdrip|dvd|dvdscr|
            cam|ts|telesync|telecine|tc|
            hdrip|hdlight
        )\b
    """

    CODECS = r"""
        \b(
            x264|x265|h\.?264|h\.?265|hevc|
            xvid|divx|
            aac|ac3|dts|truehd|atmos|
            dd5\.1|dd\+|
            flac|mp3|opus|
            8bit|10bit|hi10p
        )\b
    """

    TORRENT_GROUPS = r"""
        \b(
            yts(\.mx)?|yify|
            rarbg|eztv|ettv|
            psa|ion10|fgт|fgt|
            tgx|torrentgalaxy|
            1337x|limetorrent|
            ettv|ettv|
            publichd|scene|
            ganool|evo
        )\b
    """

    SUBTITLE_ADS = r"""
        \b(
            opensubtitles|
            subscene|
            addic7ed|
            podnapisi|
            yifysubtitles|
            subtitles?\s*by|
            synced?\s*by|
            encoded?\s*by|
            resynced?\s*by
        )\b
        |
        www\.[a-z0-9\-]+\.(com|org|net)
    """

    LANGUAGES = r"""
        \b(
            eng|english|
            ita|italian|
            fra|french|
            spa|spanish|
            ger|german|
            multi|dubbed|
            vostfr|subfrench|
            subs?|subtitles?
        )\b
    """

    EDITIONS = r"""
        \b(
            unrated|uncut|
            directors?\s*cut|
            extended|
            theatrical|
            imax|
            special\s*edition|
            limited|
            internal|
            proper|repack|real
        )\b
    """

    # -----------------------------
    # STRUCTURAL NOISE
    # -----------------------------

    BRACKETS = r"""
        [\[\(\{]
        .*?
        [\]\)\}]
    """

    SEPARATORS = r"[._\-]+"

    MULTISPACE = r"\s+"

    YEAR_PATTERN = r"(19\d{2}|20\d{2})"

    SEASON_EPISODE = [
        r"[Ss](\d{1,2})[Ee](\d{1,2})",
        r"(\d{1,2})x(\d{1,2})",
        r"Season\s*(\d{1,2})\s*Episode\s*(\d{1,2})",
    ]

    # -----------------------------
    # SUBTITLE CONTENT ADS/WATERMARKS
    # -----------------------------
    # These patterns are specifically for cleaning embedded ads from subtitle TEXT
    # They're more aggressive than filename patterns since we want to remove entire lines

    # Release group watermarks that appear in subtitle text
    SUBTITLE_WATERMARKS = [
        # YTS and variants
        r"yts\.mx",
        r"yts\.am",
        r"yts\.lt",
        r"yts\.ag",
        r"\byts\b",
        r"\byify\b",
        # RARBG and other groups
        r"\brarbg\b",
        r"\beztv\b",
        r"\bettv\b",
        r"torrentgalaxy",
        r"\btgx\b",
        r"1337x",
        r"limetorrents?",
        r"\bevo\b",
        r"\bpsa\b",
        r"\bfgt\b",
        # Subtitle sites
        r"opensubtitles?",
        r"subscene",
        r"addic7ed",
        r"podnapisi",
        r"yifysubtitles?",
        r"sub\.?scene",
        r"legendas\.?tv",
        r"shooter\.?cn",
        r"subhd",
        # Generic patterns
        r"downloaded\s+from",
        r"subtitles?\s+by",
        r"sync(?:ed|hronized)?\s+(?:and\s+)?correct(?:ed|ions?)?\s+by",
        r"ripped\s+by",
        r"encoded?\s+by",
        r"resynce?d?\s+by",
        r"improved\s+by",
        r"fixed\s+by",
        r"translated\s+by",
        r"captioned\s+by",
        r"support\s+us\s+and",
        r"get\s+more\s+subtitles",
        r"quality\s+subtitles",
        r"best\s+subtitles",
        r"free\s+subtitles",
        # URLs and domains
        r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)",
        r"https?://[^\s]+",
        # Social media handles that are clearly ads
        r"@yaborr",
        r"@sub_scene",
        r"follow\s+us\s+on",
        r"join\s+us\s+at",
        r"visit\s+us\s+at",
        # Promotional text
        r"advertise\s+here",
        r"membership\s+(is\s+)?free",
        r"become\s+a\s+member",
        r"register\s+(now|today|free)",
        r"sign\s+up\s+(now|today|free)",
    ]

    # Force-remove entire subtitle blocks if these appear anywhere in a line.
    # Partial matches are intentional (e.g. "OpenSubtitles.org").
    SUBTITLE_FORCE_REMOVE = [
        r"yts",
        r"opensubtitles?",
    ]

    _custom_force_remove_keywords: List[str] = []

    # Labels used for reporting detected watermark keywords in clean-only scans
    SUBTITLE_WATERMARK_LABELS = [
        (r"yts\.mx|yts\.am|yts\.lt|yts\.ag|\byts\b", "YTS"),
        (r"\byify\b", "YIFY"),
        (r"\brarbg\b", "RARBG"),
        (r"\beztv\b", "EZTV"),
        (r"\bettv\b", "ETTV"),
        (r"torrentgalaxy|\btgx\b", "TorrentGalaxy"),
        (r"1337x", "1337x"),
        (r"limetorrents?", "LimeTorrents"),
        (r"\bevo\b", "EVO"),
        (r"\bpsa\b", "PSA"),
        (r"\bfgt\b", "FGT"),
        (r"opensubtitles?", "OpenSubtitles"),
        (r"sub\.?scene|subscene", "Subscene"),
        (r"addic7ed", "Addic7ed"),
        (r"podnapisi", "Podnapisi"),
        (r"yifysubtitles?", "YIFY Subtitles"),
        (r"legendas\.?tv", "LegendasTV"),
        (r"shooter\.?cn", "ShooterCN"),
        (r"subhd", "SubHD"),
        (r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)|https?://", "URL"),
    ]

    # Patterns that indicate an ENTIRE subtitle block should be removed
    # (not just the matching text, but the whole block)
    SUBTITLE_BLOCK_REMOVERS = [
        # Pure promotional blocks
        r"^[\s\-_]*(?:www\.)?yts",
        r"^[\s\-_]*(?:www\.)?rarbg",
        r"^[\s\-_]*opensubtitles",
        r"^[\s\-_]*subscene",
        r"^[\s\-_]*downloaded\s+from",
        r"^[\s\-_]*subtitles?\s+by",
        r"^[\s\-_]*sync(?:ed)?\s+(?:and\s+)?correct",
        r"^[\s\-_]*support\s+us",
        r"^[\s\-_]*get\s+(?:more\s+)?subtitles",
        r"^[\s\-_]*quality\s+subtitles",
        r"^[\s\-_]*advertise",
        # ASCII art headers/footers (often used for ads)
        r"^[\s\-=_\*]{10,}$",
        # Empty after cleaning
        r"^\s*$",
    ]

    # -----------------------------
    # OCR / GARBAGE LINE DETECTION
    # -----------------------------

    GARBAGE_MUSIC_LINE = r"^[\s\[\]\(\)\{\}_\-\.\~\*]*(?:[♪♫♬♩]+)[\s\[\]\(\)\{\}_\-\.\~\*]*$"
    GARBAGE_TIMECODE = r"\d{1,2}:\d{2}:\d{2}[,\.]\d{3}"

    # -----------------------------
    # COMPILED REGEX CACHE
    # -----------------------------

    _compiled = None

    @classmethod
    def _compile(cls):
        if cls._compiled:
            return cls._compiled

        def c(p):
            return re.compile(p, re.IGNORECASE | re.VERBOSE)

        custom_force_remove = [
            re.escape(k) for k in cls._custom_force_remove_keywords if k
        ]
        combined_force_remove = cls.SUBTITLE_FORCE_REMOVE + custom_force_remove

        cls._compiled = {
            "junk": c("|".join([
                cls.QUALITY,
                cls.CODECS,
                cls.TORRENT_GROUPS,
                cls.SUBTITLE_ADS,
                cls.LANGUAGES,
                cls.EDITIONS,
            ])),
            "brackets": c(cls.BRACKETS),
            "separators": re.compile(cls.SEPARATORS),
            "multispace": re.compile(cls.MULTISPACE),
            "year": re.compile(cls.YEAR_PATTERN),
            "season_episode": [re.compile(p, re.IGNORECASE) for p in cls.SEASON_EPISODE],
            # Subtitle content cleaning patterns
            "subtitle_watermarks": [
                re.compile(p, re.IGNORECASE) for p in cls.SUBTITLE_WATERMARKS
            ],
            "subtitle_block_removers": [
                re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
            ],
            "subtitle_force_remove": [
                re.compile(p, re.IGNORECASE) for p in combined_force_remove
            ],
            "garbage_music_line": re.compile(cls.GARBAGE_MUSIC_LINE),
            "garbage_timecode": re.compile(cls.GARBAGE_TIMECODE),
        }

        return cls._compiled

    # -----------------------------
    # PUBLIC API
    # -----------------------------

    def strip_keywords(self, title: str, preserve_year: bool = True) -> str:
        rx = self._compile()

        original = title

        # Extract year early
        year: Optional[str] = None
        if preserve_year:
            m = rx["year"].search(title)
            if m:
                year = m.group(1)

        # Remove obvious junk
        cleaned = rx["junk"].sub("", title)

        # Remove bracketed junk AFTER stripping known keywords
        cleaned = rx["brackets"].sub("", cleaned)

        # Normalize separators
        cleaned = rx["separators"].sub(" ", cleaned)
        cleaned = rx["multispace"].sub(" ", cleaned).strip()

        # Re-append year
        if preserve_year and year and year not in cleaned:
            cleaned = f"{cleaned} ({year})"

        logger.debug("KeywordStripper: '%s' -> '%s'", original, cleaned)
        return cleaned

    def extract_year(self, title: str) -> Optional[str]:
        rx = self._compile()
        m = rx["year"].search(title)
        return m.group(1) if m else None

    def extract_season_episode(self, title: str):
        rx = self._compile()
        for p in rx["season_episode"]:
            m = p.search(title)
            if m:
                return int(m.group(1)), int(m.group(2))
        return None, None

    def clean_filename(self, filename: str, preserve_year: bool = True) -> dict:
        name = re.sub(r"\.[^.]+$", "", filename)

        season, episode = self.extract_season_episode(name)
        year = self.extract_year(name)
        cleaned = self.strip_keywords(name, preserve_year=preserve_year)

        return {
            "cleaned_title": cleaned,
            "year": year,
            "season": season,
            "episode": episode,
            "is_series": season is not None or episode is not None,
        }

    # -----------------------------
    # SUBTITLE CONTENT CLEANING
    # -----------------------------

    def should_remove_subtitle_block(self, text: str) -> bool:
        """
        Check if an entire subtitle block should be removed.

        Returns True if the block is purely promotional/ad content
        with no legitimate dialogue.

        Args:
            text: The subtitle text content

        Returns:
            True if block should be removed entirely
        """
        rx = self._compile()

        # Check each line of the subtitle
        lines = text.strip().split('\n')
        non_ad_lines = 0

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Hard kill-switch: if a line mentions these sources, drop the whole block.
            for pattern in rx["subtitle_force_remove"]:
                if pattern.search(line):
                    return True

            # Check if this line matches any block remover pattern
            is_ad_line = False
            for pattern in rx["subtitle_block_removers"]:
                if pattern.search(line):
                    is_ad_line = True
                    break

            # Also check watermarks - if the entire line is just a watermark
            if not is_ad_line:
                temp_line = line
                for pattern in rx["subtitle_watermarks"]:
                    temp_line = pattern.sub("", temp_line)
                # If after removing watermarks, line is empty or just punctuation
                temp_line = re.sub(r'[\s\-_\.\,\!\?\:\;]+', '', temp_line)
                if not temp_line:
                    is_ad_line = True

            if not is_ad_line:
                non_ad_lines += 1

        # If no legitimate content remains, remove the block
        return non_ad_lines == 0

    def clean_subtitle_text(self, text: str) -> str:
        """
        Clean watermarks and ads from subtitle text while preserving dialogue.

        This is more surgical than should_remove_subtitle_block() - it removes
        specific ad text but keeps the rest of the subtitle intact.

        Args:
            text: The subtitle text content

        Returns:
            Cleaned text with ads removed, or empty string if nothing remains
        """
        return self._clean_subtitle_text(text, remove_watermarks=True, remove_garbage=False)

    def clean_subtitle_text_with_options(
        self,
        text: str,
        remove_watermarks: bool = True,
        remove_garbage: bool = False,
    ) -> str:
        return self._clean_subtitle_text(
            text,
            remove_watermarks=remove_watermarks,
            remove_garbage=remove_garbage,
        )

    def _is_timecode_line(self, line: str) -> bool:
        rx = self._compile()
        if not rx["garbage_timecode"].search(line):
            return False
        stripped = rx["garbage_timecode"].sub("", line)
        stripped = re.sub(r"[\s0-9:\-–>,\.\[\]]+", "", stripped)
        return stripped == ""

    def _is_music_line(self, line: str) -> bool:
        rx = self._compile()
        return bool(rx["garbage_music_line"].match(line.strip()))

    def _normalize_line(self, line: str) -> str:
        return re.sub(r"\s+", " ", line.strip()).lower()

    def _clean_subtitle_text(
        self,
        text: str,
        remove_watermarks: bool = True,
        remove_garbage: bool = False,
    ) -> str:
        rx = self._compile()
        original = text

        # Process line by line to handle multi-line subtitles
        lines = text.split('\n')
        cleaned_lines = []
        seen_lines = set()

        for line in lines:
            if remove_garbage:
                if self._is_music_line(line) or self._is_timecode_line(line):
                    continue

            cleaned_line = line

            # Remove watermark patterns
            if remove_watermarks:
                for pattern in rx["subtitle_watermarks"]:
                    cleaned_line = pattern.sub("", cleaned_line)

            # Clean up resulting whitespace
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()

            if not cleaned_line:
                continue

            if remove_garbage:
                normalized = self._normalize_line(cleaned_line)
                if normalized and normalized in seen_lines:
                    continue
                seen_lines.add(normalized)

            cleaned_lines.append(cleaned_line)

        result = '\n'.join(cleaned_lines)

        # Final cleanup - remove lines that are just punctuation/dashes
        result_lines = result.split('\n')
        result_lines = [l for l in result_lines if re.search(r'[a-zA-Z0-9]', l)]
        result = '\n'.join(result_lines)

        if result != original:
            logger.debug("Cleaned subtitle text: '%s' -> '%s'", original[:50], result[:50])

        return result

    def clean_subtitle_blocks(
        self,
        blocks: List[dict],
        remove_watermarks: bool = True,
        remove_garbage: bool = False,
    ) -> List[dict]:
        """
        Clean a list of subtitle blocks, removing ads and watermarks.

        This processes each block:
        1. Checks if the entire block should be removed (pure ad content)
        2. If not, cleans watermarks from the text

        Args:
            blocks: List of subtitle block dicts with 'text' key

        Returns:
            Cleaned list with ad blocks removed and watermarks stripped
        """
        cleaned = []
        removed_count = 0
        modified_count = 0

        for block in blocks:
            text = block.get("text", "")

            # Check if entire block should be removed
            if remove_watermarks and self.should_remove_subtitle_block(text):
                removed_count += 1
                logger.debug("Removing ad block: '%s'", text[:50])
                continue

            # Clean the text
            cleaned_text = self._clean_subtitle_text(
                text,
                remove_watermarks=remove_watermarks,
                remove_garbage=remove_garbage,
            )

            # Skip if cleaning resulted in empty text
            if not cleaned_text.strip():
                removed_count += 1
                continue

            # Track if we modified the text
            if cleaned_text != text:
                modified_count += 1

            # Create new block with cleaned text
            cleaned_block = block.copy()
            cleaned_block["text"] = cleaned_text
            cleaned.append(cleaned_block)

        if removed_count > 0 or modified_count > 0:
            logger.info(
                "Subtitle cleaning: removed %d ad blocks, modified %d blocks",
                removed_count, modified_count
            )

        return cleaned

    def detect_subtitle_watermarks(self, text: str) -> List[str]:
        """Detect known subtitle watermark keywords in raw subtitle text."""
        detected = []
        for pattern, label in self.SUBTITLE_WATERMARK_LABELS:
            if re.search(pattern, text, re.IGNORECASE):
                detected.append(label)
        for keyword in self._custom_force_remove_keywords:
            if keyword and re.search(re.escape(keyword), text, re.IGNORECASE):
                detected.append(keyword)
        return detected

    def detect_garbage_labels(self, block_texts: List[str]) -> List[str]:
        """Detect OCR/garbage patterns in subtitle blocks."""
        labels = set()
        for text in block_texts:
            lines = text.split("\n")
            seen = set()
            for line in lines:
                if self._is_music_line(line):
                    labels.add("Music-only lines")
                if self._is_timecode_line(line):
                    labels.add("OCR timecodes")
                normalized = self._normalize_line(line)
                if normalized:
                    if normalized in seen:
                        labels.add("Duplicate lines")
                    else:
                        seen.add(normalized)
            if len(labels) >= 3:
                break
        return sorted(labels)

    def set_force_remove_keywords(self, keywords: List[str]) -> None:
        """Set custom force-remove keywords and refresh regex cache."""
        type(self)._custom_force_remove_keywords = [
            k.strip() for k in (keywords or []) if k and k.strip()
        ]
        type(self)._compiled = None


# -----------------------------
# SINGLETON HELPERS
# -----------------------------

_default_stripper: Optional[KeywordStripper] = None


def get_stripper() -> KeywordStripper:
    global _default_stripper
    if _default_stripper is None:
        _default_stripper = KeywordStripper()
    return _default_stripper


def clean_title(title: str, preserve_year: bool = True) -> str:
    return get_stripper().strip_keywords(title, preserve_year)


def clean_filename(filename: str, preserve_year: bool = True) -> dict:
    return get_stripper().clean_filename(filename, preserve_year)


def clean_subtitle_content(
    text: str,
    remove_watermarks: bool = True,
    remove_garbage: bool = False,
) -> str:
    """Clean watermarks/ads and optional OCR garbage from subtitle text."""
    return get_stripper().clean_subtitle_text_with_options(
        text,
        remove_watermarks=remove_watermarks,
        remove_garbage=remove_garbage,
    )


def should_remove_subtitle(text: str) -> bool:
    """Check if a subtitle block should be removed entirely (pure ad)."""
    return get_stripper().should_remove_subtitle_block(text)