1.1.0 - automations, clean only mode, bug fixes

This commit is contained in:
ponzischeme89
2026-01-19 02:10:08 +13:00
parent 93e8b38e24
commit 9345ac4331
25 changed files with 2690 additions and 499 deletions
+21 -6
View File
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
import sys
sys.path.insert(0, str(Path(__file__).parent))
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
from keyword_stripper import get_stripper
class FileScanner:
@@ -45,6 +46,7 @@ class FileScanner:
directory_path: str | Path,
batch_size: int = DEFAULT_BATCH_SIZE,
follow_symlinks: bool = False,
detect_cleanup_keywords: bool = False,
) -> Generator[List[Dict], None, None]:
"""
Recursively scan a directory tree for .srt files.
@@ -106,8 +108,11 @@ class FileScanner:
# Plot detection
# --------------------------------------------
content = None
try:
plot_marker_count = cls._count_plot_markers(file_path)
if detect_cleanup_keywords:
content = file_path.read_text(encoding="utf-8", errors="ignore")
plot_marker_count = cls._count_plot_markers(file_path, content=content)
has_plot = plot_marker_count > 0
logger.debug(
"Plot check for %s: %s",
@@ -126,7 +131,7 @@ class FileScanner:
if has_plot:
try:
metadata = cls._extract_metadata(file_path)
metadata = cls._extract_metadata(file_path, content=content)
logger.debug(
"Extracted metadata from %s: %s",
file_path.name,
@@ -138,6 +143,13 @@ class FileScanner:
file_path.name, e
)
clean_keywords = []
if detect_cleanup_keywords and content:
try:
clean_keywords = get_stripper().detect_subtitle_watermarks(content)
except Exception as e:
logger.debug("Cleanup keyword detection failed: %s", e)
status = "Has Plot" if has_plot else "Not Loaded"
if plot_marker_count > 1:
status = "Duplicate Plot"
@@ -156,6 +168,7 @@ class FileScanner:
"imdb_rating": metadata.get("imdb_rating"),
"rating": metadata.get("imdb_rating"),
"runtime": metadata.get("runtime"),
"clean_keywords": clean_keywords,
"selected": False,
})
@@ -216,14 +229,15 @@ class FileScanner:
)
@classmethod
def _count_plot_markers(cls, file_path: Path) -> int:
def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
"""
Count Sublogue plot markers to detect duplicates.
"""
logger.debug("Scanning for plot markers in %s", file_path.name)
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
if content is None:
content = file_path.read_text(encoding="utf-8", errors="ignore")
lower_content = content.lower()
generated_count = lower_content.count("generated by sublogue")
if generated_count > 0:
@@ -237,14 +251,15 @@ class FileScanner:
return 0
@classmethod
def _extract_metadata(cls, file_path: Path) -> Dict:
def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
"""
Extract title, year, rating, runtime, and plot
from Sublogue-generated subtitles.
"""
logger.debug("Extracting metadata from %s", file_path.name)
content = file_path.read_text(encoding="utf-8", errors="ignore")
if content is None:
content = file_path.read_text(encoding="utf-8", errors="ignore")
blocks = parse_srt(content)
metadata = {