1.1.0 - automations, clean only mode, bug fixes
This commit is contained in:
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
|
||||
from keyword_stripper import get_stripper
|
||||
|
||||
|
||||
class FileScanner:
|
||||
@@ -45,6 +46,7 @@ class FileScanner:
|
||||
directory_path: str | Path,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
follow_symlinks: bool = False,
|
||||
detect_cleanup_keywords: bool = False,
|
||||
) -> Generator[List[Dict], None, None]:
|
||||
"""
|
||||
Recursively scan a directory tree for .srt files.
|
||||
@@ -106,8 +108,11 @@ class FileScanner:
|
||||
# Plot detection
|
||||
# --------------------------------------------
|
||||
|
||||
content = None
|
||||
try:
|
||||
plot_marker_count = cls._count_plot_markers(file_path)
|
||||
if detect_cleanup_keywords:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
plot_marker_count = cls._count_plot_markers(file_path, content=content)
|
||||
has_plot = plot_marker_count > 0
|
||||
logger.debug(
|
||||
"Plot check for %s: %s",
|
||||
@@ -126,7 +131,7 @@ class FileScanner:
|
||||
|
||||
if has_plot:
|
||||
try:
|
||||
metadata = cls._extract_metadata(file_path)
|
||||
metadata = cls._extract_metadata(file_path, content=content)
|
||||
logger.debug(
|
||||
"Extracted metadata from %s: %s",
|
||||
file_path.name,
|
||||
@@ -138,6 +143,13 @@ class FileScanner:
|
||||
file_path.name, e
|
||||
)
|
||||
|
||||
clean_keywords = []
|
||||
if detect_cleanup_keywords and content:
|
||||
try:
|
||||
clean_keywords = get_stripper().detect_subtitle_watermarks(content)
|
||||
except Exception as e:
|
||||
logger.debug("Cleanup keyword detection failed: %s", e)
|
||||
|
||||
status = "Has Plot" if has_plot else "Not Loaded"
|
||||
if plot_marker_count > 1:
|
||||
status = "Duplicate Plot"
|
||||
@@ -156,6 +168,7 @@ class FileScanner:
|
||||
"imdb_rating": metadata.get("imdb_rating"),
|
||||
"rating": metadata.get("imdb_rating"),
|
||||
"runtime": metadata.get("runtime"),
|
||||
"clean_keywords": clean_keywords,
|
||||
"selected": False,
|
||||
})
|
||||
|
||||
@@ -216,14 +229,15 @@ class FileScanner:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _count_plot_markers(cls, file_path: Path) -> int:
|
||||
def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
|
||||
"""
|
||||
Count Sublogue plot markers to detect duplicates.
|
||||
"""
|
||||
logger.debug("Scanning for plot markers in %s", file_path.name)
|
||||
|
||||
try:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content is None:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
lower_content = content.lower()
|
||||
generated_count = lower_content.count("generated by sublogue")
|
||||
if generated_count > 0:
|
||||
@@ -237,14 +251,15 @@ class FileScanner:
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def _extract_metadata(cls, file_path: Path) -> Dict:
|
||||
def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
|
||||
"""
|
||||
Extract title, year, rating, runtime, and plot
|
||||
from Sublogue-generated subtitles.
|
||||
"""
|
||||
logger.debug("Extracting metadata from %s", file_path.name)
|
||||
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content is None:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
blocks = parse_srt(content)
|
||||
|
||||
metadata = {
|
||||
|
||||
Reference in New Issue
Block a user