import logging from logging_utils import get_logger import os import re from pathlib import Path from typing import Generator, List, Dict # ------------------------------------------------------------ # Logging configuration # ------------------------------------------------------------ logger = get_logger("FileScanner") # ------------------------------------------------------------ # Import subtitle parser # ------------------------------------------------------------ import sys sys.path.insert(0, str(Path(__file__).parent)) from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN class FileScanner: """ Efficient, disk-friendly subtitle scanner. - Recursive (os.scandir-based) - Streams file reads - Batches results - Extensive logging for observability """ SUPPORTED_EXTENSIONS = {".srt"} MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB PLOT_SCAN_LINES = 50 DEFAULT_BATCH_SIZE = 100 # -------------------------------------------------------- # Public API # -------------------------------------------------------- @classmethod def scan_directory( cls, directory_path: str | Path, batch_size: int = DEFAULT_BATCH_SIZE, follow_symlinks: bool = False, ) -> Generator[List[Dict], None, None]: """ Recursively scan a directory tree for .srt files. Yields batches of metadata dictionaries. """ root = Path(directory_path) logger.info("Starting subtitle scan") logger.info("Root directory : %s", root) logger.info("Batch size : %s", batch_size) logger.info("Follow symlinks : %s", follow_symlinks) if not root.exists(): logger.error("Scan failed: path does not exist (%s)", root) raise ValueError(f"Directory does not exist: {directory_path}") if not root.is_dir(): logger.error("Scan failed: not a directory (%s)", root) raise ValueError(f"Invalid directory: {directory_path}") batch: List[Dict] = [] total_seen = 0 total_srt = 0 total_skipped = 0 for file_path in cls._walk_files(root, follow_symlinks): total_seen += 1 if file_path.suffix.lower() not in cls.SUPPORTED_EXTENSIONS: logger.debug("Ignoring non-subtitle file: %s", file_path) continue total_srt += 1 logger.debug("Found subtitle file: %s", file_path) # -------------------------------------------- # Stat / size guard # -------------------------------------------- try: stat = file_path.stat() except OSError as e: total_skipped += 1 logger.warning( "Skipping unreadable file: %s (%s)", file_path, e ) continue if stat.st_size > cls.MAX_FILE_SIZE_BYTES: total_skipped += 1 logger.warning( "Skipping large subtitle file (%d bytes): %s", stat.st_size, file_path ) continue # -------------------------------------------- # Plot detection # -------------------------------------------- try: plot_marker_count = cls._count_plot_markers(file_path) has_plot = plot_marker_count > 0 logger.debug( "Plot check for %s: %s", file_path.name, "FOUND" if has_plot else "NOT FOUND" ) except Exception as e: total_skipped += 1 logger.error( "Plot scan failed for %s: %s", file_path, e ) continue metadata = {} if has_plot: try: metadata = cls._extract_metadata(file_path) logger.debug( "Extracted metadata from %s: %s", file_path.name, {k: v for k, v in metadata.items() if v} ) except Exception as e: logger.warning( "Metadata extraction failed for %s: %s", file_path.name, e ) status = "Has Plot" if has_plot else "Not Loaded" if plot_marker_count > 1: status = "Duplicate Plot" batch.append({ "path": str(file_path), "name": file_path.name, "has_plot": has_plot, "plot_marker_count": plot_marker_count, "duplicate_plot": plot_marker_count > 1, "status": status, "summary": metadata.get("summary", ""), "plot": metadata.get("summary", ""), "title": metadata.get("title"), "year": metadata.get("year"), "imdb_rating": metadata.get("imdb_rating"), "rating": metadata.get("imdb_rating"), "runtime": metadata.get("runtime"), "selected": False, }) if len(batch) >= batch_size: logger.info( "Yielding batch (%d items, %d total files scanned)", len(batch), total_seen ) yield batch batch = [] if batch: logger.info( "Yielding final batch (%d items)", len(batch) ) yield batch logger.info("Subtitle scan completed") logger.info("Files visited : %d", total_seen) logger.info("Subtitle files found : %d", total_srt) logger.info("Files skipped : %d", total_skipped) # -------------------------------------------------------- # Internal helpers # -------------------------------------------------------- @staticmethod def _walk_files(root: Path, follow_symlinks: bool): """ Fast iterative recursive directory walk using os.scandir. """ logger.debug("Beginning recursive walk at %s", root) stack = [root] while stack: current = stack.pop() logger.debug("Scanning directory: %s", current) try: with os.scandir(current) as entries: for entry in entries: try: if entry.is_dir(follow_symlinks=follow_symlinks): stack.append(Path(entry.path)) elif entry.is_file(): yield Path(entry.path) except OSError as e: logger.debug( "Skipping entry due to OS error: %s (%s)", entry.path, e ) except OSError as e: logger.warning( "Cannot access directory: %s (%s)", current, e ) @classmethod def _count_plot_markers(cls, file_path: Path) -> int: """ Count Sublogue plot markers to detect duplicates. """ logger.debug("Scanning for plot markers in %s", file_path.name) try: content = file_path.read_text(encoding="utf-8", errors="ignore") lower_content = content.lower() generated_count = lower_content.count("generated by sublogue") if generated_count > 0: return generated_count return content.count(SUBLOGUE_SENTINEL) except Exception as e: logger.error( "Error reading file during plot scan: %s (%s)", file_path, e ) return 0 @classmethod def _extract_metadata(cls, file_path: Path) -> Dict: """ Extract title, year, rating, runtime, and plot from Sublogue-generated subtitles. """ logger.debug("Extracting metadata from %s", file_path.name) content = file_path.read_text(encoding="utf-8", errors="ignore") blocks = parse_srt(content) metadata = { "title": None, "year": None, "imdb_rating": None, "runtime": None, "summary": "" } if len(blocks) < 2: logger.debug("Not enough subtitle blocks for metadata extraction") return metadata # -------------------------------------------- # Plot block (index 1) # -------------------------------------------- plot_text = blocks[1].text plot_text = plot_text.split("Generated by Sublogue")[0].strip() plot_text = SUBLOGUE_TOKEN_PATTERN.sub("", plot_text).strip() metadata["summary"] = plot_text # -------------------------------------------- # Header block (index 0) # -------------------------------------------- header_lines = blocks[0].text.split("\n") if header_lines: first_line = header_lines[0].strip() if first_line == SUBLOGUE_SENTINEL and len(header_lines) > 1: first_line = header_lines[1].strip() year_match = re.search(r"\((\d{4})\)", first_line) if year_match: metadata["year"] = year_match.group(1) metadata["title"] = first_line[:year_match.start()].strip() else: metadata["title"] = first_line.strip() if len(header_lines) > 1: second_line = header_lines[1] rating_match = re.search(r"IMDb:\s*([^\s]+)", second_line) if rating_match: metadata["imdb_rating"] = rating_match.group(1) runtime_match = re.search(r"⏱\s*(.+)", second_line) if runtime_match: metadata["runtime"] = runtime_match.group(1).strip() logger.debug("Metadata extracted: %s", metadata) return metadata