server/core/file_scanner.py

import logging
from logging_utils import get_logger
import os
import re
from pathlib import Path
from typing import Generator, List, Dict

# ------------------------------------------------------------
# Logging configuration
# ------------------------------------------------------------

logger = get_logger("FileScanner")

# ------------------------------------------------------------
# Import subtitle parser
# ------------------------------------------------------------

import sys
sys.path.insert(0, str(Path(__file__).parent))
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN


class FileScanner:
    """
    Efficient, disk-friendly subtitle scanner.

    - Recursive (os.scandir-based)
    - Streams file reads
    - Batches results
    - Extensive logging for observability
    """

    SUPPORTED_EXTENSIONS = {".srt"}
    MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB
    PLOT_SCAN_LINES = 50
    DEFAULT_BATCH_SIZE = 100

    # --------------------------------------------------------
    # Public API
    # --------------------------------------------------------

    @classmethod
    def scan_directory(
        cls,
        directory_path: str | Path,
        batch_size: int = DEFAULT_BATCH_SIZE,
        follow_symlinks: bool = False,
    ) -> Generator[List[Dict], None, None]:
        """
        Recursively scan a directory tree for .srt files.
        Yields batches of metadata dictionaries.
        """
        root = Path(directory_path)

        logger.info("Starting subtitle scan")
        logger.info("Root directory      : %s", root)
        logger.info("Batch size          : %s", batch_size)
        logger.info("Follow symlinks     : %s", follow_symlinks)

        if not root.exists():
            logger.error("Scan failed: path does not exist (%s)", root)
            raise ValueError(f"Directory does not exist: {directory_path}")

        if not root.is_dir():
            logger.error("Scan failed: not a directory (%s)", root)
            raise ValueError(f"Invalid directory: {directory_path}")

        batch: List[Dict] = []
        total_seen = 0
        total_srt = 0
        total_skipped = 0

        for file_path in cls._walk_files(root, follow_symlinks):
            total_seen += 1

            if file_path.suffix.lower() not in cls.SUPPORTED_EXTENSIONS:
                logger.debug("Ignoring non-subtitle file: %s", file_path)
                continue

            total_srt += 1
            logger.debug("Found subtitle file: %s", file_path)

            # --------------------------------------------
            # Stat / size guard
            # --------------------------------------------

            try:
                stat = file_path.stat()
            except OSError as e:
                total_skipped += 1
                logger.warning(
                    "Skipping unreadable file: %s (%s)",
                    file_path, e
                )
                continue

            if stat.st_size > cls.MAX_FILE_SIZE_BYTES:
                total_skipped += 1
                logger.warning(
                    "Skipping large subtitle file (%d bytes): %s",
                    stat.st_size, file_path
                )
                continue

            # --------------------------------------------
            # Plot detection
            # --------------------------------------------

            try:
                plot_marker_count = cls._count_plot_markers(file_path)
                has_plot = plot_marker_count > 0
                logger.debug(
                    "Plot check for %s: %s",
                    file_path.name,
                    "FOUND" if has_plot else "NOT FOUND"
                )
            except Exception as e:
                total_skipped += 1
                logger.error(
                    "Plot scan failed for %s: %s",
                    file_path, e
                )
                continue

            metadata = {}

            if has_plot:
                try:
                    metadata = cls._extract_metadata(file_path)
                    logger.debug(
                        "Extracted metadata from %s: %s",
                        file_path.name,
                        {k: v for k, v in metadata.items() if v}
                    )
                except Exception as e:
                    logger.warning(
                        "Metadata extraction failed for %s: %s",
                        file_path.name, e
                    )

            batch.append({
                "path": str(file_path),
                "name": file_path.name,
                "has_plot": has_plot,
                "plot_marker_count": plot_marker_count,
                "duplicate_plot": plot_marker_count > 1,
                "status": "Has Plot" if has_plot else "Not Loaded",
                "summary": metadata.get("summary", ""),
                "plot": metadata.get("summary", ""),
                "title": metadata.get("title"),
                "year": metadata.get("year"),
                "imdb_rating": metadata.get("imdb_rating"),
                "rating": metadata.get("imdb_rating"),
                "runtime": metadata.get("runtime"),
                "selected": False,
            })

            if len(batch) >= batch_size:
                logger.info(
                    "Yielding batch (%d items, %d total files scanned)",
                    len(batch),
                    total_seen
                )
                yield batch
                batch = []

        if batch:
            logger.info(
                "Yielding final batch (%d items)",
                len(batch)
            )
            yield batch

        logger.info("Subtitle scan completed")
        logger.info("Files visited        : %d", total_seen)
        logger.info("Subtitle files found : %d", total_srt)
        logger.info("Files skipped        : %d", total_skipped)

    # --------------------------------------------------------
    # Internal helpers
    # --------------------------------------------------------

    @staticmethod
    def _walk_files(root: Path, follow_symlinks: bool):
        """
        Fast iterative recursive directory walk using os.scandir.
        """
        logger.debug("Beginning recursive walk at %s", root)
        stack = [root]

        while stack:
            current = stack.pop()
            logger.debug("Scanning directory: %s", current)

            try:
                with os.scandir(current) as entries:
                    for entry in entries:
                        try:
                            if entry.is_dir(follow_symlinks=follow_symlinks):
                                stack.append(Path(entry.path))
                            elif entry.is_file():
                                yield Path(entry.path)
                        except OSError as e:
                            logger.debug(
                                "Skipping entry due to OS error: %s (%s)",
                                entry.path, e
                            )
            except OSError as e:
                logger.warning(
                    "Cannot access directory: %s (%s)",
                    current, e
                )

    @classmethod
    def _count_plot_markers(cls, file_path: Path) -> int:
        """
        Count Sublogue plot markers to detect duplicates.
        """
        logger.debug("Scanning for plot markers in %s", file_path.name)

        try:
            content = file_path.read_text(encoding="utf-8", errors="ignore")
            lower_content = content.lower()
            generated_count = lower_content.count("generated by sublogue")
            if generated_count > 0:
                return generated_count
            return content.count(SUBLOGUE_SENTINEL)
        except Exception as e:
            logger.error(
                "Error reading file during plot scan: %s (%s)",
                file_path, e
            )
            return 0

    @classmethod
    def _extract_metadata(cls, file_path: Path) -> Dict:
        """
        Extract title, year, rating, runtime, and plot
        from Sublogue-generated subtitles.
        """
        logger.debug("Extracting metadata from %s", file_path.name)

        content = file_path.read_text(encoding="utf-8", errors="ignore")
        blocks = parse_srt(content)

        metadata = {
            "title": None,
            "year": None,
            "imdb_rating": None,
            "runtime": None,
            "summary": ""
        }

        if len(blocks) < 2:
            logger.debug("Not enough subtitle blocks for metadata extraction")
            return metadata

        # --------------------------------------------
        # Plot block (index 1)
        # --------------------------------------------

        plot_text = blocks[1].text
        plot_text = plot_text.split("Generated by Sublogue")[0].strip()
        plot_text = SUBLOGUE_TOKEN_PATTERN.sub("", plot_text).strip()
        metadata["summary"] = plot_text

        # --------------------------------------------
        # Header block (index 0)
        # --------------------------------------------

        header_lines = blocks[0].text.split("\n")

        if header_lines:
            first_line = header_lines[0].strip()
            if first_line == SUBLOGUE_SENTINEL and len(header_lines) > 1:
                first_line = header_lines[1].strip()
            year_match = re.search(r"\((\d{4})\)", first_line)
            if year_match:
                metadata["year"] = year_match.group(1)
                metadata["title"] = first_line[:year_match.start()].strip()
            else:
                metadata["title"] = first_line.strip()

        if len(header_lines) > 1:
            second_line = header_lines[1]

            rating_match = re.search(r"IMDb:\s*([^\s]+)", second_line)
            if rating_match:
                metadata["imdb_rating"] = rating_match.group(1)

            runtime_match = re.search(r"⏱\s*(.+)", second_line)
            if runtime_match:
                metadata["runtime"] = runtime_match.group(1).strip()

        logger.debug("Metadata extracted: %s", metadata)
        return metadata
Initial commit 2026-01-17 21:49:22 +13:00			`import logging`
1.0.0.8 logging updates 2026-01-18 23:01:03 +13:00			`from logging_utils import get_logger`
Initial commit 2026-01-17 21:49:22 +13:00			`import os`
			`import re`
			`from pathlib import Path`
			`from typing import Generator, List, Dict`

			`# ------------------------------------------------------------`
			`# Logging configuration`
			`# ------------------------------------------------------------`

1.0.0.8 logging updates 2026-01-18 23:01:03 +13:00			`logger = get_logger("FileScanner")`
Initial commit 2026-01-17 21:49:22 +13:00
			`# ------------------------------------------------------------`
			`# Import subtitle parser`
			`# ------------------------------------------------------------`

			`import sys`
			`sys.path.insert(0, str(Path(__file__).parent))`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN`
Initial commit 2026-01-17 21:49:22 +13:00

			`class FileScanner:`
			`"""`
			`Efficient, disk-friendly subtitle scanner.`

			`- Recursive (os.scandir-based)`
			`- Streams file reads`
			`- Batches results`
			`- Extensive logging for observability`
			`"""`

			`SUPPORTED_EXTENSIONS = {".srt"}`
			`MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB`
			`PLOT_SCAN_LINES = 50`
			`DEFAULT_BATCH_SIZE = 100`

			`# --------------------------------------------------------`
			`# Public API`
			`# --------------------------------------------------------`

			`@classmethod`
			`def scan_directory(`
			`cls,`
			`directory_path: str \| Path,`
			`batch_size: int = DEFAULT_BATCH_SIZE,`
			`follow_symlinks: bool = False,`
			`) -> Generator[List[Dict], None, None]:`
			`"""`
			`Recursively scan a directory tree for .srt files.`
			`Yields batches of metadata dictionaries.`
			`"""`
			`root = Path(directory_path)`

			`logger.info("Starting subtitle scan")`
			`logger.info("Root directory : %s", root)`
			`logger.info("Batch size : %s", batch_size)`
			`logger.info("Follow symlinks : %s", follow_symlinks)`

			`if not root.exists():`
			`logger.error("Scan failed: path does not exist (%s)", root)`
			`raise ValueError(f"Directory does not exist: {directory_path}")`

			`if not root.is_dir():`
			`logger.error("Scan failed: not a directory (%s)", root)`
			`raise ValueError(f"Invalid directory: {directory_path}")`

			`batch: List[Dict] = []`
			`total_seen = 0`
			`total_srt = 0`
			`total_skipped = 0`

			`for file_path in cls._walk_files(root, follow_symlinks):`
			`total_seen += 1`

			`if file_path.suffix.lower() not in cls.SUPPORTED_EXTENSIONS:`
			`logger.debug("Ignoring non-subtitle file: %s", file_path)`
			`continue`

			`total_srt += 1`
			`logger.debug("Found subtitle file: %s", file_path)`

			`# --------------------------------------------`
			`# Stat / size guard`
			`# --------------------------------------------`

			`try:`
			`stat = file_path.stat()`
			`except OSError as e:`
			`total_skipped += 1`
			`logger.warning(`
			`"Skipping unreadable file: %s (%s)",`
			`file_path, e`
			`)`
			`continue`

			`if stat.st_size > cls.MAX_FILE_SIZE_BYTES:`
			`total_skipped += 1`
			`logger.warning(`
			`"Skipping large subtitle file (%d bytes): %s",`
			`stat.st_size, file_path`
			`)`
			`continue`

			`# --------------------------------------------`
			`# Plot detection`
			`# --------------------------------------------`

			`try:`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`plot_marker_count = cls._count_plot_markers(file_path)`
			`has_plot = plot_marker_count > 0`
Initial commit 2026-01-17 21:49:22 +13:00			`logger.debug(`
			`"Plot check for %s: %s",`
			`file_path.name,`
			`"FOUND" if has_plot else "NOT FOUND"`
			`)`
			`except Exception as e:`
			`total_skipped += 1`
			`logger.error(`
			`"Plot scan failed for %s: %s",`
			`file_path, e`
			`)`
			`continue`

			`metadata = {}`

			`if has_plot:`
			`try:`
			`metadata = cls._extract_metadata(file_path)`
			`logger.debug(`
			`"Extracted metadata from %s: %s",`
			`file_path.name,`
			`{k: v for k, v in metadata.items() if v}`
			`)`
			`except Exception as e:`
			`logger.warning(`
			`"Metadata extraction failed for %s: %s",`
			`file_path.name, e`
			`)`

			`batch.append({`
			`"path": str(file_path),`
			`"name": file_path.name,`
			`"has_plot": has_plot,`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`"plot_marker_count": plot_marker_count,`
			`"duplicate_plot": plot_marker_count > 1,`
Initial commit 2026-01-17 21:49:22 +13:00			`"status": "Has Plot" if has_plot else "Not Loaded",`
			`"summary": metadata.get("summary", ""),`
			`"plot": metadata.get("summary", ""),`
			`"title": metadata.get("title"),`
			`"year": metadata.get("year"),`
			`"imdb_rating": metadata.get("imdb_rating"),`
			`"rating": metadata.get("imdb_rating"),`
			`"runtime": metadata.get("runtime"),`
			`"selected": False,`
			`})`

			`if len(batch) >= batch_size:`
			`logger.info(`
			`"Yielding batch (%d items, %d total files scanned)",`
			`len(batch),`
			`total_seen`
			`)`
			`yield batch`
			`batch = []`

			`if batch:`
			`logger.info(`
			`"Yielding final batch (%d items)",`
			`len(batch)`
			`)`
			`yield batch`

			`logger.info("Subtitle scan completed")`
			`logger.info("Files visited : %d", total_seen)`
			`logger.info("Subtitle files found : %d", total_srt)`
			`logger.info("Files skipped : %d", total_skipped)`

			`# --------------------------------------------------------`
			`# Internal helpers`
			`# --------------------------------------------------------`

			`@staticmethod`
			`def _walk_files(root: Path, follow_symlinks: bool):`
			`"""`
			`Fast iterative recursive directory walk using os.scandir.`
			`"""`
			`logger.debug("Beginning recursive walk at %s", root)`
			`stack = [root]`

			`while stack:`
			`current = stack.pop()`
			`logger.debug("Scanning directory: %s", current)`

			`try:`
			`with os.scandir(current) as entries:`
			`for entry in entries:`
			`try:`
			`if entry.is_dir(follow_symlinks=follow_symlinks):`
			`stack.append(Path(entry.path))`
			`elif entry.is_file():`
			`yield Path(entry.path)`
			`except OSError as e:`
			`logger.debug(`
			`"Skipping entry due to OS error: %s (%s)",`
			`entry.path, e`
			`)`
			`except OSError as e:`
			`logger.warning(`
			`"Cannot access directory: %s (%s)",`
			`current, e`
			`)`

			`@classmethod`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`def _count_plot_markers(cls, file_path: Path) -> int:`
Initial commit 2026-01-17 21:49:22 +13:00			`"""`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`Count Sublogue plot markers to detect duplicates.`
Initial commit 2026-01-17 21:49:22 +13:00			`"""`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`logger.debug("Scanning for plot markers in %s", file_path.name)`
Initial commit 2026-01-17 21:49:22 +13:00
			`try:`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`content = file_path.read_text(encoding="utf-8", errors="ignore")`
			`lower_content = content.lower()`
			`generated_count = lower_content.count("generated by sublogue")`
			`if generated_count > 0:`
			`return generated_count`
			`return content.count(SUBLOGUE_SENTINEL)`
Initial commit 2026-01-17 21:49:22 +13:00			`except Exception as e:`
			`logger.error(`
			`"Error reading file during plot scan: %s (%s)",`
			`file_path, e`
			`)`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`return 0`
Initial commit 2026-01-17 21:49:22 +13:00
			`@classmethod`
			`def _extract_metadata(cls, file_path: Path) -> Dict:`
			`"""`
			`Extract title, year, rating, runtime, and plot`
			`from Sublogue-generated subtitles.`
			`"""`
			`logger.debug("Extracting metadata from %s", file_path.name)`

			`content = file_path.read_text(encoding="utf-8", errors="ignore")`
			`blocks = parse_srt(content)`

			`metadata = {`
			`"title": None,`
			`"year": None,`
			`"imdb_rating": None,`
			`"runtime": None,`
			`"summary": ""`
			`}`

			`if len(blocks) < 2:`
			`logger.debug("Not enough subtitle blocks for metadata extraction")`
			`return metadata`

			`# --------------------------------------------`
			`# Plot block (index 1)`
			`# --------------------------------------------`

			`plot_text = blocks[1].text`
			`plot_text = plot_text.split("Generated by Sublogue")[0].strip()`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`plot_text = SUBLOGUE_TOKEN_PATTERN.sub("", plot_text).strip()`
Initial commit 2026-01-17 21:49:22 +13:00			`metadata["summary"] = plot_text`

			`# --------------------------------------------`
			`# Header block (index 0)`
			`# --------------------------------------------`

			`header_lines = blocks[0].text.split("\n")`

			`if header_lines:`
1.0.0.7 - Matching improves, added library page. Removed schedule scans support 2026-01-18 22:29:51 +13:00			`first_line = header_lines[0].strip()`
			`if first_line == SUBLOGUE_SENTINEL and len(header_lines) > 1:`
			`first_line = header_lines[1].strip()`
Initial commit 2026-01-17 21:49:22 +13:00			`year_match = re.search(r"\((\d{4})\)", first_line)`
			`if year_match:`
			`metadata["year"] = year_match.group(1)`
			`metadata["title"] = first_line[:year_match.start()].strip()`
			`else:`
			`metadata["title"] = first_line.strip()`

			`if len(header_lines) > 1:`
			`second_line = header_lines[1]`

			`rating_match = re.search(r"IMDb:\s*([^\s]+)", second_line)`
			`if rating_match:`
			`metadata["imdb_rating"] = rating_match.group(1)`

			`runtime_match = re.search(r"⏱\s*(.+)", second_line)`
			`if runtime_match:`
			`metadata["runtime"] = runtime_match.group(1).strip()`

			`logger.debug("Metadata extracted: %s", metadata)`
			`return metadata`