Initial commit

2026-01-17 21:49:22 +13:00
commit 3ad3d9bfe0
118 changed files with 18586 additions and 0 deletions
@@ -0,0 +1,67 @@
+"""
+Configuration manager - handles settings persistence
+"""
+import json
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ConfigManager:
+    """Manages application settings with JSON persistence"""
+
+    def __init__(self, file_path="settings.json"):
+        self.file_path = Path(file_path)
+        self.settings = {
+            "api_key": "",
+            "default_directory": "",
+            "duration": 40,
+            "cleaning_patterns": [
+                r"\(\d{4}\)",
+                r"\[\d{4}\]",
+                r"\.\d{4}\.",
+                r"\.(19|20)\d{2}",
+                r"\.[a-z]{2,3}\b",
+                r"\.(?:720p|1080p|2160p|480p|HDRip|BRRip|BluRay|WEBRip|WEB-DL)",
+                r"\.(?:x264|x265|HEVC|AAC|AC3|DTS)"
+            ]
+        }
+        self.load_settings()
+
+    def load_settings(self):
+        """Load settings from disk"""
+        if self.file_path.exists():
+            try:
+                with open(self.file_path, "r") as f:
+                    self.settings.update(json.load(f))
+                logging.info("Settings loaded successfully")
+            except Exception as e:
+                logging.error(f"Error loading settings: {e}")
+
+    def save_settings(self):
+        """Save settings to disk"""
+        try:
+            with open(self.file_path, "w") as f:
+                json.dump(self.settings, f, indent=2)
+            logging.info("Settings saved successfully")
+        except Exception as e:
+            logging.error(f"Error saving settings: {e}")
+
+    def get(self, key, default=None):
+        """Get a setting value"""
+        return self.settings.get(key, default)
+
+    def set(self, key, value):
+        """Set a setting value"""
+        self.settings[key] = value
+        logging.info(f"Setting updated: {key}")
+
+    def get_all(self):
+        """Get all settings"""
+        return self.settings.copy()
+
+    def update_multiple(self, updates):
+        """Update multiple settings at once"""
+        self.settings.update(updates)
+        logging.info(f"Updated {len(updates)} settings")
@@ -0,0 +1,828 @@
+"""
+Database layer using SQLAlchemy with SQLite
+Handles persistent storage for settings, runs, and history
+"""
+from datetime import datetime
+from pathlib import Path
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, Float, Text, ForeignKey
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, relationship, scoped_session
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+Base = declarative_base()
+
+
+class Settings(Base):
+    """Settings table - stores application configuration"""
+    __tablename__ = 'settings'
+
+    id = Column(Integer, primary_key=True)
+    key = Column(String(100), unique=True, nullable=False, index=True)
+    value = Column(Text, nullable=False)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+    def __repr__(self):
+        return f"<Settings(key='{self.key}', value='{self.value}')>"
+
+
+class ProcessingRun(Base):
+    """Processing runs table - stores each batch processing session"""
+    __tablename__ = 'processing_runs'
+
+    id = Column(Integer, primary_key=True)
+    started_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
+    completed_at = Column(DateTime)
+    total_files = Column(Integer, default=0)
+    successful_files = Column(Integer, default=0)
+    failed_files = Column(Integer, default=0)
+    duration_seconds = Column(Float)
+    status = Column(String(50), default='in_progress')  # in_progress, completed, failed
+
+    # Relationship to file results
+    file_results = relationship("FileResult", back_populates="run", cascade="all, delete-orphan")
+
+    def __repr__(self):
+        return f"<ProcessingRun(id={self.id}, started_at='{self.started_at}', status='{self.status}')>"
+
+
+class FileResult(Base):
+    """File results table - stores individual file processing results"""
+    __tablename__ = 'file_results'
+
+    id = Column(Integer, primary_key=True)
+    run_id = Column(Integer, ForeignKey('processing_runs.id'), nullable=False, index=True)
+    file_path = Column(String(500), nullable=False, index=True)
+    file_name = Column(String(255), nullable=False)
+    success = Column(Boolean, default=False)
+    status = Column(String(100))
+    summary = Column(Text)
+    error_message = Column(Text)
+    processed_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    duration = Column(Integer)  # subtitle duration in seconds
+
+    # Relationship back to run
+    run = relationship("ProcessingRun", back_populates="file_results")
+
+    def __repr__(self):
+        return f"<FileResult(id={self.id}, file_name='{self.file_name}', success={self.success})>"
+
+
+class ScanHistory(Base):
+    """Scan history table - stores directory scan history"""
+    __tablename__ = 'scan_history'
+
+    id = Column(Integer, primary_key=True)
+    directory = Column(String(500), nullable=False, index=True)
+    scanned_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
+    files_found = Column(Integer, default=0)
+    files_with_plot = Column(Integer, default=0)
+    scan_duration_ms = Column(Integer)
+
+    def __repr__(self):
+        return f"<ScanHistory(id={self.id}, directory='{self.directory}', files_found={self.files_found})>"
+
+
+class ScheduledScan(Base):
+    """Scheduled scans table - stores scheduled scan jobs and results"""
+    __tablename__ = 'scheduled_scans'
+
+    id = Column(Integer, primary_key=True)
+    directory = Column(String(500), nullable=False, index=True)
+    scheduled_for = Column(DateTime, nullable=False, index=True)
+    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    started_at = Column(DateTime)
+    completed_at = Column(DateTime)
+    status = Column(String(50), default='scheduled', index=True)  # scheduled, running, completed, cancelled, failed
+    files_found = Column(Integer, default=0)
+    files_with_plot = Column(Integer, default=0)
+    scan_duration_ms = Column(Integer)
+    error_message = Column(Text)
+
+    def __repr__(self):
+        return f"<ScheduledScan(id={self.id}, directory='{self.directory}', status='{self.status}')>"
+
+
+class ApiUsage(Base):
+    """API usage tracking table - monitors API calls per integration"""
+    __tablename__ = 'api_usage'
+
+    id = Column(Integer, primary_key=True)
+    provider = Column(String(50), nullable=False, index=True)  # omdb, tmdb, tvmaze
+    endpoint = Column(String(200))  # Specific endpoint called
+    timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
+    success = Column(Boolean, default=True)
+    response_time_ms = Column(Integer)
+    call_count = Column(Integer, default=1)  # Number of API calls in this batch
+
+    def __repr__(self):
+        return f"<ApiUsage(id={self.id}, provider='{self.provider}', calls={self.call_count}, timestamp='{self.timestamp}')>"
+
+
+class SuggestedMatch(Base):
+    """Suggested matches table - stores auto-matched titles for files"""
+    __tablename__ = 'suggested_matches'
+
+    id = Column(Integer, primary_key=True)
+    file_path = Column(String(500), nullable=False, unique=True, index=True)
+    file_name = Column(String(255), nullable=False)
+    matched_title = Column(String(255), nullable=False)
+    matched_year = Column(String(10))
+    matched_imdb_id = Column(String(50))
+    match_data = Column(Text)  # JSON blob with full match data
+    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+    def __repr__(self):
+        return f"<SuggestedMatch(id={self.id}, file_name='{self.file_name}', matched_title='{self.matched_title}')>"
+
+
+class DatabaseManager:
+    """Manages database connections and operations"""
+
+    def __init__(self, db_path="sublogue.db"):
+        """Initialize database connection"""
+        self.db_path = Path(db_path)
+        self.engine = create_engine(f'sqlite:///{self.db_path}', echo=False)
+        self.Session = scoped_session(sessionmaker(bind=self.engine))
+
+        # Create tables if they don't exist
+        Base.metadata.create_all(self.engine)
+        logger.info(f"Database initialized at {self.db_path}")
+
+    def get_session(self):
+        """Get a new database session"""
+        return self.Session()
+
+    def close_session(self):
+        """Close the session"""
+        self.Session.remove()
+
+    # ============ SETTINGS OPERATIONS ============
+
+    def get_setting(self, key, default=None):
+        """Get a setting value"""
+        session = self.get_session()
+        try:
+            setting = session.query(Settings).filter_by(key=key).first()
+            if setting:
+                try:
+                    return json.loads(setting.value)
+                except json.JSONDecodeError:
+                    return setting.value
+            return default
+        finally:
+            session.close()
+
+    def set_setting(self, key, value):
+        """Set a setting value"""
+        session = self.get_session()
+        try:
+            setting = session.query(Settings).filter_by(key=key).first()
+
+            # Convert value to JSON string
+            json_value = json.dumps(value) if not isinstance(value, str) else value
+
+            if setting:
+                setting.value = json_value
+                setting.updated_at = datetime.utcnow()
+            else:
+                setting = Settings(key=key, value=json_value)
+                session.add(setting)
+
+            session.commit()
+            logger.info(f"Setting updated: {key}")
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error setting value: {e}")
+            raise
+        finally:
+            session.close()
+
+    def get_all_settings(self):
+        """Get all settings as a dictionary"""
+        session = self.get_session()
+        try:
+            settings = session.query(Settings).all()
+            result = {}
+            for setting in settings:
+                try:
+                    result[setting.key] = json.loads(setting.value)
+                except json.JSONDecodeError:
+                    result[setting.key] = setting.value
+            return result
+        finally:
+            session.close()
+
+    def update_settings(self, settings_dict):
+        """Update multiple settings at once"""
+        for key, value in settings_dict.items():
+            self.set_setting(key, value)
+
+    # ============ PROCESSING RUN OPERATIONS ============
+
+    def create_run(self, total_files):
+        """Create a new processing run"""
+        session = self.get_session()
+        try:
+            run = ProcessingRun(
+                total_files=total_files,
+                status='in_progress'
+            )
+            session.add(run)
+            session.commit()
+            run_id = run.id
+            logger.info(f"Created processing run {run_id}")
+            return run_id
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error creating run: {e}")
+            raise
+        finally:
+            session.close()
+
+    def complete_run(self, run_id, successful_files, failed_files):
+        """Mark a run as completed"""
+        session = self.get_session()
+        try:
+            run = session.query(ProcessingRun).get(run_id)
+            if run:
+                run.completed_at = datetime.utcnow()
+                run.successful_files = successful_files
+                run.failed_files = failed_files
+                run.status = 'completed'
+
+                if run.started_at:
+                    duration = (run.completed_at - run.started_at).total_seconds()
+                    run.duration_seconds = duration
+
+                session.commit()
+                logger.info(f"Completed run {run_id}")
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error completing run: {e}")
+            raise
+        finally:
+            session.close()
+
+    def add_file_result(self, run_id, file_path, success, status, summary="", error_message="", duration=40):
+        """Add a file processing result"""
+        session = self.get_session()
+        try:
+            result = FileResult(
+                run_id=run_id,
+                file_path=file_path,
+                file_name=Path(file_path).name,
+                success=success,
+                status=status,
+                summary=summary,
+                error_message=error_message,
+                duration=duration
+            )
+            session.add(result)
+            session.commit()
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error adding file result: {e}")
+            raise
+        finally:
+            session.close()
+
+    def get_run_history(self, limit=50):
+        """Get processing run history"""
+        session = self.get_session()
+        try:
+            runs = session.query(ProcessingRun).order_by(
+                ProcessingRun.started_at.desc()
+            ).limit(limit).all()
+
+            result = []
+            for run in runs:
+                result.append({
+                    'id': run.id,
+                    'started_at': run.started_at.isoformat() if run.started_at else None,
+                    'completed_at': run.completed_at.isoformat() if run.completed_at else None,
+                    'total_files': run.total_files,
+                    'successful_files': run.successful_files,
+                    'failed_files': run.failed_files,
+                    'duration_seconds': run.duration_seconds,
+                    'status': run.status
+                })
+            return result
+        finally:
+            session.close()
+
+    def get_run_details(self, run_id):
+        """Get detailed information about a specific run"""
+        session = self.get_session()
+        try:
+            run = session.query(ProcessingRun).get(run_id)
+            if not run:
+                return None
+
+            file_results = []
+            for result in run.file_results:
+                file_results.append({
+                    'id': result.id,
+                    'file_path': result.file_path,
+                    'file_name': result.file_name,
+                    'success': result.success,
+                    'status': result.status,
+                    'summary': result.summary,
+                    'error_message': result.error_message,
+                    'processed_at': result.processed_at.isoformat() if result.processed_at else None,
+                    'duration': result.duration
+                })
+
+            return {
+                'id': run.id,
+                'started_at': run.started_at.isoformat() if run.started_at else None,
+                'completed_at': run.completed_at.isoformat() if run.completed_at else None,
+                'total_files': run.total_files,
+                'successful_files': run.successful_files,
+                'failed_files': run.failed_files,
+                'duration_seconds': run.duration_seconds,
+                'status': run.status,
+                'file_results': file_results
+            }
+        finally:
+            session.close()
+
+    # ============ SCAN HISTORY OPERATIONS ============
+
+    def add_scan_history(self, directory, files_found, files_with_plot, scan_duration_ms):
+        """Add a scan history entry"""
+        session = self.get_session()
+        try:
+            scan = ScanHistory(
+                directory=directory,
+                files_found=files_found,
+                files_with_plot=files_with_plot,
+                scan_duration_ms=scan_duration_ms
+            )
+            session.add(scan)
+            session.commit()
+            logger.info(f"Scan history saved for {directory}")
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error saving scan history: {e}")
+            raise
+        finally:
+            session.close()
+
+    def get_scan_history(self, limit=50):
+        """Get scan history"""
+        session = self.get_session()
+        try:
+            scans = session.query(ScanHistory).order_by(
+                ScanHistory.scanned_at.desc()
+            ).limit(limit).all()
+
+            result = []
+            for scan in scans:
+                result.append({
+                    'id': scan.id,
+                    'directory': scan.directory,
+                    'scanned_at': scan.scanned_at.isoformat() if scan.scanned_at else None,
+                    'files_found': scan.files_found,
+                    'files_with_plot': scan.files_with_plot,
+                    'scan_duration_ms': scan.scan_duration_ms
+                })
+            return result
+        finally:
+            session.close()
+
+    # ============ SCHEDULED SCAN OPERATIONS ============
+
+    def create_scheduled_scan(self, directory, scheduled_for):
+        """Create a scheduled scan entry"""
+        session = self.get_session()
+        try:
+            scheduled = ScheduledScan(
+                directory=directory,
+                scheduled_for=scheduled_for,
+                status='scheduled'
+            )
+            session.add(scheduled)
+            session.commit()
+            return scheduled.id
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error creating scheduled scan: {e}")
+            raise
+        finally:
+            session.close()
+
+    def get_scheduled_scans(self, limit=50, status=None):
+        """Get scheduled scans"""
+        session = self.get_session()
+        try:
+            query = session.query(ScheduledScan)
+            if status:
+                query = query.filter_by(status=status)
+            scans = query.order_by(ScheduledScan.scheduled_for.desc()).limit(limit).all()
+
+            result = []
+            for scan in scans:
+                result.append({
+                    'id': scan.id,
+                    'directory': scan.directory,
+                    'scheduled_for': scan.scheduled_for.isoformat() if scan.scheduled_for else None,
+                    'created_at': scan.created_at.isoformat() if scan.created_at else None,
+                    'started_at': scan.started_at.isoformat() if scan.started_at else None,
+                    'completed_at': scan.completed_at.isoformat() if scan.completed_at else None,
+                    'status': scan.status,
+                    'files_found': scan.files_found,
+                    'files_with_plot': scan.files_with_plot,
+                    'scan_duration_ms': scan.scan_duration_ms,
+                    'error_message': scan.error_message
+                })
+            return result
+        finally:
+            session.close()
+
+    def get_scheduled_scan(self, scan_id):
+        """Get a single scheduled scan by id"""
+        session = self.get_session()
+        try:
+            scan = session.query(ScheduledScan).get(scan_id)
+            if not scan:
+                return None
+            return {
+                'id': scan.id,
+                'directory': scan.directory,
+                'scheduled_for': scan.scheduled_for.isoformat() if scan.scheduled_for else None,
+                'created_at': scan.created_at.isoformat() if scan.created_at else None,
+                'started_at': scan.started_at.isoformat() if scan.started_at else None,
+                'completed_at': scan.completed_at.isoformat() if scan.completed_at else None,
+                'status': scan.status,
+                'files_found': scan.files_found,
+                'files_with_plot': scan.files_with_plot,
+                'scan_duration_ms': scan.scan_duration_ms,
+                'error_message': scan.error_message
+            }
+        finally:
+            session.close()
+
+    def get_due_scheduled_scans(self, now):
+        """Get scheduled scans that are due to run"""
+        session = self.get_session()
+        try:
+            scans = session.query(ScheduledScan).filter(
+                ScheduledScan.status == 'scheduled',
+                ScheduledScan.scheduled_for <= now
+            ).order_by(ScheduledScan.scheduled_for.asc()).all()
+            return [scan.id for scan in scans]
+        finally:
+            session.close()
+
+    def mark_scheduled_scan_running(self, scan_id, started_at=None):
+        """Mark a scheduled scan as running"""
+        session = self.get_session()
+        try:
+            scan = session.query(ScheduledScan).get(scan_id)
+            if not scan or scan.status != 'scheduled':
+                return False
+            scan.status = 'running'
+            scan.started_at = started_at or datetime.utcnow()
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error marking scheduled scan running: {e}")
+            return False
+        finally:
+            session.close()
+
+    def complete_scheduled_scan(self, scan_id, files_found, files_with_plot, scan_duration_ms, completed_at=None):
+        """Mark a scheduled scan as completed"""
+        session = self.get_session()
+        try:
+            scan = session.query(ScheduledScan).get(scan_id)
+            if not scan:
+                return False
+            scan.status = 'completed'
+            scan.completed_at = completed_at or datetime.utcnow()
+            scan.files_found = files_found
+            scan.files_with_plot = files_with_plot
+            scan.scan_duration_ms = scan_duration_ms
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error completing scheduled scan: {e}")
+            return False
+        finally:
+            session.close()
+
+    def fail_scheduled_scan(self, scan_id, error_message, completed_at=None):
+        """Mark a scheduled scan as failed"""
+        session = self.get_session()
+        try:
+            scan = session.query(ScheduledScan).get(scan_id)
+            if not scan:
+                return False
+            scan.status = 'failed'
+            scan.completed_at = completed_at or datetime.utcnow()
+            scan.error_message = error_message
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error failing scheduled scan: {e}")
+            return False
+        finally:
+            session.close()
+
+    def cancel_scheduled_scan(self, scan_id):
+        """Cancel a scheduled scan"""
+        session = self.get_session()
+        try:
+            scan = session.query(ScheduledScan).get(scan_id)
+            if not scan or scan.status != 'scheduled':
+                return False
+            scan.status = 'cancelled'
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error cancelling scheduled scan: {e}")
+            return False
+        finally:
+            session.close()
+
+    # ============ API USAGE OPERATIONS ============
+
+    def track_api_call(self, provider, endpoint=None, success=True, response_time_ms=None, call_count=1):
+        """Track API call(s) for usage monitoring
+
+        Args:
+            provider: API provider name (omdb, tmdb, tvmaze)
+            endpoint: Specific endpoint called
+            success: Whether the call(s) succeeded
+            response_time_ms: Total response time in milliseconds
+            call_count: Number of API calls made (for batched operations)
+        """
+        session = self.get_session()
+        try:
+            usage = ApiUsage(
+                provider=provider,
+                endpoint=endpoint,
+                success=success,
+                response_time_ms=response_time_ms,
+                call_count=call_count
+            )
+            session.add(usage)
+            session.commit()
+            logger.debug(f"Tracked {call_count} API call(s) to {provider}")
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error tracking API call: {e}")
+        finally:
+            session.close()
+
+    def get_api_usage_24h(self, provider):
+        """Get API call count for the last 24 hours (sums call_count field)"""
+        session = self.get_session()
+        try:
+            from datetime import timedelta
+            from sqlalchemy import func
+
+            cutoff_time = datetime.utcnow() - timedelta(hours=24)
+
+            # Sum the call_count column to get total API calls
+            result = session.query(func.coalesce(func.sum(ApiUsage.call_count), 0)).filter(
+                ApiUsage.provider == provider,
+                ApiUsage.timestamp >= cutoff_time
+            ).scalar()
+
+            return result or 0
+        finally:
+            session.close()
+
+    def check_api_limit(self, provider, limit=1000):
+        """Check if API usage is under the limit"""
+        current_usage = self.get_api_usage_24h(provider)
+        return {
+            'under_limit': current_usage < limit,
+            'current_usage': current_usage,
+            'limit': limit,
+            'remaining': max(0, limit - current_usage)
+        }
+
+    def get_usage_stats(self, provider):
+        """Get detailed usage statistics for a provider"""
+        session = self.get_session()
+        try:
+            from datetime import timedelta
+            cutoff_time = datetime.utcnow() - timedelta(hours=24)
+
+            # Get 24h usage records
+            usage_24h = session.query(ApiUsage).filter(
+                ApiUsage.provider == provider,
+                ApiUsage.timestamp >= cutoff_time
+            ).all()
+
+            # Sum call_count for accurate totals
+            total_calls = sum(u.call_count or 1 for u in usage_24h)
+            successful_calls = sum(u.call_count or 1 for u in usage_24h if u.success)
+            failed_calls = total_calls - successful_calls
+
+            # Calculate average response time (weighted by call count)
+            weighted_times = [(u.response_time_ms or 0) * (u.call_count or 1) for u in usage_24h if u.response_time_ms is not None]
+            avg_response_time = sum(weighted_times) / total_calls if total_calls > 0 and weighted_times else 0
+
+            # Find oldest call in 24h window to calculate reset time
+            oldest_call = min([u.timestamp for u in usage_24h]) if usage_24h else datetime.utcnow()
+            reset_time = oldest_call + timedelta(hours=24)
+
+            return {
+                'provider': provider,
+                'total_calls_24h': total_calls,
+                'successful_calls': successful_calls,
+                'failed_calls': failed_calls,
+                'avg_response_time_ms': round(avg_response_time, 2),
+                'reset_time': reset_time.isoformat(),
+                'limit': 1000,
+                'remaining': max(0, 1000 - total_calls)
+            }
+        finally:
+            session.close()
+
+    def get_all_usage_stats(self):
+        """Get usage statistics for all providers"""
+        providers = ['omdb', 'tmdb', 'tvmaze']
+        return {provider: self.get_usage_stats(provider) for provider in providers}
+
+    # ============ SUGGESTED MATCHES OPERATIONS ============
+
+    def save_suggested_match(self, file_path, file_name, match_data):
+        """Save or update a suggested match for a file"""
+        session = self.get_session()
+        try:
+            # Check if match already exists
+            existing = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
+
+            if existing:
+                # Update existing match
+                existing.matched_title = match_data.get('title', '')
+                existing.matched_year = match_data.get('year', '')
+                existing.matched_imdb_id = match_data.get('imdb_id', '')
+                existing.match_data = json.dumps(match_data)
+                existing.updated_at = datetime.utcnow()
+            else:
+                # Create new match
+                new_match = SuggestedMatch(
+                    file_path=file_path,
+                    file_name=file_name,
+                    matched_title=match_data.get('title', ''),
+                    matched_year=match_data.get('year', ''),
+                    matched_imdb_id=match_data.get('imdb_id', ''),
+                    match_data=json.dumps(match_data)
+                )
+                session.add(new_match)
+
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error saving suggested match: {e}")
+            return False
+        finally:
+            session.close()
+
+    def get_suggested_match(self, file_path):
+        """Get suggested match for a specific file"""
+        session = self.get_session()
+        try:
+            match = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
+            if match:
+                return {
+                    'file_path': match.file_path,
+                    'file_name': match.file_name,
+                    'matched_title': match.matched_title,
+                    'matched_year': match.matched_year,
+                    'matched_imdb_id': match.matched_imdb_id,
+                    'match_data': json.loads(match.match_data) if match.match_data else {},
+                    'created_at': match.created_at.isoformat(),
+                    'updated_at': match.updated_at.isoformat()
+                }
+            return None
+        finally:
+            session.close()
+
+    def get_suggested_matches_for_directory(self, directory):
+        """Get all suggested matches for files in a directory"""
+        session = self.get_session()
+        try:
+            matches = session.query(SuggestedMatch).filter(
+                SuggestedMatch.file_path.like(f"{directory}%")
+            ).all()
+
+            return {
+                match.file_path: json.loads(match.match_data) if match.match_data else {}
+                for match in matches
+            }
+        finally:
+            session.close()
+
+    def delete_suggested_match(self, file_path):
+        """Delete a suggested match for a file"""
+        session = self.get_session()
+        try:
+            match = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
+            if match:
+                session.delete(match)
+                session.commit()
+                return True
+            return False
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error deleting suggested match: {e}")
+            return False
+        finally:
+            session.close()
+
+    def clear_all_suggested_matches(self):
+        """Clear all suggested matches"""
+        session = self.get_session()
+        try:
+            session.query(SuggestedMatch).delete()
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error clearing suggested matches: {e}")
+            return False
+        finally:
+            session.close()
+
+    # ============ MAINTENANCE OPERATIONS ============
+
+    def clear_settings(self, keep_api_keys=False):
+        """Clear all settings, optionally keeping API keys"""
+        session = self.get_session()
+        try:
+            keep_values = {}
+            if keep_api_keys:
+                for key in ("omdb_api_key", "tmdb_api_key", "api_key"):
+                    setting = session.query(Settings).filter_by(key=key).first()
+                    if setting:
+                        keep_values[key] = setting.value
+
+            session.query(Settings).delete()
+            session.commit()
+
+            if keep_values:
+                for key, value in keep_values.items():
+                    session.add(Settings(key=key, value=value))
+                session.commit()
+
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error clearing settings: {e}")
+            return False
+        finally:
+            session.close()
+
+    def clear_history_and_logs(self):
+        """Clear processing runs, scan history, scheduled scans, and API usage logs"""
+        session = self.get_session()
+        try:
+            session.query(FileResult).delete()
+            session.query(ProcessingRun).delete()
+            session.query(ScanHistory).delete()
+            session.query(ScheduledScan).delete()
+            session.query(ApiUsage).delete()
+            session.commit()
+            return True
+        except Exception as e:
+            session.rollback()
+            logger.error(f"Error clearing history and logs: {e}")
+            return False
+        finally:
+            session.close()
+
+    # ============ STATISTICS ============
+
+    def get_statistics(self):
+        """Get overall statistics"""
+        session = self.get_session()
+        try:
+            total_runs = session.query(ProcessingRun).count()
+            completed_runs = session.query(ProcessingRun).filter_by(status='completed').count()
+            total_files_processed = session.query(FileResult).count()
+            successful_files = session.query(FileResult).filter_by(success=True).count()
+
+            return {
+                'total_runs': total_runs,
+                'completed_runs': completed_runs,
+                'total_files_processed': total_files_processed,
+                'successful_files': successful_files,
+                'failed_files': total_files_processed - successful_files
+            }
+        finally:
+            session.close()
@@ -0,0 +1,302 @@
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Generator, List, Dict
+
+# ------------------------------------------------------------
+# Logging configuration
+# ------------------------------------------------------------
+
+logger = logging.getLogger("FileScanner")
+logger.setLevel(logging.INFO)  # Change to DEBUG for deep tracing
+
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+    "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
+)
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+# ------------------------------------------------------------
+# Import subtitle parser
+# ------------------------------------------------------------
+
+import sys
+sys.path.insert(0, str(Path(__file__).parent))
+from subtitle_processor import parse_srt
+
+
+class FileScanner:
+    """
+    Efficient, disk-friendly subtitle scanner.
+
+    - Recursive (os.scandir-based)
+    - Streams file reads
+    - Batches results
+    - Extensive logging for observability
+    """
+
+    SUPPORTED_EXTENSIONS = {".srt"}
+    MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB
+    PLOT_SCAN_LINES = 50
+    DEFAULT_BATCH_SIZE = 100
+
+    # --------------------------------------------------------
+    # Public API
+    # --------------------------------------------------------
+
+    @classmethod
+    def scan_directory(
+        cls,
+        directory_path: str | Path,
+        batch_size: int = DEFAULT_BATCH_SIZE,
+        follow_symlinks: bool = False,
+    ) -> Generator[List[Dict], None, None]:
+        """
+        Recursively scan a directory tree for .srt files.
+        Yields batches of metadata dictionaries.
+        """
+        root = Path(directory_path)
+
+        logger.info("Starting subtitle scan")
+        logger.info("Root directory      : %s", root)
+        logger.info("Batch size          : %s", batch_size)
+        logger.info("Follow symlinks     : %s", follow_symlinks)
+
+        if not root.exists():
+            logger.error("Scan failed: path does not exist (%s)", root)
+            raise ValueError(f"Directory does not exist: {directory_path}")
+
+        if not root.is_dir():
+            logger.error("Scan failed: not a directory (%s)", root)
+            raise ValueError(f"Invalid directory: {directory_path}")
+
+        batch: List[Dict] = []
+        total_seen = 0
+        total_srt = 0
+        total_skipped = 0
+
+        for file_path in cls._walk_files(root, follow_symlinks):
+            total_seen += 1
+
+            if file_path.suffix.lower() not in cls.SUPPORTED_EXTENSIONS:
+                logger.debug("Ignoring non-subtitle file: %s", file_path)
+                continue
+
+            total_srt += 1
+            logger.debug("Found subtitle file: %s", file_path)
+
+            # --------------------------------------------
+            # Stat / size guard
+            # --------------------------------------------
+
+            try:
+                stat = file_path.stat()
+            except OSError as e:
+                total_skipped += 1
+                logger.warning(
+                    "Skipping unreadable file: %s (%s)",
+                    file_path, e
+                )
+                continue
+
+            if stat.st_size > cls.MAX_FILE_SIZE_BYTES:
+                total_skipped += 1
+                logger.warning(
+                    "Skipping large subtitle file (%d bytes): %s",
+                    stat.st_size, file_path
+                )
+                continue
+
+            # --------------------------------------------
+            # Plot detection
+            # --------------------------------------------
+
+            try:
+                has_plot = cls._check_has_plot(file_path)
+                logger.debug(
+                    "Plot check for %s: %s",
+                    file_path.name,
+                    "FOUND" if has_plot else "NOT FOUND"
+                )
+            except Exception as e:
+                total_skipped += 1
+                logger.error(
+                    "Plot scan failed for %s: %s",
+                    file_path, e
+                )
+                continue
+
+            metadata = {}
+
+            if has_plot:
+                try:
+                    metadata = cls._extract_metadata(file_path)
+                    logger.debug(
+                        "Extracted metadata from %s: %s",
+                        file_path.name,
+                        {k: v for k, v in metadata.items() if v}
+                    )
+                except Exception as e:
+                    logger.warning(
+                        "Metadata extraction failed for %s: %s",
+                        file_path.name, e
+                    )
+
+            batch.append({
+                "path": str(file_path),
+                "name": file_path.name,
+                "has_plot": has_plot,
+                "status": "Has Plot" if has_plot else "Not Loaded",
+                "summary": metadata.get("summary", ""),
+                "plot": metadata.get("summary", ""),
+                "title": metadata.get("title"),
+                "year": metadata.get("year"),
+                "imdb_rating": metadata.get("imdb_rating"),
+                "rating": metadata.get("imdb_rating"),
+                "runtime": metadata.get("runtime"),
+                "selected": False,
+            })
+
+            if len(batch) >= batch_size:
+                logger.info(
+                    "Yielding batch (%d items, %d total files scanned)",
+                    len(batch),
+                    total_seen
+                )
+                yield batch
+                batch = []
+
+        if batch:
+            logger.info(
+                "Yielding final batch (%d items)",
+                len(batch)
+            )
+            yield batch
+
+        logger.info("Subtitle scan completed")
+        logger.info("Files visited        : %d", total_seen)
+        logger.info("Subtitle files found : %d", total_srt)
+        logger.info("Files skipped        : %d", total_skipped)
+
+    # --------------------------------------------------------
+    # Internal helpers
+    # --------------------------------------------------------
+
+    @staticmethod
+    def _walk_files(root: Path, follow_symlinks: bool):
+        """
+        Fast iterative recursive directory walk using os.scandir.
+        """
+        logger.debug("Beginning recursive walk at %s", root)
+        stack = [root]
+
+        while stack:
+            current = stack.pop()
+            logger.debug("Scanning directory: %s", current)
+
+            try:
+                with os.scandir(current) as entries:
+                    for entry in entries:
+                        try:
+                            if entry.is_dir(follow_symlinks=follow_symlinks):
+                                stack.append(Path(entry.path))
+                            elif entry.is_file():
+                                yield Path(entry.path)
+                        except OSError as e:
+                            logger.debug(
+                                "Skipping entry due to OS error: %s (%s)",
+                                entry.path, e
+                            )
+            except OSError as e:
+                logger.warning(
+                    "Cannot access directory: %s (%s)",
+                    current, e
+                )
+
+    @classmethod
+    def _check_has_plot(cls, file_path: Path) -> bool:
+        """
+        Check first N lines for Sublogue signature.
+        """
+        logger.debug("Scanning for plot marker in %s", file_path.name)
+
+        try:
+            with file_path.open("r", encoding="utf-8", errors="ignore") as f:
+                for i, line in enumerate(f):
+                    if i >= cls.PLOT_SCAN_LINES:
+                        break
+                    if "generated by sublogue" in line.lower():
+                        logger.debug(
+                            "Plot marker found in %s (line %d)",
+                            file_path.name, i + 1
+                        )
+                        return True
+        except Exception as e:
+            logger.error(
+                "Error reading file during plot scan: %s (%s)",
+                file_path, e
+            )
+
+        return False
+
+    @classmethod
+    def _extract_metadata(cls, file_path: Path) -> Dict:
+        """
+        Extract title, year, rating, runtime, and plot
+        from Sublogue-generated subtitles.
+        """
+        logger.debug("Extracting metadata from %s", file_path.name)
+
+        content = file_path.read_text(encoding="utf-8", errors="ignore")
+        blocks = parse_srt(content)
+
+        metadata = {
+            "title": None,
+            "year": None,
+            "imdb_rating": None,
+            "runtime": None,
+            "summary": ""
+        }
+
+        if len(blocks) < 2:
+            logger.debug("Not enough subtitle blocks for metadata extraction")
+            return metadata
+
+        # --------------------------------------------
+        # Plot block (index 1)
+        # --------------------------------------------
+
+        plot_text = blocks[1].text
+        plot_text = plot_text.split("Generated by Sublogue")[0].strip()
+        metadata["summary"] = plot_text
+
+        # --------------------------------------------
+        # Header block (index 0)
+        # --------------------------------------------
+
+        header_lines = blocks[0].text.split("\n")
+
+        if header_lines:
+            first_line = header_lines[0]
+            year_match = re.search(r"\((\d{4})\)", first_line)
+            if year_match:
+                metadata["year"] = year_match.group(1)
+                metadata["title"] = first_line[:year_match.start()].strip()
+            else:
+                metadata["title"] = first_line.strip()
+
+        if len(header_lines) > 1:
+            second_line = header_lines[1]
+
+            rating_match = re.search(r"IMDb:\s*([^\s]+)", second_line)
+            if rating_match:
+                metadata["imdb_rating"] = rating_match.group(1)
+
+            runtime_match = re.search(r"⏱\s*(.+)", second_line)
+            if runtime_match:
+                metadata["runtime"] = runtime_match.group(1).strip()
+
+        logger.debug("Metadata extracted: %s", metadata)
+        return metadata
@@ -0,0 +1,501 @@
+"""
+Keyword stripper utility - removes common junk keywords from filenames and subtitle content
+Optimised for torrent / subtitle garbage while preserving real titles and dialogue
+"""
+
+from __future__ import annotations
+
+import re
+import logging
+from typing import Optional, List
+
+logger = logging.getLogger(__name__)
+
+
+class KeywordStripper:
+    """
+    High-performance filename cleaner for movies & TV.
+
+    Design goals:
+    - Torrent / subtitle spam annihilation
+    - Minimal false positives
+    - Regex compiled once
+    - Fast enough for large libraries
+    """
+
+    # -----------------------------
+    # CORE JUNK PATTERNS
+    # -----------------------------
+
+    QUALITY = r"""
+        \b(
+            480p|720p|1080p|2160p|4320p|
+            4k|8k|
+            hdr|hdr10|hdr10\+|dolby\s*vision|dv|
+            bluray|blu[-\s]?ray|bdrip|brrip|bd|
+            webrip|web[-\s]?dl|web|
+            dvdrip|dvd|dvdscr|
+            cam|ts|telesync|telecine|tc|
+            hdrip|hdlight
+        )\b
+    """
+
+    CODECS = r"""
+        \b(
+            x264|x265|h\.?264|h\.?265|hevc|
+            xvid|divx|
+            aac|ac3|dts|truehd|atmos|
+            dd5\.1|dd\+|
+            flac|mp3|opus|
+            8bit|10bit|hi10p
+        )\b
+    """
+
+    TORRENT_GROUPS = r"""
+        \b(
+            yts(\.mx)?|yify|
+            rarbg|eztv|ettv|
+            psa|ion10|fgт|fgt|
+            tgx|torrentgalaxy|
+            1337x|limetorrent|
+            ettv|ettv|
+            publichd|scene|
+            ganool|evo
+        )\b
+    """
+
+    SUBTITLE_ADS = r"""
+        \b(
+            opensubtitles|
+            subscene|
+            addic7ed|
+            podnapisi|
+            yifysubtitles|
+            subtitles?\s*by|
+            synced?\s*by|
+            encoded?\s*by|
+            resynced?\s*by
+        )\b
+        |
+        www\.[a-z0-9\-]+\.(com|org|net)
+    """
+
+    LANGUAGES = r"""
+        \b(
+            eng|english|
+            ita|italian|
+            fra|french|
+            spa|spanish|
+            ger|german|
+            multi|dubbed|
+            vostfr|subfrench|
+            subs?|subtitles?
+        )\b
+    """
+
+    EDITIONS = r"""
+        \b(
+            unrated|uncut|
+            directors?\s*cut|
+            extended|
+            theatrical|
+            imax|
+            special\s*edition|
+            limited|
+            internal|
+            proper|repack|real
+        )\b
+    """
+
+    # -----------------------------
+    # STRUCTURAL NOISE
+    # -----------------------------
+
+    BRACKETS = r"""
+        [\[\(\{]
+        .*?
+        [\]\)\}]
+    """
+
+    SEPARATORS = r"[._\-]+"
+
+    MULTISPACE = r"\s+"
+
+    YEAR_PATTERN = r"(19\d{2}|20\d{2})"
+
+    SEASON_EPISODE = [
+        r"[Ss](\d{1,2})[Ee](\d{1,2})",
+        r"(\d{1,2})x(\d{1,2})",
+        r"Season\s*(\d{1,2})\s*Episode\s*(\d{1,2})",
+    ]
+
+    # -----------------------------
+    # SUBTITLE CONTENT ADS/WATERMARKS
+    # -----------------------------
+    # These patterns are specifically for cleaning embedded ads from subtitle TEXT
+    # They're more aggressive than filename patterns since we want to remove entire lines
+
+    # Release group watermarks that appear in subtitle text
+    SUBTITLE_WATERMARKS = [
+        # YTS and variants
+        r"yts\.mx",
+        r"yts\.am",
+        r"yts\.lt",
+        r"yts\.ag",
+        r"\byts\b",
+        r"\byify\b",
+        # RARBG and other groups
+        r"\brarbg\b",
+        r"\beztv\b",
+        r"\bettv\b",
+        r"torrentgalaxy",
+        r"\btgx\b",
+        r"1337x",
+        r"limetorrents?",
+        r"\bevo\b",
+        r"\bpsa\b",
+        r"\bfgt\b",
+        # Subtitle sites
+        r"opensubtitles?",
+        r"subscene",
+        r"addic7ed",
+        r"podnapisi",
+        r"yifysubtitles?",
+        r"sub\.?scene",
+        r"legendas\.?tv",
+        r"shooter\.?cn",
+        r"subhd",
+        # Generic patterns
+        r"downloaded\s+from",
+        r"subtitles?\s+by",
+        r"sync(?:ed|hronized)?\s+(?:and\s+)?correct(?:ed|ions?)?\s+by",
+        r"ripped\s+by",
+        r"encoded?\s+by",
+        r"resynce?d?\s+by",
+        r"improved\s+by",
+        r"fixed\s+by",
+        r"translated\s+by",
+        r"captioned\s+by",
+        r"support\s+us\s+and",
+        r"get\s+more\s+subtitles",
+        r"quality\s+subtitles",
+        r"best\s+subtitles",
+        r"free\s+subtitles",
+        # URLs and domains
+        r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)",
+        r"https?://[^\s]+",
+        # Social media handles that are clearly ads
+        r"@yaborr",
+        r"@sub_scene",
+        r"follow\s+us\s+on",
+        r"join\s+us\s+at",
+        r"visit\s+us\s+at",
+        # Promotional text
+        r"advertise\s+here",
+        r"membership\s+(is\s+)?free",
+        r"become\s+a\s+member",
+        r"register\s+(now|today|free)",
+        r"sign\s+up\s+(now|today|free)",
+    ]
+
+    # Patterns that indicate an ENTIRE subtitle block should be removed
+    # (not just the matching text, but the whole block)
+    SUBTITLE_BLOCK_REMOVERS = [
+        # Pure promotional blocks
+        r"^[\s\-_]*(?:www\.)?yts",
+        r"^[\s\-_]*(?:www\.)?rarbg",
+        r"^[\s\-_]*opensubtitles",
+        r"^[\s\-_]*subscene",
+        r"^[\s\-_]*downloaded\s+from",
+        r"^[\s\-_]*subtitles?\s+by",
+        r"^[\s\-_]*sync(?:ed)?\s+(?:and\s+)?correct",
+        r"^[\s\-_]*support\s+us",
+        r"^[\s\-_]*get\s+(?:more\s+)?subtitles",
+        r"^[\s\-_]*quality\s+subtitles",
+        r"^[\s\-_]*advertise",
+        # ASCII art headers/footers (often used for ads)
+        r"^[\s\-=_\*]{10,}$",
+        # Empty after cleaning
+        r"^\s*$",
+    ]
+
+    # -----------------------------
+    # COMPILED REGEX CACHE
+    # -----------------------------
+
+    _compiled = None
+
+    @classmethod
+    def _compile(cls):
+        if cls._compiled:
+            return cls._compiled
+
+        def c(p):
+            return re.compile(p, re.IGNORECASE | re.VERBOSE)
+
+        cls._compiled = {
+            "junk": c("|".join([
+                cls.QUALITY,
+                cls.CODECS,
+                cls.TORRENT_GROUPS,
+                cls.SUBTITLE_ADS,
+                cls.LANGUAGES,
+                cls.EDITIONS,
+            ])),
+            "brackets": c(cls.BRACKETS),
+            "separators": re.compile(cls.SEPARATORS),
+            "multispace": re.compile(cls.MULTISPACE),
+            "year": re.compile(cls.YEAR_PATTERN),
+            "season_episode": [re.compile(p, re.IGNORECASE) for p in cls.SEASON_EPISODE],
+            # Subtitle content cleaning patterns
+            "subtitle_watermarks": [
+                re.compile(p, re.IGNORECASE) for p in cls.SUBTITLE_WATERMARKS
+            ],
+            "subtitle_block_removers": [
+                re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
+            ],
+        }
+
+        return cls._compiled
+
+    # -----------------------------
+    # PUBLIC API
+    # -----------------------------
+
+    def strip_keywords(self, title: str, preserve_year: bool = True) -> str:
+        rx = self._compile()
+
+        original = title
+
+        # Extract year early
+        year: Optional[str] = None
+        if preserve_year:
+            m = rx["year"].search(title)
+            if m:
+                year = m.group(1)
+
+        # Remove obvious junk
+        cleaned = rx["junk"].sub("", title)
+
+        # Remove bracketed junk AFTER stripping known keywords
+        cleaned = rx["brackets"].sub("", cleaned)
+
+        # Normalize separators
+        cleaned = rx["separators"].sub(" ", cleaned)
+        cleaned = rx["multispace"].sub(" ", cleaned).strip()
+
+        # Re-append year
+        if preserve_year and year and year not in cleaned:
+            cleaned = f"{cleaned} ({year})"
+
+        logger.debug("KeywordStripper: '%s' -> '%s'", original, cleaned)
+        return cleaned
+
+    def extract_year(self, title: str) -> Optional[str]:
+        rx = self._compile()
+        m = rx["year"].search(title)
+        return m.group(1) if m else None
+
+    def extract_season_episode(self, title: str):
+        rx = self._compile()
+        for p in rx["season_episode"]:
+            m = p.search(title)
+            if m:
+                return int(m.group(1)), int(m.group(2))
+        return None, None
+
+    def clean_filename(self, filename: str, preserve_year: bool = True) -> dict:
+        name = re.sub(r"\.[^.]+$", "", filename)
+
+        season, episode = self.extract_season_episode(name)
+        year = self.extract_year(name)
+        cleaned = self.strip_keywords(name, preserve_year=preserve_year)
+
+        return {
+            "cleaned_title": cleaned,
+            "year": year,
+            "season": season,
+            "episode": episode,
+            "is_series": season is not None or episode is not None,
+        }
+
+    # -----------------------------
+    # SUBTITLE CONTENT CLEANING
+    # -----------------------------
+
+    def should_remove_subtitle_block(self, text: str) -> bool:
+        """
+        Check if an entire subtitle block should be removed.
+
+        Returns True if the block is purely promotional/ad content
+        with no legitimate dialogue.
+
+        Args:
+            text: The subtitle text content
+
+        Returns:
+            True if block should be removed entirely
+        """
+        rx = self._compile()
+
+        # Check each line of the subtitle
+        lines = text.strip().split('\n')
+        non_ad_lines = 0
+
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+
+            # Check if this line matches any block remover pattern
+            is_ad_line = False
+            for pattern in rx["subtitle_block_removers"]:
+                if pattern.search(line):
+                    is_ad_line = True
+                    break
+
+            # Also check watermarks - if the entire line is just a watermark
+            if not is_ad_line:
+                temp_line = line
+                for pattern in rx["subtitle_watermarks"]:
+                    temp_line = pattern.sub("", temp_line)
+                # If after removing watermarks, line is empty or just punctuation
+                temp_line = re.sub(r'[\s\-_\.\,\!\?\:\;]+', '', temp_line)
+                if not temp_line:
+                    is_ad_line = True
+
+            if not is_ad_line:
+                non_ad_lines += 1
+
+        # If no legitimate content remains, remove the block
+        return non_ad_lines == 0
+
+    def clean_subtitle_text(self, text: str) -> str:
+        """
+        Clean watermarks and ads from subtitle text while preserving dialogue.
+
+        This is more surgical than should_remove_subtitle_block() - it removes
+        specific ad text but keeps the rest of the subtitle intact.
+
+        Args:
+            text: The subtitle text content
+
+        Returns:
+            Cleaned text with ads removed, or empty string if nothing remains
+        """
+        rx = self._compile()
+        original = text
+
+        # Process line by line to handle multi-line subtitles
+        lines = text.split('\n')
+        cleaned_lines = []
+
+        for line in lines:
+            cleaned_line = line
+
+            # Remove watermark patterns
+            for pattern in rx["subtitle_watermarks"]:
+                cleaned_line = pattern.sub("", cleaned_line)
+
+            # Clean up resulting whitespace
+            cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
+
+            # Only keep lines that have content after cleaning
+            if cleaned_line:
+                cleaned_lines.append(cleaned_line)
+
+        result = '\n'.join(cleaned_lines)
+
+        # Final cleanup - remove lines that are just punctuation/dashes
+        result_lines = result.split('\n')
+        result_lines = [l for l in result_lines if re.search(r'[a-zA-Z0-9]', l)]
+        result = '\n'.join(result_lines)
+
+        if result != original:
+            logger.debug("Cleaned subtitle text: '%s' -> '%s'", original[:50], result[:50])
+
+        return result
+
+    def clean_subtitle_blocks(self, blocks: List[dict]) -> List[dict]:
+        """
+        Clean a list of subtitle blocks, removing ads and watermarks.
+
+        This processes each block:
+        1. Checks if the entire block should be removed (pure ad content)
+        2. If not, cleans watermarks from the text
+
+        Args:
+            blocks: List of subtitle block dicts with 'text' key
+
+        Returns:
+            Cleaned list with ad blocks removed and watermarks stripped
+        """
+        cleaned = []
+        removed_count = 0
+        modified_count = 0
+
+        for block in blocks:
+            text = block.get("text", "")
+
+            # Check if entire block should be removed
+            if self.should_remove_subtitle_block(text):
+                removed_count += 1
+                logger.debug("Removing ad block: '%s'", text[:50])
+                continue
+
+            # Clean the text
+            cleaned_text = self.clean_subtitle_text(text)
+
+            # Skip if cleaning resulted in empty text
+            if not cleaned_text.strip():
+                removed_count += 1
+                continue
+
+            # Track if we modified the text
+            if cleaned_text != text:
+                modified_count += 1
+
+            # Create new block with cleaned text
+            cleaned_block = block.copy()
+            cleaned_block["text"] = cleaned_text
+            cleaned.append(cleaned_block)
+
+        if removed_count > 0 or modified_count > 0:
+            logger.info(
+                "Subtitle cleaning: removed %d ad blocks, modified %d blocks",
+                removed_count, modified_count
+            )
+
+        return cleaned
+
+
+# -----------------------------
+# SINGLETON HELPERS
+# -----------------------------
+
+_default_stripper: Optional[KeywordStripper] = None
+
+
+def get_stripper() -> KeywordStripper:
+    global _default_stripper
+    if _default_stripper is None:
+        _default_stripper = KeywordStripper()
+    return _default_stripper
+
+
+def clean_title(title: str, preserve_year: bool = True) -> str:
+    return get_stripper().strip_keywords(title, preserve_year)
+
+
+def clean_filename(filename: str, preserve_year: bool = True) -> dict:
+    return get_stripper().clean_filename(filename, preserve_year)
+
+
+def clean_subtitle_content(text: str) -> str:
+    """Clean watermarks and ads from subtitle text."""
+    return get_stripper().clean_subtitle_text(text)
+
+
+def should_remove_subtitle(text: str) -> bool:
+    """Check if a subtitle block should be removed entirely (pure ad)."""
+    return get_stripper().should_remove_subtitle_block(text)
@@ -0,0 +1,243 @@
+"""
+OMDb API client - async movie metadata fetching
+Includes:
+- Rate limiting (requests / second)
+- Concurrency limiting
+- Request de-duplication
+- Shared aiohttp session
+"""
+
+from __future__ import annotations
+
+import asyncio
+import aiohttp
+import logging
+import time
+from typing import Dict, Optional
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class RateLimiter:
+    """
+    Simple async rate limiter (token bucket-ish).
+    Limits how often requests may start.
+    """
+
+    def __init__(self, rate_per_second: float):
+        self._interval = 1.0 / rate_per_second
+        self._lock = asyncio.Lock()
+        self._last_call = 0.0
+
+    async def wait(self):
+        async with self._lock:
+            now = time.monotonic()
+            delta = now - self._last_call
+            if delta < self._interval:
+                await asyncio.sleep(self._interval - delta)
+            self._last_call = time.monotonic()
+
+
+class OMDbClient:
+    """Async, rate-limited OMDb client"""
+
+    BASE_URL = "https://www.omdbapi.com/"
+
+    def __init__(
+        self,
+        api_key: str,
+        *,
+        max_concurrent: int = 5,
+        rate_limit_per_sec: float = 2.0,
+        db_manager=None,
+        timeout: int = 15,
+    ):
+        self.api_key = api_key
+        self.db_manager = db_manager
+
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.rate_limiter = RateLimiter(rate_limit_per_sec)
+
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._inflight: Dict[str, asyncio.Future] = {}
+
+        self._timeout = aiohttp.ClientTimeout(total=timeout)
+
+    # -----------------------------
+    # Session management
+    # -----------------------------
+
+    async def _get_session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession(timeout=self._timeout)
+        return self._session
+
+    async def close(self):
+        if self._session and not self._session.closed:
+            await self._session.close()
+
+    # -----------------------------
+    # Public API
+    # -----------------------------
+
+    async def fetch_summary(
+        self,
+        title: str,
+        media_type: str = "movie",
+        year: Optional[str] = None,
+        season: Optional[int] = None,
+        episode: Optional[int] = None,
+    ) -> Optional[dict]:
+        """
+        Fetch summary from OMDb.
+        Automatically deduplicates identical in-flight requests.
+
+        Args:
+            title: Movie/show title to search
+            media_type: "movie" or "series"
+            year: Year of release (improves match accuracy)
+            season: Season number (for series)
+            episode: Episode number (for series)
+        """
+
+        key = self._make_cache_key(title, media_type, year, season, episode)
+
+        # Deduplicate in-flight requests
+        if key in self._inflight:
+            logger.debug("Awaiting inflight OMDb request: %s", key)
+            return await self._inflight[key]
+
+        loop = asyncio.get_running_loop()
+        future = loop.create_future()
+        self._inflight[key] = future
+
+        try:
+            result = await self._fetch_summary_internal(
+                title, media_type, year, season, episode
+            )
+            future.set_result(result)
+            return result
+        except Exception as e:
+            future.set_exception(e)
+            raise
+        finally:
+            self._inflight.pop(key, None)
+
+    # -----------------------------
+    # Internal fetch logic
+    # -----------------------------
+
+    async def _fetch_summary_internal(
+        self,
+        title: str,
+        media_type: str,
+        year: Optional[str],
+        season: Optional[int],
+        episode: Optional[int],
+    ) -> Optional[dict]:
+
+        logger.info("Fetching OMDb summary for: %s (year=%s)", title, year)
+
+        async with self.semaphore:
+            await self.rate_limiter.wait()
+
+            params = {
+                "apikey": self.api_key,
+                "t": title,
+                "plot": "short",
+            }
+
+            # Add year for better matching accuracy
+            if year:
+                params["y"] = year
+
+            if media_type == "series":
+                params["type"] = "series"
+            if season is not None:
+                params["Season"] = season
+            if episode is not None:
+                params["Episode"] = episode
+
+            session = await self._get_session()
+            start = time.monotonic()
+
+            try:
+                async with session.get(self.BASE_URL, params=params) as resp:
+                    elapsed_ms = int((time.monotonic() - start) * 1000)
+
+                    if resp.status != 200:
+                        self._track(False, title, elapsed_ms)
+                        logger.error("OMDb HTTP %s for '%s'", resp.status, title)
+                        return None
+
+                    data = await resp.json()
+
+                    if data.get("Response") != "True":
+                        self._track(False, title, elapsed_ms)
+                        logger.warning("OMDb error for '%s': %s", title, data.get("Error"))
+                        return None
+
+                    self._track(True, title, elapsed_ms)
+
+                    # Validate year match if year was specified
+                    if year:
+                        returned_year = data.get("Year", "")
+                        # Handle year ranges like "2020-2023" for series
+                        if "-" in returned_year:
+                            returned_year = returned_year.split("-")[0]
+                        if returned_year and returned_year != year:
+                            logger.warning(
+                                f"Year mismatch for '{title}': requested {year}, got {returned_year} "
+                                f"('{data.get('Title')}'). Rejecting result."
+                            )
+                            return None
+
+                    return self._parse_response(data)
+
+            except asyncio.TimeoutError:
+                logger.error("OMDb timeout for '%s'", title)
+                return None
+            except aiohttp.ClientError as e:
+                logger.error("OMDb network error for '%s': %s", title, e)
+                return None
+
+    # -----------------------------
+    # Helpers
+    # -----------------------------
+
+    def _parse_response(self, data: dict) -> dict:
+        rt_rating = "N/A"
+        for r in data.get("Ratings", []):
+            if r.get("Source") == "Rotten Tomatoes":
+                rt_rating = r.get("Value")
+                break
+
+        return {
+            "plot": data.get("Plot", "No plot available"),
+            "rotten_tomatoes": rt_rating,
+            "title": data.get("Title"),
+            "year": data.get("Year"),
+            "media_type": data.get("Type"),
+            "imdb_rating": data.get("imdbRating", "N/A"),
+            "runtime": data.get("Runtime", "N/A"),
+            # Additional metadata fields
+            "director": data.get("Director", "N/A"),
+            "actors": data.get("Actors", "N/A"),
+            "released": data.get("Released", "N/A"),
+            "genre": data.get("Genre", "N/A"),
+        }
+
+    def _track(self, success: bool, title: str, response_time_ms: int):
+        if not self.db_manager:
+            return
+        self.db_manager.track_api_call(
+            provider="omdb",
+            endpoint=f"/?t={title}",
+            success=success,
+            response_time_ms=response_time_ms,
+        )
+
+    @staticmethod
+    def _make_cache_key(title, media_type, year, season, episode) -> str:
+        return f"{title.lower()}|{media_type}|{year}|{season}|{episode}"
@@ -0,0 +1,321 @@
+"""
+TMDb API client - async movie and TV series metadata fetching
+"""
+import asyncio
+import aiohttp
+import logging
+import time
+
+logging.basicConfig(level=logging.INFO)
+
+
+class TMDbClient:
+    """Async client for The Movie Database (TMDb) API"""
+
+    def __init__(self, api_key, db_manager=None):
+        self.api_key = api_key
+        self.base_url = "https://api.themoviedb.org/3"
+        self.semaphore = asyncio.Semaphore(5)  # Limit concurrent requests
+        self.db_manager = db_manager
+
+    async def search_movie(self, title, year=None):
+        """
+        Search for a movie by title
+
+        Args:
+            title: Movie title to search
+            year: Optional year to narrow search
+
+        Returns:
+            dict: Movie data or None if not found
+        """
+        async with self.semaphore:
+            url = f"{self.base_url}/search/movie"
+            params = {
+                "api_key": self.api_key,
+                "query": title
+            }
+            if year:
+                params["year"] = year
+
+            try:
+                start_time = time.time()
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, params=params) as response:
+                        response_time_ms = int((time.time() - start_time) * 1000)
+
+                        if response.status != 200:
+                            logging.error(f"TMDb HTTP error {response.status} for movie '{title}'")
+                            # Track failed API call
+                            if self.db_manager:
+                                self.db_manager.track_api_call(
+                                    provider='tmdb',
+                                    endpoint='/search/movie',
+                                    success=False,
+                                    response_time_ms=response_time_ms
+                                )
+                            return None
+
+                        data = await response.json()
+
+                        if data.get("results") and len(data["results"]) > 0:
+                            # Track successful API call
+                            if self.db_manager:
+                                self.db_manager.track_api_call(
+                                    provider='tmdb',
+                                    endpoint='/search/movie',
+                                    success=True,
+                                    response_time_ms=response_time_ms
+                                )
+                            return data["results"][0]  # Return first match
+
+                        logging.warning(f"No TMDb results for movie '{title}'")
+                        # Track failed API call (no results)
+                        if self.db_manager:
+                            self.db_manager.track_api_call(
+                                provider='tmdb',
+                                endpoint='/search/movie',
+                                success=False,
+                                response_time_ms=response_time_ms
+                            )
+                        return None
+
+            except Exception as e:
+                logging.error(f"Error searching TMDb for movie '{title}': {e}")
+                return None
+
+    async def search_tv(self, title, year=None):
+        """
+        Search for a TV series by title
+
+        Args:
+            title: TV series title to search
+            year: Optional year to narrow search
+
+        Returns:
+            dict: TV series data or None if not found
+        """
+        async with self.semaphore:
+            url = f"{self.base_url}/search/tv"
+            params = {
+                "api_key": self.api_key,
+                "query": title
+            }
+            if year:
+                params["first_air_date_year"] = year
+
+            try:
+                start_time = time.time()
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, params=params) as response:
+                        response_time_ms = int((time.time() - start_time) * 1000)
+
+                        if response.status != 200:
+                            logging.error(f"TMDb HTTP error {response.status} for TV '{title}'")
+                            # Track failed API call
+                            if self.db_manager:
+                                self.db_manager.track_api_call(
+                                    provider='tmdb',
+                                    endpoint='/search/tv',
+                                    success=False,
+                                    response_time_ms=response_time_ms
+                                )
+                            return None
+
+                        data = await response.json()
+
+                        if data.get("results") and len(data["results"]) > 0:
+                            # Track successful API call
+                            if self.db_manager:
+                                self.db_manager.track_api_call(
+                                    provider='tmdb',
+                                    endpoint='/search/tv',
+                                    success=True,
+                                    response_time_ms=response_time_ms
+                                )
+                            return data["results"][0]  # Return first match
+
+                        logging.warning(f"No TMDb results for TV series '{title}'")
+                        # Track failed API call (no results)
+                        if self.db_manager:
+                            self.db_manager.track_api_call(
+                                provider='tmdb',
+                                endpoint='/search/tv',
+                                success=False,
+                                response_time_ms=response_time_ms
+                            )
+                        return None
+
+            except Exception as e:
+                logging.error(f"Error searching TMDb for TV '{title}': {e}")
+                return None
+
+    async def get_movie_details(self, movie_id):
+        """
+        Get detailed movie information
+
+        Args:
+            movie_id: TMDb movie ID
+
+        Returns:
+            dict: Detailed movie data
+        """
+        async with self.semaphore:
+            url = f"{self.base_url}/movie/{movie_id}"
+            params = {"api_key": self.api_key}
+
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, params=params) as response:
+                        if response.status != 200:
+                            logging.error(f"TMDb HTTP error {response.status} for movie ID {movie_id}")
+                            return None
+
+                        return await response.json()
+
+            except Exception as e:
+                logging.error(f"Error getting TMDb movie details for ID {movie_id}: {e}")
+                return None
+
+    async def get_tv_details(self, tv_id):
+        """
+        Get detailed TV series information
+
+        Args:
+            tv_id: TMDb TV series ID
+
+        Returns:
+            dict: Detailed TV series data
+        """
+        async with self.semaphore:
+            url = f"{self.base_url}/tv/{tv_id}"
+            params = {"api_key": self.api_key}
+
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, params=params) as response:
+                        if response.status != 200:
+                            logging.error(f"TMDb HTTP error {response.status} for TV ID {tv_id}")
+                            return None
+
+                        return await response.json()
+
+            except Exception as e:
+                logging.error(f"Error getting TMDb TV details for ID {tv_id}: {e}")
+                return None
+
+    async def get_tv_season(self, tv_id, season_number):
+        """
+        Get TV season information
+
+        Args:
+            tv_id: TMDb TV series ID
+            season_number: Season number
+
+        Returns:
+            dict: Season data including episodes
+        """
+        async with self.semaphore:
+            url = f"{self.base_url}/tv/{tv_id}/season/{season_number}"
+            params = {"api_key": self.api_key}
+
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, params=params) as response:
+                        if response.status != 200:
+                            logging.error(f"TMDb HTTP error {response.status} for TV {tv_id} season {season_number}")
+                            return None
+
+                        return await response.json()
+
+            except Exception as e:
+                logging.error(f"Error getting TMDb season data: {e}")
+                return None
+
+    async def fetch_summary(self, title, media_type="movie", year=None, season=None, episode=None):
+        """
+        Fetch summary for movie or TV series
+
+        Args:
+            title: Title to search
+            media_type: "movie" or "tv"
+            year: Optional year
+            season: Optional season number (for TV)
+            episode: Optional episode number (for TV)
+
+        Returns:
+            dict: {plot, title, year, media_type, rating} or None if not found
+        """
+        logging.info(f"Fetching TMDb summary for: {title} (type: {media_type})")
+
+        try:
+            if media_type == "tv":
+                # Search for TV series
+                search_result = await self.search_tv(title, year)
+                if not search_result:
+                    return None
+
+                tv_id = search_result["id"]
+
+                # Get detailed TV info
+                tv_details = await self.get_tv_details(tv_id)
+                if not tv_details:
+                    return None
+
+                plot = tv_details.get("overview", "No plot available")
+
+                # If specific season/episode requested, try to get that plot
+                if season is not None:
+                    season_data = await self.get_tv_season(tv_id, season)
+                    if season_data and episode is not None:
+                        episodes = season_data.get("episodes", [])
+                        for ep in episodes:
+                            if ep.get("episode_number") == episode:
+                                plot = ep.get("overview", plot)
+                                break
+
+                # Get episode runtime (usually consistent across series)
+                runtime = "N/A"
+                episode_run_time = tv_details.get("episode_run_time", [])
+                if episode_run_time and len(episode_run_time) > 0:
+                    runtime = f"{episode_run_time[0]} min"
+
+                return {
+                    "plot": plot,
+                    "title": tv_details.get("name", title),
+                    "year": tv_details.get("first_air_date", "")[:4] if tv_details.get("first_air_date") else "N/A",
+                    "media_type": "tv",
+                    "rating": f"{tv_details.get('vote_average', 0):.1f}/10",
+                    "runtime": runtime
+                }
+
+            else:  # movie
+                # Search for movie
+                search_result = await self.search_movie(title, year)
+                if not search_result:
+                    return None
+
+                movie_id = search_result["id"]
+
+                # Get detailed movie info
+                movie_details = await self.get_movie_details(movie_id)
+                if not movie_details:
+                    return None
+
+                # Get runtime
+                runtime = "N/A"
+                if movie_details.get("runtime"):
+                    runtime = f"{movie_details.get('runtime')} min"
+
+                return {
+                    "plot": movie_details.get("overview", "No plot available"),
+                    "title": movie_details.get("title", title),
+                    "year": movie_details.get("release_date", "")[:4] if movie_details.get("release_date") else "N/A",
+                    "media_type": "movie",
+                    "rating": f"{movie_details.get('vote_average', 0):.1f}/10",
+                    "runtime": runtime
+                }
+
+        except Exception as e:
+            logging.error(f"Error fetching TMDb summary for '{title}': {e}")
+            return None
@@ -0,0 +1,135 @@
+"""
+TVmaze API client - async TV metadata fetching
+"""
+import aiohttp
+import logging
+import re
+import time
+
+logger = logging.getLogger(__name__)
+
+
+class TVMazeClient:
+    """Async client for the TVmaze API (no API key required)."""
+
+    BASE_URL = "https://api.tvmaze.com"
+
+    def __init__(self, db_manager=None, timeout=15):
+        self.db_manager = db_manager
+        self._timeout = aiohttp.ClientTimeout(total=timeout)
+
+    async def fetch_summary(self, title, year=None, season=None, episode=None):
+        """
+        Fetch summary for a TV series (optionally episode-specific).
+
+        Args:
+            title: Series title to search
+            year: Optional year to validate match
+            season: Optional season number
+            episode: Optional episode number
+        """
+        show = await self._fetch_show(title, year)
+        if not show:
+            return None
+
+        plot = self._strip_html(show.get("summary")) or "No plot available"
+
+        if season is not None and episode is not None:
+            episode_data = await self._fetch_episode(show.get("id"), season, episode)
+            if episode_data:
+                episode_summary = self._strip_html(episode_data.get("summary"))
+                if episode_summary:
+                    plot = episode_summary
+
+        rating = show.get("rating", {}).get("average")
+        imdb_rating = f"{rating:.1f}" if isinstance(rating, (int, float)) else "N/A"
+
+        runtime_value = show.get("runtime") or show.get("averageRuntime")
+        runtime = f"{runtime_value} min" if runtime_value else "N/A"
+
+        premiered = show.get("premiered") or ""
+        year_value = premiered[:4] if premiered else "N/A"
+
+        return {
+            "plot": plot,
+            "title": show.get("name", title),
+            "year": year_value,
+            "media_type": "tv",
+            "imdb_rating": imdb_rating,
+            "rotten_tomatoes": "N/A",
+            "runtime": runtime,
+        }
+
+    async def _fetch_show(self, title, year):
+        params = {"q": title}
+        url = f"{self.BASE_URL}/singlesearch/shows"
+
+        start_time = time.time()
+        try:
+            async with aiohttp.ClientSession(timeout=self._timeout) as session:
+                async with session.get(url, params=params) as response:
+                    response_time_ms = int((time.time() - start_time) * 1000)
+
+                    if response.status != 200:
+                        self._track(False, "/singlesearch/shows", response_time_ms)
+                        logger.error("TVmaze HTTP %s for '%s'", response.status, title)
+                        return None
+
+                    data = await response.json()
+                    self._track(True, "/singlesearch/shows", response_time_ms)
+
+                    if year:
+                        premiered = data.get("premiered") or ""
+                        if premiered and premiered[:4] != year:
+                            logger.warning(
+                                "TVmaze year mismatch for '%s': requested %s, got %s",
+                                title,
+                                year,
+                                premiered[:4],
+                            )
+                            return None
+
+                    return data
+        except Exception as e:
+            logger.error("TVmaze error for '%s': %s", title, e)
+            return None
+
+    async def _fetch_episode(self, show_id, season, episode):
+        if not show_id:
+            return None
+
+        params = {"season": season, "number": episode}
+        url = f"{self.BASE_URL}/shows/{show_id}/episodebynumber"
+
+        start_time = time.time()
+        try:
+            async with aiohttp.ClientSession(timeout=self._timeout) as session:
+                async with session.get(url, params=params) as response:
+                    response_time_ms = int((time.time() - start_time) * 1000)
+
+                    if response.status != 200:
+                        self._track(False, "/shows/{id}/episodebynumber", response_time_ms)
+                        return None
+
+                    data = await response.json()
+                    self._track(True, "/shows/{id}/episodebynumber", response_time_ms)
+                    return data
+        except Exception as e:
+            logger.error("TVmaze episode error for show %s: %s", show_id, e)
+            return None
+
+    def _track(self, success, endpoint, response_time_ms):
+        if not self.db_manager:
+            return
+        self.db_manager.track_api_call(
+            provider="tvmaze",
+            endpoint=endpoint,
+            success=success,
+            response_time_ms=response_time_ms,
+        )
+
+    @staticmethod
+    def _strip_html(text):
+        if not text:
+            return ""
+        return re.sub(r"<[^>]+>", "", text).strip()
@@ -0,0 +1,13 @@
+# Core dependencies
+flask>=3.0.0
+flask-cors>=4.0.0
+aiohttp>=3.9.0
+
+# Database
+sqlalchemy>=2.0.0
+
+# Environment
+python-dotenv>=1.0.0
+
+# Production WSGI server
+gunicorn>=21.0.0
@@ -0,0 +1,220 @@
+"""
+Test to verify zero timing drift guarantee for subtitle processing.
+
+This test demonstrates that after injecting plot metadata:
+1. All original subtitle timestamps remain byte-for-byte identical
+2. First dialogue text appears at exactly the same timestamp
+3. No subtitle blocks are renumbered incorrectly
+"""
+
+import sys
+import io
+
+# Fix Windows console encoding issues
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+from core.subtitle_processor import parse_srt, build_intro_blocks, strip_existing_plot_blocks, format_srt
+
+# Test SRT content with first subtitle at 10 seconds
+test_srt = """1
+00:00:10,000 --> 00:00:12,500
+Hello sir
+
+2
+00:00:13,000 --> 00:00:15,000
+How are you today?
+
+3
+00:00:16,000 --> 00:00:18,500
+I'm doing great, thanks!
+"""
+
+# Mock movie data
+mock_movie = {
+    "title": "Test Movie",
+    "year": "2025",
+    "imdb_rating": "8.5",
+    "runtime": "120 min",
+    "plot": "A thrilling story about testing subtitle timing preservation in automated systems."
+}
+
+def test_timing_preservation():
+    """Test that original subtitle timing is preserved exactly"""
+
+    # Parse original
+    original_blocks = parse_srt(test_srt)
+
+    print("=" * 60)
+    print("ORIGINAL SUBTITLES")
+    print("=" * 60)
+    for block in original_blocks:
+        print(f"Block {block.index}: {block.start_time}ms - {block.end_time}ms")
+        text_preview = block.text.replace("\n", " ")[:50]
+        print(f"  Text: {text_preview}...")
+
+    # Build intro blocks that fit before first subtitle (10,000 ms)
+    intro_blocks = build_intro_blocks(
+        mock_movie,
+        mock_movie["plot"],
+        first_subtitle_start_ms=original_blocks[0].start_time,
+        min_safe_gap_ms=1000
+    )
+
+    print("\n" + "=" * 60)
+    print("INJECTED PLOT BLOCKS")
+    print("=" * 60)
+    for block in intro_blocks:
+        print(f"Block: {block.start_time}ms - {block.end_time}ms")
+        text_preview = block.text.encode('ascii', 'replace').decode('ascii')[:50]
+        print(f"  Text: {text_preview}...")
+
+    # Combine and renumber (like the real processor does)
+    final = intro_blocks + original_blocks
+    renumbered = [
+        type(block)(
+            index=i + 1,
+            start_time=block.start_time,
+            end_time=block.end_time,
+            text=block.text
+        )
+        for i, block in enumerate(final)
+    ]
+
+    print("\n" + "=" * 60)
+    print("FINAL OUTPUT (AFTER INJECTION)")
+    print("=" * 60)
+    for block in renumbered:
+        print(f"Block {block.index}: {block.start_time}ms - {block.end_time}ms")
+        text_preview = block.text.replace("\n", " ")[:50]
+        print(f"  Text: {text_preview}...")
+
+    # Verify timing preservation
+    print("\n" + "=" * 60)
+    print("VERIFICATION")
+    print("=" * 60)
+
+    # Find first dialogue subtitle in final output (skip intro blocks)
+    dialogue_blocks = [b for b in renumbered if b.start_time >= original_blocks[0].start_time]
+
+    assert len(dialogue_blocks) == len(original_blocks), \
+        f"Block count mismatch: {len(dialogue_blocks)} vs {len(original_blocks)}"
+
+    for i, (original, final_block) in enumerate(zip(original_blocks, dialogue_blocks)):
+        print(f"\nDialogue Block {i+1}:")
+        print(f"  Original timing: {original.start_time}ms - {original.end_time}ms")
+        print(f"  Final timing:    {final_block.start_time}ms - {final_block.end_time}ms")
+        print(f"  Text match: {original.text == final_block.text}")
+
+        assert original.start_time == final_block.start_time, \
+            f"Start time changed! {original.start_time} != {final_block.start_time}"
+        assert original.end_time == final_block.end_time, \
+            f"End time changed! {original.end_time} != {final_block.end_time}"
+        assert original.text == final_block.text, \
+            f"Text changed!"
+
+    # Verify intro blocks don't overlap with first subtitle
+    last_intro_block = intro_blocks[-1]
+    first_dialogue_start = original_blocks[0].start_time
+
+    print(f"\nGap between plot and dialogue:")
+    print(f"  Last plot block ends: {last_intro_block.end_time}ms")
+    print(f"  First dialogue starts: {first_dialogue_start}ms")
+    print(f"  Gap: {first_dialogue_start - last_intro_block.end_time}ms")
+
+    assert last_intro_block.end_time < first_dialogue_start, \
+        "Plot blocks overlap with dialogue!"
+
+    print("\n" + "=" * 60)
+    print("✅ ALL TESTS PASSED - ZERO TIMING DRIFT CONFIRMED")
+    print("=" * 60)
+
+def test_edge_case_early_subtitle():
+    """Test case where first subtitle starts very early (1 second)"""
+
+    early_srt = """1
+00:00:01,000 --> 00:00:03,000
+Early dialogue
+
+2
+00:00:04,000 --> 00:00:06,000
+More early dialogue
+"""
+
+    original_blocks = parse_srt(early_srt)
+
+    print("\n" + "=" * 60)
+    print("EDGE CASE: EARLY SUBTITLE AT 1 SECOND")
+    print("=" * 60)
+
+    intro_blocks = build_intro_blocks(
+        mock_movie,
+        mock_movie["plot"],
+        first_subtitle_start_ms=original_blocks[0].start_time,
+        min_safe_gap_ms=1000
+    )
+
+    print(f"First dialogue at: {original_blocks[0].start_time}ms")
+    print(f"Number of intro blocks: {len(intro_blocks)}")
+
+    for block in intro_blocks:
+        print(f"  Plot block: {block.start_time}ms - {block.end_time}ms")
+
+    # Should use zero-duration blocks since not enough time
+    if original_blocks[0].start_time < 2000:
+        assert all(b.start_time == 0 and b.end_time == 0 for b in intro_blocks), \
+            "Should use zero-duration blocks for very early subtitles"
+        print("✅ Correctly using zero-duration blocks")
+
+    print("=" * 60)
+
+def test_idempotency():
+    """Test that running the operation twice doesn't duplicate plot blocks"""
+
+    print("\n" + "=" * 60)
+    print("IDEMPOTENCY TEST")
+    print("=" * 60)
+
+    # First pass: add plot blocks
+    original_blocks = parse_srt(test_srt)
+    intro_blocks = build_intro_blocks(
+        mock_movie,
+        mock_movie["plot"],
+        first_subtitle_start_ms=original_blocks[0].start_time,
+        min_safe_gap_ms=1000
+    )
+
+    first_pass = intro_blocks + original_blocks
+    first_pass_srt = format_srt([
+        type(b)(index=i+1, start_time=b.start_time, end_time=b.end_time, text=b.text)
+        for i, b in enumerate(first_pass)
+    ])
+
+    print(f"After first pass: {len(first_pass)} blocks")
+
+    # Second pass: should strip existing plot blocks
+    second_pass_parsed = parse_srt(first_pass_srt)
+
+    print(f"Parsed second pass: {len(second_pass_parsed)} blocks")
+    for i, block in enumerate(second_pass_parsed):
+        text_preview = block.text.replace("\n", " ")[:60]
+        print(f"  Block {i+1}: {block.start_time}-{block.end_time}ms | {text_preview}")
+
+    second_pass_cleaned = strip_existing_plot_blocks(second_pass_parsed)
+
+    print(f"\nAfter stripping plot blocks: {len(second_pass_cleaned)} blocks")
+    for i, block in enumerate(second_pass_cleaned):
+        text_preview = block.text.replace("\n", " ")[:60]
+        print(f"  Block {i+1}: {block.start_time}-{block.end_time}ms | {text_preview}")
+
+    # Should have same number as original dialogue
+    assert len(second_pass_cleaned) == len(original_blocks), \
+        f"Failed to strip plot blocks! Expected {len(original_blocks)}, got {len(second_pass_cleaned)}"
+
+    print("\n✅ Idempotency verified - plot blocks correctly stripped")
+    print("=" * 60)
+
+if __name__ == "__main__":
+    test_timing_preservation()
+    test_edge_case_early_subtitle()
+    test_idempotency()
+    print("\n🎉 All tests passed! Zero timing drift guaranteed.")