Initial commit
This commit is contained in:
Binary file not shown.
+1376
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,67 @@
|
||||
"""
|
||||
Configuration manager - handles settings persistence
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
class ConfigManager:
|
||||
"""Manages application settings with JSON persistence"""
|
||||
|
||||
def __init__(self, file_path="settings.json"):
|
||||
self.file_path = Path(file_path)
|
||||
self.settings = {
|
||||
"api_key": "",
|
||||
"default_directory": "",
|
||||
"duration": 40,
|
||||
"cleaning_patterns": [
|
||||
r"\(\d{4}\)",
|
||||
r"\[\d{4}\]",
|
||||
r"\.\d{4}\.",
|
||||
r"\.(19|20)\d{2}",
|
||||
r"\.[a-z]{2,3}\b",
|
||||
r"\.(?:720p|1080p|2160p|480p|HDRip|BRRip|BluRay|WEBRip|WEB-DL)",
|
||||
r"\.(?:x264|x265|HEVC|AAC|AC3|DTS)"
|
||||
]
|
||||
}
|
||||
self.load_settings()
|
||||
|
||||
def load_settings(self):
|
||||
"""Load settings from disk"""
|
||||
if self.file_path.exists():
|
||||
try:
|
||||
with open(self.file_path, "r") as f:
|
||||
self.settings.update(json.load(f))
|
||||
logging.info("Settings loaded successfully")
|
||||
except Exception as e:
|
||||
logging.error(f"Error loading settings: {e}")
|
||||
|
||||
def save_settings(self):
|
||||
"""Save settings to disk"""
|
||||
try:
|
||||
with open(self.file_path, "w") as f:
|
||||
json.dump(self.settings, f, indent=2)
|
||||
logging.info("Settings saved successfully")
|
||||
except Exception as e:
|
||||
logging.error(f"Error saving settings: {e}")
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get a setting value"""
|
||||
return self.settings.get(key, default)
|
||||
|
||||
def set(self, key, value):
|
||||
"""Set a setting value"""
|
||||
self.settings[key] = value
|
||||
logging.info(f"Setting updated: {key}")
|
||||
|
||||
def get_all(self):
|
||||
"""Get all settings"""
|
||||
return self.settings.copy()
|
||||
|
||||
def update_multiple(self, updates):
|
||||
"""Update multiple settings at once"""
|
||||
self.settings.update(updates)
|
||||
logging.info(f"Updated {len(updates)} settings")
|
||||
@@ -0,0 +1,828 @@
|
||||
"""
|
||||
Database layer using SQLAlchemy with SQLite
|
||||
Handles persistent storage for settings, runs, and history
|
||||
"""
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, Float, Text, ForeignKey
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker, relationship, scoped_session
|
||||
import json
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class Settings(Base):
|
||||
"""Settings table - stores application configuration"""
|
||||
__tablename__ = 'settings'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
key = Column(String(100), unique=True, nullable=False, index=True)
|
||||
value = Column(Text, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Settings(key='{self.key}', value='{self.value}')>"
|
||||
|
||||
|
||||
class ProcessingRun(Base):
|
||||
"""Processing runs table - stores each batch processing session"""
|
||||
__tablename__ = 'processing_runs'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
started_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
||||
completed_at = Column(DateTime)
|
||||
total_files = Column(Integer, default=0)
|
||||
successful_files = Column(Integer, default=0)
|
||||
failed_files = Column(Integer, default=0)
|
||||
duration_seconds = Column(Float)
|
||||
status = Column(String(50), default='in_progress') # in_progress, completed, failed
|
||||
|
||||
# Relationship to file results
|
||||
file_results = relationship("FileResult", back_populates="run", cascade="all, delete-orphan")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ProcessingRun(id={self.id}, started_at='{self.started_at}', status='{self.status}')>"
|
||||
|
||||
|
||||
class FileResult(Base):
|
||||
"""File results table - stores individual file processing results"""
|
||||
__tablename__ = 'file_results'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
run_id = Column(Integer, ForeignKey('processing_runs.id'), nullable=False, index=True)
|
||||
file_path = Column(String(500), nullable=False, index=True)
|
||||
file_name = Column(String(255), nullable=False)
|
||||
success = Column(Boolean, default=False)
|
||||
status = Column(String(100))
|
||||
summary = Column(Text)
|
||||
error_message = Column(Text)
|
||||
processed_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
duration = Column(Integer) # subtitle duration in seconds
|
||||
|
||||
# Relationship back to run
|
||||
run = relationship("ProcessingRun", back_populates="file_results")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<FileResult(id={self.id}, file_name='{self.file_name}', success={self.success})>"
|
||||
|
||||
|
||||
class ScanHistory(Base):
|
||||
"""Scan history table - stores directory scan history"""
|
||||
__tablename__ = 'scan_history'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
directory = Column(String(500), nullable=False, index=True)
|
||||
scanned_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
||||
files_found = Column(Integer, default=0)
|
||||
files_with_plot = Column(Integer, default=0)
|
||||
scan_duration_ms = Column(Integer)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScanHistory(id={self.id}, directory='{self.directory}', files_found={self.files_found})>"
|
||||
|
||||
|
||||
class ScheduledScan(Base):
|
||||
"""Scheduled scans table - stores scheduled scan jobs and results"""
|
||||
__tablename__ = 'scheduled_scans'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
directory = Column(String(500), nullable=False, index=True)
|
||||
scheduled_for = Column(DateTime, nullable=False, index=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
started_at = Column(DateTime)
|
||||
completed_at = Column(DateTime)
|
||||
status = Column(String(50), default='scheduled', index=True) # scheduled, running, completed, cancelled, failed
|
||||
files_found = Column(Integer, default=0)
|
||||
files_with_plot = Column(Integer, default=0)
|
||||
scan_duration_ms = Column(Integer)
|
||||
error_message = Column(Text)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScheduledScan(id={self.id}, directory='{self.directory}', status='{self.status}')>"
|
||||
|
||||
|
||||
class ApiUsage(Base):
|
||||
"""API usage tracking table - monitors API calls per integration"""
|
||||
__tablename__ = 'api_usage'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
provider = Column(String(50), nullable=False, index=True) # omdb, tmdb, tvmaze
|
||||
endpoint = Column(String(200)) # Specific endpoint called
|
||||
timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
|
||||
success = Column(Boolean, default=True)
|
||||
response_time_ms = Column(Integer)
|
||||
call_count = Column(Integer, default=1) # Number of API calls in this batch
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ApiUsage(id={self.id}, provider='{self.provider}', calls={self.call_count}, timestamp='{self.timestamp}')>"
|
||||
|
||||
|
||||
class SuggestedMatch(Base):
|
||||
"""Suggested matches table - stores auto-matched titles for files"""
|
||||
__tablename__ = 'suggested_matches'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
file_path = Column(String(500), nullable=False, unique=True, index=True)
|
||||
file_name = Column(String(255), nullable=False)
|
||||
matched_title = Column(String(255), nullable=False)
|
||||
matched_year = Column(String(10))
|
||||
matched_imdb_id = Column(String(50))
|
||||
match_data = Column(Text) # JSON blob with full match data
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<SuggestedMatch(id={self.id}, file_name='{self.file_name}', matched_title='{self.matched_title}')>"
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""Manages database connections and operations"""
|
||||
|
||||
def __init__(self, db_path="sublogue.db"):
|
||||
"""Initialize database connection"""
|
||||
self.db_path = Path(db_path)
|
||||
self.engine = create_engine(f'sqlite:///{self.db_path}', echo=False)
|
||||
self.Session = scoped_session(sessionmaker(bind=self.engine))
|
||||
|
||||
# Create tables if they don't exist
|
||||
Base.metadata.create_all(self.engine)
|
||||
logger.info(f"Database initialized at {self.db_path}")
|
||||
|
||||
def get_session(self):
|
||||
"""Get a new database session"""
|
||||
return self.Session()
|
||||
|
||||
def close_session(self):
|
||||
"""Close the session"""
|
||||
self.Session.remove()
|
||||
|
||||
# ============ SETTINGS OPERATIONS ============
|
||||
|
||||
def get_setting(self, key, default=None):
|
||||
"""Get a setting value"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
setting = session.query(Settings).filter_by(key=key).first()
|
||||
if setting:
|
||||
try:
|
||||
return json.loads(setting.value)
|
||||
except json.JSONDecodeError:
|
||||
return setting.value
|
||||
return default
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def set_setting(self, key, value):
|
||||
"""Set a setting value"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
setting = session.query(Settings).filter_by(key=key).first()
|
||||
|
||||
# Convert value to JSON string
|
||||
json_value = json.dumps(value) if not isinstance(value, str) else value
|
||||
|
||||
if setting:
|
||||
setting.value = json_value
|
||||
setting.updated_at = datetime.utcnow()
|
||||
else:
|
||||
setting = Settings(key=key, value=json_value)
|
||||
session.add(setting)
|
||||
|
||||
session.commit()
|
||||
logger.info(f"Setting updated: {key}")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error setting value: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_all_settings(self):
|
||||
"""Get all settings as a dictionary"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
settings = session.query(Settings).all()
|
||||
result = {}
|
||||
for setting in settings:
|
||||
try:
|
||||
result[setting.key] = json.loads(setting.value)
|
||||
except json.JSONDecodeError:
|
||||
result[setting.key] = setting.value
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def update_settings(self, settings_dict):
|
||||
"""Update multiple settings at once"""
|
||||
for key, value in settings_dict.items():
|
||||
self.set_setting(key, value)
|
||||
|
||||
# ============ PROCESSING RUN OPERATIONS ============
|
||||
|
||||
def create_run(self, total_files):
|
||||
"""Create a new processing run"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
run = ProcessingRun(
|
||||
total_files=total_files,
|
||||
status='in_progress'
|
||||
)
|
||||
session.add(run)
|
||||
session.commit()
|
||||
run_id = run.id
|
||||
logger.info(f"Created processing run {run_id}")
|
||||
return run_id
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error creating run: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def complete_run(self, run_id, successful_files, failed_files):
|
||||
"""Mark a run as completed"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
run = session.query(ProcessingRun).get(run_id)
|
||||
if run:
|
||||
run.completed_at = datetime.utcnow()
|
||||
run.successful_files = successful_files
|
||||
run.failed_files = failed_files
|
||||
run.status = 'completed'
|
||||
|
||||
if run.started_at:
|
||||
duration = (run.completed_at - run.started_at).total_seconds()
|
||||
run.duration_seconds = duration
|
||||
|
||||
session.commit()
|
||||
logger.info(f"Completed run {run_id}")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error completing run: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def add_file_result(self, run_id, file_path, success, status, summary="", error_message="", duration=40):
|
||||
"""Add a file processing result"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
result = FileResult(
|
||||
run_id=run_id,
|
||||
file_path=file_path,
|
||||
file_name=Path(file_path).name,
|
||||
success=success,
|
||||
status=status,
|
||||
summary=summary,
|
||||
error_message=error_message,
|
||||
duration=duration
|
||||
)
|
||||
session.add(result)
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error adding file result: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_run_history(self, limit=50):
|
||||
"""Get processing run history"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
runs = session.query(ProcessingRun).order_by(
|
||||
ProcessingRun.started_at.desc()
|
||||
).limit(limit).all()
|
||||
|
||||
result = []
|
||||
for run in runs:
|
||||
result.append({
|
||||
'id': run.id,
|
||||
'started_at': run.started_at.isoformat() if run.started_at else None,
|
||||
'completed_at': run.completed_at.isoformat() if run.completed_at else None,
|
||||
'total_files': run.total_files,
|
||||
'successful_files': run.successful_files,
|
||||
'failed_files': run.failed_files,
|
||||
'duration_seconds': run.duration_seconds,
|
||||
'status': run.status
|
||||
})
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_run_details(self, run_id):
|
||||
"""Get detailed information about a specific run"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
run = session.query(ProcessingRun).get(run_id)
|
||||
if not run:
|
||||
return None
|
||||
|
||||
file_results = []
|
||||
for result in run.file_results:
|
||||
file_results.append({
|
||||
'id': result.id,
|
||||
'file_path': result.file_path,
|
||||
'file_name': result.file_name,
|
||||
'success': result.success,
|
||||
'status': result.status,
|
||||
'summary': result.summary,
|
||||
'error_message': result.error_message,
|
||||
'processed_at': result.processed_at.isoformat() if result.processed_at else None,
|
||||
'duration': result.duration
|
||||
})
|
||||
|
||||
return {
|
||||
'id': run.id,
|
||||
'started_at': run.started_at.isoformat() if run.started_at else None,
|
||||
'completed_at': run.completed_at.isoformat() if run.completed_at else None,
|
||||
'total_files': run.total_files,
|
||||
'successful_files': run.successful_files,
|
||||
'failed_files': run.failed_files,
|
||||
'duration_seconds': run.duration_seconds,
|
||||
'status': run.status,
|
||||
'file_results': file_results
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ SCAN HISTORY OPERATIONS ============
|
||||
|
||||
def add_scan_history(self, directory, files_found, files_with_plot, scan_duration_ms):
|
||||
"""Add a scan history entry"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scan = ScanHistory(
|
||||
directory=directory,
|
||||
files_found=files_found,
|
||||
files_with_plot=files_with_plot,
|
||||
scan_duration_ms=scan_duration_ms
|
||||
)
|
||||
session.add(scan)
|
||||
session.commit()
|
||||
logger.info(f"Scan history saved for {directory}")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error saving scan history: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_scan_history(self, limit=50):
|
||||
"""Get scan history"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scans = session.query(ScanHistory).order_by(
|
||||
ScanHistory.scanned_at.desc()
|
||||
).limit(limit).all()
|
||||
|
||||
result = []
|
||||
for scan in scans:
|
||||
result.append({
|
||||
'id': scan.id,
|
||||
'directory': scan.directory,
|
||||
'scanned_at': scan.scanned_at.isoformat() if scan.scanned_at else None,
|
||||
'files_found': scan.files_found,
|
||||
'files_with_plot': scan.files_with_plot,
|
||||
'scan_duration_ms': scan.scan_duration_ms
|
||||
})
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ SCHEDULED SCAN OPERATIONS ============
|
||||
|
||||
def create_scheduled_scan(self, directory, scheduled_for):
|
||||
"""Create a scheduled scan entry"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scheduled = ScheduledScan(
|
||||
directory=directory,
|
||||
scheduled_for=scheduled_for,
|
||||
status='scheduled'
|
||||
)
|
||||
session.add(scheduled)
|
||||
session.commit()
|
||||
return scheduled.id
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error creating scheduled scan: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_scheduled_scans(self, limit=50, status=None):
|
||||
"""Get scheduled scans"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
query = session.query(ScheduledScan)
|
||||
if status:
|
||||
query = query.filter_by(status=status)
|
||||
scans = query.order_by(ScheduledScan.scheduled_for.desc()).limit(limit).all()
|
||||
|
||||
result = []
|
||||
for scan in scans:
|
||||
result.append({
|
||||
'id': scan.id,
|
||||
'directory': scan.directory,
|
||||
'scheduled_for': scan.scheduled_for.isoformat() if scan.scheduled_for else None,
|
||||
'created_at': scan.created_at.isoformat() if scan.created_at else None,
|
||||
'started_at': scan.started_at.isoformat() if scan.started_at else None,
|
||||
'completed_at': scan.completed_at.isoformat() if scan.completed_at else None,
|
||||
'status': scan.status,
|
||||
'files_found': scan.files_found,
|
||||
'files_with_plot': scan.files_with_plot,
|
||||
'scan_duration_ms': scan.scan_duration_ms,
|
||||
'error_message': scan.error_message
|
||||
})
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_scheduled_scan(self, scan_id):
|
||||
"""Get a single scheduled scan by id"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scan = session.query(ScheduledScan).get(scan_id)
|
||||
if not scan:
|
||||
return None
|
||||
return {
|
||||
'id': scan.id,
|
||||
'directory': scan.directory,
|
||||
'scheduled_for': scan.scheduled_for.isoformat() if scan.scheduled_for else None,
|
||||
'created_at': scan.created_at.isoformat() if scan.created_at else None,
|
||||
'started_at': scan.started_at.isoformat() if scan.started_at else None,
|
||||
'completed_at': scan.completed_at.isoformat() if scan.completed_at else None,
|
||||
'status': scan.status,
|
||||
'files_found': scan.files_found,
|
||||
'files_with_plot': scan.files_with_plot,
|
||||
'scan_duration_ms': scan.scan_duration_ms,
|
||||
'error_message': scan.error_message
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_due_scheduled_scans(self, now):
|
||||
"""Get scheduled scans that are due to run"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scans = session.query(ScheduledScan).filter(
|
||||
ScheduledScan.status == 'scheduled',
|
||||
ScheduledScan.scheduled_for <= now
|
||||
).order_by(ScheduledScan.scheduled_for.asc()).all()
|
||||
return [scan.id for scan in scans]
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def mark_scheduled_scan_running(self, scan_id, started_at=None):
|
||||
"""Mark a scheduled scan as running"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scan = session.query(ScheduledScan).get(scan_id)
|
||||
if not scan or scan.status != 'scheduled':
|
||||
return False
|
||||
scan.status = 'running'
|
||||
scan.started_at = started_at or datetime.utcnow()
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error marking scheduled scan running: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def complete_scheduled_scan(self, scan_id, files_found, files_with_plot, scan_duration_ms, completed_at=None):
|
||||
"""Mark a scheduled scan as completed"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scan = session.query(ScheduledScan).get(scan_id)
|
||||
if not scan:
|
||||
return False
|
||||
scan.status = 'completed'
|
||||
scan.completed_at = completed_at or datetime.utcnow()
|
||||
scan.files_found = files_found
|
||||
scan.files_with_plot = files_with_plot
|
||||
scan.scan_duration_ms = scan_duration_ms
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error completing scheduled scan: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def fail_scheduled_scan(self, scan_id, error_message, completed_at=None):
|
||||
"""Mark a scheduled scan as failed"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scan = session.query(ScheduledScan).get(scan_id)
|
||||
if not scan:
|
||||
return False
|
||||
scan.status = 'failed'
|
||||
scan.completed_at = completed_at or datetime.utcnow()
|
||||
scan.error_message = error_message
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error failing scheduled scan: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def cancel_scheduled_scan(self, scan_id):
|
||||
"""Cancel a scheduled scan"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
scan = session.query(ScheduledScan).get(scan_id)
|
||||
if not scan or scan.status != 'scheduled':
|
||||
return False
|
||||
scan.status = 'cancelled'
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error cancelling scheduled scan: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ API USAGE OPERATIONS ============
|
||||
|
||||
def track_api_call(self, provider, endpoint=None, success=True, response_time_ms=None, call_count=1):
|
||||
"""Track API call(s) for usage monitoring
|
||||
|
||||
Args:
|
||||
provider: API provider name (omdb, tmdb, tvmaze)
|
||||
endpoint: Specific endpoint called
|
||||
success: Whether the call(s) succeeded
|
||||
response_time_ms: Total response time in milliseconds
|
||||
call_count: Number of API calls made (for batched operations)
|
||||
"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
usage = ApiUsage(
|
||||
provider=provider,
|
||||
endpoint=endpoint,
|
||||
success=success,
|
||||
response_time_ms=response_time_ms,
|
||||
call_count=call_count
|
||||
)
|
||||
session.add(usage)
|
||||
session.commit()
|
||||
logger.debug(f"Tracked {call_count} API call(s) to {provider}")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error tracking API call: {e}")
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_api_usage_24h(self, provider):
|
||||
"""Get API call count for the last 24 hours (sums call_count field)"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
from datetime import timedelta
|
||||
from sqlalchemy import func
|
||||
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=24)
|
||||
|
||||
# Sum the call_count column to get total API calls
|
||||
result = session.query(func.coalesce(func.sum(ApiUsage.call_count), 0)).filter(
|
||||
ApiUsage.provider == provider,
|
||||
ApiUsage.timestamp >= cutoff_time
|
||||
).scalar()
|
||||
|
||||
return result or 0
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def check_api_limit(self, provider, limit=1000):
|
||||
"""Check if API usage is under the limit"""
|
||||
current_usage = self.get_api_usage_24h(provider)
|
||||
return {
|
||||
'under_limit': current_usage < limit,
|
||||
'current_usage': current_usage,
|
||||
'limit': limit,
|
||||
'remaining': max(0, limit - current_usage)
|
||||
}
|
||||
|
||||
def get_usage_stats(self, provider):
|
||||
"""Get detailed usage statistics for a provider"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
from datetime import timedelta
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=24)
|
||||
|
||||
# Get 24h usage records
|
||||
usage_24h = session.query(ApiUsage).filter(
|
||||
ApiUsage.provider == provider,
|
||||
ApiUsage.timestamp >= cutoff_time
|
||||
).all()
|
||||
|
||||
# Sum call_count for accurate totals
|
||||
total_calls = sum(u.call_count or 1 for u in usage_24h)
|
||||
successful_calls = sum(u.call_count or 1 for u in usage_24h if u.success)
|
||||
failed_calls = total_calls - successful_calls
|
||||
|
||||
# Calculate average response time (weighted by call count)
|
||||
weighted_times = [(u.response_time_ms or 0) * (u.call_count or 1) for u in usage_24h if u.response_time_ms is not None]
|
||||
avg_response_time = sum(weighted_times) / total_calls if total_calls > 0 and weighted_times else 0
|
||||
|
||||
# Find oldest call in 24h window to calculate reset time
|
||||
oldest_call = min([u.timestamp for u in usage_24h]) if usage_24h else datetime.utcnow()
|
||||
reset_time = oldest_call + timedelta(hours=24)
|
||||
|
||||
return {
|
||||
'provider': provider,
|
||||
'total_calls_24h': total_calls,
|
||||
'successful_calls': successful_calls,
|
||||
'failed_calls': failed_calls,
|
||||
'avg_response_time_ms': round(avg_response_time, 2),
|
||||
'reset_time': reset_time.isoformat(),
|
||||
'limit': 1000,
|
||||
'remaining': max(0, 1000 - total_calls)
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_all_usage_stats(self):
|
||||
"""Get usage statistics for all providers"""
|
||||
providers = ['omdb', 'tmdb', 'tvmaze']
|
||||
return {provider: self.get_usage_stats(provider) for provider in providers}
|
||||
|
||||
# ============ SUGGESTED MATCHES OPERATIONS ============
|
||||
|
||||
def save_suggested_match(self, file_path, file_name, match_data):
|
||||
"""Save or update a suggested match for a file"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
# Check if match already exists
|
||||
existing = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
|
||||
|
||||
if existing:
|
||||
# Update existing match
|
||||
existing.matched_title = match_data.get('title', '')
|
||||
existing.matched_year = match_data.get('year', '')
|
||||
existing.matched_imdb_id = match_data.get('imdb_id', '')
|
||||
existing.match_data = json.dumps(match_data)
|
||||
existing.updated_at = datetime.utcnow()
|
||||
else:
|
||||
# Create new match
|
||||
new_match = SuggestedMatch(
|
||||
file_path=file_path,
|
||||
file_name=file_name,
|
||||
matched_title=match_data.get('title', ''),
|
||||
matched_year=match_data.get('year', ''),
|
||||
matched_imdb_id=match_data.get('imdb_id', ''),
|
||||
match_data=json.dumps(match_data)
|
||||
)
|
||||
session.add(new_match)
|
||||
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error saving suggested match: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_suggested_match(self, file_path):
|
||||
"""Get suggested match for a specific file"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
match = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
|
||||
if match:
|
||||
return {
|
||||
'file_path': match.file_path,
|
||||
'file_name': match.file_name,
|
||||
'matched_title': match.matched_title,
|
||||
'matched_year': match.matched_year,
|
||||
'matched_imdb_id': match.matched_imdb_id,
|
||||
'match_data': json.loads(match.match_data) if match.match_data else {},
|
||||
'created_at': match.created_at.isoformat(),
|
||||
'updated_at': match.updated_at.isoformat()
|
||||
}
|
||||
return None
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_suggested_matches_for_directory(self, directory):
|
||||
"""Get all suggested matches for files in a directory"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
matches = session.query(SuggestedMatch).filter(
|
||||
SuggestedMatch.file_path.like(f"{directory}%")
|
||||
).all()
|
||||
|
||||
return {
|
||||
match.file_path: json.loads(match.match_data) if match.match_data else {}
|
||||
for match in matches
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def delete_suggested_match(self, file_path):
|
||||
"""Delete a suggested match for a file"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
match = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
|
||||
if match:
|
||||
session.delete(match)
|
||||
session.commit()
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error deleting suggested match: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def clear_all_suggested_matches(self):
|
||||
"""Clear all suggested matches"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.query(SuggestedMatch).delete()
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error clearing suggested matches: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ MAINTENANCE OPERATIONS ============
|
||||
|
||||
def clear_settings(self, keep_api_keys=False):
|
||||
"""Clear all settings, optionally keeping API keys"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
keep_values = {}
|
||||
if keep_api_keys:
|
||||
for key in ("omdb_api_key", "tmdb_api_key", "api_key"):
|
||||
setting = session.query(Settings).filter_by(key=key).first()
|
||||
if setting:
|
||||
keep_values[key] = setting.value
|
||||
|
||||
session.query(Settings).delete()
|
||||
session.commit()
|
||||
|
||||
if keep_values:
|
||||
for key, value in keep_values.items():
|
||||
session.add(Settings(key=key, value=value))
|
||||
session.commit()
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error clearing settings: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def clear_history_and_logs(self):
|
||||
"""Clear processing runs, scan history, scheduled scans, and API usage logs"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.query(FileResult).delete()
|
||||
session.query(ProcessingRun).delete()
|
||||
session.query(ScanHistory).delete()
|
||||
session.query(ScheduledScan).delete()
|
||||
session.query(ApiUsage).delete()
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error clearing history and logs: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ STATISTICS ============
|
||||
|
||||
def get_statistics(self):
|
||||
"""Get overall statistics"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
total_runs = session.query(ProcessingRun).count()
|
||||
completed_runs = session.query(ProcessingRun).filter_by(status='completed').count()
|
||||
total_files_processed = session.query(FileResult).count()
|
||||
successful_files = session.query(FileResult).filter_by(success=True).count()
|
||||
|
||||
return {
|
||||
'total_runs': total_runs,
|
||||
'completed_runs': completed_runs,
|
||||
'total_files_processed': total_files_processed,
|
||||
'successful_files': successful_files,
|
||||
'failed_files': total_files_processed - successful_files
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
@@ -0,0 +1,302 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Generator, List, Dict
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Logging configuration
|
||||
# ------------------------------------------------------------
|
||||
|
||||
logger = logging.getLogger("FileScanner")
|
||||
logger.setLevel(logging.INFO) # Change to DEBUG for deep tracing
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Import subtitle parser
|
||||
# ------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from subtitle_processor import parse_srt
|
||||
|
||||
|
||||
class FileScanner:
|
||||
"""
|
||||
Efficient, disk-friendly subtitle scanner.
|
||||
|
||||
- Recursive (os.scandir-based)
|
||||
- Streams file reads
|
||||
- Batches results
|
||||
- Extensive logging for observability
|
||||
"""
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".srt"}
|
||||
MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB
|
||||
PLOT_SCAN_LINES = 50
|
||||
DEFAULT_BATCH_SIZE = 100
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Public API
|
||||
# --------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def scan_directory(
|
||||
cls,
|
||||
directory_path: str | Path,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
follow_symlinks: bool = False,
|
||||
) -> Generator[List[Dict], None, None]:
|
||||
"""
|
||||
Recursively scan a directory tree for .srt files.
|
||||
Yields batches of metadata dictionaries.
|
||||
"""
|
||||
root = Path(directory_path)
|
||||
|
||||
logger.info("Starting subtitle scan")
|
||||
logger.info("Root directory : %s", root)
|
||||
logger.info("Batch size : %s", batch_size)
|
||||
logger.info("Follow symlinks : %s", follow_symlinks)
|
||||
|
||||
if not root.exists():
|
||||
logger.error("Scan failed: path does not exist (%s)", root)
|
||||
raise ValueError(f"Directory does not exist: {directory_path}")
|
||||
|
||||
if not root.is_dir():
|
||||
logger.error("Scan failed: not a directory (%s)", root)
|
||||
raise ValueError(f"Invalid directory: {directory_path}")
|
||||
|
||||
batch: List[Dict] = []
|
||||
total_seen = 0
|
||||
total_srt = 0
|
||||
total_skipped = 0
|
||||
|
||||
for file_path in cls._walk_files(root, follow_symlinks):
|
||||
total_seen += 1
|
||||
|
||||
if file_path.suffix.lower() not in cls.SUPPORTED_EXTENSIONS:
|
||||
logger.debug("Ignoring non-subtitle file: %s", file_path)
|
||||
continue
|
||||
|
||||
total_srt += 1
|
||||
logger.debug("Found subtitle file: %s", file_path)
|
||||
|
||||
# --------------------------------------------
|
||||
# Stat / size guard
|
||||
# --------------------------------------------
|
||||
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
except OSError as e:
|
||||
total_skipped += 1
|
||||
logger.warning(
|
||||
"Skipping unreadable file: %s (%s)",
|
||||
file_path, e
|
||||
)
|
||||
continue
|
||||
|
||||
if stat.st_size > cls.MAX_FILE_SIZE_BYTES:
|
||||
total_skipped += 1
|
||||
logger.warning(
|
||||
"Skipping large subtitle file (%d bytes): %s",
|
||||
stat.st_size, file_path
|
||||
)
|
||||
continue
|
||||
|
||||
# --------------------------------------------
|
||||
# Plot detection
|
||||
# --------------------------------------------
|
||||
|
||||
try:
|
||||
has_plot = cls._check_has_plot(file_path)
|
||||
logger.debug(
|
||||
"Plot check for %s: %s",
|
||||
file_path.name,
|
||||
"FOUND" if has_plot else "NOT FOUND"
|
||||
)
|
||||
except Exception as e:
|
||||
total_skipped += 1
|
||||
logger.error(
|
||||
"Plot scan failed for %s: %s",
|
||||
file_path, e
|
||||
)
|
||||
continue
|
||||
|
||||
metadata = {}
|
||||
|
||||
if has_plot:
|
||||
try:
|
||||
metadata = cls._extract_metadata(file_path)
|
||||
logger.debug(
|
||||
"Extracted metadata from %s: %s",
|
||||
file_path.name,
|
||||
{k: v for k, v in metadata.items() if v}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Metadata extraction failed for %s: %s",
|
||||
file_path.name, e
|
||||
)
|
||||
|
||||
batch.append({
|
||||
"path": str(file_path),
|
||||
"name": file_path.name,
|
||||
"has_plot": has_plot,
|
||||
"status": "Has Plot" if has_plot else "Not Loaded",
|
||||
"summary": metadata.get("summary", ""),
|
||||
"plot": metadata.get("summary", ""),
|
||||
"title": metadata.get("title"),
|
||||
"year": metadata.get("year"),
|
||||
"imdb_rating": metadata.get("imdb_rating"),
|
||||
"rating": metadata.get("imdb_rating"),
|
||||
"runtime": metadata.get("runtime"),
|
||||
"selected": False,
|
||||
})
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
logger.info(
|
||||
"Yielding batch (%d items, %d total files scanned)",
|
||||
len(batch),
|
||||
total_seen
|
||||
)
|
||||
yield batch
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
logger.info(
|
||||
"Yielding final batch (%d items)",
|
||||
len(batch)
|
||||
)
|
||||
yield batch
|
||||
|
||||
logger.info("Subtitle scan completed")
|
||||
logger.info("Files visited : %d", total_seen)
|
||||
logger.info("Subtitle files found : %d", total_srt)
|
||||
logger.info("Files skipped : %d", total_skipped)
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Internal helpers
|
||||
# --------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _walk_files(root: Path, follow_symlinks: bool):
|
||||
"""
|
||||
Fast iterative recursive directory walk using os.scandir.
|
||||
"""
|
||||
logger.debug("Beginning recursive walk at %s", root)
|
||||
stack = [root]
|
||||
|
||||
while stack:
|
||||
current = stack.pop()
|
||||
logger.debug("Scanning directory: %s", current)
|
||||
|
||||
try:
|
||||
with os.scandir(current) as entries:
|
||||
for entry in entries:
|
||||
try:
|
||||
if entry.is_dir(follow_symlinks=follow_symlinks):
|
||||
stack.append(Path(entry.path))
|
||||
elif entry.is_file():
|
||||
yield Path(entry.path)
|
||||
except OSError as e:
|
||||
logger.debug(
|
||||
"Skipping entry due to OS error: %s (%s)",
|
||||
entry.path, e
|
||||
)
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"Cannot access directory: %s (%s)",
|
||||
current, e
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _check_has_plot(cls, file_path: Path) -> bool:
|
||||
"""
|
||||
Check first N lines for Sublogue signature.
|
||||
"""
|
||||
logger.debug("Scanning for plot marker in %s", file_path.name)
|
||||
|
||||
try:
|
||||
with file_path.open("r", encoding="utf-8", errors="ignore") as f:
|
||||
for i, line in enumerate(f):
|
||||
if i >= cls.PLOT_SCAN_LINES:
|
||||
break
|
||||
if "generated by sublogue" in line.lower():
|
||||
logger.debug(
|
||||
"Plot marker found in %s (line %d)",
|
||||
file_path.name, i + 1
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error reading file during plot scan: %s (%s)",
|
||||
file_path, e
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _extract_metadata(cls, file_path: Path) -> Dict:
|
||||
"""
|
||||
Extract title, year, rating, runtime, and plot
|
||||
from Sublogue-generated subtitles.
|
||||
"""
|
||||
logger.debug("Extracting metadata from %s", file_path.name)
|
||||
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
blocks = parse_srt(content)
|
||||
|
||||
metadata = {
|
||||
"title": None,
|
||||
"year": None,
|
||||
"imdb_rating": None,
|
||||
"runtime": None,
|
||||
"summary": ""
|
||||
}
|
||||
|
||||
if len(blocks) < 2:
|
||||
logger.debug("Not enough subtitle blocks for metadata extraction")
|
||||
return metadata
|
||||
|
||||
# --------------------------------------------
|
||||
# Plot block (index 1)
|
||||
# --------------------------------------------
|
||||
|
||||
plot_text = blocks[1].text
|
||||
plot_text = plot_text.split("Generated by Sublogue")[0].strip()
|
||||
metadata["summary"] = plot_text
|
||||
|
||||
# --------------------------------------------
|
||||
# Header block (index 0)
|
||||
# --------------------------------------------
|
||||
|
||||
header_lines = blocks[0].text.split("\n")
|
||||
|
||||
if header_lines:
|
||||
first_line = header_lines[0]
|
||||
year_match = re.search(r"\((\d{4})\)", first_line)
|
||||
if year_match:
|
||||
metadata["year"] = year_match.group(1)
|
||||
metadata["title"] = first_line[:year_match.start()].strip()
|
||||
else:
|
||||
metadata["title"] = first_line.strip()
|
||||
|
||||
if len(header_lines) > 1:
|
||||
second_line = header_lines[1]
|
||||
|
||||
rating_match = re.search(r"IMDb:\s*([^\s]+)", second_line)
|
||||
if rating_match:
|
||||
metadata["imdb_rating"] = rating_match.group(1)
|
||||
|
||||
runtime_match = re.search(r"⏱\s*(.+)", second_line)
|
||||
if runtime_match:
|
||||
metadata["runtime"] = runtime_match.group(1).strip()
|
||||
|
||||
logger.debug("Metadata extracted: %s", metadata)
|
||||
return metadata
|
||||
@@ -0,0 +1,501 @@
|
||||
"""
|
||||
Keyword stripper utility - removes common junk keywords from filenames and subtitle content
|
||||
Optimised for torrent / subtitle garbage while preserving real titles and dialogue
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class KeywordStripper:
|
||||
"""
|
||||
High-performance filename cleaner for movies & TV.
|
||||
|
||||
Design goals:
|
||||
- Torrent / subtitle spam annihilation
|
||||
- Minimal false positives
|
||||
- Regex compiled once
|
||||
- Fast enough for large libraries
|
||||
"""
|
||||
|
||||
# -----------------------------
|
||||
# CORE JUNK PATTERNS
|
||||
# -----------------------------
|
||||
|
||||
QUALITY = r"""
|
||||
\b(
|
||||
480p|720p|1080p|2160p|4320p|
|
||||
4k|8k|
|
||||
hdr|hdr10|hdr10\+|dolby\s*vision|dv|
|
||||
bluray|blu[-\s]?ray|bdrip|brrip|bd|
|
||||
webrip|web[-\s]?dl|web|
|
||||
dvdrip|dvd|dvdscr|
|
||||
cam|ts|telesync|telecine|tc|
|
||||
hdrip|hdlight
|
||||
)\b
|
||||
"""
|
||||
|
||||
CODECS = r"""
|
||||
\b(
|
||||
x264|x265|h\.?264|h\.?265|hevc|
|
||||
xvid|divx|
|
||||
aac|ac3|dts|truehd|atmos|
|
||||
dd5\.1|dd\+|
|
||||
flac|mp3|opus|
|
||||
8bit|10bit|hi10p
|
||||
)\b
|
||||
"""
|
||||
|
||||
TORRENT_GROUPS = r"""
|
||||
\b(
|
||||
yts(\.mx)?|yify|
|
||||
rarbg|eztv|ettv|
|
||||
psa|ion10|fgт|fgt|
|
||||
tgx|torrentgalaxy|
|
||||
1337x|limetorrent|
|
||||
ettv|ettv|
|
||||
publichd|scene|
|
||||
ganool|evo
|
||||
)\b
|
||||
"""
|
||||
|
||||
SUBTITLE_ADS = r"""
|
||||
\b(
|
||||
opensubtitles|
|
||||
subscene|
|
||||
addic7ed|
|
||||
podnapisi|
|
||||
yifysubtitles|
|
||||
subtitles?\s*by|
|
||||
synced?\s*by|
|
||||
encoded?\s*by|
|
||||
resynced?\s*by
|
||||
)\b
|
||||
|
|
||||
www\.[a-z0-9\-]+\.(com|org|net)
|
||||
"""
|
||||
|
||||
LANGUAGES = r"""
|
||||
\b(
|
||||
eng|english|
|
||||
ita|italian|
|
||||
fra|french|
|
||||
spa|spanish|
|
||||
ger|german|
|
||||
multi|dubbed|
|
||||
vostfr|subfrench|
|
||||
subs?|subtitles?
|
||||
)\b
|
||||
"""
|
||||
|
||||
EDITIONS = r"""
|
||||
\b(
|
||||
unrated|uncut|
|
||||
directors?\s*cut|
|
||||
extended|
|
||||
theatrical|
|
||||
imax|
|
||||
special\s*edition|
|
||||
limited|
|
||||
internal|
|
||||
proper|repack|real
|
||||
)\b
|
||||
"""
|
||||
|
||||
# -----------------------------
|
||||
# STRUCTURAL NOISE
|
||||
# -----------------------------
|
||||
|
||||
BRACKETS = r"""
|
||||
[\[\(\{]
|
||||
.*?
|
||||
[\]\)\}]
|
||||
"""
|
||||
|
||||
SEPARATORS = r"[._\-]+"
|
||||
|
||||
MULTISPACE = r"\s+"
|
||||
|
||||
YEAR_PATTERN = r"(19\d{2}|20\d{2})"
|
||||
|
||||
SEASON_EPISODE = [
|
||||
r"[Ss](\d{1,2})[Ee](\d{1,2})",
|
||||
r"(\d{1,2})x(\d{1,2})",
|
||||
r"Season\s*(\d{1,2})\s*Episode\s*(\d{1,2})",
|
||||
]
|
||||
|
||||
# -----------------------------
|
||||
# SUBTITLE CONTENT ADS/WATERMARKS
|
||||
# -----------------------------
|
||||
# These patterns are specifically for cleaning embedded ads from subtitle TEXT
|
||||
# They're more aggressive than filename patterns since we want to remove entire lines
|
||||
|
||||
# Release group watermarks that appear in subtitle text
|
||||
SUBTITLE_WATERMARKS = [
|
||||
# YTS and variants
|
||||
r"yts\.mx",
|
||||
r"yts\.am",
|
||||
r"yts\.lt",
|
||||
r"yts\.ag",
|
||||
r"\byts\b",
|
||||
r"\byify\b",
|
||||
# RARBG and other groups
|
||||
r"\brarbg\b",
|
||||
r"\beztv\b",
|
||||
r"\bettv\b",
|
||||
r"torrentgalaxy",
|
||||
r"\btgx\b",
|
||||
r"1337x",
|
||||
r"limetorrents?",
|
||||
r"\bevo\b",
|
||||
r"\bpsa\b",
|
||||
r"\bfgt\b",
|
||||
# Subtitle sites
|
||||
r"opensubtitles?",
|
||||
r"subscene",
|
||||
r"addic7ed",
|
||||
r"podnapisi",
|
||||
r"yifysubtitles?",
|
||||
r"sub\.?scene",
|
||||
r"legendas\.?tv",
|
||||
r"shooter\.?cn",
|
||||
r"subhd",
|
||||
# Generic patterns
|
||||
r"downloaded\s+from",
|
||||
r"subtitles?\s+by",
|
||||
r"sync(?:ed|hronized)?\s+(?:and\s+)?correct(?:ed|ions?)?\s+by",
|
||||
r"ripped\s+by",
|
||||
r"encoded?\s+by",
|
||||
r"resynce?d?\s+by",
|
||||
r"improved\s+by",
|
||||
r"fixed\s+by",
|
||||
r"translated\s+by",
|
||||
r"captioned\s+by",
|
||||
r"support\s+us\s+and",
|
||||
r"get\s+more\s+subtitles",
|
||||
r"quality\s+subtitles",
|
||||
r"best\s+subtitles",
|
||||
r"free\s+subtitles",
|
||||
# URLs and domains
|
||||
r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)",
|
||||
r"https?://[^\s]+",
|
||||
# Social media handles that are clearly ads
|
||||
r"@yaborr",
|
||||
r"@sub_scene",
|
||||
r"follow\s+us\s+on",
|
||||
r"join\s+us\s+at",
|
||||
r"visit\s+us\s+at",
|
||||
# Promotional text
|
||||
r"advertise\s+here",
|
||||
r"membership\s+(is\s+)?free",
|
||||
r"become\s+a\s+member",
|
||||
r"register\s+(now|today|free)",
|
||||
r"sign\s+up\s+(now|today|free)",
|
||||
]
|
||||
|
||||
# Patterns that indicate an ENTIRE subtitle block should be removed
|
||||
# (not just the matching text, but the whole block)
|
||||
SUBTITLE_BLOCK_REMOVERS = [
|
||||
# Pure promotional blocks
|
||||
r"^[\s\-_]*(?:www\.)?yts",
|
||||
r"^[\s\-_]*(?:www\.)?rarbg",
|
||||
r"^[\s\-_]*opensubtitles",
|
||||
r"^[\s\-_]*subscene",
|
||||
r"^[\s\-_]*downloaded\s+from",
|
||||
r"^[\s\-_]*subtitles?\s+by",
|
||||
r"^[\s\-_]*sync(?:ed)?\s+(?:and\s+)?correct",
|
||||
r"^[\s\-_]*support\s+us",
|
||||
r"^[\s\-_]*get\s+(?:more\s+)?subtitles",
|
||||
r"^[\s\-_]*quality\s+subtitles",
|
||||
r"^[\s\-_]*advertise",
|
||||
# ASCII art headers/footers (often used for ads)
|
||||
r"^[\s\-=_\*]{10,}$",
|
||||
# Empty after cleaning
|
||||
r"^\s*$",
|
||||
]
|
||||
|
||||
# -----------------------------
|
||||
# COMPILED REGEX CACHE
|
||||
# -----------------------------
|
||||
|
||||
_compiled = None
|
||||
|
||||
@classmethod
|
||||
def _compile(cls):
|
||||
if cls._compiled:
|
||||
return cls._compiled
|
||||
|
||||
def c(p):
|
||||
return re.compile(p, re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
cls._compiled = {
|
||||
"junk": c("|".join([
|
||||
cls.QUALITY,
|
||||
cls.CODECS,
|
||||
cls.TORRENT_GROUPS,
|
||||
cls.SUBTITLE_ADS,
|
||||
cls.LANGUAGES,
|
||||
cls.EDITIONS,
|
||||
])),
|
||||
"brackets": c(cls.BRACKETS),
|
||||
"separators": re.compile(cls.SEPARATORS),
|
||||
"multispace": re.compile(cls.MULTISPACE),
|
||||
"year": re.compile(cls.YEAR_PATTERN),
|
||||
"season_episode": [re.compile(p, re.IGNORECASE) for p in cls.SEASON_EPISODE],
|
||||
# Subtitle content cleaning patterns
|
||||
"subtitle_watermarks": [
|
||||
re.compile(p, re.IGNORECASE) for p in cls.SUBTITLE_WATERMARKS
|
||||
],
|
||||
"subtitle_block_removers": [
|
||||
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
|
||||
],
|
||||
}
|
||||
|
||||
return cls._compiled
|
||||
|
||||
# -----------------------------
|
||||
# PUBLIC API
|
||||
# -----------------------------
|
||||
|
||||
def strip_keywords(self, title: str, preserve_year: bool = True) -> str:
|
||||
rx = self._compile()
|
||||
|
||||
original = title
|
||||
|
||||
# Extract year early
|
||||
year: Optional[str] = None
|
||||
if preserve_year:
|
||||
m = rx["year"].search(title)
|
||||
if m:
|
||||
year = m.group(1)
|
||||
|
||||
# Remove obvious junk
|
||||
cleaned = rx["junk"].sub("", title)
|
||||
|
||||
# Remove bracketed junk AFTER stripping known keywords
|
||||
cleaned = rx["brackets"].sub("", cleaned)
|
||||
|
||||
# Normalize separators
|
||||
cleaned = rx["separators"].sub(" ", cleaned)
|
||||
cleaned = rx["multispace"].sub(" ", cleaned).strip()
|
||||
|
||||
# Re-append year
|
||||
if preserve_year and year and year not in cleaned:
|
||||
cleaned = f"{cleaned} ({year})"
|
||||
|
||||
logger.debug("KeywordStripper: '%s' -> '%s'", original, cleaned)
|
||||
return cleaned
|
||||
|
||||
def extract_year(self, title: str) -> Optional[str]:
|
||||
rx = self._compile()
|
||||
m = rx["year"].search(title)
|
||||
return m.group(1) if m else None
|
||||
|
||||
def extract_season_episode(self, title: str):
|
||||
rx = self._compile()
|
||||
for p in rx["season_episode"]:
|
||||
m = p.search(title)
|
||||
if m:
|
||||
return int(m.group(1)), int(m.group(2))
|
||||
return None, None
|
||||
|
||||
def clean_filename(self, filename: str, preserve_year: bool = True) -> dict:
|
||||
name = re.sub(r"\.[^.]+$", "", filename)
|
||||
|
||||
season, episode = self.extract_season_episode(name)
|
||||
year = self.extract_year(name)
|
||||
cleaned = self.strip_keywords(name, preserve_year=preserve_year)
|
||||
|
||||
return {
|
||||
"cleaned_title": cleaned,
|
||||
"year": year,
|
||||
"season": season,
|
||||
"episode": episode,
|
||||
"is_series": season is not None or episode is not None,
|
||||
}
|
||||
|
||||
# -----------------------------
|
||||
# SUBTITLE CONTENT CLEANING
|
||||
# -----------------------------
|
||||
|
||||
def should_remove_subtitle_block(self, text: str) -> bool:
|
||||
"""
|
||||
Check if an entire subtitle block should be removed.
|
||||
|
||||
Returns True if the block is purely promotional/ad content
|
||||
with no legitimate dialogue.
|
||||
|
||||
Args:
|
||||
text: The subtitle text content
|
||||
|
||||
Returns:
|
||||
True if block should be removed entirely
|
||||
"""
|
||||
rx = self._compile()
|
||||
|
||||
# Check each line of the subtitle
|
||||
lines = text.strip().split('\n')
|
||||
non_ad_lines = 0
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Check if this line matches any block remover pattern
|
||||
is_ad_line = False
|
||||
for pattern in rx["subtitle_block_removers"]:
|
||||
if pattern.search(line):
|
||||
is_ad_line = True
|
||||
break
|
||||
|
||||
# Also check watermarks - if the entire line is just a watermark
|
||||
if not is_ad_line:
|
||||
temp_line = line
|
||||
for pattern in rx["subtitle_watermarks"]:
|
||||
temp_line = pattern.sub("", temp_line)
|
||||
# If after removing watermarks, line is empty or just punctuation
|
||||
temp_line = re.sub(r'[\s\-_\.\,\!\?\:\;]+', '', temp_line)
|
||||
if not temp_line:
|
||||
is_ad_line = True
|
||||
|
||||
if not is_ad_line:
|
||||
non_ad_lines += 1
|
||||
|
||||
# If no legitimate content remains, remove the block
|
||||
return non_ad_lines == 0
|
||||
|
||||
def clean_subtitle_text(self, text: str) -> str:
|
||||
"""
|
||||
Clean watermarks and ads from subtitle text while preserving dialogue.
|
||||
|
||||
This is more surgical than should_remove_subtitle_block() - it removes
|
||||
specific ad text but keeps the rest of the subtitle intact.
|
||||
|
||||
Args:
|
||||
text: The subtitle text content
|
||||
|
||||
Returns:
|
||||
Cleaned text with ads removed, or empty string if nothing remains
|
||||
"""
|
||||
rx = self._compile()
|
||||
original = text
|
||||
|
||||
# Process line by line to handle multi-line subtitles
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
|
||||
for line in lines:
|
||||
cleaned_line = line
|
||||
|
||||
# Remove watermark patterns
|
||||
for pattern in rx["subtitle_watermarks"]:
|
||||
cleaned_line = pattern.sub("", cleaned_line)
|
||||
|
||||
# Clean up resulting whitespace
|
||||
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
|
||||
|
||||
# Only keep lines that have content after cleaning
|
||||
if cleaned_line:
|
||||
cleaned_lines.append(cleaned_line)
|
||||
|
||||
result = '\n'.join(cleaned_lines)
|
||||
|
||||
# Final cleanup - remove lines that are just punctuation/dashes
|
||||
result_lines = result.split('\n')
|
||||
result_lines = [l for l in result_lines if re.search(r'[a-zA-Z0-9]', l)]
|
||||
result = '\n'.join(result_lines)
|
||||
|
||||
if result != original:
|
||||
logger.debug("Cleaned subtitle text: '%s' -> '%s'", original[:50], result[:50])
|
||||
|
||||
return result
|
||||
|
||||
def clean_subtitle_blocks(self, blocks: List[dict]) -> List[dict]:
|
||||
"""
|
||||
Clean a list of subtitle blocks, removing ads and watermarks.
|
||||
|
||||
This processes each block:
|
||||
1. Checks if the entire block should be removed (pure ad content)
|
||||
2. If not, cleans watermarks from the text
|
||||
|
||||
Args:
|
||||
blocks: List of subtitle block dicts with 'text' key
|
||||
|
||||
Returns:
|
||||
Cleaned list with ad blocks removed and watermarks stripped
|
||||
"""
|
||||
cleaned = []
|
||||
removed_count = 0
|
||||
modified_count = 0
|
||||
|
||||
for block in blocks:
|
||||
text = block.get("text", "")
|
||||
|
||||
# Check if entire block should be removed
|
||||
if self.should_remove_subtitle_block(text):
|
||||
removed_count += 1
|
||||
logger.debug("Removing ad block: '%s'", text[:50])
|
||||
continue
|
||||
|
||||
# Clean the text
|
||||
cleaned_text = self.clean_subtitle_text(text)
|
||||
|
||||
# Skip if cleaning resulted in empty text
|
||||
if not cleaned_text.strip():
|
||||
removed_count += 1
|
||||
continue
|
||||
|
||||
# Track if we modified the text
|
||||
if cleaned_text != text:
|
||||
modified_count += 1
|
||||
|
||||
# Create new block with cleaned text
|
||||
cleaned_block = block.copy()
|
||||
cleaned_block["text"] = cleaned_text
|
||||
cleaned.append(cleaned_block)
|
||||
|
||||
if removed_count > 0 or modified_count > 0:
|
||||
logger.info(
|
||||
"Subtitle cleaning: removed %d ad blocks, modified %d blocks",
|
||||
removed_count, modified_count
|
||||
)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# SINGLETON HELPERS
|
||||
# -----------------------------
|
||||
|
||||
_default_stripper: Optional[KeywordStripper] = None
|
||||
|
||||
|
||||
def get_stripper() -> KeywordStripper:
|
||||
global _default_stripper
|
||||
if _default_stripper is None:
|
||||
_default_stripper = KeywordStripper()
|
||||
return _default_stripper
|
||||
|
||||
|
||||
def clean_title(title: str, preserve_year: bool = True) -> str:
|
||||
return get_stripper().strip_keywords(title, preserve_year)
|
||||
|
||||
|
||||
def clean_filename(filename: str, preserve_year: bool = True) -> dict:
|
||||
return get_stripper().clean_filename(filename, preserve_year)
|
||||
|
||||
|
||||
def clean_subtitle_content(text: str) -> str:
|
||||
"""Clean watermarks and ads from subtitle text."""
|
||||
return get_stripper().clean_subtitle_text(text)
|
||||
|
||||
|
||||
def should_remove_subtitle(text: str) -> bool:
|
||||
"""Check if a subtitle block should be removed entirely (pure ad)."""
|
||||
return get_stripper().should_remove_subtitle_block(text)
|
||||
@@ -0,0 +1,243 @@
|
||||
"""
|
||||
OMDb API client - async movie metadata fetching
|
||||
Includes:
|
||||
- Rate limiting (requests / second)
|
||||
- Concurrency limiting
|
||||
- Request de-duplication
|
||||
- Shared aiohttp session
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""
|
||||
Simple async rate limiter (token bucket-ish).
|
||||
Limits how often requests may start.
|
||||
"""
|
||||
|
||||
def __init__(self, rate_per_second: float):
|
||||
self._interval = 1.0 / rate_per_second
|
||||
self._lock = asyncio.Lock()
|
||||
self._last_call = 0.0
|
||||
|
||||
async def wait(self):
|
||||
async with self._lock:
|
||||
now = time.monotonic()
|
||||
delta = now - self._last_call
|
||||
if delta < self._interval:
|
||||
await asyncio.sleep(self._interval - delta)
|
||||
self._last_call = time.monotonic()
|
||||
|
||||
|
||||
class OMDbClient:
|
||||
"""Async, rate-limited OMDb client"""
|
||||
|
||||
BASE_URL = "https://www.omdbapi.com/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
*,
|
||||
max_concurrent: int = 5,
|
||||
rate_limit_per_sec: float = 2.0,
|
||||
db_manager=None,
|
||||
timeout: int = 15,
|
||||
):
|
||||
self.api_key = api_key
|
||||
self.db_manager = db_manager
|
||||
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self.rate_limiter = RateLimiter(rate_limit_per_sec)
|
||||
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
self._inflight: Dict[str, asyncio.Future] = {}
|
||||
|
||||
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
# -----------------------------
|
||||
# Session management
|
||||
# -----------------------------
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
if self._session is None or self._session.closed:
|
||||
self._session = aiohttp.ClientSession(timeout=self._timeout)
|
||||
return self._session
|
||||
|
||||
async def close(self):
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
|
||||
# -----------------------------
|
||||
# Public API
|
||||
# -----------------------------
|
||||
|
||||
async def fetch_summary(
|
||||
self,
|
||||
title: str,
|
||||
media_type: str = "movie",
|
||||
year: Optional[str] = None,
|
||||
season: Optional[int] = None,
|
||||
episode: Optional[int] = None,
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
Fetch summary from OMDb.
|
||||
Automatically deduplicates identical in-flight requests.
|
||||
|
||||
Args:
|
||||
title: Movie/show title to search
|
||||
media_type: "movie" or "series"
|
||||
year: Year of release (improves match accuracy)
|
||||
season: Season number (for series)
|
||||
episode: Episode number (for series)
|
||||
"""
|
||||
|
||||
key = self._make_cache_key(title, media_type, year, season, episode)
|
||||
|
||||
# Deduplicate in-flight requests
|
||||
if key in self._inflight:
|
||||
logger.debug("Awaiting inflight OMDb request: %s", key)
|
||||
return await self._inflight[key]
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
future = loop.create_future()
|
||||
self._inflight[key] = future
|
||||
|
||||
try:
|
||||
result = await self._fetch_summary_internal(
|
||||
title, media_type, year, season, episode
|
||||
)
|
||||
future.set_result(result)
|
||||
return result
|
||||
except Exception as e:
|
||||
future.set_exception(e)
|
||||
raise
|
||||
finally:
|
||||
self._inflight.pop(key, None)
|
||||
|
||||
# -----------------------------
|
||||
# Internal fetch logic
|
||||
# -----------------------------
|
||||
|
||||
async def _fetch_summary_internal(
|
||||
self,
|
||||
title: str,
|
||||
media_type: str,
|
||||
year: Optional[str],
|
||||
season: Optional[int],
|
||||
episode: Optional[int],
|
||||
) -> Optional[dict]:
|
||||
|
||||
logger.info("Fetching OMDb summary for: %s (year=%s)", title, year)
|
||||
|
||||
async with self.semaphore:
|
||||
await self.rate_limiter.wait()
|
||||
|
||||
params = {
|
||||
"apikey": self.api_key,
|
||||
"t": title,
|
||||
"plot": "short",
|
||||
}
|
||||
|
||||
# Add year for better matching accuracy
|
||||
if year:
|
||||
params["y"] = year
|
||||
|
||||
if media_type == "series":
|
||||
params["type"] = "series"
|
||||
if season is not None:
|
||||
params["Season"] = season
|
||||
if episode is not None:
|
||||
params["Episode"] = episode
|
||||
|
||||
session = await self._get_session()
|
||||
start = time.monotonic()
|
||||
|
||||
try:
|
||||
async with session.get(self.BASE_URL, params=params) as resp:
|
||||
elapsed_ms = int((time.monotonic() - start) * 1000)
|
||||
|
||||
if resp.status != 200:
|
||||
self._track(False, title, elapsed_ms)
|
||||
logger.error("OMDb HTTP %s for '%s'", resp.status, title)
|
||||
return None
|
||||
|
||||
data = await resp.json()
|
||||
|
||||
if data.get("Response") != "True":
|
||||
self._track(False, title, elapsed_ms)
|
||||
logger.warning("OMDb error for '%s': %s", title, data.get("Error"))
|
||||
return None
|
||||
|
||||
self._track(True, title, elapsed_ms)
|
||||
|
||||
# Validate year match if year was specified
|
||||
if year:
|
||||
returned_year = data.get("Year", "")
|
||||
# Handle year ranges like "2020-2023" for series
|
||||
if "-" in returned_year:
|
||||
returned_year = returned_year.split("-")[0]
|
||||
if returned_year and returned_year != year:
|
||||
logger.warning(
|
||||
f"Year mismatch for '{title}': requested {year}, got {returned_year} "
|
||||
f"('{data.get('Title')}'). Rejecting result."
|
||||
)
|
||||
return None
|
||||
|
||||
return self._parse_response(data)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("OMDb timeout for '%s'", title)
|
||||
return None
|
||||
except aiohttp.ClientError as e:
|
||||
logger.error("OMDb network error for '%s': %s", title, e)
|
||||
return None
|
||||
|
||||
# -----------------------------
|
||||
# Helpers
|
||||
# -----------------------------
|
||||
|
||||
def _parse_response(self, data: dict) -> dict:
|
||||
rt_rating = "N/A"
|
||||
for r in data.get("Ratings", []):
|
||||
if r.get("Source") == "Rotten Tomatoes":
|
||||
rt_rating = r.get("Value")
|
||||
break
|
||||
|
||||
return {
|
||||
"plot": data.get("Plot", "No plot available"),
|
||||
"rotten_tomatoes": rt_rating,
|
||||
"title": data.get("Title"),
|
||||
"year": data.get("Year"),
|
||||
"media_type": data.get("Type"),
|
||||
"imdb_rating": data.get("imdbRating", "N/A"),
|
||||
"runtime": data.get("Runtime", "N/A"),
|
||||
# Additional metadata fields
|
||||
"director": data.get("Director", "N/A"),
|
||||
"actors": data.get("Actors", "N/A"),
|
||||
"released": data.get("Released", "N/A"),
|
||||
"genre": data.get("Genre", "N/A"),
|
||||
}
|
||||
|
||||
def _track(self, success: bool, title: str, response_time_ms: int):
|
||||
if not self.db_manager:
|
||||
return
|
||||
self.db_manager.track_api_call(
|
||||
provider="omdb",
|
||||
endpoint=f"/?t={title}",
|
||||
success=success,
|
||||
response_time_ms=response_time_ms,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _make_cache_key(title, media_type, year, season, episode) -> str:
|
||||
return f"{title.lower()}|{media_type}|{year}|{season}|{episode}"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
TMDb API client - async movie and TV series metadata fetching
|
||||
"""
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import time
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
class TMDbClient:
|
||||
"""Async client for The Movie Database (TMDb) API"""
|
||||
|
||||
def __init__(self, api_key, db_manager=None):
|
||||
self.api_key = api_key
|
||||
self.base_url = "https://api.themoviedb.org/3"
|
||||
self.semaphore = asyncio.Semaphore(5) # Limit concurrent requests
|
||||
self.db_manager = db_manager
|
||||
|
||||
async def search_movie(self, title, year=None):
|
||||
"""
|
||||
Search for a movie by title
|
||||
|
||||
Args:
|
||||
title: Movie title to search
|
||||
year: Optional year to narrow search
|
||||
|
||||
Returns:
|
||||
dict: Movie data or None if not found
|
||||
"""
|
||||
async with self.semaphore:
|
||||
url = f"{self.base_url}/search/movie"
|
||||
params = {
|
||||
"api_key": self.api_key,
|
||||
"query": title
|
||||
}
|
||||
if year:
|
||||
params["year"] = year
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
response_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if response.status != 200:
|
||||
logging.error(f"TMDb HTTP error {response.status} for movie '{title}'")
|
||||
# Track failed API call
|
||||
if self.db_manager:
|
||||
self.db_manager.track_api_call(
|
||||
provider='tmdb',
|
||||
endpoint='/search/movie',
|
||||
success=False,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
return None
|
||||
|
||||
data = await response.json()
|
||||
|
||||
if data.get("results") and len(data["results"]) > 0:
|
||||
# Track successful API call
|
||||
if self.db_manager:
|
||||
self.db_manager.track_api_call(
|
||||
provider='tmdb',
|
||||
endpoint='/search/movie',
|
||||
success=True,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
return data["results"][0] # Return first match
|
||||
|
||||
logging.warning(f"No TMDb results for movie '{title}'")
|
||||
# Track failed API call (no results)
|
||||
if self.db_manager:
|
||||
self.db_manager.track_api_call(
|
||||
provider='tmdb',
|
||||
endpoint='/search/movie',
|
||||
success=False,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error searching TMDb for movie '{title}': {e}")
|
||||
return None
|
||||
|
||||
async def search_tv(self, title, year=None):
|
||||
"""
|
||||
Search for a TV series by title
|
||||
|
||||
Args:
|
||||
title: TV series title to search
|
||||
year: Optional year to narrow search
|
||||
|
||||
Returns:
|
||||
dict: TV series data or None if not found
|
||||
"""
|
||||
async with self.semaphore:
|
||||
url = f"{self.base_url}/search/tv"
|
||||
params = {
|
||||
"api_key": self.api_key,
|
||||
"query": title
|
||||
}
|
||||
if year:
|
||||
params["first_air_date_year"] = year
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
response_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if response.status != 200:
|
||||
logging.error(f"TMDb HTTP error {response.status} for TV '{title}'")
|
||||
# Track failed API call
|
||||
if self.db_manager:
|
||||
self.db_manager.track_api_call(
|
||||
provider='tmdb',
|
||||
endpoint='/search/tv',
|
||||
success=False,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
return None
|
||||
|
||||
data = await response.json()
|
||||
|
||||
if data.get("results") and len(data["results"]) > 0:
|
||||
# Track successful API call
|
||||
if self.db_manager:
|
||||
self.db_manager.track_api_call(
|
||||
provider='tmdb',
|
||||
endpoint='/search/tv',
|
||||
success=True,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
return data["results"][0] # Return first match
|
||||
|
||||
logging.warning(f"No TMDb results for TV series '{title}'")
|
||||
# Track failed API call (no results)
|
||||
if self.db_manager:
|
||||
self.db_manager.track_api_call(
|
||||
provider='tmdb',
|
||||
endpoint='/search/tv',
|
||||
success=False,
|
||||
response_time_ms=response_time_ms
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error searching TMDb for TV '{title}': {e}")
|
||||
return None
|
||||
|
||||
async def get_movie_details(self, movie_id):
|
||||
"""
|
||||
Get detailed movie information
|
||||
|
||||
Args:
|
||||
movie_id: TMDb movie ID
|
||||
|
||||
Returns:
|
||||
dict: Detailed movie data
|
||||
"""
|
||||
async with self.semaphore:
|
||||
url = f"{self.base_url}/movie/{movie_id}"
|
||||
params = {"api_key": self.api_key}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
if response.status != 200:
|
||||
logging.error(f"TMDb HTTP error {response.status} for movie ID {movie_id}")
|
||||
return None
|
||||
|
||||
return await response.json()
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting TMDb movie details for ID {movie_id}: {e}")
|
||||
return None
|
||||
|
||||
async def get_tv_details(self, tv_id):
|
||||
"""
|
||||
Get detailed TV series information
|
||||
|
||||
Args:
|
||||
tv_id: TMDb TV series ID
|
||||
|
||||
Returns:
|
||||
dict: Detailed TV series data
|
||||
"""
|
||||
async with self.semaphore:
|
||||
url = f"{self.base_url}/tv/{tv_id}"
|
||||
params = {"api_key": self.api_key}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
if response.status != 200:
|
||||
logging.error(f"TMDb HTTP error {response.status} for TV ID {tv_id}")
|
||||
return None
|
||||
|
||||
return await response.json()
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting TMDb TV details for ID {tv_id}: {e}")
|
||||
return None
|
||||
|
||||
async def get_tv_season(self, tv_id, season_number):
|
||||
"""
|
||||
Get TV season information
|
||||
|
||||
Args:
|
||||
tv_id: TMDb TV series ID
|
||||
season_number: Season number
|
||||
|
||||
Returns:
|
||||
dict: Season data including episodes
|
||||
"""
|
||||
async with self.semaphore:
|
||||
url = f"{self.base_url}/tv/{tv_id}/season/{season_number}"
|
||||
params = {"api_key": self.api_key}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
if response.status != 200:
|
||||
logging.error(f"TMDb HTTP error {response.status} for TV {tv_id} season {season_number}")
|
||||
return None
|
||||
|
||||
return await response.json()
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting TMDb season data: {e}")
|
||||
return None
|
||||
|
||||
async def fetch_summary(self, title, media_type="movie", year=None, season=None, episode=None):
|
||||
"""
|
||||
Fetch summary for movie or TV series
|
||||
|
||||
Args:
|
||||
title: Title to search
|
||||
media_type: "movie" or "tv"
|
||||
year: Optional year
|
||||
season: Optional season number (for TV)
|
||||
episode: Optional episode number (for TV)
|
||||
|
||||
Returns:
|
||||
dict: {plot, title, year, media_type, rating} or None if not found
|
||||
"""
|
||||
logging.info(f"Fetching TMDb summary for: {title} (type: {media_type})")
|
||||
|
||||
try:
|
||||
if media_type == "tv":
|
||||
# Search for TV series
|
||||
search_result = await self.search_tv(title, year)
|
||||
if not search_result:
|
||||
return None
|
||||
|
||||
tv_id = search_result["id"]
|
||||
|
||||
# Get detailed TV info
|
||||
tv_details = await self.get_tv_details(tv_id)
|
||||
if not tv_details:
|
||||
return None
|
||||
|
||||
plot = tv_details.get("overview", "No plot available")
|
||||
|
||||
# If specific season/episode requested, try to get that plot
|
||||
if season is not None:
|
||||
season_data = await self.get_tv_season(tv_id, season)
|
||||
if season_data and episode is not None:
|
||||
episodes = season_data.get("episodes", [])
|
||||
for ep in episodes:
|
||||
if ep.get("episode_number") == episode:
|
||||
plot = ep.get("overview", plot)
|
||||
break
|
||||
|
||||
# Get episode runtime (usually consistent across series)
|
||||
runtime = "N/A"
|
||||
episode_run_time = tv_details.get("episode_run_time", [])
|
||||
if episode_run_time and len(episode_run_time) > 0:
|
||||
runtime = f"{episode_run_time[0]} min"
|
||||
|
||||
return {
|
||||
"plot": plot,
|
||||
"title": tv_details.get("name", title),
|
||||
"year": tv_details.get("first_air_date", "")[:4] if tv_details.get("first_air_date") else "N/A",
|
||||
"media_type": "tv",
|
||||
"rating": f"{tv_details.get('vote_average', 0):.1f}/10",
|
||||
"runtime": runtime
|
||||
}
|
||||
|
||||
else: # movie
|
||||
# Search for movie
|
||||
search_result = await self.search_movie(title, year)
|
||||
if not search_result:
|
||||
return None
|
||||
|
||||
movie_id = search_result["id"]
|
||||
|
||||
# Get detailed movie info
|
||||
movie_details = await self.get_movie_details(movie_id)
|
||||
if not movie_details:
|
||||
return None
|
||||
|
||||
# Get runtime
|
||||
runtime = "N/A"
|
||||
if movie_details.get("runtime"):
|
||||
runtime = f"{movie_details.get('runtime')} min"
|
||||
|
||||
return {
|
||||
"plot": movie_details.get("overview", "No plot available"),
|
||||
"title": movie_details.get("title", title),
|
||||
"year": movie_details.get("release_date", "")[:4] if movie_details.get("release_date") else "N/A",
|
||||
"media_type": "movie",
|
||||
"rating": f"{movie_details.get('vote_average', 0):.1f}/10",
|
||||
"runtime": runtime
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error fetching TMDb summary for '{title}': {e}")
|
||||
return None
|
||||
@@ -0,0 +1,135 @@
|
||||
"""
|
||||
TVmaze API client - async TV metadata fetching
|
||||
"""
|
||||
import aiohttp
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TVMazeClient:
|
||||
"""Async client for the TVmaze API (no API key required)."""
|
||||
|
||||
BASE_URL = "https://api.tvmaze.com"
|
||||
|
||||
def __init__(self, db_manager=None, timeout=15):
|
||||
self.db_manager = db_manager
|
||||
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
async def fetch_summary(self, title, year=None, season=None, episode=None):
|
||||
"""
|
||||
Fetch summary for a TV series (optionally episode-specific).
|
||||
|
||||
Args:
|
||||
title: Series title to search
|
||||
year: Optional year to validate match
|
||||
season: Optional season number
|
||||
episode: Optional episode number
|
||||
"""
|
||||
show = await self._fetch_show(title, year)
|
||||
if not show:
|
||||
return None
|
||||
|
||||
plot = self._strip_html(show.get("summary")) or "No plot available"
|
||||
|
||||
if season is not None and episode is not None:
|
||||
episode_data = await self._fetch_episode(show.get("id"), season, episode)
|
||||
if episode_data:
|
||||
episode_summary = self._strip_html(episode_data.get("summary"))
|
||||
if episode_summary:
|
||||
plot = episode_summary
|
||||
|
||||
rating = show.get("rating", {}).get("average")
|
||||
imdb_rating = f"{rating:.1f}" if isinstance(rating, (int, float)) else "N/A"
|
||||
|
||||
runtime_value = show.get("runtime") or show.get("averageRuntime")
|
||||
runtime = f"{runtime_value} min" if runtime_value else "N/A"
|
||||
|
||||
premiered = show.get("premiered") or ""
|
||||
year_value = premiered[:4] if premiered else "N/A"
|
||||
|
||||
return {
|
||||
"plot": plot,
|
||||
"title": show.get("name", title),
|
||||
"year": year_value,
|
||||
"media_type": "tv",
|
||||
"imdb_rating": imdb_rating,
|
||||
"rotten_tomatoes": "N/A",
|
||||
"runtime": runtime,
|
||||
}
|
||||
|
||||
async def _fetch_show(self, title, year):
|
||||
params = {"q": title}
|
||||
url = f"{self.BASE_URL}/singlesearch/shows"
|
||||
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=self._timeout) as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
response_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if response.status != 200:
|
||||
self._track(False, "/singlesearch/shows", response_time_ms)
|
||||
logger.error("TVmaze HTTP %s for '%s'", response.status, title)
|
||||
return None
|
||||
|
||||
data = await response.json()
|
||||
self._track(True, "/singlesearch/shows", response_time_ms)
|
||||
|
||||
if year:
|
||||
premiered = data.get("premiered") or ""
|
||||
if premiered and premiered[:4] != year:
|
||||
logger.warning(
|
||||
"TVmaze year mismatch for '%s': requested %s, got %s",
|
||||
title,
|
||||
year,
|
||||
premiered[:4],
|
||||
)
|
||||
return None
|
||||
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error("TVmaze error for '%s': %s", title, e)
|
||||
return None
|
||||
|
||||
async def _fetch_episode(self, show_id, season, episode):
|
||||
if not show_id:
|
||||
return None
|
||||
|
||||
params = {"season": season, "number": episode}
|
||||
url = f"{self.BASE_URL}/shows/{show_id}/episodebynumber"
|
||||
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=self._timeout) as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
response_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if response.status != 200:
|
||||
self._track(False, "/shows/{id}/episodebynumber", response_time_ms)
|
||||
return None
|
||||
|
||||
data = await response.json()
|
||||
self._track(True, "/shows/{id}/episodebynumber", response_time_ms)
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error("TVmaze episode error for show %s: %s", show_id, e)
|
||||
return None
|
||||
|
||||
def _track(self, success, endpoint, response_time_ms):
|
||||
if not self.db_manager:
|
||||
return
|
||||
self.db_manager.track_api_call(
|
||||
provider="tvmaze",
|
||||
endpoint=endpoint,
|
||||
success=success,
|
||||
response_time_ms=response_time_ms,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _strip_html(text):
|
||||
if not text:
|
||||
return ""
|
||||
return re.sub(r"<[^>]+>", "", text).strip()
|
||||
@@ -0,0 +1,13 @@
|
||||
# Core dependencies
|
||||
flask>=3.0.0
|
||||
flask-cors>=4.0.0
|
||||
aiohttp>=3.9.0
|
||||
|
||||
# Database
|
||||
sqlalchemy>=2.0.0
|
||||
|
||||
# Environment
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Production WSGI server
|
||||
gunicorn>=21.0.0
|
||||
Binary file not shown.
@@ -0,0 +1,220 @@
|
||||
"""
|
||||
Test to verify zero timing drift guarantee for subtitle processing.
|
||||
|
||||
This test demonstrates that after injecting plot metadata:
|
||||
1. All original subtitle timestamps remain byte-for-byte identical
|
||||
2. First dialogue text appears at exactly the same timestamp
|
||||
3. No subtitle blocks are renumbered incorrectly
|
||||
"""
|
||||
|
||||
import sys
|
||||
import io
|
||||
|
||||
# Fix Windows console encoding issues
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
|
||||
from core.subtitle_processor import parse_srt, build_intro_blocks, strip_existing_plot_blocks, format_srt
|
||||
|
||||
# Test SRT content with first subtitle at 10 seconds
|
||||
test_srt = """1
|
||||
00:00:10,000 --> 00:00:12,500
|
||||
Hello sir
|
||||
|
||||
2
|
||||
00:00:13,000 --> 00:00:15,000
|
||||
How are you today?
|
||||
|
||||
3
|
||||
00:00:16,000 --> 00:00:18,500
|
||||
I'm doing great, thanks!
|
||||
"""
|
||||
|
||||
# Mock movie data
|
||||
mock_movie = {
|
||||
"title": "Test Movie",
|
||||
"year": "2025",
|
||||
"imdb_rating": "8.5",
|
||||
"runtime": "120 min",
|
||||
"plot": "A thrilling story about testing subtitle timing preservation in automated systems."
|
||||
}
|
||||
|
||||
def test_timing_preservation():
|
||||
"""Test that original subtitle timing is preserved exactly"""
|
||||
|
||||
# Parse original
|
||||
original_blocks = parse_srt(test_srt)
|
||||
|
||||
print("=" * 60)
|
||||
print("ORIGINAL SUBTITLES")
|
||||
print("=" * 60)
|
||||
for block in original_blocks:
|
||||
print(f"Block {block.index}: {block.start_time}ms - {block.end_time}ms")
|
||||
text_preview = block.text.replace("\n", " ")[:50]
|
||||
print(f" Text: {text_preview}...")
|
||||
|
||||
# Build intro blocks that fit before first subtitle (10,000 ms)
|
||||
intro_blocks = build_intro_blocks(
|
||||
mock_movie,
|
||||
mock_movie["plot"],
|
||||
first_subtitle_start_ms=original_blocks[0].start_time,
|
||||
min_safe_gap_ms=1000
|
||||
)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("INJECTED PLOT BLOCKS")
|
||||
print("=" * 60)
|
||||
for block in intro_blocks:
|
||||
print(f"Block: {block.start_time}ms - {block.end_time}ms")
|
||||
text_preview = block.text.encode('ascii', 'replace').decode('ascii')[:50]
|
||||
print(f" Text: {text_preview}...")
|
||||
|
||||
# Combine and renumber (like the real processor does)
|
||||
final = intro_blocks + original_blocks
|
||||
renumbered = [
|
||||
type(block)(
|
||||
index=i + 1,
|
||||
start_time=block.start_time,
|
||||
end_time=block.end_time,
|
||||
text=block.text
|
||||
)
|
||||
for i, block in enumerate(final)
|
||||
]
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("FINAL OUTPUT (AFTER INJECTION)")
|
||||
print("=" * 60)
|
||||
for block in renumbered:
|
||||
print(f"Block {block.index}: {block.start_time}ms - {block.end_time}ms")
|
||||
text_preview = block.text.replace("\n", " ")[:50]
|
||||
print(f" Text: {text_preview}...")
|
||||
|
||||
# Verify timing preservation
|
||||
print("\n" + "=" * 60)
|
||||
print("VERIFICATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Find first dialogue subtitle in final output (skip intro blocks)
|
||||
dialogue_blocks = [b for b in renumbered if b.start_time >= original_blocks[0].start_time]
|
||||
|
||||
assert len(dialogue_blocks) == len(original_blocks), \
|
||||
f"Block count mismatch: {len(dialogue_blocks)} vs {len(original_blocks)}"
|
||||
|
||||
for i, (original, final_block) in enumerate(zip(original_blocks, dialogue_blocks)):
|
||||
print(f"\nDialogue Block {i+1}:")
|
||||
print(f" Original timing: {original.start_time}ms - {original.end_time}ms")
|
||||
print(f" Final timing: {final_block.start_time}ms - {final_block.end_time}ms")
|
||||
print(f" Text match: {original.text == final_block.text}")
|
||||
|
||||
assert original.start_time == final_block.start_time, \
|
||||
f"Start time changed! {original.start_time} != {final_block.start_time}"
|
||||
assert original.end_time == final_block.end_time, \
|
||||
f"End time changed! {original.end_time} != {final_block.end_time}"
|
||||
assert original.text == final_block.text, \
|
||||
f"Text changed!"
|
||||
|
||||
# Verify intro blocks don't overlap with first subtitle
|
||||
last_intro_block = intro_blocks[-1]
|
||||
first_dialogue_start = original_blocks[0].start_time
|
||||
|
||||
print(f"\nGap between plot and dialogue:")
|
||||
print(f" Last plot block ends: {last_intro_block.end_time}ms")
|
||||
print(f" First dialogue starts: {first_dialogue_start}ms")
|
||||
print(f" Gap: {first_dialogue_start - last_intro_block.end_time}ms")
|
||||
|
||||
assert last_intro_block.end_time < first_dialogue_start, \
|
||||
"Plot blocks overlap with dialogue!"
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ ALL TESTS PASSED - ZERO TIMING DRIFT CONFIRMED")
|
||||
print("=" * 60)
|
||||
|
||||
def test_edge_case_early_subtitle():
|
||||
"""Test case where first subtitle starts very early (1 second)"""
|
||||
|
||||
early_srt = """1
|
||||
00:00:01,000 --> 00:00:03,000
|
||||
Early dialogue
|
||||
|
||||
2
|
||||
00:00:04,000 --> 00:00:06,000
|
||||
More early dialogue
|
||||
"""
|
||||
|
||||
original_blocks = parse_srt(early_srt)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("EDGE CASE: EARLY SUBTITLE AT 1 SECOND")
|
||||
print("=" * 60)
|
||||
|
||||
intro_blocks = build_intro_blocks(
|
||||
mock_movie,
|
||||
mock_movie["plot"],
|
||||
first_subtitle_start_ms=original_blocks[0].start_time,
|
||||
min_safe_gap_ms=1000
|
||||
)
|
||||
|
||||
print(f"First dialogue at: {original_blocks[0].start_time}ms")
|
||||
print(f"Number of intro blocks: {len(intro_blocks)}")
|
||||
|
||||
for block in intro_blocks:
|
||||
print(f" Plot block: {block.start_time}ms - {block.end_time}ms")
|
||||
|
||||
# Should use zero-duration blocks since not enough time
|
||||
if original_blocks[0].start_time < 2000:
|
||||
assert all(b.start_time == 0 and b.end_time == 0 for b in intro_blocks), \
|
||||
"Should use zero-duration blocks for very early subtitles"
|
||||
print("✅ Correctly using zero-duration blocks")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
def test_idempotency():
|
||||
"""Test that running the operation twice doesn't duplicate plot blocks"""
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("IDEMPOTENCY TEST")
|
||||
print("=" * 60)
|
||||
|
||||
# First pass: add plot blocks
|
||||
original_blocks = parse_srt(test_srt)
|
||||
intro_blocks = build_intro_blocks(
|
||||
mock_movie,
|
||||
mock_movie["plot"],
|
||||
first_subtitle_start_ms=original_blocks[0].start_time,
|
||||
min_safe_gap_ms=1000
|
||||
)
|
||||
|
||||
first_pass = intro_blocks + original_blocks
|
||||
first_pass_srt = format_srt([
|
||||
type(b)(index=i+1, start_time=b.start_time, end_time=b.end_time, text=b.text)
|
||||
for i, b in enumerate(first_pass)
|
||||
])
|
||||
|
||||
print(f"After first pass: {len(first_pass)} blocks")
|
||||
|
||||
# Second pass: should strip existing plot blocks
|
||||
second_pass_parsed = parse_srt(first_pass_srt)
|
||||
|
||||
print(f"Parsed second pass: {len(second_pass_parsed)} blocks")
|
||||
for i, block in enumerate(second_pass_parsed):
|
||||
text_preview = block.text.replace("\n", " ")[:60]
|
||||
print(f" Block {i+1}: {block.start_time}-{block.end_time}ms | {text_preview}")
|
||||
|
||||
second_pass_cleaned = strip_existing_plot_blocks(second_pass_parsed)
|
||||
|
||||
print(f"\nAfter stripping plot blocks: {len(second_pass_cleaned)} blocks")
|
||||
for i, block in enumerate(second_pass_cleaned):
|
||||
text_preview = block.text.replace("\n", " ")[:60]
|
||||
print(f" Block {i+1}: {block.start_time}-{block.end_time}ms | {text_preview}")
|
||||
|
||||
# Should have same number as original dialogue
|
||||
assert len(second_pass_cleaned) == len(original_blocks), \
|
||||
f"Failed to strip plot blocks! Expected {len(original_blocks)}, got {len(second_pass_cleaned)}"
|
||||
|
||||
print("\n✅ Idempotency verified - plot blocks correctly stripped")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_timing_preservation()
|
||||
test_edge_case_early_subtitle()
|
||||
test_idempotency()
|
||||
print("\n🎉 All tests passed! Zero timing drift guaranteed.")
|
||||
Reference in New Issue
Block a user