Initial commit

This commit is contained in:
ponzischeme89
2026-01-17 21:49:22 +13:00
commit 3ad3d9bfe0
118 changed files with 18586 additions and 0 deletions
Binary file not shown.
+1376
View File
File diff suppressed because it is too large Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+67
View File
@@ -0,0 +1,67 @@
"""
Configuration manager - handles settings persistence
"""
import json
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
class ConfigManager:
"""Manages application settings with JSON persistence"""
def __init__(self, file_path="settings.json"):
self.file_path = Path(file_path)
self.settings = {
"api_key": "",
"default_directory": "",
"duration": 40,
"cleaning_patterns": [
r"\(\d{4}\)",
r"\[\d{4}\]",
r"\.\d{4}\.",
r"\.(19|20)\d{2}",
r"\.[a-z]{2,3}\b",
r"\.(?:720p|1080p|2160p|480p|HDRip|BRRip|BluRay|WEBRip|WEB-DL)",
r"\.(?:x264|x265|HEVC|AAC|AC3|DTS)"
]
}
self.load_settings()
def load_settings(self):
"""Load settings from disk"""
if self.file_path.exists():
try:
with open(self.file_path, "r") as f:
self.settings.update(json.load(f))
logging.info("Settings loaded successfully")
except Exception as e:
logging.error(f"Error loading settings: {e}")
def save_settings(self):
"""Save settings to disk"""
try:
with open(self.file_path, "w") as f:
json.dump(self.settings, f, indent=2)
logging.info("Settings saved successfully")
except Exception as e:
logging.error(f"Error saving settings: {e}")
def get(self, key, default=None):
"""Get a setting value"""
return self.settings.get(key, default)
def set(self, key, value):
"""Set a setting value"""
self.settings[key] = value
logging.info(f"Setting updated: {key}")
def get_all(self):
"""Get all settings"""
return self.settings.copy()
def update_multiple(self, updates):
"""Update multiple settings at once"""
self.settings.update(updates)
logging.info(f"Updated {len(updates)} settings")
+828
View File
@@ -0,0 +1,828 @@
"""
Database layer using SQLAlchemy with SQLite
Handles persistent storage for settings, runs, and history
"""
from datetime import datetime
from pathlib import Path
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, Float, Text, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship, scoped_session
import json
import logging
logger = logging.getLogger(__name__)
Base = declarative_base()
class Settings(Base):
"""Settings table - stores application configuration"""
__tablename__ = 'settings'
id = Column(Integer, primary_key=True)
key = Column(String(100), unique=True, nullable=False, index=True)
value = Column(Text, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
def __repr__(self):
return f"<Settings(key='{self.key}', value='{self.value}')>"
class ProcessingRun(Base):
"""Processing runs table - stores each batch processing session"""
__tablename__ = 'processing_runs'
id = Column(Integer, primary_key=True)
started_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
completed_at = Column(DateTime)
total_files = Column(Integer, default=0)
successful_files = Column(Integer, default=0)
failed_files = Column(Integer, default=0)
duration_seconds = Column(Float)
status = Column(String(50), default='in_progress') # in_progress, completed, failed
# Relationship to file results
file_results = relationship("FileResult", back_populates="run", cascade="all, delete-orphan")
def __repr__(self):
return f"<ProcessingRun(id={self.id}, started_at='{self.started_at}', status='{self.status}')>"
class FileResult(Base):
"""File results table - stores individual file processing results"""
__tablename__ = 'file_results'
id = Column(Integer, primary_key=True)
run_id = Column(Integer, ForeignKey('processing_runs.id'), nullable=False, index=True)
file_path = Column(String(500), nullable=False, index=True)
file_name = Column(String(255), nullable=False)
success = Column(Boolean, default=False)
status = Column(String(100))
summary = Column(Text)
error_message = Column(Text)
processed_at = Column(DateTime, default=datetime.utcnow, nullable=False)
duration = Column(Integer) # subtitle duration in seconds
# Relationship back to run
run = relationship("ProcessingRun", back_populates="file_results")
def __repr__(self):
return f"<FileResult(id={self.id}, file_name='{self.file_name}', success={self.success})>"
class ScanHistory(Base):
"""Scan history table - stores directory scan history"""
__tablename__ = 'scan_history'
id = Column(Integer, primary_key=True)
directory = Column(String(500), nullable=False, index=True)
scanned_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
files_found = Column(Integer, default=0)
files_with_plot = Column(Integer, default=0)
scan_duration_ms = Column(Integer)
def __repr__(self):
return f"<ScanHistory(id={self.id}, directory='{self.directory}', files_found={self.files_found})>"
class ScheduledScan(Base):
"""Scheduled scans table - stores scheduled scan jobs and results"""
__tablename__ = 'scheduled_scans'
id = Column(Integer, primary_key=True)
directory = Column(String(500), nullable=False, index=True)
scheduled_for = Column(DateTime, nullable=False, index=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
started_at = Column(DateTime)
completed_at = Column(DateTime)
status = Column(String(50), default='scheduled', index=True) # scheduled, running, completed, cancelled, failed
files_found = Column(Integer, default=0)
files_with_plot = Column(Integer, default=0)
scan_duration_ms = Column(Integer)
error_message = Column(Text)
def __repr__(self):
return f"<ScheduledScan(id={self.id}, directory='{self.directory}', status='{self.status}')>"
class ApiUsage(Base):
"""API usage tracking table - monitors API calls per integration"""
__tablename__ = 'api_usage'
id = Column(Integer, primary_key=True)
provider = Column(String(50), nullable=False, index=True) # omdb, tmdb, tvmaze
endpoint = Column(String(200)) # Specific endpoint called
timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
success = Column(Boolean, default=True)
response_time_ms = Column(Integer)
call_count = Column(Integer, default=1) # Number of API calls in this batch
def __repr__(self):
return f"<ApiUsage(id={self.id}, provider='{self.provider}', calls={self.call_count}, timestamp='{self.timestamp}')>"
class SuggestedMatch(Base):
"""Suggested matches table - stores auto-matched titles for files"""
__tablename__ = 'suggested_matches'
id = Column(Integer, primary_key=True)
file_path = Column(String(500), nullable=False, unique=True, index=True)
file_name = Column(String(255), nullable=False)
matched_title = Column(String(255), nullable=False)
matched_year = Column(String(10))
matched_imdb_id = Column(String(50))
match_data = Column(Text) # JSON blob with full match data
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
def __repr__(self):
return f"<SuggestedMatch(id={self.id}, file_name='{self.file_name}', matched_title='{self.matched_title}')>"
class DatabaseManager:
"""Manages database connections and operations"""
def __init__(self, db_path="sublogue.db"):
"""Initialize database connection"""
self.db_path = Path(db_path)
self.engine = create_engine(f'sqlite:///{self.db_path}', echo=False)
self.Session = scoped_session(sessionmaker(bind=self.engine))
# Create tables if they don't exist
Base.metadata.create_all(self.engine)
logger.info(f"Database initialized at {self.db_path}")
def get_session(self):
"""Get a new database session"""
return self.Session()
def close_session(self):
"""Close the session"""
self.Session.remove()
# ============ SETTINGS OPERATIONS ============
def get_setting(self, key, default=None):
"""Get a setting value"""
session = self.get_session()
try:
setting = session.query(Settings).filter_by(key=key).first()
if setting:
try:
return json.loads(setting.value)
except json.JSONDecodeError:
return setting.value
return default
finally:
session.close()
def set_setting(self, key, value):
"""Set a setting value"""
session = self.get_session()
try:
setting = session.query(Settings).filter_by(key=key).first()
# Convert value to JSON string
json_value = json.dumps(value) if not isinstance(value, str) else value
if setting:
setting.value = json_value
setting.updated_at = datetime.utcnow()
else:
setting = Settings(key=key, value=json_value)
session.add(setting)
session.commit()
logger.info(f"Setting updated: {key}")
except Exception as e:
session.rollback()
logger.error(f"Error setting value: {e}")
raise
finally:
session.close()
def get_all_settings(self):
"""Get all settings as a dictionary"""
session = self.get_session()
try:
settings = session.query(Settings).all()
result = {}
for setting in settings:
try:
result[setting.key] = json.loads(setting.value)
except json.JSONDecodeError:
result[setting.key] = setting.value
return result
finally:
session.close()
def update_settings(self, settings_dict):
"""Update multiple settings at once"""
for key, value in settings_dict.items():
self.set_setting(key, value)
# ============ PROCESSING RUN OPERATIONS ============
def create_run(self, total_files):
"""Create a new processing run"""
session = self.get_session()
try:
run = ProcessingRun(
total_files=total_files,
status='in_progress'
)
session.add(run)
session.commit()
run_id = run.id
logger.info(f"Created processing run {run_id}")
return run_id
except Exception as e:
session.rollback()
logger.error(f"Error creating run: {e}")
raise
finally:
session.close()
def complete_run(self, run_id, successful_files, failed_files):
"""Mark a run as completed"""
session = self.get_session()
try:
run = session.query(ProcessingRun).get(run_id)
if run:
run.completed_at = datetime.utcnow()
run.successful_files = successful_files
run.failed_files = failed_files
run.status = 'completed'
if run.started_at:
duration = (run.completed_at - run.started_at).total_seconds()
run.duration_seconds = duration
session.commit()
logger.info(f"Completed run {run_id}")
except Exception as e:
session.rollback()
logger.error(f"Error completing run: {e}")
raise
finally:
session.close()
def add_file_result(self, run_id, file_path, success, status, summary="", error_message="", duration=40):
"""Add a file processing result"""
session = self.get_session()
try:
result = FileResult(
run_id=run_id,
file_path=file_path,
file_name=Path(file_path).name,
success=success,
status=status,
summary=summary,
error_message=error_message,
duration=duration
)
session.add(result)
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error adding file result: {e}")
raise
finally:
session.close()
def get_run_history(self, limit=50):
"""Get processing run history"""
session = self.get_session()
try:
runs = session.query(ProcessingRun).order_by(
ProcessingRun.started_at.desc()
).limit(limit).all()
result = []
for run in runs:
result.append({
'id': run.id,
'started_at': run.started_at.isoformat() if run.started_at else None,
'completed_at': run.completed_at.isoformat() if run.completed_at else None,
'total_files': run.total_files,
'successful_files': run.successful_files,
'failed_files': run.failed_files,
'duration_seconds': run.duration_seconds,
'status': run.status
})
return result
finally:
session.close()
def get_run_details(self, run_id):
"""Get detailed information about a specific run"""
session = self.get_session()
try:
run = session.query(ProcessingRun).get(run_id)
if not run:
return None
file_results = []
for result in run.file_results:
file_results.append({
'id': result.id,
'file_path': result.file_path,
'file_name': result.file_name,
'success': result.success,
'status': result.status,
'summary': result.summary,
'error_message': result.error_message,
'processed_at': result.processed_at.isoformat() if result.processed_at else None,
'duration': result.duration
})
return {
'id': run.id,
'started_at': run.started_at.isoformat() if run.started_at else None,
'completed_at': run.completed_at.isoformat() if run.completed_at else None,
'total_files': run.total_files,
'successful_files': run.successful_files,
'failed_files': run.failed_files,
'duration_seconds': run.duration_seconds,
'status': run.status,
'file_results': file_results
}
finally:
session.close()
# ============ SCAN HISTORY OPERATIONS ============
def add_scan_history(self, directory, files_found, files_with_plot, scan_duration_ms):
"""Add a scan history entry"""
session = self.get_session()
try:
scan = ScanHistory(
directory=directory,
files_found=files_found,
files_with_plot=files_with_plot,
scan_duration_ms=scan_duration_ms
)
session.add(scan)
session.commit()
logger.info(f"Scan history saved for {directory}")
except Exception as e:
session.rollback()
logger.error(f"Error saving scan history: {e}")
raise
finally:
session.close()
def get_scan_history(self, limit=50):
"""Get scan history"""
session = self.get_session()
try:
scans = session.query(ScanHistory).order_by(
ScanHistory.scanned_at.desc()
).limit(limit).all()
result = []
for scan in scans:
result.append({
'id': scan.id,
'directory': scan.directory,
'scanned_at': scan.scanned_at.isoformat() if scan.scanned_at else None,
'files_found': scan.files_found,
'files_with_plot': scan.files_with_plot,
'scan_duration_ms': scan.scan_duration_ms
})
return result
finally:
session.close()
# ============ SCHEDULED SCAN OPERATIONS ============
def create_scheduled_scan(self, directory, scheduled_for):
"""Create a scheduled scan entry"""
session = self.get_session()
try:
scheduled = ScheduledScan(
directory=directory,
scheduled_for=scheduled_for,
status='scheduled'
)
session.add(scheduled)
session.commit()
return scheduled.id
except Exception as e:
session.rollback()
logger.error(f"Error creating scheduled scan: {e}")
raise
finally:
session.close()
def get_scheduled_scans(self, limit=50, status=None):
"""Get scheduled scans"""
session = self.get_session()
try:
query = session.query(ScheduledScan)
if status:
query = query.filter_by(status=status)
scans = query.order_by(ScheduledScan.scheduled_for.desc()).limit(limit).all()
result = []
for scan in scans:
result.append({
'id': scan.id,
'directory': scan.directory,
'scheduled_for': scan.scheduled_for.isoformat() if scan.scheduled_for else None,
'created_at': scan.created_at.isoformat() if scan.created_at else None,
'started_at': scan.started_at.isoformat() if scan.started_at else None,
'completed_at': scan.completed_at.isoformat() if scan.completed_at else None,
'status': scan.status,
'files_found': scan.files_found,
'files_with_plot': scan.files_with_plot,
'scan_duration_ms': scan.scan_duration_ms,
'error_message': scan.error_message
})
return result
finally:
session.close()
def get_scheduled_scan(self, scan_id):
"""Get a single scheduled scan by id"""
session = self.get_session()
try:
scan = session.query(ScheduledScan).get(scan_id)
if not scan:
return None
return {
'id': scan.id,
'directory': scan.directory,
'scheduled_for': scan.scheduled_for.isoformat() if scan.scheduled_for else None,
'created_at': scan.created_at.isoformat() if scan.created_at else None,
'started_at': scan.started_at.isoformat() if scan.started_at else None,
'completed_at': scan.completed_at.isoformat() if scan.completed_at else None,
'status': scan.status,
'files_found': scan.files_found,
'files_with_plot': scan.files_with_plot,
'scan_duration_ms': scan.scan_duration_ms,
'error_message': scan.error_message
}
finally:
session.close()
def get_due_scheduled_scans(self, now):
"""Get scheduled scans that are due to run"""
session = self.get_session()
try:
scans = session.query(ScheduledScan).filter(
ScheduledScan.status == 'scheduled',
ScheduledScan.scheduled_for <= now
).order_by(ScheduledScan.scheduled_for.asc()).all()
return [scan.id for scan in scans]
finally:
session.close()
def mark_scheduled_scan_running(self, scan_id, started_at=None):
"""Mark a scheduled scan as running"""
session = self.get_session()
try:
scan = session.query(ScheduledScan).get(scan_id)
if not scan or scan.status != 'scheduled':
return False
scan.status = 'running'
scan.started_at = started_at or datetime.utcnow()
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error marking scheduled scan running: {e}")
return False
finally:
session.close()
def complete_scheduled_scan(self, scan_id, files_found, files_with_plot, scan_duration_ms, completed_at=None):
"""Mark a scheduled scan as completed"""
session = self.get_session()
try:
scan = session.query(ScheduledScan).get(scan_id)
if not scan:
return False
scan.status = 'completed'
scan.completed_at = completed_at or datetime.utcnow()
scan.files_found = files_found
scan.files_with_plot = files_with_plot
scan.scan_duration_ms = scan_duration_ms
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error completing scheduled scan: {e}")
return False
finally:
session.close()
def fail_scheduled_scan(self, scan_id, error_message, completed_at=None):
"""Mark a scheduled scan as failed"""
session = self.get_session()
try:
scan = session.query(ScheduledScan).get(scan_id)
if not scan:
return False
scan.status = 'failed'
scan.completed_at = completed_at or datetime.utcnow()
scan.error_message = error_message
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error failing scheduled scan: {e}")
return False
finally:
session.close()
def cancel_scheduled_scan(self, scan_id):
"""Cancel a scheduled scan"""
session = self.get_session()
try:
scan = session.query(ScheduledScan).get(scan_id)
if not scan or scan.status != 'scheduled':
return False
scan.status = 'cancelled'
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error cancelling scheduled scan: {e}")
return False
finally:
session.close()
# ============ API USAGE OPERATIONS ============
def track_api_call(self, provider, endpoint=None, success=True, response_time_ms=None, call_count=1):
"""Track API call(s) for usage monitoring
Args:
provider: API provider name (omdb, tmdb, tvmaze)
endpoint: Specific endpoint called
success: Whether the call(s) succeeded
response_time_ms: Total response time in milliseconds
call_count: Number of API calls made (for batched operations)
"""
session = self.get_session()
try:
usage = ApiUsage(
provider=provider,
endpoint=endpoint,
success=success,
response_time_ms=response_time_ms,
call_count=call_count
)
session.add(usage)
session.commit()
logger.debug(f"Tracked {call_count} API call(s) to {provider}")
except Exception as e:
session.rollback()
logger.error(f"Error tracking API call: {e}")
finally:
session.close()
def get_api_usage_24h(self, provider):
"""Get API call count for the last 24 hours (sums call_count field)"""
session = self.get_session()
try:
from datetime import timedelta
from sqlalchemy import func
cutoff_time = datetime.utcnow() - timedelta(hours=24)
# Sum the call_count column to get total API calls
result = session.query(func.coalesce(func.sum(ApiUsage.call_count), 0)).filter(
ApiUsage.provider == provider,
ApiUsage.timestamp >= cutoff_time
).scalar()
return result or 0
finally:
session.close()
def check_api_limit(self, provider, limit=1000):
"""Check if API usage is under the limit"""
current_usage = self.get_api_usage_24h(provider)
return {
'under_limit': current_usage < limit,
'current_usage': current_usage,
'limit': limit,
'remaining': max(0, limit - current_usage)
}
def get_usage_stats(self, provider):
"""Get detailed usage statistics for a provider"""
session = self.get_session()
try:
from datetime import timedelta
cutoff_time = datetime.utcnow() - timedelta(hours=24)
# Get 24h usage records
usage_24h = session.query(ApiUsage).filter(
ApiUsage.provider == provider,
ApiUsage.timestamp >= cutoff_time
).all()
# Sum call_count for accurate totals
total_calls = sum(u.call_count or 1 for u in usage_24h)
successful_calls = sum(u.call_count or 1 for u in usage_24h if u.success)
failed_calls = total_calls - successful_calls
# Calculate average response time (weighted by call count)
weighted_times = [(u.response_time_ms or 0) * (u.call_count or 1) for u in usage_24h if u.response_time_ms is not None]
avg_response_time = sum(weighted_times) / total_calls if total_calls > 0 and weighted_times else 0
# Find oldest call in 24h window to calculate reset time
oldest_call = min([u.timestamp for u in usage_24h]) if usage_24h else datetime.utcnow()
reset_time = oldest_call + timedelta(hours=24)
return {
'provider': provider,
'total_calls_24h': total_calls,
'successful_calls': successful_calls,
'failed_calls': failed_calls,
'avg_response_time_ms': round(avg_response_time, 2),
'reset_time': reset_time.isoformat(),
'limit': 1000,
'remaining': max(0, 1000 - total_calls)
}
finally:
session.close()
def get_all_usage_stats(self):
"""Get usage statistics for all providers"""
providers = ['omdb', 'tmdb', 'tvmaze']
return {provider: self.get_usage_stats(provider) for provider in providers}
# ============ SUGGESTED MATCHES OPERATIONS ============
def save_suggested_match(self, file_path, file_name, match_data):
"""Save or update a suggested match for a file"""
session = self.get_session()
try:
# Check if match already exists
existing = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
if existing:
# Update existing match
existing.matched_title = match_data.get('title', '')
existing.matched_year = match_data.get('year', '')
existing.matched_imdb_id = match_data.get('imdb_id', '')
existing.match_data = json.dumps(match_data)
existing.updated_at = datetime.utcnow()
else:
# Create new match
new_match = SuggestedMatch(
file_path=file_path,
file_name=file_name,
matched_title=match_data.get('title', ''),
matched_year=match_data.get('year', ''),
matched_imdb_id=match_data.get('imdb_id', ''),
match_data=json.dumps(match_data)
)
session.add(new_match)
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error saving suggested match: {e}")
return False
finally:
session.close()
def get_suggested_match(self, file_path):
"""Get suggested match for a specific file"""
session = self.get_session()
try:
match = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
if match:
return {
'file_path': match.file_path,
'file_name': match.file_name,
'matched_title': match.matched_title,
'matched_year': match.matched_year,
'matched_imdb_id': match.matched_imdb_id,
'match_data': json.loads(match.match_data) if match.match_data else {},
'created_at': match.created_at.isoformat(),
'updated_at': match.updated_at.isoformat()
}
return None
finally:
session.close()
def get_suggested_matches_for_directory(self, directory):
"""Get all suggested matches for files in a directory"""
session = self.get_session()
try:
matches = session.query(SuggestedMatch).filter(
SuggestedMatch.file_path.like(f"{directory}%")
).all()
return {
match.file_path: json.loads(match.match_data) if match.match_data else {}
for match in matches
}
finally:
session.close()
def delete_suggested_match(self, file_path):
"""Delete a suggested match for a file"""
session = self.get_session()
try:
match = session.query(SuggestedMatch).filter_by(file_path=file_path).first()
if match:
session.delete(match)
session.commit()
return True
return False
except Exception as e:
session.rollback()
logger.error(f"Error deleting suggested match: {e}")
return False
finally:
session.close()
def clear_all_suggested_matches(self):
"""Clear all suggested matches"""
session = self.get_session()
try:
session.query(SuggestedMatch).delete()
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error clearing suggested matches: {e}")
return False
finally:
session.close()
# ============ MAINTENANCE OPERATIONS ============
def clear_settings(self, keep_api_keys=False):
"""Clear all settings, optionally keeping API keys"""
session = self.get_session()
try:
keep_values = {}
if keep_api_keys:
for key in ("omdb_api_key", "tmdb_api_key", "api_key"):
setting = session.query(Settings).filter_by(key=key).first()
if setting:
keep_values[key] = setting.value
session.query(Settings).delete()
session.commit()
if keep_values:
for key, value in keep_values.items():
session.add(Settings(key=key, value=value))
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error clearing settings: {e}")
return False
finally:
session.close()
def clear_history_and_logs(self):
"""Clear processing runs, scan history, scheduled scans, and API usage logs"""
session = self.get_session()
try:
session.query(FileResult).delete()
session.query(ProcessingRun).delete()
session.query(ScanHistory).delete()
session.query(ScheduledScan).delete()
session.query(ApiUsage).delete()
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error clearing history and logs: {e}")
return False
finally:
session.close()
# ============ STATISTICS ============
def get_statistics(self):
"""Get overall statistics"""
session = self.get_session()
try:
total_runs = session.query(ProcessingRun).count()
completed_runs = session.query(ProcessingRun).filter_by(status='completed').count()
total_files_processed = session.query(FileResult).count()
successful_files = session.query(FileResult).filter_by(success=True).count()
return {
'total_runs': total_runs,
'completed_runs': completed_runs,
'total_files_processed': total_files_processed,
'successful_files': successful_files,
'failed_files': total_files_processed - successful_files
}
finally:
session.close()
+302
View File
@@ -0,0 +1,302 @@
import logging
import os
import re
from pathlib import Path
from typing import Generator, List, Dict
# ------------------------------------------------------------
# Logging configuration
# ------------------------------------------------------------
logger = logging.getLogger("FileScanner")
logger.setLevel(logging.INFO) # Change to DEBUG for deep tracing
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)
# ------------------------------------------------------------
# Import subtitle parser
# ------------------------------------------------------------
import sys
sys.path.insert(0, str(Path(__file__).parent))
from subtitle_processor import parse_srt
class FileScanner:
"""
Efficient, disk-friendly subtitle scanner.
- Recursive (os.scandir-based)
- Streams file reads
- Batches results
- Extensive logging for observability
"""
SUPPORTED_EXTENSIONS = {".srt"}
MAX_FILE_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB
PLOT_SCAN_LINES = 50
DEFAULT_BATCH_SIZE = 100
# --------------------------------------------------------
# Public API
# --------------------------------------------------------
@classmethod
def scan_directory(
cls,
directory_path: str | Path,
batch_size: int = DEFAULT_BATCH_SIZE,
follow_symlinks: bool = False,
) -> Generator[List[Dict], None, None]:
"""
Recursively scan a directory tree for .srt files.
Yields batches of metadata dictionaries.
"""
root = Path(directory_path)
logger.info("Starting subtitle scan")
logger.info("Root directory : %s", root)
logger.info("Batch size : %s", batch_size)
logger.info("Follow symlinks : %s", follow_symlinks)
if not root.exists():
logger.error("Scan failed: path does not exist (%s)", root)
raise ValueError(f"Directory does not exist: {directory_path}")
if not root.is_dir():
logger.error("Scan failed: not a directory (%s)", root)
raise ValueError(f"Invalid directory: {directory_path}")
batch: List[Dict] = []
total_seen = 0
total_srt = 0
total_skipped = 0
for file_path in cls._walk_files(root, follow_symlinks):
total_seen += 1
if file_path.suffix.lower() not in cls.SUPPORTED_EXTENSIONS:
logger.debug("Ignoring non-subtitle file: %s", file_path)
continue
total_srt += 1
logger.debug("Found subtitle file: %s", file_path)
# --------------------------------------------
# Stat / size guard
# --------------------------------------------
try:
stat = file_path.stat()
except OSError as e:
total_skipped += 1
logger.warning(
"Skipping unreadable file: %s (%s)",
file_path, e
)
continue
if stat.st_size > cls.MAX_FILE_SIZE_BYTES:
total_skipped += 1
logger.warning(
"Skipping large subtitle file (%d bytes): %s",
stat.st_size, file_path
)
continue
# --------------------------------------------
# Plot detection
# --------------------------------------------
try:
has_plot = cls._check_has_plot(file_path)
logger.debug(
"Plot check for %s: %s",
file_path.name,
"FOUND" if has_plot else "NOT FOUND"
)
except Exception as e:
total_skipped += 1
logger.error(
"Plot scan failed for %s: %s",
file_path, e
)
continue
metadata = {}
if has_plot:
try:
metadata = cls._extract_metadata(file_path)
logger.debug(
"Extracted metadata from %s: %s",
file_path.name,
{k: v for k, v in metadata.items() if v}
)
except Exception as e:
logger.warning(
"Metadata extraction failed for %s: %s",
file_path.name, e
)
batch.append({
"path": str(file_path),
"name": file_path.name,
"has_plot": has_plot,
"status": "Has Plot" if has_plot else "Not Loaded",
"summary": metadata.get("summary", ""),
"plot": metadata.get("summary", ""),
"title": metadata.get("title"),
"year": metadata.get("year"),
"imdb_rating": metadata.get("imdb_rating"),
"rating": metadata.get("imdb_rating"),
"runtime": metadata.get("runtime"),
"selected": False,
})
if len(batch) >= batch_size:
logger.info(
"Yielding batch (%d items, %d total files scanned)",
len(batch),
total_seen
)
yield batch
batch = []
if batch:
logger.info(
"Yielding final batch (%d items)",
len(batch)
)
yield batch
logger.info("Subtitle scan completed")
logger.info("Files visited : %d", total_seen)
logger.info("Subtitle files found : %d", total_srt)
logger.info("Files skipped : %d", total_skipped)
# --------------------------------------------------------
# Internal helpers
# --------------------------------------------------------
@staticmethod
def _walk_files(root: Path, follow_symlinks: bool):
"""
Fast iterative recursive directory walk using os.scandir.
"""
logger.debug("Beginning recursive walk at %s", root)
stack = [root]
while stack:
current = stack.pop()
logger.debug("Scanning directory: %s", current)
try:
with os.scandir(current) as entries:
for entry in entries:
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
stack.append(Path(entry.path))
elif entry.is_file():
yield Path(entry.path)
except OSError as e:
logger.debug(
"Skipping entry due to OS error: %s (%s)",
entry.path, e
)
except OSError as e:
logger.warning(
"Cannot access directory: %s (%s)",
current, e
)
@classmethod
def _check_has_plot(cls, file_path: Path) -> bool:
"""
Check first N lines for Sublogue signature.
"""
logger.debug("Scanning for plot marker in %s", file_path.name)
try:
with file_path.open("r", encoding="utf-8", errors="ignore") as f:
for i, line in enumerate(f):
if i >= cls.PLOT_SCAN_LINES:
break
if "generated by sublogue" in line.lower():
logger.debug(
"Plot marker found in %s (line %d)",
file_path.name, i + 1
)
return True
except Exception as e:
logger.error(
"Error reading file during plot scan: %s (%s)",
file_path, e
)
return False
@classmethod
def _extract_metadata(cls, file_path: Path) -> Dict:
"""
Extract title, year, rating, runtime, and plot
from Sublogue-generated subtitles.
"""
logger.debug("Extracting metadata from %s", file_path.name)
content = file_path.read_text(encoding="utf-8", errors="ignore")
blocks = parse_srt(content)
metadata = {
"title": None,
"year": None,
"imdb_rating": None,
"runtime": None,
"summary": ""
}
if len(blocks) < 2:
logger.debug("Not enough subtitle blocks for metadata extraction")
return metadata
# --------------------------------------------
# Plot block (index 1)
# --------------------------------------------
plot_text = blocks[1].text
plot_text = plot_text.split("Generated by Sublogue")[0].strip()
metadata["summary"] = plot_text
# --------------------------------------------
# Header block (index 0)
# --------------------------------------------
header_lines = blocks[0].text.split("\n")
if header_lines:
first_line = header_lines[0]
year_match = re.search(r"\((\d{4})\)", first_line)
if year_match:
metadata["year"] = year_match.group(1)
metadata["title"] = first_line[:year_match.start()].strip()
else:
metadata["title"] = first_line.strip()
if len(header_lines) > 1:
second_line = header_lines[1]
rating_match = re.search(r"IMDb:\s*([^\s]+)", second_line)
if rating_match:
metadata["imdb_rating"] = rating_match.group(1)
runtime_match = re.search(r"\s*(.+)", second_line)
if runtime_match:
metadata["runtime"] = runtime_match.group(1).strip()
logger.debug("Metadata extracted: %s", metadata)
return metadata
+501
View File
@@ -0,0 +1,501 @@
"""
Keyword stripper utility - removes common junk keywords from filenames and subtitle content
Optimised for torrent / subtitle garbage while preserving real titles and dialogue
"""
from __future__ import annotations
import re
import logging
from typing import Optional, List
logger = logging.getLogger(__name__)
class KeywordStripper:
"""
High-performance filename cleaner for movies & TV.
Design goals:
- Torrent / subtitle spam annihilation
- Minimal false positives
- Regex compiled once
- Fast enough for large libraries
"""
# -----------------------------
# CORE JUNK PATTERNS
# -----------------------------
QUALITY = r"""
\b(
480p|720p|1080p|2160p|4320p|
4k|8k|
hdr|hdr10|hdr10\+|dolby\s*vision|dv|
bluray|blu[-\s]?ray|bdrip|brrip|bd|
webrip|web[-\s]?dl|web|
dvdrip|dvd|dvdscr|
cam|ts|telesync|telecine|tc|
hdrip|hdlight
)\b
"""
CODECS = r"""
\b(
x264|x265|h\.?264|h\.?265|hevc|
xvid|divx|
aac|ac3|dts|truehd|atmos|
dd5\.1|dd\+|
flac|mp3|opus|
8bit|10bit|hi10p
)\b
"""
TORRENT_GROUPS = r"""
\b(
yts(\.mx)?|yify|
rarbg|eztv|ettv|
psa|ion10|fgт|fgt|
tgx|torrentgalaxy|
1337x|limetorrent|
ettv|ettv|
publichd|scene|
ganool|evo
)\b
"""
SUBTITLE_ADS = r"""
\b(
opensubtitles|
subscene|
addic7ed|
podnapisi|
yifysubtitles|
subtitles?\s*by|
synced?\s*by|
encoded?\s*by|
resynced?\s*by
)\b
|
www\.[a-z0-9\-]+\.(com|org|net)
"""
LANGUAGES = r"""
\b(
eng|english|
ita|italian|
fra|french|
spa|spanish|
ger|german|
multi|dubbed|
vostfr|subfrench|
subs?|subtitles?
)\b
"""
EDITIONS = r"""
\b(
unrated|uncut|
directors?\s*cut|
extended|
theatrical|
imax|
special\s*edition|
limited|
internal|
proper|repack|real
)\b
"""
# -----------------------------
# STRUCTURAL NOISE
# -----------------------------
BRACKETS = r"""
[\[\(\{]
.*?
[\]\)\}]
"""
SEPARATORS = r"[._\-]+"
MULTISPACE = r"\s+"
YEAR_PATTERN = r"(19\d{2}|20\d{2})"
SEASON_EPISODE = [
r"[Ss](\d{1,2})[Ee](\d{1,2})",
r"(\d{1,2})x(\d{1,2})",
r"Season\s*(\d{1,2})\s*Episode\s*(\d{1,2})",
]
# -----------------------------
# SUBTITLE CONTENT ADS/WATERMARKS
# -----------------------------
# These patterns are specifically for cleaning embedded ads from subtitle TEXT
# They're more aggressive than filename patterns since we want to remove entire lines
# Release group watermarks that appear in subtitle text
SUBTITLE_WATERMARKS = [
# YTS and variants
r"yts\.mx",
r"yts\.am",
r"yts\.lt",
r"yts\.ag",
r"\byts\b",
r"\byify\b",
# RARBG and other groups
r"\brarbg\b",
r"\beztv\b",
r"\bettv\b",
r"torrentgalaxy",
r"\btgx\b",
r"1337x",
r"limetorrents?",
r"\bevo\b",
r"\bpsa\b",
r"\bfgt\b",
# Subtitle sites
r"opensubtitles?",
r"subscene",
r"addic7ed",
r"podnapisi",
r"yifysubtitles?",
r"sub\.?scene",
r"legendas\.?tv",
r"shooter\.?cn",
r"subhd",
# Generic patterns
r"downloaded\s+from",
r"subtitles?\s+by",
r"sync(?:ed|hronized)?\s+(?:and\s+)?correct(?:ed|ions?)?\s+by",
r"ripped\s+by",
r"encoded?\s+by",
r"resynce?d?\s+by",
r"improved\s+by",
r"fixed\s+by",
r"translated\s+by",
r"captioned\s+by",
r"support\s+us\s+and",
r"get\s+more\s+subtitles",
r"quality\s+subtitles",
r"best\s+subtitles",
r"free\s+subtitles",
# URLs and domains
r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)",
r"https?://[^\s]+",
# Social media handles that are clearly ads
r"@yaborr",
r"@sub_scene",
r"follow\s+us\s+on",
r"join\s+us\s+at",
r"visit\s+us\s+at",
# Promotional text
r"advertise\s+here",
r"membership\s+(is\s+)?free",
r"become\s+a\s+member",
r"register\s+(now|today|free)",
r"sign\s+up\s+(now|today|free)",
]
# Patterns that indicate an ENTIRE subtitle block should be removed
# (not just the matching text, but the whole block)
SUBTITLE_BLOCK_REMOVERS = [
# Pure promotional blocks
r"^[\s\-_]*(?:www\.)?yts",
r"^[\s\-_]*(?:www\.)?rarbg",
r"^[\s\-_]*opensubtitles",
r"^[\s\-_]*subscene",
r"^[\s\-_]*downloaded\s+from",
r"^[\s\-_]*subtitles?\s+by",
r"^[\s\-_]*sync(?:ed)?\s+(?:and\s+)?correct",
r"^[\s\-_]*support\s+us",
r"^[\s\-_]*get\s+(?:more\s+)?subtitles",
r"^[\s\-_]*quality\s+subtitles",
r"^[\s\-_]*advertise",
# ASCII art headers/footers (often used for ads)
r"^[\s\-=_\*]{10,}$",
# Empty after cleaning
r"^\s*$",
]
# -----------------------------
# COMPILED REGEX CACHE
# -----------------------------
_compiled = None
@classmethod
def _compile(cls):
if cls._compiled:
return cls._compiled
def c(p):
return re.compile(p, re.IGNORECASE | re.VERBOSE)
cls._compiled = {
"junk": c("|".join([
cls.QUALITY,
cls.CODECS,
cls.TORRENT_GROUPS,
cls.SUBTITLE_ADS,
cls.LANGUAGES,
cls.EDITIONS,
])),
"brackets": c(cls.BRACKETS),
"separators": re.compile(cls.SEPARATORS),
"multispace": re.compile(cls.MULTISPACE),
"year": re.compile(cls.YEAR_PATTERN),
"season_episode": [re.compile(p, re.IGNORECASE) for p in cls.SEASON_EPISODE],
# Subtitle content cleaning patterns
"subtitle_watermarks": [
re.compile(p, re.IGNORECASE) for p in cls.SUBTITLE_WATERMARKS
],
"subtitle_block_removers": [
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
],
}
return cls._compiled
# -----------------------------
# PUBLIC API
# -----------------------------
def strip_keywords(self, title: str, preserve_year: bool = True) -> str:
rx = self._compile()
original = title
# Extract year early
year: Optional[str] = None
if preserve_year:
m = rx["year"].search(title)
if m:
year = m.group(1)
# Remove obvious junk
cleaned = rx["junk"].sub("", title)
# Remove bracketed junk AFTER stripping known keywords
cleaned = rx["brackets"].sub("", cleaned)
# Normalize separators
cleaned = rx["separators"].sub(" ", cleaned)
cleaned = rx["multispace"].sub(" ", cleaned).strip()
# Re-append year
if preserve_year and year and year not in cleaned:
cleaned = f"{cleaned} ({year})"
logger.debug("KeywordStripper: '%s' -> '%s'", original, cleaned)
return cleaned
def extract_year(self, title: str) -> Optional[str]:
rx = self._compile()
m = rx["year"].search(title)
return m.group(1) if m else None
def extract_season_episode(self, title: str):
rx = self._compile()
for p in rx["season_episode"]:
m = p.search(title)
if m:
return int(m.group(1)), int(m.group(2))
return None, None
def clean_filename(self, filename: str, preserve_year: bool = True) -> dict:
name = re.sub(r"\.[^.]+$", "", filename)
season, episode = self.extract_season_episode(name)
year = self.extract_year(name)
cleaned = self.strip_keywords(name, preserve_year=preserve_year)
return {
"cleaned_title": cleaned,
"year": year,
"season": season,
"episode": episode,
"is_series": season is not None or episode is not None,
}
# -----------------------------
# SUBTITLE CONTENT CLEANING
# -----------------------------
def should_remove_subtitle_block(self, text: str) -> bool:
"""
Check if an entire subtitle block should be removed.
Returns True if the block is purely promotional/ad content
with no legitimate dialogue.
Args:
text: The subtitle text content
Returns:
True if block should be removed entirely
"""
rx = self._compile()
# Check each line of the subtitle
lines = text.strip().split('\n')
non_ad_lines = 0
for line in lines:
line = line.strip()
if not line:
continue
# Check if this line matches any block remover pattern
is_ad_line = False
for pattern in rx["subtitle_block_removers"]:
if pattern.search(line):
is_ad_line = True
break
# Also check watermarks - if the entire line is just a watermark
if not is_ad_line:
temp_line = line
for pattern in rx["subtitle_watermarks"]:
temp_line = pattern.sub("", temp_line)
# If after removing watermarks, line is empty or just punctuation
temp_line = re.sub(r'[\s\-_\.\,\!\?\:\;]+', '', temp_line)
if not temp_line:
is_ad_line = True
if not is_ad_line:
non_ad_lines += 1
# If no legitimate content remains, remove the block
return non_ad_lines == 0
def clean_subtitle_text(self, text: str) -> str:
"""
Clean watermarks and ads from subtitle text while preserving dialogue.
This is more surgical than should_remove_subtitle_block() - it removes
specific ad text but keeps the rest of the subtitle intact.
Args:
text: The subtitle text content
Returns:
Cleaned text with ads removed, or empty string if nothing remains
"""
rx = self._compile()
original = text
# Process line by line to handle multi-line subtitles
lines = text.split('\n')
cleaned_lines = []
for line in lines:
cleaned_line = line
# Remove watermark patterns
for pattern in rx["subtitle_watermarks"]:
cleaned_line = pattern.sub("", cleaned_line)
# Clean up resulting whitespace
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
# Only keep lines that have content after cleaning
if cleaned_line:
cleaned_lines.append(cleaned_line)
result = '\n'.join(cleaned_lines)
# Final cleanup - remove lines that are just punctuation/dashes
result_lines = result.split('\n')
result_lines = [l for l in result_lines if re.search(r'[a-zA-Z0-9]', l)]
result = '\n'.join(result_lines)
if result != original:
logger.debug("Cleaned subtitle text: '%s' -> '%s'", original[:50], result[:50])
return result
def clean_subtitle_blocks(self, blocks: List[dict]) -> List[dict]:
"""
Clean a list of subtitle blocks, removing ads and watermarks.
This processes each block:
1. Checks if the entire block should be removed (pure ad content)
2. If not, cleans watermarks from the text
Args:
blocks: List of subtitle block dicts with 'text' key
Returns:
Cleaned list with ad blocks removed and watermarks stripped
"""
cleaned = []
removed_count = 0
modified_count = 0
for block in blocks:
text = block.get("text", "")
# Check if entire block should be removed
if self.should_remove_subtitle_block(text):
removed_count += 1
logger.debug("Removing ad block: '%s'", text[:50])
continue
# Clean the text
cleaned_text = self.clean_subtitle_text(text)
# Skip if cleaning resulted in empty text
if not cleaned_text.strip():
removed_count += 1
continue
# Track if we modified the text
if cleaned_text != text:
modified_count += 1
# Create new block with cleaned text
cleaned_block = block.copy()
cleaned_block["text"] = cleaned_text
cleaned.append(cleaned_block)
if removed_count > 0 or modified_count > 0:
logger.info(
"Subtitle cleaning: removed %d ad blocks, modified %d blocks",
removed_count, modified_count
)
return cleaned
# -----------------------------
# SINGLETON HELPERS
# -----------------------------
_default_stripper: Optional[KeywordStripper] = None
def get_stripper() -> KeywordStripper:
global _default_stripper
if _default_stripper is None:
_default_stripper = KeywordStripper()
return _default_stripper
def clean_title(title: str, preserve_year: bool = True) -> str:
return get_stripper().strip_keywords(title, preserve_year)
def clean_filename(filename: str, preserve_year: bool = True) -> dict:
return get_stripper().clean_filename(filename, preserve_year)
def clean_subtitle_content(text: str) -> str:
"""Clean watermarks and ads from subtitle text."""
return get_stripper().clean_subtitle_text(text)
def should_remove_subtitle(text: str) -> bool:
"""Check if a subtitle block should be removed entirely (pure ad)."""
return get_stripper().should_remove_subtitle_block(text)
+243
View File
@@ -0,0 +1,243 @@
"""
OMDb API client - async movie metadata fetching
Includes:
- Rate limiting (requests / second)
- Concurrency limiting
- Request de-duplication
- Shared aiohttp session
"""
from __future__ import annotations
import asyncio
import aiohttp
import logging
import time
from typing import Dict, Optional
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RateLimiter:
"""
Simple async rate limiter (token bucket-ish).
Limits how often requests may start.
"""
def __init__(self, rate_per_second: float):
self._interval = 1.0 / rate_per_second
self._lock = asyncio.Lock()
self._last_call = 0.0
async def wait(self):
async with self._lock:
now = time.monotonic()
delta = now - self._last_call
if delta < self._interval:
await asyncio.sleep(self._interval - delta)
self._last_call = time.monotonic()
class OMDbClient:
"""Async, rate-limited OMDb client"""
BASE_URL = "https://www.omdbapi.com/"
def __init__(
self,
api_key: str,
*,
max_concurrent: int = 5,
rate_limit_per_sec: float = 2.0,
db_manager=None,
timeout: int = 15,
):
self.api_key = api_key
self.db_manager = db_manager
self.semaphore = asyncio.Semaphore(max_concurrent)
self.rate_limiter = RateLimiter(rate_limit_per_sec)
self._session: Optional[aiohttp.ClientSession] = None
self._inflight: Dict[str, asyncio.Future] = {}
self._timeout = aiohttp.ClientTimeout(total=timeout)
# -----------------------------
# Session management
# -----------------------------
async def _get_session(self) -> aiohttp.ClientSession:
if self._session is None or self._session.closed:
self._session = aiohttp.ClientSession(timeout=self._timeout)
return self._session
async def close(self):
if self._session and not self._session.closed:
await self._session.close()
# -----------------------------
# Public API
# -----------------------------
async def fetch_summary(
self,
title: str,
media_type: str = "movie",
year: Optional[str] = None,
season: Optional[int] = None,
episode: Optional[int] = None,
) -> Optional[dict]:
"""
Fetch summary from OMDb.
Automatically deduplicates identical in-flight requests.
Args:
title: Movie/show title to search
media_type: "movie" or "series"
year: Year of release (improves match accuracy)
season: Season number (for series)
episode: Episode number (for series)
"""
key = self._make_cache_key(title, media_type, year, season, episode)
# Deduplicate in-flight requests
if key in self._inflight:
logger.debug("Awaiting inflight OMDb request: %s", key)
return await self._inflight[key]
loop = asyncio.get_running_loop()
future = loop.create_future()
self._inflight[key] = future
try:
result = await self._fetch_summary_internal(
title, media_type, year, season, episode
)
future.set_result(result)
return result
except Exception as e:
future.set_exception(e)
raise
finally:
self._inflight.pop(key, None)
# -----------------------------
# Internal fetch logic
# -----------------------------
async def _fetch_summary_internal(
self,
title: str,
media_type: str,
year: Optional[str],
season: Optional[int],
episode: Optional[int],
) -> Optional[dict]:
logger.info("Fetching OMDb summary for: %s (year=%s)", title, year)
async with self.semaphore:
await self.rate_limiter.wait()
params = {
"apikey": self.api_key,
"t": title,
"plot": "short",
}
# Add year for better matching accuracy
if year:
params["y"] = year
if media_type == "series":
params["type"] = "series"
if season is not None:
params["Season"] = season
if episode is not None:
params["Episode"] = episode
session = await self._get_session()
start = time.monotonic()
try:
async with session.get(self.BASE_URL, params=params) as resp:
elapsed_ms = int((time.monotonic() - start) * 1000)
if resp.status != 200:
self._track(False, title, elapsed_ms)
logger.error("OMDb HTTP %s for '%s'", resp.status, title)
return None
data = await resp.json()
if data.get("Response") != "True":
self._track(False, title, elapsed_ms)
logger.warning("OMDb error for '%s': %s", title, data.get("Error"))
return None
self._track(True, title, elapsed_ms)
# Validate year match if year was specified
if year:
returned_year = data.get("Year", "")
# Handle year ranges like "2020-2023" for series
if "-" in returned_year:
returned_year = returned_year.split("-")[0]
if returned_year and returned_year != year:
logger.warning(
f"Year mismatch for '{title}': requested {year}, got {returned_year} "
f"('{data.get('Title')}'). Rejecting result."
)
return None
return self._parse_response(data)
except asyncio.TimeoutError:
logger.error("OMDb timeout for '%s'", title)
return None
except aiohttp.ClientError as e:
logger.error("OMDb network error for '%s': %s", title, e)
return None
# -----------------------------
# Helpers
# -----------------------------
def _parse_response(self, data: dict) -> dict:
rt_rating = "N/A"
for r in data.get("Ratings", []):
if r.get("Source") == "Rotten Tomatoes":
rt_rating = r.get("Value")
break
return {
"plot": data.get("Plot", "No plot available"),
"rotten_tomatoes": rt_rating,
"title": data.get("Title"),
"year": data.get("Year"),
"media_type": data.get("Type"),
"imdb_rating": data.get("imdbRating", "N/A"),
"runtime": data.get("Runtime", "N/A"),
# Additional metadata fields
"director": data.get("Director", "N/A"),
"actors": data.get("Actors", "N/A"),
"released": data.get("Released", "N/A"),
"genre": data.get("Genre", "N/A"),
}
def _track(self, success: bool, title: str, response_time_ms: int):
if not self.db_manager:
return
self.db_manager.track_api_call(
provider="omdb",
endpoint=f"/?t={title}",
success=success,
response_time_ms=response_time_ms,
)
@staticmethod
def _make_cache_key(title, media_type, year, season, episode) -> str:
return f"{title.lower()}|{media_type}|{year}|{season}|{episode}"
File diff suppressed because it is too large Load Diff
+321
View File
@@ -0,0 +1,321 @@
"""
TMDb API client - async movie and TV series metadata fetching
"""
import asyncio
import aiohttp
import logging
import time
logging.basicConfig(level=logging.INFO)
class TMDbClient:
"""Async client for The Movie Database (TMDb) API"""
def __init__(self, api_key, db_manager=None):
self.api_key = api_key
self.base_url = "https://api.themoviedb.org/3"
self.semaphore = asyncio.Semaphore(5) # Limit concurrent requests
self.db_manager = db_manager
async def search_movie(self, title, year=None):
"""
Search for a movie by title
Args:
title: Movie title to search
year: Optional year to narrow search
Returns:
dict: Movie data or None if not found
"""
async with self.semaphore:
url = f"{self.base_url}/search/movie"
params = {
"api_key": self.api_key,
"query": title
}
if year:
params["year"] = year
try:
start_time = time.time()
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as response:
response_time_ms = int((time.time() - start_time) * 1000)
if response.status != 200:
logging.error(f"TMDb HTTP error {response.status} for movie '{title}'")
# Track failed API call
if self.db_manager:
self.db_manager.track_api_call(
provider='tmdb',
endpoint='/search/movie',
success=False,
response_time_ms=response_time_ms
)
return None
data = await response.json()
if data.get("results") and len(data["results"]) > 0:
# Track successful API call
if self.db_manager:
self.db_manager.track_api_call(
provider='tmdb',
endpoint='/search/movie',
success=True,
response_time_ms=response_time_ms
)
return data["results"][0] # Return first match
logging.warning(f"No TMDb results for movie '{title}'")
# Track failed API call (no results)
if self.db_manager:
self.db_manager.track_api_call(
provider='tmdb',
endpoint='/search/movie',
success=False,
response_time_ms=response_time_ms
)
return None
except Exception as e:
logging.error(f"Error searching TMDb for movie '{title}': {e}")
return None
async def search_tv(self, title, year=None):
"""
Search for a TV series by title
Args:
title: TV series title to search
year: Optional year to narrow search
Returns:
dict: TV series data or None if not found
"""
async with self.semaphore:
url = f"{self.base_url}/search/tv"
params = {
"api_key": self.api_key,
"query": title
}
if year:
params["first_air_date_year"] = year
try:
start_time = time.time()
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as response:
response_time_ms = int((time.time() - start_time) * 1000)
if response.status != 200:
logging.error(f"TMDb HTTP error {response.status} for TV '{title}'")
# Track failed API call
if self.db_manager:
self.db_manager.track_api_call(
provider='tmdb',
endpoint='/search/tv',
success=False,
response_time_ms=response_time_ms
)
return None
data = await response.json()
if data.get("results") and len(data["results"]) > 0:
# Track successful API call
if self.db_manager:
self.db_manager.track_api_call(
provider='tmdb',
endpoint='/search/tv',
success=True,
response_time_ms=response_time_ms
)
return data["results"][0] # Return first match
logging.warning(f"No TMDb results for TV series '{title}'")
# Track failed API call (no results)
if self.db_manager:
self.db_manager.track_api_call(
provider='tmdb',
endpoint='/search/tv',
success=False,
response_time_ms=response_time_ms
)
return None
except Exception as e:
logging.error(f"Error searching TMDb for TV '{title}': {e}")
return None
async def get_movie_details(self, movie_id):
"""
Get detailed movie information
Args:
movie_id: TMDb movie ID
Returns:
dict: Detailed movie data
"""
async with self.semaphore:
url = f"{self.base_url}/movie/{movie_id}"
params = {"api_key": self.api_key}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as response:
if response.status != 200:
logging.error(f"TMDb HTTP error {response.status} for movie ID {movie_id}")
return None
return await response.json()
except Exception as e:
logging.error(f"Error getting TMDb movie details for ID {movie_id}: {e}")
return None
async def get_tv_details(self, tv_id):
"""
Get detailed TV series information
Args:
tv_id: TMDb TV series ID
Returns:
dict: Detailed TV series data
"""
async with self.semaphore:
url = f"{self.base_url}/tv/{tv_id}"
params = {"api_key": self.api_key}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as response:
if response.status != 200:
logging.error(f"TMDb HTTP error {response.status} for TV ID {tv_id}")
return None
return await response.json()
except Exception as e:
logging.error(f"Error getting TMDb TV details for ID {tv_id}: {e}")
return None
async def get_tv_season(self, tv_id, season_number):
"""
Get TV season information
Args:
tv_id: TMDb TV series ID
season_number: Season number
Returns:
dict: Season data including episodes
"""
async with self.semaphore:
url = f"{self.base_url}/tv/{tv_id}/season/{season_number}"
params = {"api_key": self.api_key}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params) as response:
if response.status != 200:
logging.error(f"TMDb HTTP error {response.status} for TV {tv_id} season {season_number}")
return None
return await response.json()
except Exception as e:
logging.error(f"Error getting TMDb season data: {e}")
return None
async def fetch_summary(self, title, media_type="movie", year=None, season=None, episode=None):
"""
Fetch summary for movie or TV series
Args:
title: Title to search
media_type: "movie" or "tv"
year: Optional year
season: Optional season number (for TV)
episode: Optional episode number (for TV)
Returns:
dict: {plot, title, year, media_type, rating} or None if not found
"""
logging.info(f"Fetching TMDb summary for: {title} (type: {media_type})")
try:
if media_type == "tv":
# Search for TV series
search_result = await self.search_tv(title, year)
if not search_result:
return None
tv_id = search_result["id"]
# Get detailed TV info
tv_details = await self.get_tv_details(tv_id)
if not tv_details:
return None
plot = tv_details.get("overview", "No plot available")
# If specific season/episode requested, try to get that plot
if season is not None:
season_data = await self.get_tv_season(tv_id, season)
if season_data and episode is not None:
episodes = season_data.get("episodes", [])
for ep in episodes:
if ep.get("episode_number") == episode:
plot = ep.get("overview", plot)
break
# Get episode runtime (usually consistent across series)
runtime = "N/A"
episode_run_time = tv_details.get("episode_run_time", [])
if episode_run_time and len(episode_run_time) > 0:
runtime = f"{episode_run_time[0]} min"
return {
"plot": plot,
"title": tv_details.get("name", title),
"year": tv_details.get("first_air_date", "")[:4] if tv_details.get("first_air_date") else "N/A",
"media_type": "tv",
"rating": f"{tv_details.get('vote_average', 0):.1f}/10",
"runtime": runtime
}
else: # movie
# Search for movie
search_result = await self.search_movie(title, year)
if not search_result:
return None
movie_id = search_result["id"]
# Get detailed movie info
movie_details = await self.get_movie_details(movie_id)
if not movie_details:
return None
# Get runtime
runtime = "N/A"
if movie_details.get("runtime"):
runtime = f"{movie_details.get('runtime')} min"
return {
"plot": movie_details.get("overview", "No plot available"),
"title": movie_details.get("title", title),
"year": movie_details.get("release_date", "")[:4] if movie_details.get("release_date") else "N/A",
"media_type": "movie",
"rating": f"{movie_details.get('vote_average', 0):.1f}/10",
"runtime": runtime
}
except Exception as e:
logging.error(f"Error fetching TMDb summary for '{title}': {e}")
return None
+135
View File
@@ -0,0 +1,135 @@
"""
TVmaze API client - async TV metadata fetching
"""
import aiohttp
import logging
import re
import time
logger = logging.getLogger(__name__)
class TVMazeClient:
"""Async client for the TVmaze API (no API key required)."""
BASE_URL = "https://api.tvmaze.com"
def __init__(self, db_manager=None, timeout=15):
self.db_manager = db_manager
self._timeout = aiohttp.ClientTimeout(total=timeout)
async def fetch_summary(self, title, year=None, season=None, episode=None):
"""
Fetch summary for a TV series (optionally episode-specific).
Args:
title: Series title to search
year: Optional year to validate match
season: Optional season number
episode: Optional episode number
"""
show = await self._fetch_show(title, year)
if not show:
return None
plot = self._strip_html(show.get("summary")) or "No plot available"
if season is not None and episode is not None:
episode_data = await self._fetch_episode(show.get("id"), season, episode)
if episode_data:
episode_summary = self._strip_html(episode_data.get("summary"))
if episode_summary:
plot = episode_summary
rating = show.get("rating", {}).get("average")
imdb_rating = f"{rating:.1f}" if isinstance(rating, (int, float)) else "N/A"
runtime_value = show.get("runtime") or show.get("averageRuntime")
runtime = f"{runtime_value} min" if runtime_value else "N/A"
premiered = show.get("premiered") or ""
year_value = premiered[:4] if premiered else "N/A"
return {
"plot": plot,
"title": show.get("name", title),
"year": year_value,
"media_type": "tv",
"imdb_rating": imdb_rating,
"rotten_tomatoes": "N/A",
"runtime": runtime,
}
async def _fetch_show(self, title, year):
params = {"q": title}
url = f"{self.BASE_URL}/singlesearch/shows"
start_time = time.time()
try:
async with aiohttp.ClientSession(timeout=self._timeout) as session:
async with session.get(url, params=params) as response:
response_time_ms = int((time.time() - start_time) * 1000)
if response.status != 200:
self._track(False, "/singlesearch/shows", response_time_ms)
logger.error("TVmaze HTTP %s for '%s'", response.status, title)
return None
data = await response.json()
self._track(True, "/singlesearch/shows", response_time_ms)
if year:
premiered = data.get("premiered") or ""
if premiered and premiered[:4] != year:
logger.warning(
"TVmaze year mismatch for '%s': requested %s, got %s",
title,
year,
premiered[:4],
)
return None
return data
except Exception as e:
logger.error("TVmaze error for '%s': %s", title, e)
return None
async def _fetch_episode(self, show_id, season, episode):
if not show_id:
return None
params = {"season": season, "number": episode}
url = f"{self.BASE_URL}/shows/{show_id}/episodebynumber"
start_time = time.time()
try:
async with aiohttp.ClientSession(timeout=self._timeout) as session:
async with session.get(url, params=params) as response:
response_time_ms = int((time.time() - start_time) * 1000)
if response.status != 200:
self._track(False, "/shows/{id}/episodebynumber", response_time_ms)
return None
data = await response.json()
self._track(True, "/shows/{id}/episodebynumber", response_time_ms)
return data
except Exception as e:
logger.error("TVmaze episode error for show %s: %s", show_id, e)
return None
def _track(self, success, endpoint, response_time_ms):
if not self.db_manager:
return
self.db_manager.track_api_call(
provider="tvmaze",
endpoint=endpoint,
success=success,
response_time_ms=response_time_ms,
)
@staticmethod
def _strip_html(text):
if not text:
return ""
return re.sub(r"<[^>]+>", "", text).strip()
+13
View File
@@ -0,0 +1,13 @@
# Core dependencies
flask>=3.0.0
flask-cors>=4.0.0
aiohttp>=3.9.0
# Database
sqlalchemy>=2.0.0
# Environment
python-dotenv>=1.0.0
# Production WSGI server
gunicorn>=21.0.0
Binary file not shown.
+220
View File
@@ -0,0 +1,220 @@
"""
Test to verify zero timing drift guarantee for subtitle processing.
This test demonstrates that after injecting plot metadata:
1. All original subtitle timestamps remain byte-for-byte identical
2. First dialogue text appears at exactly the same timestamp
3. No subtitle blocks are renumbered incorrectly
"""
import sys
import io
# Fix Windows console encoding issues
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
from core.subtitle_processor import parse_srt, build_intro_blocks, strip_existing_plot_blocks, format_srt
# Test SRT content with first subtitle at 10 seconds
test_srt = """1
00:00:10,000 --> 00:00:12,500
Hello sir
2
00:00:13,000 --> 00:00:15,000
How are you today?
3
00:00:16,000 --> 00:00:18,500
I'm doing great, thanks!
"""
# Mock movie data
mock_movie = {
"title": "Test Movie",
"year": "2025",
"imdb_rating": "8.5",
"runtime": "120 min",
"plot": "A thrilling story about testing subtitle timing preservation in automated systems."
}
def test_timing_preservation():
"""Test that original subtitle timing is preserved exactly"""
# Parse original
original_blocks = parse_srt(test_srt)
print("=" * 60)
print("ORIGINAL SUBTITLES")
print("=" * 60)
for block in original_blocks:
print(f"Block {block.index}: {block.start_time}ms - {block.end_time}ms")
text_preview = block.text.replace("\n", " ")[:50]
print(f" Text: {text_preview}...")
# Build intro blocks that fit before first subtitle (10,000 ms)
intro_blocks = build_intro_blocks(
mock_movie,
mock_movie["plot"],
first_subtitle_start_ms=original_blocks[0].start_time,
min_safe_gap_ms=1000
)
print("\n" + "=" * 60)
print("INJECTED PLOT BLOCKS")
print("=" * 60)
for block in intro_blocks:
print(f"Block: {block.start_time}ms - {block.end_time}ms")
text_preview = block.text.encode('ascii', 'replace').decode('ascii')[:50]
print(f" Text: {text_preview}...")
# Combine and renumber (like the real processor does)
final = intro_blocks + original_blocks
renumbered = [
type(block)(
index=i + 1,
start_time=block.start_time,
end_time=block.end_time,
text=block.text
)
for i, block in enumerate(final)
]
print("\n" + "=" * 60)
print("FINAL OUTPUT (AFTER INJECTION)")
print("=" * 60)
for block in renumbered:
print(f"Block {block.index}: {block.start_time}ms - {block.end_time}ms")
text_preview = block.text.replace("\n", " ")[:50]
print(f" Text: {text_preview}...")
# Verify timing preservation
print("\n" + "=" * 60)
print("VERIFICATION")
print("=" * 60)
# Find first dialogue subtitle in final output (skip intro blocks)
dialogue_blocks = [b for b in renumbered if b.start_time >= original_blocks[0].start_time]
assert len(dialogue_blocks) == len(original_blocks), \
f"Block count mismatch: {len(dialogue_blocks)} vs {len(original_blocks)}"
for i, (original, final_block) in enumerate(zip(original_blocks, dialogue_blocks)):
print(f"\nDialogue Block {i+1}:")
print(f" Original timing: {original.start_time}ms - {original.end_time}ms")
print(f" Final timing: {final_block.start_time}ms - {final_block.end_time}ms")
print(f" Text match: {original.text == final_block.text}")
assert original.start_time == final_block.start_time, \
f"Start time changed! {original.start_time} != {final_block.start_time}"
assert original.end_time == final_block.end_time, \
f"End time changed! {original.end_time} != {final_block.end_time}"
assert original.text == final_block.text, \
f"Text changed!"
# Verify intro blocks don't overlap with first subtitle
last_intro_block = intro_blocks[-1]
first_dialogue_start = original_blocks[0].start_time
print(f"\nGap between plot and dialogue:")
print(f" Last plot block ends: {last_intro_block.end_time}ms")
print(f" First dialogue starts: {first_dialogue_start}ms")
print(f" Gap: {first_dialogue_start - last_intro_block.end_time}ms")
assert last_intro_block.end_time < first_dialogue_start, \
"Plot blocks overlap with dialogue!"
print("\n" + "=" * 60)
print("✅ ALL TESTS PASSED - ZERO TIMING DRIFT CONFIRMED")
print("=" * 60)
def test_edge_case_early_subtitle():
"""Test case where first subtitle starts very early (1 second)"""
early_srt = """1
00:00:01,000 --> 00:00:03,000
Early dialogue
2
00:00:04,000 --> 00:00:06,000
More early dialogue
"""
original_blocks = parse_srt(early_srt)
print("\n" + "=" * 60)
print("EDGE CASE: EARLY SUBTITLE AT 1 SECOND")
print("=" * 60)
intro_blocks = build_intro_blocks(
mock_movie,
mock_movie["plot"],
first_subtitle_start_ms=original_blocks[0].start_time,
min_safe_gap_ms=1000
)
print(f"First dialogue at: {original_blocks[0].start_time}ms")
print(f"Number of intro blocks: {len(intro_blocks)}")
for block in intro_blocks:
print(f" Plot block: {block.start_time}ms - {block.end_time}ms")
# Should use zero-duration blocks since not enough time
if original_blocks[0].start_time < 2000:
assert all(b.start_time == 0 and b.end_time == 0 for b in intro_blocks), \
"Should use zero-duration blocks for very early subtitles"
print("✅ Correctly using zero-duration blocks")
print("=" * 60)
def test_idempotency():
"""Test that running the operation twice doesn't duplicate plot blocks"""
print("\n" + "=" * 60)
print("IDEMPOTENCY TEST")
print("=" * 60)
# First pass: add plot blocks
original_blocks = parse_srt(test_srt)
intro_blocks = build_intro_blocks(
mock_movie,
mock_movie["plot"],
first_subtitle_start_ms=original_blocks[0].start_time,
min_safe_gap_ms=1000
)
first_pass = intro_blocks + original_blocks
first_pass_srt = format_srt([
type(b)(index=i+1, start_time=b.start_time, end_time=b.end_time, text=b.text)
for i, b in enumerate(first_pass)
])
print(f"After first pass: {len(first_pass)} blocks")
# Second pass: should strip existing plot blocks
second_pass_parsed = parse_srt(first_pass_srt)
print(f"Parsed second pass: {len(second_pass_parsed)} blocks")
for i, block in enumerate(second_pass_parsed):
text_preview = block.text.replace("\n", " ")[:60]
print(f" Block {i+1}: {block.start_time}-{block.end_time}ms | {text_preview}")
second_pass_cleaned = strip_existing_plot_blocks(second_pass_parsed)
print(f"\nAfter stripping plot blocks: {len(second_pass_cleaned)} blocks")
for i, block in enumerate(second_pass_cleaned):
text_preview = block.text.replace("\n", " ")[:60]
print(f" Block {i+1}: {block.start_time}-{block.end_time}ms | {text_preview}")
# Should have same number as original dialogue
assert len(second_pass_cleaned) == len(original_blocks), \
f"Failed to strip plot blocks! Expected {len(original_blocks)}, got {len(second_pass_cleaned)}"
print("\n✅ Idempotency verified - plot blocks correctly stripped")
print("=" * 60)
if __name__ == "__main__":
test_timing_preservation()
test_edge_case_early_subtitle()
test_idempotency()
print("\n🎉 All tests passed! Zero timing drift guaranteed.")