1.1.0 - automations, clean only mode, bug fixes

This commit is contained in:
ponzischeme89
2026-01-19 02:10:08 +13:00
parent 93e8b38e24
commit 9345ac4331
25 changed files with 2690 additions and 499 deletions
+140
View File
@@ -184,6 +184,40 @@ class FolderRule(Base):
return f"<FolderRule(id={self.id}, directory='{self.directory}')>"
class AutomationRule(Base):
"""Automation rules for scheduled tasks"""
__tablename__ = 'automation_rules'
id = Column(String(64), primary_key=True)
name = Column(String(255), nullable=False)
schedule = Column(String(100), nullable=False)
enabled = Column(Boolean, default=True, nullable=False)
patterns = Column(Text, nullable=False) # JSON list
target_folders = Column(Text, nullable=False) # JSON list
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
def __repr__(self):
return f"<AutomationRule(id='{self.id}', name='{self.name}', enabled={self.enabled})>"
class AutomationLog(Base):
"""Automation run log entries"""
__tablename__ = 'automation_logs'
id = Column(Integer, primary_key=True)
rule_id = Column(String(64), nullable=False, index=True)
file_path = Column(String(500), nullable=False)
modified = Column(Boolean, default=False)
removed_lines = Column(Integer, default=0)
dry_run = Column(Boolean, default=False)
error_message = Column(Text)
run_at = Column(DateTime, default=datetime.utcnow, nullable=False)
def __repr__(self):
return f"<AutomationLog(rule_id='{self.rule_id}', file_path='{self.file_path}')>"
class DatabaseManager:
"""Manages database connections and operations"""
@@ -1033,6 +1067,112 @@ class DatabaseManager:
finally:
session.close()
# ============ AUTOMATION RULES OPERATIONS ============
def get_automation_rules(self):
"""Get all automation rules"""
session = self.get_session()
try:
rules = session.query(AutomationRule).order_by(AutomationRule.created_at.asc()).all()
result = []
for rule in rules:
result.append({
"id": rule.id,
"name": rule.name,
"schedule": rule.schedule,
"enabled": rule.enabled,
"patterns": json.loads(rule.patterns) if rule.patterns else [],
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
"created_at": rule.created_at.isoformat() if rule.created_at else None,
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
})
return result
finally:
session.close()
def get_automation_rule(self, rule_id):
"""Get a single automation rule"""
session = self.get_session()
try:
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
if not rule:
return None
return {
"id": rule.id,
"name": rule.name,
"schedule": rule.schedule,
"enabled": rule.enabled,
"patterns": json.loads(rule.patterns) if rule.patterns else [],
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
"created_at": rule.created_at.isoformat() if rule.created_at else None,
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
}
finally:
session.close()
def upsert_automation_rule(self, rule_data):
"""Create or update an automation rule"""
session = self.get_session()
try:
rule_id = rule_data["id"]
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
if not rule:
rule = AutomationRule(id=rule_id)
session.add(rule)
rule.name = rule_data["name"]
rule.schedule = rule_data["schedule"]
rule.enabled = bool(rule_data.get("enabled", True))
rule.patterns = json.dumps(rule_data.get("patterns", []))
rule.target_folders = json.dumps(rule_data.get("target_folders", []))
session.commit()
return True
except Exception as e:
session.rollback()
logger.error(f"Error saving automation rule: {e}")
return False
finally:
session.close()
def delete_automation_rule(self, rule_id):
"""Delete an automation rule"""
session = self.get_session()
try:
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
if rule:
session.delete(rule)
session.commit()
return True
return False
except Exception as e:
session.rollback()
logger.error(f"Error deleting automation rule: {e}")
return False
finally:
session.close()
def add_automation_log(self, rule_id, file_path, modified, removed_lines, dry_run=False, error_message=None):
"""Add an automation log entry"""
session = self.get_session()
try:
entry = AutomationLog(
rule_id=rule_id,
file_path=file_path,
modified=bool(modified),
removed_lines=int(removed_lines or 0),
dry_run=bool(dry_run),
error_message=error_message
)
session.add(entry)
session.commit()
except Exception as e:
session.rollback()
logger.error(f"Error saving automation log: {e}")
raise
finally:
session.close()
# ============ MAINTENANCE OPERATIONS ============
def clear_settings(self, keep_api_keys=False):
+21 -6
View File
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
import sys
sys.path.insert(0, str(Path(__file__).parent))
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
from keyword_stripper import get_stripper
class FileScanner:
@@ -45,6 +46,7 @@ class FileScanner:
directory_path: str | Path,
batch_size: int = DEFAULT_BATCH_SIZE,
follow_symlinks: bool = False,
detect_cleanup_keywords: bool = False,
) -> Generator[List[Dict], None, None]:
"""
Recursively scan a directory tree for .srt files.
@@ -106,8 +108,11 @@ class FileScanner:
# Plot detection
# --------------------------------------------
content = None
try:
plot_marker_count = cls._count_plot_markers(file_path)
if detect_cleanup_keywords:
content = file_path.read_text(encoding="utf-8", errors="ignore")
plot_marker_count = cls._count_plot_markers(file_path, content=content)
has_plot = plot_marker_count > 0
logger.debug(
"Plot check for %s: %s",
@@ -126,7 +131,7 @@ class FileScanner:
if has_plot:
try:
metadata = cls._extract_metadata(file_path)
metadata = cls._extract_metadata(file_path, content=content)
logger.debug(
"Extracted metadata from %s: %s",
file_path.name,
@@ -138,6 +143,13 @@ class FileScanner:
file_path.name, e
)
clean_keywords = []
if detect_cleanup_keywords and content:
try:
clean_keywords = get_stripper().detect_subtitle_watermarks(content)
except Exception as e:
logger.debug("Cleanup keyword detection failed: %s", e)
status = "Has Plot" if has_plot else "Not Loaded"
if plot_marker_count > 1:
status = "Duplicate Plot"
@@ -156,6 +168,7 @@ class FileScanner:
"imdb_rating": metadata.get("imdb_rating"),
"rating": metadata.get("imdb_rating"),
"runtime": metadata.get("runtime"),
"clean_keywords": clean_keywords,
"selected": False,
})
@@ -216,14 +229,15 @@ class FileScanner:
)
@classmethod
def _count_plot_markers(cls, file_path: Path) -> int:
def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
"""
Count Sublogue plot markers to detect duplicates.
"""
logger.debug("Scanning for plot markers in %s", file_path.name)
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
if content is None:
content = file_path.read_text(encoding="utf-8", errors="ignore")
lower_content = content.lower()
generated_count = lower_content.count("generated by sublogue")
if generated_count > 0:
@@ -237,14 +251,15 @@ class FileScanner:
return 0
@classmethod
def _extract_metadata(cls, file_path: Path) -> Dict:
def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
"""
Extract title, year, rating, runtime, and plot
from Sublogue-generated subtitles.
"""
logger.debug("Extracting metadata from %s", file_path.name)
content = file_path.read_text(encoding="utf-8", errors="ignore")
if content is None:
content = file_path.read_text(encoding="utf-8", errors="ignore")
blocks = parse_srt(content)
metadata = {
+64
View File
@@ -199,6 +199,39 @@ class KeywordStripper:
r"sign\s+up\s+(now|today|free)",
]
# Force-remove entire subtitle blocks if these appear anywhere in a line.
# Partial matches are intentional (e.g. "OpenSubtitles.org").
SUBTITLE_FORCE_REMOVE = [
r"yts",
r"opensubtitles?",
]
_custom_force_remove_keywords: List[str] = []
# Labels used for reporting detected watermark keywords in clean-only scans
SUBTITLE_WATERMARK_LABELS = [
(r"yts\.mx|yts\.am|yts\.lt|yts\.ag|\byts\b", "YTS"),
(r"\byify\b", "YIFY"),
(r"\brarbg\b", "RARBG"),
(r"\beztv\b", "EZTV"),
(r"\bettv\b", "ETTV"),
(r"torrentgalaxy|\btgx\b", "TorrentGalaxy"),
(r"1337x", "1337x"),
(r"limetorrents?", "LimeTorrents"),
(r"\bevo\b", "EVO"),
(r"\bpsa\b", "PSA"),
(r"\bfgt\b", "FGT"),
(r"opensubtitles?", "OpenSubtitles"),
(r"sub\.?scene|subscene", "Subscene"),
(r"addic7ed", "Addic7ed"),
(r"podnapisi", "Podnapisi"),
(r"yifysubtitles?", "YIFY Subtitles"),
(r"legendas\.?tv", "LegendasTV"),
(r"shooter\.?cn", "ShooterCN"),
(r"subhd", "SubHD"),
(r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)|https?://", "URL"),
]
# Patterns that indicate an ENTIRE subtitle block should be removed
# (not just the matching text, but the whole block)
SUBTITLE_BLOCK_REMOVERS = [
@@ -234,6 +267,11 @@ class KeywordStripper:
def c(p):
return re.compile(p, re.IGNORECASE | re.VERBOSE)
custom_force_remove = [
re.escape(k) for k in cls._custom_force_remove_keywords if k
]
combined_force_remove = cls.SUBTITLE_FORCE_REMOVE + custom_force_remove
cls._compiled = {
"junk": c("|".join([
cls.QUALITY,
@@ -255,6 +293,9 @@ class KeywordStripper:
"subtitle_block_removers": [
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
],
"subtitle_force_remove": [
re.compile(p, re.IGNORECASE) for p in combined_force_remove
],
}
return cls._compiled
@@ -348,6 +389,11 @@ class KeywordStripper:
if not line:
continue
# Hard kill-switch: if a line mentions these sources, drop the whole block.
for pattern in rx["subtitle_force_remove"]:
if pattern.search(line):
return True
# Check if this line matches any block remover pattern
is_ad_line = False
for pattern in rx["subtitle_block_removers"]:
@@ -469,6 +515,24 @@ class KeywordStripper:
return cleaned
def detect_subtitle_watermarks(self, text: str) -> List[str]:
"""Detect known subtitle watermark keywords in raw subtitle text."""
detected = []
for pattern, label in self.SUBTITLE_WATERMARK_LABELS:
if re.search(pattern, text, re.IGNORECASE):
detected.append(label)
for keyword in self._custom_force_remove_keywords:
if keyword and re.search(re.escape(keyword), text, re.IGNORECASE):
detected.append(keyword)
return detected
def set_force_remove_keywords(self, keywords: List[str]) -> None:
"""Set custom force-remove keywords and refresh regex cache."""
type(self)._custom_force_remove_keywords = [
k.strip() for k in (keywords or []) if k and k.strip()
]
type(self)._compiled = None
# -----------------------------
# SINGLETON HELPERS
+112
View File
@@ -1373,6 +1373,8 @@ class SubtitleProcessor:
# ─────────────────────────────────────────────────────────────
original = file_path.read_text(encoding="utf-8", errors="ignore")
subs = parse_srt(original)
stripper = get_stripper()
detected_keywords = stripper.detect_subtitle_watermarks(original)
if not subs:
return self._fail("No valid subtitle blocks found")
@@ -1540,6 +1542,116 @@ class SubtitleProcessor:
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
return self._fail(f"File is being processed by another task: {e}")
def clean_file(
self,
file_path: str | Path,
clean_subtitle_content: bool = True,
) -> dict:
"""Clean ad/watermark content from a subtitle file without inserting plots."""
file_path = Path(file_path)
if not file_path.exists():
return self._fail("File not found")
if file_path.stat().st_size > self.MAX_SRT_BYTES:
return self._fail("Subtitle file too large")
try:
with file_lock(file_path, timeout=30.0):
original = file_path.read_text(encoding="utf-8", errors="ignore")
subs = parse_srt(original)
if not subs:
return self._fail("No valid subtitle blocks found")
original_blocks = subs
removed_count = 0
modified_count = 0
if clean_subtitle_content:
cleaned_blocks: List[SubtitleBlock] = []
for block in original_blocks:
text = block.text
if stripper.should_remove_subtitle_block(text):
removed_count += 1
continue
cleaned_text = stripper.clean_subtitle_text(text)
if not cleaned_text.strip():
removed_count += 1
continue
if cleaned_text != text:
modified_count += 1
cleaned_blocks.append(
SubtitleBlock(
block.index,
block.start_time,
block.end_time,
cleaned_text,
)
)
else:
cleaned_blocks = list(original_blocks)
sanitized = sanitize_all_blocks(cleaned_blocks)
if len(sanitized) < len(cleaned_blocks):
removed_count += len(cleaned_blocks) - len(sanitized)
if not sanitized:
return self._fail("No dialogue subtitles found after cleaning")
renumbered = [
SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
for i, b in enumerate(sanitized)
]
changed = len(renumbered) != len(original_blocks)
if not changed:
for updated, original_block in zip(renumbered, original_blocks):
if (
updated.start_time != original_block.start_time
or updated.end_time != original_block.end_time
or updated.text != original_block.text
):
changed = True
break
if not changed:
return {
"success": True,
"status": "Skipped",
"summary": "No changes needed",
"removed_blocks": 0,
"modified_blocks": 0,
"clean_keywords": detected_keywords,
}
tmp = file_path.with_suffix(".srt.tmp")
tmp.write_text(format_srt(renumbered), encoding="utf-8")
tmp.replace(file_path)
summary = (
f"Removed {removed_count} ad blocks, modified {modified_count} blocks"
if clean_subtitle_content
else "Cleaned subtitle content"
)
return {
"success": True,
"status": "Cleaned",
"summary": summary,
"removed_blocks": removed_count,
"modified_blocks": modified_count,
"clean_keywords": detected_keywords,
}
except FileLockError as e:
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
return self._fail(f"File is being processed by another task: {e}")
# ========================================================
# Metadata fetching
# ========================================================