1.1.0 - automations, clean only mode, bug fixes
This commit is contained in:
@@ -184,6 +184,40 @@ class FolderRule(Base):
|
||||
return f"<FolderRule(id={self.id}, directory='{self.directory}')>"
|
||||
|
||||
|
||||
class AutomationRule(Base):
|
||||
"""Automation rules for scheduled tasks"""
|
||||
__tablename__ = 'automation_rules'
|
||||
|
||||
id = Column(String(64), primary_key=True)
|
||||
name = Column(String(255), nullable=False)
|
||||
schedule = Column(String(100), nullable=False)
|
||||
enabled = Column(Boolean, default=True, nullable=False)
|
||||
patterns = Column(Text, nullable=False) # JSON list
|
||||
target_folders = Column(Text, nullable=False) # JSON list
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<AutomationRule(id='{self.id}', name='{self.name}', enabled={self.enabled})>"
|
||||
|
||||
|
||||
class AutomationLog(Base):
|
||||
"""Automation run log entries"""
|
||||
__tablename__ = 'automation_logs'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
rule_id = Column(String(64), nullable=False, index=True)
|
||||
file_path = Column(String(500), nullable=False)
|
||||
modified = Column(Boolean, default=False)
|
||||
removed_lines = Column(Integer, default=0)
|
||||
dry_run = Column(Boolean, default=False)
|
||||
error_message = Column(Text)
|
||||
run_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<AutomationLog(rule_id='{self.rule_id}', file_path='{self.file_path}')>"
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""Manages database connections and operations"""
|
||||
|
||||
@@ -1033,6 +1067,112 @@ class DatabaseManager:
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ AUTOMATION RULES OPERATIONS ============
|
||||
|
||||
def get_automation_rules(self):
|
||||
"""Get all automation rules"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rules = session.query(AutomationRule).order_by(AutomationRule.created_at.asc()).all()
|
||||
result = []
|
||||
for rule in rules:
|
||||
result.append({
|
||||
"id": rule.id,
|
||||
"name": rule.name,
|
||||
"schedule": rule.schedule,
|
||||
"enabled": rule.enabled,
|
||||
"patterns": json.loads(rule.patterns) if rule.patterns else [],
|
||||
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
|
||||
"created_at": rule.created_at.isoformat() if rule.created_at else None,
|
||||
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
|
||||
})
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_automation_rule(self, rule_id):
|
||||
"""Get a single automation rule"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
|
||||
if not rule:
|
||||
return None
|
||||
return {
|
||||
"id": rule.id,
|
||||
"name": rule.name,
|
||||
"schedule": rule.schedule,
|
||||
"enabled": rule.enabled,
|
||||
"patterns": json.loads(rule.patterns) if rule.patterns else [],
|
||||
"target_folders": json.loads(rule.target_folders) if rule.target_folders else [],
|
||||
"created_at": rule.created_at.isoformat() if rule.created_at else None,
|
||||
"updated_at": rule.updated_at.isoformat() if rule.updated_at else None
|
||||
}
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def upsert_automation_rule(self, rule_data):
|
||||
"""Create or update an automation rule"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rule_id = rule_data["id"]
|
||||
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
|
||||
if not rule:
|
||||
rule = AutomationRule(id=rule_id)
|
||||
session.add(rule)
|
||||
|
||||
rule.name = rule_data["name"]
|
||||
rule.schedule = rule_data["schedule"]
|
||||
rule.enabled = bool(rule_data.get("enabled", True))
|
||||
rule.patterns = json.dumps(rule_data.get("patterns", []))
|
||||
rule.target_folders = json.dumps(rule_data.get("target_folders", []))
|
||||
|
||||
session.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error saving automation rule: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def delete_automation_rule(self, rule_id):
|
||||
"""Delete an automation rule"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
rule = session.query(AutomationRule).filter_by(id=rule_id).first()
|
||||
if rule:
|
||||
session.delete(rule)
|
||||
session.commit()
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error deleting automation rule: {e}")
|
||||
return False
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def add_automation_log(self, rule_id, file_path, modified, removed_lines, dry_run=False, error_message=None):
|
||||
"""Add an automation log entry"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
entry = AutomationLog(
|
||||
rule_id=rule_id,
|
||||
file_path=file_path,
|
||||
modified=bool(modified),
|
||||
removed_lines=int(removed_lines or 0),
|
||||
dry_run=bool(dry_run),
|
||||
error_message=error_message
|
||||
)
|
||||
session.add(entry)
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"Error saving automation log: {e}")
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# ============ MAINTENANCE OPERATIONS ============
|
||||
|
||||
def clear_settings(self, keep_api_keys=False):
|
||||
|
||||
@@ -18,6 +18,7 @@ logger = get_logger("FileScanner")
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from subtitle_processor import parse_srt, SUBLOGUE_SENTINEL, SUBLOGUE_TOKEN_PATTERN
|
||||
from keyword_stripper import get_stripper
|
||||
|
||||
|
||||
class FileScanner:
|
||||
@@ -45,6 +46,7 @@ class FileScanner:
|
||||
directory_path: str | Path,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
follow_symlinks: bool = False,
|
||||
detect_cleanup_keywords: bool = False,
|
||||
) -> Generator[List[Dict], None, None]:
|
||||
"""
|
||||
Recursively scan a directory tree for .srt files.
|
||||
@@ -106,8 +108,11 @@ class FileScanner:
|
||||
# Plot detection
|
||||
# --------------------------------------------
|
||||
|
||||
content = None
|
||||
try:
|
||||
plot_marker_count = cls._count_plot_markers(file_path)
|
||||
if detect_cleanup_keywords:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
plot_marker_count = cls._count_plot_markers(file_path, content=content)
|
||||
has_plot = plot_marker_count > 0
|
||||
logger.debug(
|
||||
"Plot check for %s: %s",
|
||||
@@ -126,7 +131,7 @@ class FileScanner:
|
||||
|
||||
if has_plot:
|
||||
try:
|
||||
metadata = cls._extract_metadata(file_path)
|
||||
metadata = cls._extract_metadata(file_path, content=content)
|
||||
logger.debug(
|
||||
"Extracted metadata from %s: %s",
|
||||
file_path.name,
|
||||
@@ -138,6 +143,13 @@ class FileScanner:
|
||||
file_path.name, e
|
||||
)
|
||||
|
||||
clean_keywords = []
|
||||
if detect_cleanup_keywords and content:
|
||||
try:
|
||||
clean_keywords = get_stripper().detect_subtitle_watermarks(content)
|
||||
except Exception as e:
|
||||
logger.debug("Cleanup keyword detection failed: %s", e)
|
||||
|
||||
status = "Has Plot" if has_plot else "Not Loaded"
|
||||
if plot_marker_count > 1:
|
||||
status = "Duplicate Plot"
|
||||
@@ -156,6 +168,7 @@ class FileScanner:
|
||||
"imdb_rating": metadata.get("imdb_rating"),
|
||||
"rating": metadata.get("imdb_rating"),
|
||||
"runtime": metadata.get("runtime"),
|
||||
"clean_keywords": clean_keywords,
|
||||
"selected": False,
|
||||
})
|
||||
|
||||
@@ -216,14 +229,15 @@ class FileScanner:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _count_plot_markers(cls, file_path: Path) -> int:
|
||||
def _count_plot_markers(cls, file_path: Path, content: str | None = None) -> int:
|
||||
"""
|
||||
Count Sublogue plot markers to detect duplicates.
|
||||
"""
|
||||
logger.debug("Scanning for plot markers in %s", file_path.name)
|
||||
|
||||
try:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content is None:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
lower_content = content.lower()
|
||||
generated_count = lower_content.count("generated by sublogue")
|
||||
if generated_count > 0:
|
||||
@@ -237,14 +251,15 @@ class FileScanner:
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def _extract_metadata(cls, file_path: Path) -> Dict:
|
||||
def _extract_metadata(cls, file_path: Path, content: str | None = None) -> Dict:
|
||||
"""
|
||||
Extract title, year, rating, runtime, and plot
|
||||
from Sublogue-generated subtitles.
|
||||
"""
|
||||
logger.debug("Extracting metadata from %s", file_path.name)
|
||||
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if content is None:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
blocks = parse_srt(content)
|
||||
|
||||
metadata = {
|
||||
|
||||
@@ -199,6 +199,39 @@ class KeywordStripper:
|
||||
r"sign\s+up\s+(now|today|free)",
|
||||
]
|
||||
|
||||
# Force-remove entire subtitle blocks if these appear anywhere in a line.
|
||||
# Partial matches are intentional (e.g. "OpenSubtitles.org").
|
||||
SUBTITLE_FORCE_REMOVE = [
|
||||
r"yts",
|
||||
r"opensubtitles?",
|
||||
]
|
||||
|
||||
_custom_force_remove_keywords: List[str] = []
|
||||
|
||||
# Labels used for reporting detected watermark keywords in clean-only scans
|
||||
SUBTITLE_WATERMARK_LABELS = [
|
||||
(r"yts\.mx|yts\.am|yts\.lt|yts\.ag|\byts\b", "YTS"),
|
||||
(r"\byify\b", "YIFY"),
|
||||
(r"\brarbg\b", "RARBG"),
|
||||
(r"\beztv\b", "EZTV"),
|
||||
(r"\bettv\b", "ETTV"),
|
||||
(r"torrentgalaxy|\btgx\b", "TorrentGalaxy"),
|
||||
(r"1337x", "1337x"),
|
||||
(r"limetorrents?", "LimeTorrents"),
|
||||
(r"\bevo\b", "EVO"),
|
||||
(r"\bpsa\b", "PSA"),
|
||||
(r"\bfgt\b", "FGT"),
|
||||
(r"opensubtitles?", "OpenSubtitles"),
|
||||
(r"sub\.?scene|subscene", "Subscene"),
|
||||
(r"addic7ed", "Addic7ed"),
|
||||
(r"podnapisi", "Podnapisi"),
|
||||
(r"yifysubtitles?", "YIFY Subtitles"),
|
||||
(r"legendas\.?tv", "LegendasTV"),
|
||||
(r"shooter\.?cn", "ShooterCN"),
|
||||
(r"subhd", "SubHD"),
|
||||
(r"www\.[a-z0-9\-]+\.(com|org|net|io|tv|mx|am|lt|ag)|https?://", "URL"),
|
||||
]
|
||||
|
||||
# Patterns that indicate an ENTIRE subtitle block should be removed
|
||||
# (not just the matching text, but the whole block)
|
||||
SUBTITLE_BLOCK_REMOVERS = [
|
||||
@@ -234,6 +267,11 @@ class KeywordStripper:
|
||||
def c(p):
|
||||
return re.compile(p, re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
custom_force_remove = [
|
||||
re.escape(k) for k in cls._custom_force_remove_keywords if k
|
||||
]
|
||||
combined_force_remove = cls.SUBTITLE_FORCE_REMOVE + custom_force_remove
|
||||
|
||||
cls._compiled = {
|
||||
"junk": c("|".join([
|
||||
cls.QUALITY,
|
||||
@@ -255,6 +293,9 @@ class KeywordStripper:
|
||||
"subtitle_block_removers": [
|
||||
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in cls.SUBTITLE_BLOCK_REMOVERS
|
||||
],
|
||||
"subtitle_force_remove": [
|
||||
re.compile(p, re.IGNORECASE) for p in combined_force_remove
|
||||
],
|
||||
}
|
||||
|
||||
return cls._compiled
|
||||
@@ -348,6 +389,11 @@ class KeywordStripper:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Hard kill-switch: if a line mentions these sources, drop the whole block.
|
||||
for pattern in rx["subtitle_force_remove"]:
|
||||
if pattern.search(line):
|
||||
return True
|
||||
|
||||
# Check if this line matches any block remover pattern
|
||||
is_ad_line = False
|
||||
for pattern in rx["subtitle_block_removers"]:
|
||||
@@ -469,6 +515,24 @@ class KeywordStripper:
|
||||
|
||||
return cleaned
|
||||
|
||||
def detect_subtitle_watermarks(self, text: str) -> List[str]:
|
||||
"""Detect known subtitle watermark keywords in raw subtitle text."""
|
||||
detected = []
|
||||
for pattern, label in self.SUBTITLE_WATERMARK_LABELS:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
detected.append(label)
|
||||
for keyword in self._custom_force_remove_keywords:
|
||||
if keyword and re.search(re.escape(keyword), text, re.IGNORECASE):
|
||||
detected.append(keyword)
|
||||
return detected
|
||||
|
||||
def set_force_remove_keywords(self, keywords: List[str]) -> None:
|
||||
"""Set custom force-remove keywords and refresh regex cache."""
|
||||
type(self)._custom_force_remove_keywords = [
|
||||
k.strip() for k in (keywords or []) if k and k.strip()
|
||||
]
|
||||
type(self)._compiled = None
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# SINGLETON HELPERS
|
||||
|
||||
@@ -1373,6 +1373,8 @@ class SubtitleProcessor:
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
original = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
subs = parse_srt(original)
|
||||
stripper = get_stripper()
|
||||
detected_keywords = stripper.detect_subtitle_watermarks(original)
|
||||
|
||||
if not subs:
|
||||
return self._fail("No valid subtitle blocks found")
|
||||
@@ -1540,6 +1542,116 @@ class SubtitleProcessor:
|
||||
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
|
||||
return self._fail(f"File is being processed by another task: {e}")
|
||||
|
||||
def clean_file(
|
||||
self,
|
||||
file_path: str | Path,
|
||||
clean_subtitle_content: bool = True,
|
||||
) -> dict:
|
||||
"""Clean ad/watermark content from a subtitle file without inserting plots."""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if not file_path.exists():
|
||||
return self._fail("File not found")
|
||||
|
||||
if file_path.stat().st_size > self.MAX_SRT_BYTES:
|
||||
return self._fail("Subtitle file too large")
|
||||
|
||||
try:
|
||||
with file_lock(file_path, timeout=30.0):
|
||||
original = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
subs = parse_srt(original)
|
||||
|
||||
if not subs:
|
||||
return self._fail("No valid subtitle blocks found")
|
||||
|
||||
original_blocks = subs
|
||||
removed_count = 0
|
||||
modified_count = 0
|
||||
|
||||
if clean_subtitle_content:
|
||||
cleaned_blocks: List[SubtitleBlock] = []
|
||||
|
||||
for block in original_blocks:
|
||||
text = block.text
|
||||
if stripper.should_remove_subtitle_block(text):
|
||||
removed_count += 1
|
||||
continue
|
||||
|
||||
cleaned_text = stripper.clean_subtitle_text(text)
|
||||
if not cleaned_text.strip():
|
||||
removed_count += 1
|
||||
continue
|
||||
|
||||
if cleaned_text != text:
|
||||
modified_count += 1
|
||||
|
||||
cleaned_blocks.append(
|
||||
SubtitleBlock(
|
||||
block.index,
|
||||
block.start_time,
|
||||
block.end_time,
|
||||
cleaned_text,
|
||||
)
|
||||
)
|
||||
else:
|
||||
cleaned_blocks = list(original_blocks)
|
||||
|
||||
sanitized = sanitize_all_blocks(cleaned_blocks)
|
||||
if len(sanitized) < len(cleaned_blocks):
|
||||
removed_count += len(cleaned_blocks) - len(sanitized)
|
||||
|
||||
if not sanitized:
|
||||
return self._fail("No dialogue subtitles found after cleaning")
|
||||
|
||||
renumbered = [
|
||||
SubtitleBlock(i + 1, b.start_time, b.end_time, b.text)
|
||||
for i, b in enumerate(sanitized)
|
||||
]
|
||||
|
||||
changed = len(renumbered) != len(original_blocks)
|
||||
if not changed:
|
||||
for updated, original_block in zip(renumbered, original_blocks):
|
||||
if (
|
||||
updated.start_time != original_block.start_time
|
||||
or updated.end_time != original_block.end_time
|
||||
or updated.text != original_block.text
|
||||
):
|
||||
changed = True
|
||||
break
|
||||
|
||||
if not changed:
|
||||
return {
|
||||
"success": True,
|
||||
"status": "Skipped",
|
||||
"summary": "No changes needed",
|
||||
"removed_blocks": 0,
|
||||
"modified_blocks": 0,
|
||||
"clean_keywords": detected_keywords,
|
||||
}
|
||||
|
||||
tmp = file_path.with_suffix(".srt.tmp")
|
||||
tmp.write_text(format_srt(renumbered), encoding="utf-8")
|
||||
tmp.replace(file_path)
|
||||
|
||||
summary = (
|
||||
f"Removed {removed_count} ad blocks, modified {modified_count} blocks"
|
||||
if clean_subtitle_content
|
||||
else "Cleaned subtitle content"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"status": "Cleaned",
|
||||
"summary": summary,
|
||||
"removed_blocks": removed_count,
|
||||
"modified_blocks": modified_count,
|
||||
"clean_keywords": detected_keywords,
|
||||
}
|
||||
|
||||
except FileLockError as e:
|
||||
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
|
||||
return self._fail(f"File is being processed by another task: {e}")
|
||||
|
||||
# ========================================================
|
||||
# Metadata fetching
|
||||
# ========================================================
|
||||
|
||||
Reference in New Issue
Block a user