1.1.1 - adds OCR garbage removal

This commit is contained in:
2026-01-27 10:37:38 +13:00
parent 0a3fcf7532
commit c9d55fbee8
3 changed files with 319 additions and 51 deletions
+153 -40
View File
@@ -1267,6 +1267,7 @@ class SubtitleProcessor:
format_options: SubtitleFormatOptions = None,
strip_keywords: bool = True,
clean_subtitle_content: bool = True,
clean_subtitle_garbage: bool = False,
insertion_position: str = "start",
preferred_source: str | None = None,
language: str | None = None,
@@ -1294,6 +1295,8 @@ class SubtitleProcessor:
IMPORTANT: This ONLY affects the title lookup, NOT the subtitle content or timing.
clean_subtitle_content: If True, remove embedded ads/watermarks (YTS, RARBG, etc.)
from inside subtitle text. This cleans the actual dialogue content.
clean_subtitle_garbage: If True, remove OCR garbage like timecodes, music-only
lines, and duplicate lines inside a subtitle block.
"""
file_path = Path(file_path)
@@ -1375,6 +1378,7 @@ class SubtitleProcessor:
subs = parse_srt(original)
stripper = get_stripper()
detected_keywords = stripper.detect_subtitle_watermarks(original)
detected_keywords += stripper.detect_garbage_labels([b.text for b in subs])
if not subs:
return self._fail("No valid subtitle blocks found")
@@ -1395,7 +1399,7 @@ class SubtitleProcessor:
# This removes things like "YTS", "RARBG", "OpenSubtitles" etc.
# from inside the actual subtitle text
# ─────────────────────────────────────────────────────────────
if clean_subtitle_content:
if clean_subtitle_content or clean_subtitle_garbage:
stripper = get_stripper()
original_count = len(clean_subs)
@@ -1406,7 +1410,11 @@ class SubtitleProcessor:
]
# Clean the content
cleaned_dicts = stripper.clean_subtitle_blocks(blocks_as_dicts)
cleaned_dicts = stripper.clean_subtitle_blocks(
blocks_as_dicts,
remove_watermarks=clean_subtitle_content,
remove_garbage=clean_subtitle_garbage,
)
# Convert back to SubtitleBlock
clean_subs = [
@@ -1414,10 +1422,10 @@ class SubtitleProcessor:
for d in cleaned_dicts
]
removed_ads = original_count - len(clean_subs)
if removed_ads > 0:
removed_blocks = original_count - len(clean_subs)
if removed_blocks > 0:
logger.info(
f"Removed {removed_ads} ad/watermark subtitle blocks from {file_path.name}"
f"Removed {removed_blocks} cleaned subtitle blocks from {file_path.name}"
)
if not clean_subs:
@@ -1546,6 +1554,7 @@ class SubtitleProcessor:
self,
file_path: str | Path,
clean_subtitle_content: bool = True,
clean_subtitle_garbage: bool = False,
) -> dict:
"""Clean ad/watermark content from a subtitle file without inserting plots."""
file_path = Path(file_path)
@@ -1557,6 +1566,7 @@ class SubtitleProcessor:
return self._fail("Subtitle file too large")
try:
stripper = get_stripper()
with file_lock(file_path, timeout=30.0):
original = file_path.read_text(encoding="utf-8", errors="ignore")
subs = parse_srt(original)
@@ -1565,40 +1575,16 @@ class SubtitleProcessor:
return self._fail("No valid subtitle blocks found")
original_blocks = subs
removed_count = 0
modified_count = 0
detected_keywords = stripper.detect_subtitle_watermarks(original)
detected_keywords += stripper.detect_garbage_labels(
[b.text for b in original_blocks]
)
if clean_subtitle_content:
cleaned_blocks: List[SubtitleBlock] = []
for block in original_blocks:
text = block.text
if stripper.should_remove_subtitle_block(text):
removed_count += 1
continue
cleaned_text = stripper.clean_subtitle_text(text)
if not cleaned_text.strip():
removed_count += 1
continue
if cleaned_text != text:
modified_count += 1
cleaned_blocks.append(
SubtitleBlock(
block.index,
block.start_time,
block.end_time,
cleaned_text,
)
)
else:
cleaned_blocks = list(original_blocks)
sanitized = sanitize_all_blocks(cleaned_blocks)
if len(sanitized) < len(cleaned_blocks):
removed_count += len(cleaned_blocks) - len(sanitized)
sanitized, removed_count, modified_count = self._clean_blocks_for_content(
original_blocks,
clean_subtitle_content=clean_subtitle_content,
clean_subtitle_garbage=clean_subtitle_garbage,
)
if not sanitized:
return self._fail("No dialogue subtitles found after cleaning")
@@ -1634,8 +1620,8 @@ class SubtitleProcessor:
tmp.replace(file_path)
summary = (
f"Removed {removed_count} ad blocks, modified {modified_count} blocks"
if clean_subtitle_content
f"Removed {removed_count} blocks, modified {modified_count} blocks"
if (clean_subtitle_content or clean_subtitle_garbage)
else "Cleaned subtitle content"
)
@@ -1652,6 +1638,133 @@ class SubtitleProcessor:
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
return self._fail(f"File is being processed by another task: {e}")
def preview_clean_file(
self,
file_path: str | Path,
clean_subtitle_content: bool = True,
clean_subtitle_garbage: bool = False,
max_changes: int = 80,
) -> dict:
"""Preview cleaning changes without modifying the file."""
file_path = Path(file_path)
if not file_path.exists():
return self._fail("File not found")
if file_path.stat().st_size > self.MAX_SRT_BYTES:
return self._fail("Subtitle file too large")
try:
with file_lock(file_path, timeout=10.0):
original = file_path.read_text(encoding="utf-8", errors="ignore")
subs = parse_srt(original)
if not subs:
return self._fail("No valid subtitle blocks found")
sanitized, removed_count, modified_count = self._clean_blocks_for_content(
subs,
clean_subtitle_content=clean_subtitle_content,
clean_subtitle_garbage=clean_subtitle_garbage,
)
cleaned_map = {}
for block in sanitized:
key = (block.start_time, block.end_time)
cleaned_map.setdefault(key, []).append(block)
changes = []
for block in subs:
key = (block.start_time, block.end_time)
updated = None
if key in cleaned_map and cleaned_map[key]:
updated = cleaned_map[key].pop(0)
if updated is None:
changes.append({
"type": "removed",
"start_ms": block.start_time,
"end_ms": block.end_time,
"timecode": f"{_ms_to_timecode(block.start_time)}{_ms_to_timecode(block.end_time)}",
"before": block.text,
"after": "",
})
elif updated.text != block.text:
changes.append({
"type": "modified",
"start_ms": block.start_time,
"end_ms": block.end_time,
"timecode": f"{_ms_to_timecode(block.start_time)}{_ms_to_timecode(block.end_time)}",
"before": block.text,
"after": updated.text,
})
if len(changes) >= max_changes:
break
return {
"success": True,
"summary": f"Removed {removed_count} blocks, modified {modified_count} blocks",
"removed_blocks": removed_count,
"modified_blocks": modified_count,
"total_changed_blocks": removed_count + modified_count,
"changes_truncated": (removed_count + modified_count) > len(changes),
"changes": changes,
}
except FileLockError as e:
logger.error(f"Could not acquire lock for {file_path.name}: {e}")
return self._fail(f"File is being processed by another task: {e}")
def _clean_blocks_for_content(
self,
original_blocks: List[SubtitleBlock],
clean_subtitle_content: bool,
clean_subtitle_garbage: bool,
) -> tuple[List[SubtitleBlock], int, int]:
stripper = get_stripper()
cleaned_blocks: List[SubtitleBlock] = []
removed_count = 0
modified_count = 0
for block in original_blocks:
text = block.text
if clean_subtitle_content and stripper.should_remove_subtitle_block(text):
removed_count += 1
continue
if clean_subtitle_content or clean_subtitle_garbage:
cleaned_text = stripper.clean_subtitle_text_with_options(
text,
remove_watermarks=clean_subtitle_content,
remove_garbage=clean_subtitle_garbage,
)
else:
cleaned_text = text
if not cleaned_text.strip():
removed_count += 1
continue
if cleaned_text != text:
modified_count += 1
cleaned_blocks.append(
SubtitleBlock(
block.index,
block.start_time,
block.end_time,
cleaned_text,
)
)
sanitized = sanitize_all_blocks(cleaned_blocks)
if len(sanitized) < len(cleaned_blocks):
removed_count += len(cleaned_blocks) - len(sanitized)
return sanitized, removed_count, modified_count
# ========================================================
# Metadata fetching
# ========================================================