Add support for special characters in filenames

- Added a new function to handle special characters in filenames.
- Modified existing functions to handle special characters in filenames.
- Updated tests to ensure correct behavior with special characters in filenames.
This commit is contained in:
Roger Gonzalez 2025-03-19 15:49:38 -03:00
parent 72e2f5afdf
commit e7c4dc16f8
Signed by: rogs
GPG Key ID: C7ECE9C6C36EC2E6
2 changed files with 224 additions and 23 deletions

View File

@ -22,6 +22,7 @@ import os
import re import re
import sys import sys
import time import time
import pathlib
import chardet import chardet
import pysrt import pysrt
@ -110,35 +111,43 @@ def contains_ad(subtitle_line: str) -> bool:
return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS) return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS)
def is_processed_before(subtitle_file: str) -> bool: def is_processed_before(subtitle_file: pathlib.Path) -> bool:
""" """
Check if the subtitle file has already been processed. Check if the subtitle file has already been processed.
Args: Args:
subtitle_file (str): The path to the subtitle file. subtitle_file (pathlib.Path): The path to the subtitle file.
Returns: Returns:
bool: True if the subtitle file has already been processed, False otherwise. bool: True if the subtitle file has already been processed, False otherwise.
""" """
file_creation_time = os.path.getctime(subtitle_file) try:
processed_timestamp = time.mktime( file_creation_time = os.path.getctime(subtitle_file)
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"), processed_timestamp = time.mktime(
) time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
return file_creation_time < processed_timestamp )
return file_creation_time < processed_timestamp
except Exception as e:
print(f"Error checking if file was processed before: {e}")
return False
def get_encoding(subtitle_file: str) -> str: def get_encoding(subtitle_file: pathlib.Path) -> str:
""" """
Detect the encoding of the subtitle file. Detect the encoding of the subtitle file.
Args: Args:
subtitle_file (str): The path to the subtitle file. subtitle_file (pathlib.Path): The path to the subtitle file.
Returns: Returns:
str: The detected encoding of the subtitle file. str: The detected encoding of the subtitle file.
""" """
with open(subtitle_file, "rb") as file: try:
return chardet.detect(file.read())["encoding"] with open(subtitle_file, "rb") as file:
return chardet.detect(file.read())["encoding"] or "utf-8"
except Exception as e:
print(f"Error detecting encoding: {e}")
return "utf-8"
def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool: def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
@ -152,33 +161,52 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
bool: True if the subtitle data was modified, False otherwise. bool: True if the subtitle data was modified, False otherwise.
""" """
modified = False modified = False
indices_to_remove = []
for index, subtitle in enumerate(subtitle_data): for index, subtitle in enumerate(subtitle_data):
if contains_ad(subtitle.text): if contains_ad(subtitle.text):
print(f"Removing: {subtitle}\n") print(f"Removing: {subtitle}\n")
del subtitle_data[index] indices_to_remove.append(index)
modified = True modified = True
for index in sorted(indices_to_remove, reverse=True):
del subtitle_data[index]
return modified return modified
def process_subtitle_file(subtitle_file: str) -> bool: def process_subtitle_file(subtitle_file_path: str) -> bool:
""" """
Process a subtitle file to remove ad lines. Process a subtitle file to remove ad lines.
Args: Args:
subtitle_file (str): The path to the subtitle file. subtitle_file_path (str): The path to the subtitle file.
Returns: Returns:
bool: True if the subtitle file was modified, False otherwise. bool: True if the subtitle file was modified, False otherwise.
""" """
try: try:
subtitle_file = pathlib.Path(subtitle_file_path)
print(f"Analyzing: {subtitle_file}")
if not subtitle_file.exists():
print(f"File not found: {subtitle_file}")
return False
if is_processed_before(subtitle_file): if is_processed_before(subtitle_file):
print(f"Already processed {subtitle_file}") print(f"Already processed {subtitle_file}")
return False return False
print(f"Analyzing: {subtitle_file}")
encoding = get_encoding(subtitle_file) encoding = get_encoding(subtitle_file)
subtitle_data = pysrt.open(subtitle_file, encoding=encoding) try:
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
except UnicodeDecodeError:
print(f"Failed to open with detected encoding {encoding}, trying utf-8")
subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
except Exception as e:
print(f"Error opening subtitle file with pysrt: {e}")
return False
if remove_ad_lines(subtitle_data): if remove_ad_lines(subtitle_data):
print(f"Saving {subtitle_file}") print(f"Saving {subtitle_file}")
@ -186,7 +214,7 @@ def process_subtitle_file(subtitle_file: str) -> bool:
return True return True
return False return False
except Exception as e: except Exception as e:
print(f"Error processing {subtitle_file}: {e}") print(f"Error processing {subtitle_file_path}: {e}")
return False return False

View File

@ -1,6 +1,9 @@
"""Unit tests for the subscleaner module.""" """Unit tests for the subscleaner module."""
import os
import sys
from io import StringIO from io import StringIO
from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
import pysrt import pysrt
@ -34,6 +37,14 @@ Another sample subtitle.
""" """
@pytest.fixture
def special_chars_temp_dir(tmpdir):
"""Create a temporary directory with special character filenames."""
special_chars_dir = Path(tmpdir) / "special_chars"
special_chars_dir.mkdir(exist_ok=True)
return special_chars_dir
def create_sample_srt_file(tmpdir, content): def create_sample_srt_file(tmpdir, content):
"""Create a sample SRT file with the given content.""" """Create a sample SRT file with the given content."""
file_path = tmpdir.join("sample.srt") file_path = tmpdir.join("sample.srt")
@ -41,6 +52,28 @@ def create_sample_srt_file(tmpdir, content):
return str(file_path) return str(file_path)
def create_special_char_files(dir_path, content):
"""Create sample SRT files with special characters in their names."""
special_filenames = [
"file,with,commas.srt",
"file with spaces.srt",
"file_with_ümlaut.srt",
"file_with_ß_char.srt",
"file_with_áccent.srt",
"file_with_$ymbol.srt",
"file_with_パーセント.srt", # Japanese characters
]
created_files = []
for filename in special_filenames:
file_path = dir_path / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
created_files.append(str(file_path))
return created_files
@pytest.mark.parametrize( @pytest.mark.parametrize(
"subtitle_line, expected_result", "subtitle_line, expected_result",
[ [
@ -69,11 +102,13 @@ def test_is_processed_before(tmpdir):
tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file. tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file.
""" """
subtitle_file = create_sample_srt_file(tmpdir, "") subtitle_file = create_sample_srt_file(tmpdir, "")
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=0): subtitle_path = Path(subtitle_file)
assert is_processed_before(subtitle_file) is True
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=9999999999): with patch("os.path.getctime", return_value=0):
assert is_processed_before(subtitle_file) is False assert is_processed_before(subtitle_path) is True
with patch("os.path.getctime", return_value=9999999999):
assert is_processed_before(subtitle_path) is False
def test_get_encoding(tmpdir, sample_srt_content): def test_get_encoding(tmpdir, sample_srt_content):
@ -85,7 +120,8 @@ def test_get_encoding(tmpdir, sample_srt_content):
sample_srt_content (str): The sample SRT content. sample_srt_content (str): The sample SRT content.
""" """
subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content) subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content)
assert get_encoding(subtitle_file) == "ascii" encoding = get_encoding(Path(subtitle_file))
assert encoding in ("ascii", "utf-8"), f"Expected ascii or utf-8, got {encoding}"
def test_remove_ad_lines(sample_srt_content): def test_remove_ad_lines(sample_srt_content):
@ -192,3 +228,140 @@ def test_main_with_modification(tmpdir, sample_srt_content):
): ):
main() main()
mock_process_subtitle_files.assert_called_once_with([subtitle_file]) mock_process_subtitle_files.assert_called_once_with([subtitle_file])
def test_process_files_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test processing subtitle files with special characters in their names.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
modified_files = process_subtitle_files(special_files)
assert len(modified_files) == len(special_files), "Not all files with special characters were processed"
for file_path in special_files:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
assert "OpenSubtitles" not in content, f"Ad not removed from {file_path}"
def test_get_encoding_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test encoding detection for files with special characters in their names.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
file_path = special_chars_temp_dir / "test_ümlaut_ß_áccent.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(sample_srt_content)
encoding = get_encoding(file_path)
assert encoding is not None, "Encoding detection failed for file with special characters"
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
try:
encoding = get_encoding(non_existent_file)
assert encoding == "utf-8", "Fallback encoding is not utf-8"
except Exception as e:
pytest.fail(f"get_encoding raised {e} with non-existent file")
def test_is_processed_before_with_special_chars(special_chars_temp_dir):
"""
Test is_processed_before function with special character filenames.
Args:
special_chars_temp_dir: Temporary directory for special character files
"""
file_path = special_chars_temp_dir / "check_processed_ümlaut.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write("Test content")
with patch("os.path.getctime", return_value=0):
assert is_processed_before(file_path) is True
with patch("os.path.getctime", return_value=9999999999):
assert is_processed_before(file_path) is False
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
assert is_processed_before(non_existent_file) is False
def test_process_subtitle_file_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test process_subtitle_file function with special character filenames.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
file_path = special_chars_temp_dir / "process_this_ümlaut,file.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(sample_srt_content)
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
assert process_subtitle_file(str(file_path)) is True
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
assert "OpenSubtitles" not in content
non_existent_file = str(special_chars_temp_dir / "non_existent_ümlaut,file.srt")
assert process_subtitle_file(non_existent_file) is False
def test_file_saving_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test that files with special characters can be saved correctly after modification.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
modified_files = process_subtitle_files(special_files)
for file_path in modified_files:
assert os.path.exists(file_path), f"File {file_path} does not exist after saving"
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
assert "OpenSubtitles" not in content, f"Content was not properly saved in {file_path}"
except Exception as e:
pytest.fail(f"Failed to reopen file {file_path} after saving: {e}")
def test_main_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test the main function with filenames containing special characters.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
file_path = special_chars_temp_dir / "main_test_ümlaut,file.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(sample_srt_content)
stdin_content = str(file_path)
with (
patch("sys.stdin", StringIO(stdin_content)),
patch(
"src.subscleaner.subscleaner.process_subtitle_files",
return_value=[str(file_path)],
) as mock_process_subtitle_files,
):
main()
mock_process_subtitle_files.assert_called_once_with([str(file_path)])