Add support for special characters in filenames #5

Merged
rogs merged 3 commits from fix-special-characters-issue-on-windows into master 2025-03-19 15:56:37 -03:00
3 changed files with 224 additions and 24 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "subscleaner"
version = "1.1.5"
version = "1.2.0"
description = "Remove advertisements from subtitle files"
authors = ["Roger Gonzalez <roger@rogs.me>"]
license = "GPL-3.0-or-later"

View File

@ -19,6 +19,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import os
import pathlib
import re
import sys
import time
@ -110,35 +111,43 @@ def contains_ad(subtitle_line: str) -> bool:
return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS)
def is_processed_before(subtitle_file: str) -> bool:
def is_processed_before(subtitle_file: pathlib.Path) -> bool:
"""
Check if the subtitle file has already been processed.
Args:
subtitle_file (str): The path to the subtitle file.
subtitle_file (pathlib.Path): The path to the subtitle file.
Returns:
bool: True if the subtitle file has already been processed, False otherwise.
"""
file_creation_time = os.path.getctime(subtitle_file)
processed_timestamp = time.mktime(
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
)
return file_creation_time < processed_timestamp
try:
file_creation_time = os.path.getctime(subtitle_file)
processed_timestamp = time.mktime(
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
)
return file_creation_time < processed_timestamp
except Exception as e:
print(f"Error checking if file was processed before: {e}")
return False
def get_encoding(subtitle_file: str) -> str:
def get_encoding(subtitle_file: pathlib.Path) -> str:
"""
Detect the encoding of the subtitle file.
Args:
subtitle_file (str): The path to the subtitle file.
subtitle_file (pathlib.Path): The path to the subtitle file.
Returns:
str: The detected encoding of the subtitle file.
"""
with open(subtitle_file, "rb") as file:
return chardet.detect(file.read())["encoding"]
try:
with open(subtitle_file, "rb") as file:
return chardet.detect(file.read())["encoding"] or "utf-8"
except Exception as e:
print(f"Error detecting encoding: {e}")
return "utf-8"
def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
@ -152,33 +161,52 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
bool: True if the subtitle data was modified, False otherwise.
"""
modified = False
indices_to_remove = []
for index, subtitle in enumerate(subtitle_data):
if contains_ad(subtitle.text):
print(f"Removing: {subtitle}\n")
del subtitle_data[index]
indices_to_remove.append(index)
modified = True
for index in sorted(indices_to_remove, reverse=True):
del subtitle_data[index]
return modified
def process_subtitle_file(subtitle_file: str) -> bool:
def process_subtitle_file(subtitle_file_path: str) -> bool:
"""
Process a subtitle file to remove ad lines.
Args:
subtitle_file (str): The path to the subtitle file.
subtitle_file_path (str): The path to the subtitle file.
Returns:
bool: True if the subtitle file was modified, False otherwise.
"""
try:
subtitle_file = pathlib.Path(subtitle_file_path)
print(f"Analyzing: {subtitle_file}")
if not subtitle_file.exists():
print(f"File not found: {subtitle_file}")
return False
if is_processed_before(subtitle_file):
print(f"Already processed {subtitle_file}")
return False
print(f"Analyzing: {subtitle_file}")
encoding = get_encoding(subtitle_file)
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
try:
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
except UnicodeDecodeError:
print(f"Failed to open with detected encoding {encoding}, trying utf-8")
subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
except Exception as e:
print(f"Error opening subtitle file with pysrt: {e}")
return False
if remove_ad_lines(subtitle_data):
print(f"Saving {subtitle_file}")
@ -186,7 +214,7 @@ def process_subtitle_file(subtitle_file: str) -> bool:
return True
return False
except Exception as e:
print(f"Error processing {subtitle_file}: {e}")
print(f"Error processing {subtitle_file_path}: {e}")
return False

View File

@ -1,6 +1,8 @@
"""Unit tests for the subscleaner module."""
import os
from io import StringIO
from pathlib import Path
from unittest.mock import patch
import pysrt
@ -34,6 +36,14 @@ Another sample subtitle.
"""
@pytest.fixture
def special_chars_temp_dir(tmpdir):
"""Create a temporary directory with special character filenames."""
special_chars_dir = Path(tmpdir) / "special_chars"
special_chars_dir.mkdir(exist_ok=True)
return special_chars_dir
def create_sample_srt_file(tmpdir, content):
"""Create a sample SRT file with the given content."""
file_path = tmpdir.join("sample.srt")
@ -41,6 +51,28 @@ def create_sample_srt_file(tmpdir, content):
return str(file_path)
def create_special_char_files(dir_path, content):
"""Create sample SRT files with special characters in their names."""
special_filenames = [
"file,with,commas.srt",
"file with spaces.srt",
"file_with_ümlaut.srt",
"file_with_ß_char.srt",
"file_with_áccent.srt",
"file_with_$ymbol.srt",
"file_with_パーセント.srt", # Japanese characters
]
created_files = []
for filename in special_filenames:
file_path = dir_path / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
created_files.append(str(file_path))
return created_files
@pytest.mark.parametrize(
"subtitle_line, expected_result",
[
@ -69,11 +101,13 @@ def test_is_processed_before(tmpdir):
tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file.
"""
subtitle_file = create_sample_srt_file(tmpdir, "")
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=0):
assert is_processed_before(subtitle_file) is True
subtitle_path = Path(subtitle_file)
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=9999999999):
assert is_processed_before(subtitle_file) is False
with patch("os.path.getctime", return_value=0):
assert is_processed_before(subtitle_path) is True
with patch("os.path.getctime", return_value=9999999999):
assert is_processed_before(subtitle_path) is False
def test_get_encoding(tmpdir, sample_srt_content):
@ -85,7 +119,8 @@ def test_get_encoding(tmpdir, sample_srt_content):
sample_srt_content (str): The sample SRT content.
"""
subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content)
assert get_encoding(subtitle_file) == "ascii"
encoding = get_encoding(Path(subtitle_file))
assert encoding in ("ascii", "utf-8"), f"Expected ascii or utf-8, got {encoding}"
def test_remove_ad_lines(sample_srt_content):
@ -192,3 +227,140 @@ def test_main_with_modification(tmpdir, sample_srt_content):
):
main()
mock_process_subtitle_files.assert_called_once_with([subtitle_file])
def test_process_files_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test processing subtitle files with special characters in their names.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
modified_files = process_subtitle_files(special_files)
assert len(modified_files) == len(special_files), "Not all files with special characters were processed"
for file_path in special_files:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
assert "OpenSubtitles" not in content, f"Ad not removed from {file_path}"
def test_get_encoding_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test encoding detection for files with special characters in their names.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
file_path = special_chars_temp_dir / "test_ümlaut_ß_áccent.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(sample_srt_content)
encoding = get_encoding(file_path)
assert encoding is not None, "Encoding detection failed for file with special characters"
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
try:
encoding = get_encoding(non_existent_file)
assert encoding == "utf-8", "Fallback encoding is not utf-8"
except Exception as e:
pytest.fail(f"get_encoding raised {e} with non-existent file")
def test_is_processed_before_with_special_chars(special_chars_temp_dir):
"""
Test is_processed_before function with special character filenames.
Args:
special_chars_temp_dir: Temporary directory for special character files
"""
file_path = special_chars_temp_dir / "check_processed_ümlaut.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write("Test content")
with patch("os.path.getctime", return_value=0):
assert is_processed_before(file_path) is True
with patch("os.path.getctime", return_value=9999999999):
assert is_processed_before(file_path) is False
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
assert is_processed_before(non_existent_file) is False
def test_process_subtitle_file_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test process_subtitle_file function with special character filenames.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
file_path = special_chars_temp_dir / "process_this_ümlaut,file.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(sample_srt_content)
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
assert process_subtitle_file(str(file_path)) is True
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
assert "OpenSubtitles" not in content
non_existent_file = str(special_chars_temp_dir / "non_existent_ümlaut,file.srt")
assert process_subtitle_file(non_existent_file) is False
def test_file_saving_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test that files with special characters can be saved correctly after modification.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
modified_files = process_subtitle_files(special_files)
for file_path in modified_files:
assert os.path.exists(file_path), f"File {file_path} does not exist after saving"
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
assert "OpenSubtitles" not in content, f"Content was not properly saved in {file_path}"
except Exception as e:
pytest.fail(f"Failed to reopen file {file_path} after saving: {e}")
def test_main_with_special_chars(special_chars_temp_dir, sample_srt_content):
"""
Test the main function with filenames containing special characters.
Args:
special_chars_temp_dir: Temporary directory for special character files
sample_srt_content: Sample SRT content
"""
file_path = special_chars_temp_dir / "main_test_ümlaut,file.srt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(sample_srt_content)
stdin_content = str(file_path)
with (
patch("sys.stdin", StringIO(stdin_content)),
patch(
"src.subscleaner.subscleaner.process_subtitle_files",
return_value=[str(file_path)],
) as mock_process_subtitle_files,
):
main()
mock_process_subtitle_files.assert_called_once_with([str(file_path)])