Merge branch 'fix-special-characters-issue-on-windows' into 'master'

Add support for special characters in filenames Closes #2 See merge request rogs/subscleaner!2
2025-03-19 15:56:36 -03:00 · 2025-03-19 15:56:36 -03:00 · cf619272d3
commit cf619272d3
parent 72e2f5afdf 9d1b6c2342
3 changed files with 224 additions and 24 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "subscleaner"
-version = "1.1.5"
+version = "1.2.0"
 description = "Remove advertisements from subtitle files"
 authors = ["Roger Gonzalez <roger@rogs.me>"]
 license = "GPL-3.0-or-later"
--- a/src/subscleaner/subscleaner.py
+++ b/src/subscleaner/subscleaner.py
@ -19,6 +19,7 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import os
 import pathlib
 import re
 import sys
 import time
@ -110,35 +111,43 @@ def contains_ad(subtitle_line: str) -> bool:
    return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS)
-def is_processed_before(subtitle_file: str) -> bool:
+def is_processed_before(subtitle_file: pathlib.Path) -> bool:
    """
    Check if the subtitle file has already been processed.
    Args:
-        subtitle_file (str): The path to the subtitle file.
+        subtitle_file (pathlib.Path): The path to the subtitle file.
    Returns:
        bool: True if the subtitle file has already been processed, False otherwise.
    """
-    file_creation_time = os.path.getctime(subtitle_file)
+    try:
-    processed_timestamp = time.mktime(
+        file_creation_time = os.path.getctime(subtitle_file)
-        time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
+        processed_timestamp = time.mktime(
-    )
+            time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
-    return file_creation_time < processed_timestamp
+        )
        return file_creation_time < processed_timestamp
    except Exception as e:
        print(f"Error checking if file was processed before: {e}")
        return False
-def get_encoding(subtitle_file: str) -> str:
+def get_encoding(subtitle_file: pathlib.Path) -> str:
    """
    Detect the encoding of the subtitle file.
    Args:
-        subtitle_file (str): The path to the subtitle file.
+        subtitle_file (pathlib.Path): The path to the subtitle file.
    Returns:
        str: The detected encoding of the subtitle file.
    """
-    with open(subtitle_file, "rb") as file:
+    try:
-        return chardet.detect(file.read())["encoding"]
+        with open(subtitle_file, "rb") as file:
            return chardet.detect(file.read())["encoding"] or "utf-8"
    except Exception as e:
        print(f"Error detecting encoding: {e}")
        return "utf-8"
 def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
@ -152,33 +161,52 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
        bool: True if the subtitle data was modified, False otherwise.
    """
    modified = False
    indices_to_remove = []
    for index, subtitle in enumerate(subtitle_data):
        if contains_ad(subtitle.text):
            print(f"Removing: {subtitle}\n")
-            del subtitle_data[index]
+            indices_to_remove.append(index)
            modified = True
    for index in sorted(indices_to_remove, reverse=True):
        del subtitle_data[index]
    return modified
-def process_subtitle_file(subtitle_file: str) -> bool:
+def process_subtitle_file(subtitle_file_path: str) -> bool:
    """
    Process a subtitle file to remove ad lines.
    Args:
-        subtitle_file (str): The path to the subtitle file.
+        subtitle_file_path (str): The path to the subtitle file.
    Returns:
        bool: True if the subtitle file was modified, False otherwise.
    """
    try:
        subtitle_file = pathlib.Path(subtitle_file_path)
        print(f"Analyzing: {subtitle_file}")
        if not subtitle_file.exists():
            print(f"File not found: {subtitle_file}")
            return False
        if is_processed_before(subtitle_file):
            print(f"Already processed {subtitle_file}")
            return False
        print(f"Analyzing: {subtitle_file}")
        encoding = get_encoding(subtitle_file)
-        subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
+        try:
            subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
        except UnicodeDecodeError:
            print(f"Failed to open with detected encoding {encoding}, trying utf-8")
            subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
        except Exception as e:
            print(f"Error opening subtitle file with pysrt: {e}")
            return False
        if remove_ad_lines(subtitle_data):
            print(f"Saving {subtitle_file}")
@ -186,7 +214,7 @@ def process_subtitle_file(subtitle_file: str) -> bool:
            return True
        return False
    except Exception as e:
-        print(f"Error processing {subtitle_file}: {e}")
+        print(f"Error processing {subtitle_file_path}: {e}")
        return False
--- a/tests/test_subscleaner.py
+++ b/tests/test_subscleaner.py
@ -1,6 +1,8 @@
 """Unit tests for the subscleaner module."""
 import os
 from io import StringIO
 from pathlib import Path
 from unittest.mock import patch
 import pysrt
@ -34,6 +36,14 @@ Another sample subtitle.
 """
@pytest.fixture
 def special_chars_temp_dir(tmpdir):
    """Create a temporary directory with special character filenames."""
    special_chars_dir = Path(tmpdir) / "special_chars"
    special_chars_dir.mkdir(exist_ok=True)
    return special_chars_dir
 def create_sample_srt_file(tmpdir, content):
    """Create a sample SRT file with the given content."""
    file_path = tmpdir.join("sample.srt")
@ -41,6 +51,28 @@ def create_sample_srt_file(tmpdir, content):
    return str(file_path)
 def create_special_char_files(dir_path, content):
    """Create sample SRT files with special characters in their names."""
    special_filenames = [
        "file,with,commas.srt",
        "file with spaces.srt",
        "file_with_ümlaut.srt",
        "file_with_ß_char.srt",
        "file_with_áccent.srt",
        "file_with_$ymbol.srt",
        "file_with_パーセント.srt",  # Japanese characters
    ]
    created_files = []
    for filename in special_filenames:
        file_path = dir_path / filename
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        created_files.append(str(file_path))
    return created_files
@pytest.mark.parametrize(
    "subtitle_line, expected_result",
    [
@ -69,11 +101,13 @@ def test_is_processed_before(tmpdir):
        tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file.
    """
    subtitle_file = create_sample_srt_file(tmpdir, "")
-    with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=0):
+    subtitle_path = Path(subtitle_file)
        assert is_processed_before(subtitle_file) is True
-    with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=9999999999):
+    with patch("os.path.getctime", return_value=0):
-        assert is_processed_before(subtitle_file) is False
+        assert is_processed_before(subtitle_path) is True
    with patch("os.path.getctime", return_value=9999999999):
        assert is_processed_before(subtitle_path) is False
 def test_get_encoding(tmpdir, sample_srt_content):
@ -85,7 +119,8 @@ def test_get_encoding(tmpdir, sample_srt_content):
        sample_srt_content (str): The sample SRT content.
    """
    subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content)
-    assert get_encoding(subtitle_file) == "ascii"
+    encoding = get_encoding(Path(subtitle_file))
    assert encoding in ("ascii", "utf-8"), f"Expected ascii or utf-8, got {encoding}"
 def test_remove_ad_lines(sample_srt_content):
@ -192,3 +227,140 @@ def test_main_with_modification(tmpdir, sample_srt_content):
    ):
        main()
        mock_process_subtitle_files.assert_called_once_with([subtitle_file])
 def test_process_files_with_special_chars(special_chars_temp_dir, sample_srt_content):
    """
    Test processing subtitle files with special characters in their names.
    Args:
        special_chars_temp_dir: Temporary directory for special character files
        sample_srt_content: Sample SRT content
    """
    special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
    with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
        modified_files = process_subtitle_files(special_files)
    assert len(modified_files) == len(special_files), "Not all files with special characters were processed"
    for file_path in special_files:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        assert "OpenSubtitles" not in content, f"Ad not removed from {file_path}"
 def test_get_encoding_with_special_chars(special_chars_temp_dir, sample_srt_content):
    """
    Test encoding detection for files with special characters in their names.
    Args:
        special_chars_temp_dir: Temporary directory for special character files
        sample_srt_content: Sample SRT content
    """
    file_path = special_chars_temp_dir / "test_ümlaut_ß_áccent.srt"
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(sample_srt_content)
    encoding = get_encoding(file_path)
    assert encoding is not None, "Encoding detection failed for file with special characters"
    non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
    try:
        encoding = get_encoding(non_existent_file)
        assert encoding == "utf-8", "Fallback encoding is not utf-8"
    except Exception as e:
        pytest.fail(f"get_encoding raised {e} with non-existent file")
 def test_is_processed_before_with_special_chars(special_chars_temp_dir):
    """
    Test is_processed_before function with special character filenames.
    Args:
        special_chars_temp_dir: Temporary directory for special character files
    """
    file_path = special_chars_temp_dir / "check_processed_ümlaut.srt"
    with open(file_path, "w", encoding="utf-8") as f:
        f.write("Test content")
    with patch("os.path.getctime", return_value=0):
        assert is_processed_before(file_path) is True
    with patch("os.path.getctime", return_value=9999999999):
        assert is_processed_before(file_path) is False
    non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
    assert is_processed_before(non_existent_file) is False
 def test_process_subtitle_file_with_special_chars(special_chars_temp_dir, sample_srt_content):
    """
    Test process_subtitle_file function with special character filenames.
    Args:
        special_chars_temp_dir: Temporary directory for special character files
        sample_srt_content: Sample SRT content
    """
    file_path = special_chars_temp_dir / "process_this_ümlaut,file.srt"
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(sample_srt_content)
    with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
        assert process_subtitle_file(str(file_path)) is True
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    assert "OpenSubtitles" not in content
    non_existent_file = str(special_chars_temp_dir / "non_existent_ümlaut,file.srt")
    assert process_subtitle_file(non_existent_file) is False
 def test_file_saving_with_special_chars(special_chars_temp_dir, sample_srt_content):
    """
    Test that files with special characters can be saved correctly after modification.
    Args:
        special_chars_temp_dir: Temporary directory for special character files
        sample_srt_content: Sample SRT content
    """
    special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
    with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
        modified_files = process_subtitle_files(special_files)
    for file_path in modified_files:
        assert os.path.exists(file_path), f"File {file_path} does not exist after saving"
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
            assert "OpenSubtitles" not in content, f"Content was not properly saved in {file_path}"
        except Exception as e:
            pytest.fail(f"Failed to reopen file {file_path} after saving: {e}")
 def test_main_with_special_chars(special_chars_temp_dir, sample_srt_content):
    """
    Test the main function with filenames containing special characters.
    Args:
        special_chars_temp_dir: Temporary directory for special character files
        sample_srt_content: Sample SRT content
    """
    file_path = special_chars_temp_dir / "main_test_ümlaut,file.srt"
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(sample_srt_content)
    stdin_content = str(file_path)
    with (
        patch("sys.stdin", StringIO(stdin_content)),
        patch(
            "src.subscleaner.subscleaner.process_subtitle_files",
            return_value=[str(file_path)],
        ) as mock_process_subtitle_files,
    ):
        main()
        mock_process_subtitle_files.assert_called_once_with([str(file_path)])