From e7c4dc16f842ab047a3f7e1bf4bb2490cfed06ff Mon Sep 17 00:00:00 2001 From: Roger Gonzalez Date: Wed, 19 Mar 2025 15:49:38 -0300 Subject: [PATCH 1/3] Add support for special characters in filenames - Added a new function to handle special characters in filenames. - Modified existing functions to handle special characters in filenames. - Updated tests to ensure correct behavior with special characters in filenames. --- src/subscleaner/subscleaner.py | 64 ++++++++---- tests/test_subscleaner.py | 183 ++++++++++++++++++++++++++++++++- 2 files changed, 224 insertions(+), 23 deletions(-) diff --git a/src/subscleaner/subscleaner.py b/src/subscleaner/subscleaner.py index a6fc36b..e12a29b 100755 --- a/src/subscleaner/subscleaner.py +++ b/src/subscleaner/subscleaner.py @@ -22,6 +22,7 @@ import os import re import sys import time +import pathlib import chardet import pysrt @@ -110,35 +111,43 @@ def contains_ad(subtitle_line: str) -> bool: return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS) -def is_processed_before(subtitle_file: str) -> bool: +def is_processed_before(subtitle_file: pathlib.Path) -> bool: """ Check if the subtitle file has already been processed. Args: - subtitle_file (str): The path to the subtitle file. + subtitle_file (pathlib.Path): The path to the subtitle file. Returns: bool: True if the subtitle file has already been processed, False otherwise. """ - file_creation_time = os.path.getctime(subtitle_file) - processed_timestamp = time.mktime( - time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"), - ) - return file_creation_time < processed_timestamp + try: + file_creation_time = os.path.getctime(subtitle_file) + processed_timestamp = time.mktime( + time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"), + ) + return file_creation_time < processed_timestamp + except Exception as e: + print(f"Error checking if file was processed before: {e}") + return False -def get_encoding(subtitle_file: str) -> str: +def get_encoding(subtitle_file: pathlib.Path) -> str: """ Detect the encoding of the subtitle file. Args: - subtitle_file (str): The path to the subtitle file. + subtitle_file (pathlib.Path): The path to the subtitle file. Returns: str: The detected encoding of the subtitle file. """ - with open(subtitle_file, "rb") as file: - return chardet.detect(file.read())["encoding"] + try: + with open(subtitle_file, "rb") as file: + return chardet.detect(file.read())["encoding"] or "utf-8" + except Exception as e: + print(f"Error detecting encoding: {e}") + return "utf-8" def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool: @@ -152,33 +161,52 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool: bool: True if the subtitle data was modified, False otherwise. """ modified = False + indices_to_remove = [] + for index, subtitle in enumerate(subtitle_data): if contains_ad(subtitle.text): print(f"Removing: {subtitle}\n") - del subtitle_data[index] + indices_to_remove.append(index) modified = True + + for index in sorted(indices_to_remove, reverse=True): + del subtitle_data[index] + return modified -def process_subtitle_file(subtitle_file: str) -> bool: +def process_subtitle_file(subtitle_file_path: str) -> bool: """ Process a subtitle file to remove ad lines. Args: - subtitle_file (str): The path to the subtitle file. + subtitle_file_path (str): The path to the subtitle file. Returns: bool: True if the subtitle file was modified, False otherwise. """ try: + subtitle_file = pathlib.Path(subtitle_file_path) + + print(f"Analyzing: {subtitle_file}") + + if not subtitle_file.exists(): + print(f"File not found: {subtitle_file}") + return False + if is_processed_before(subtitle_file): print(f"Already processed {subtitle_file}") return False - print(f"Analyzing: {subtitle_file}") - encoding = get_encoding(subtitle_file) - subtitle_data = pysrt.open(subtitle_file, encoding=encoding) + try: + subtitle_data = pysrt.open(subtitle_file, encoding=encoding) + except UnicodeDecodeError: + print(f"Failed to open with detected encoding {encoding}, trying utf-8") + subtitle_data = pysrt.open(subtitle_file, encoding="utf-8") + except Exception as e: + print(f"Error opening subtitle file with pysrt: {e}") + return False if remove_ad_lines(subtitle_data): print(f"Saving {subtitle_file}") @@ -186,7 +214,7 @@ def process_subtitle_file(subtitle_file: str) -> bool: return True return False except Exception as e: - print(f"Error processing {subtitle_file}: {e}") + print(f"Error processing {subtitle_file_path}: {e}") return False diff --git a/tests/test_subscleaner.py b/tests/test_subscleaner.py index 52eec01..7147f8e 100644 --- a/tests/test_subscleaner.py +++ b/tests/test_subscleaner.py @@ -1,6 +1,9 @@ """Unit tests for the subscleaner module.""" +import os +import sys from io import StringIO +from pathlib import Path from unittest.mock import patch import pysrt @@ -34,6 +37,14 @@ Another sample subtitle. """ +@pytest.fixture +def special_chars_temp_dir(tmpdir): + """Create a temporary directory with special character filenames.""" + special_chars_dir = Path(tmpdir) / "special_chars" + special_chars_dir.mkdir(exist_ok=True) + return special_chars_dir + + def create_sample_srt_file(tmpdir, content): """Create a sample SRT file with the given content.""" file_path = tmpdir.join("sample.srt") @@ -41,6 +52,28 @@ def create_sample_srt_file(tmpdir, content): return str(file_path) +def create_special_char_files(dir_path, content): + """Create sample SRT files with special characters in their names.""" + special_filenames = [ + "file,with,commas.srt", + "file with spaces.srt", + "file_with_ümlaut.srt", + "file_with_ß_char.srt", + "file_with_áccent.srt", + "file_with_$ymbol.srt", + "file_with_パーセント.srt", # Japanese characters + ] + + created_files = [] + for filename in special_filenames: + file_path = dir_path / filename + with open(file_path, "w", encoding="utf-8") as f: + f.write(content) + created_files.append(str(file_path)) + + return created_files + + @pytest.mark.parametrize( "subtitle_line, expected_result", [ @@ -69,11 +102,13 @@ def test_is_processed_before(tmpdir): tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file. """ subtitle_file = create_sample_srt_file(tmpdir, "") - with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=0): - assert is_processed_before(subtitle_file) is True + subtitle_path = Path(subtitle_file) - with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=9999999999): - assert is_processed_before(subtitle_file) is False + with patch("os.path.getctime", return_value=0): + assert is_processed_before(subtitle_path) is True + + with patch("os.path.getctime", return_value=9999999999): + assert is_processed_before(subtitle_path) is False def test_get_encoding(tmpdir, sample_srt_content): @@ -85,7 +120,8 @@ def test_get_encoding(tmpdir, sample_srt_content): sample_srt_content (str): The sample SRT content. """ subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content) - assert get_encoding(subtitle_file) == "ascii" + encoding = get_encoding(Path(subtitle_file)) + assert encoding in ("ascii", "utf-8"), f"Expected ascii or utf-8, got {encoding}" def test_remove_ad_lines(sample_srt_content): @@ -192,3 +228,140 @@ def test_main_with_modification(tmpdir, sample_srt_content): ): main() mock_process_subtitle_files.assert_called_once_with([subtitle_file]) + + +def test_process_files_with_special_chars(special_chars_temp_dir, sample_srt_content): + """ + Test processing subtitle files with special characters in their names. + + Args: + special_chars_temp_dir: Temporary directory for special character files + sample_srt_content: Sample SRT content + """ + special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content) + + with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False): + modified_files = process_subtitle_files(special_files) + + assert len(modified_files) == len(special_files), "Not all files with special characters were processed" + + for file_path in special_files: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + assert "OpenSubtitles" not in content, f"Ad not removed from {file_path}" + + +def test_get_encoding_with_special_chars(special_chars_temp_dir, sample_srt_content): + """ + Test encoding detection for files with special characters in their names. + + Args: + special_chars_temp_dir: Temporary directory for special character files + sample_srt_content: Sample SRT content + """ + file_path = special_chars_temp_dir / "test_ümlaut_ß_áccent.srt" + with open(file_path, "w", encoding="utf-8") as f: + f.write(sample_srt_content) + + encoding = get_encoding(file_path) + assert encoding is not None, "Encoding detection failed for file with special characters" + + non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt" + try: + encoding = get_encoding(non_existent_file) + assert encoding == "utf-8", "Fallback encoding is not utf-8" + except Exception as e: + pytest.fail(f"get_encoding raised {e} with non-existent file") + + +def test_is_processed_before_with_special_chars(special_chars_temp_dir): + """ + Test is_processed_before function with special character filenames. + + Args: + special_chars_temp_dir: Temporary directory for special character files + """ + file_path = special_chars_temp_dir / "check_processed_ümlaut.srt" + with open(file_path, "w", encoding="utf-8") as f: + f.write("Test content") + + with patch("os.path.getctime", return_value=0): + assert is_processed_before(file_path) is True + + with patch("os.path.getctime", return_value=9999999999): + assert is_processed_before(file_path) is False + + non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt" + assert is_processed_before(non_existent_file) is False + + +def test_process_subtitle_file_with_special_chars(special_chars_temp_dir, sample_srt_content): + """ + Test process_subtitle_file function with special character filenames. + + Args: + special_chars_temp_dir: Temporary directory for special character files + sample_srt_content: Sample SRT content + """ + file_path = special_chars_temp_dir / "process_this_ümlaut,file.srt" + with open(file_path, "w", encoding="utf-8") as f: + f.write(sample_srt_content) + + with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False): + assert process_subtitle_file(str(file_path)) is True + + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + assert "OpenSubtitles" not in content + + non_existent_file = str(special_chars_temp_dir / "non_existent_ümlaut,file.srt") + assert process_subtitle_file(non_existent_file) is False + + +def test_file_saving_with_special_chars(special_chars_temp_dir, sample_srt_content): + """ + Test that files with special characters can be saved correctly after modification. + + Args: + special_chars_temp_dir: Temporary directory for special character files + sample_srt_content: Sample SRT content + """ + special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content) + + with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False): + modified_files = process_subtitle_files(special_files) + + for file_path in modified_files: + assert os.path.exists(file_path), f"File {file_path} does not exist after saving" + + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + assert "OpenSubtitles" not in content, f"Content was not properly saved in {file_path}" + except Exception as e: + pytest.fail(f"Failed to reopen file {file_path} after saving: {e}") + + +def test_main_with_special_chars(special_chars_temp_dir, sample_srt_content): + """ + Test the main function with filenames containing special characters. + + Args: + special_chars_temp_dir: Temporary directory for special character files + sample_srt_content: Sample SRT content + """ + file_path = special_chars_temp_dir / "main_test_ümlaut,file.srt" + with open(file_path, "w", encoding="utf-8") as f: + f.write(sample_srt_content) + + stdin_content = str(file_path) + + with ( + patch("sys.stdin", StringIO(stdin_content)), + patch( + "src.subscleaner.subscleaner.process_subtitle_files", + return_value=[str(file_path)], + ) as mock_process_subtitle_files, + ): + main() + mock_process_subtitle_files.assert_called_once_with([str(file_path)]) From 5f58289e8f70059b15f05f8c8ed4dae4cc611d65 Mon Sep 17 00:00:00 2001 From: Roger Gonzalez Date: Wed, 19 Mar 2025 15:50:54 -0300 Subject: [PATCH 2/3] Update version to 1.2.0 in pyproject.toml Update the version number in pyproject.toml from 1.1.5 to 1.2.0. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d57b64e..890d84a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "subscleaner" -version = "1.1.5" +version = "1.2.0" description = "Remove advertisements from subtitle files" authors = ["Roger Gonzalez "] license = "GPL-3.0-or-later" From 9d1b6c2342018ff72d558d9e5447b47e69792ab7 Mon Sep 17 00:00:00 2001 From: Roger Gonzalez Date: Wed, 19 Mar 2025 15:52:43 -0300 Subject: [PATCH 3/3] Fixed linter issues --- src/subscleaner/subscleaner.py | 2 +- tests/test_subscleaner.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/subscleaner/subscleaner.py b/src/subscleaner/subscleaner.py index e12a29b..967ab8a 100755 --- a/src/subscleaner/subscleaner.py +++ b/src/subscleaner/subscleaner.py @@ -19,10 +19,10 @@ along with this program. If not, see . """ import os +import pathlib import re import sys import time -import pathlib import chardet import pysrt diff --git a/tests/test_subscleaner.py b/tests/test_subscleaner.py index 7147f8e..e9e9a7e 100644 --- a/tests/test_subscleaner.py +++ b/tests/test_subscleaner.py @@ -1,7 +1,6 @@ """Unit tests for the subscleaner module.""" import os -import sys from io import StringIO from pathlib import Path from unittest.mock import patch