Merge branch 'fix-special-characters-issue-on-windows' into 'master'
Add support for special characters in filenames Closes #2 See merge request rogs/subscleaner!2
This commit is contained in:
commit
cf619272d3
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "subscleaner"
|
||||
version = "1.1.5"
|
||||
version = "1.2.0"
|
||||
description = "Remove advertisements from subtitle files"
|
||||
authors = ["Roger Gonzalez <roger@rogs.me>"]
|
||||
license = "GPL-3.0-or-later"
|
||||
|
@ -19,6 +19,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
@ -110,35 +111,43 @@ def contains_ad(subtitle_line: str) -> bool:
|
||||
return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS)
|
||||
|
||||
|
||||
def is_processed_before(subtitle_file: str) -> bool:
|
||||
def is_processed_before(subtitle_file: pathlib.Path) -> bool:
|
||||
"""
|
||||
Check if the subtitle file has already been processed.
|
||||
|
||||
Args:
|
||||
subtitle_file (str): The path to the subtitle file.
|
||||
subtitle_file (pathlib.Path): The path to the subtitle file.
|
||||
|
||||
Returns:
|
||||
bool: True if the subtitle file has already been processed, False otherwise.
|
||||
"""
|
||||
file_creation_time = os.path.getctime(subtitle_file)
|
||||
processed_timestamp = time.mktime(
|
||||
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
return file_creation_time < processed_timestamp
|
||||
try:
|
||||
file_creation_time = os.path.getctime(subtitle_file)
|
||||
processed_timestamp = time.mktime(
|
||||
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
return file_creation_time < processed_timestamp
|
||||
except Exception as e:
|
||||
print(f"Error checking if file was processed before: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_encoding(subtitle_file: str) -> str:
|
||||
def get_encoding(subtitle_file: pathlib.Path) -> str:
|
||||
"""
|
||||
Detect the encoding of the subtitle file.
|
||||
|
||||
Args:
|
||||
subtitle_file (str): The path to the subtitle file.
|
||||
subtitle_file (pathlib.Path): The path to the subtitle file.
|
||||
|
||||
Returns:
|
||||
str: The detected encoding of the subtitle file.
|
||||
"""
|
||||
with open(subtitle_file, "rb") as file:
|
||||
return chardet.detect(file.read())["encoding"]
|
||||
try:
|
||||
with open(subtitle_file, "rb") as file:
|
||||
return chardet.detect(file.read())["encoding"] or "utf-8"
|
||||
except Exception as e:
|
||||
print(f"Error detecting encoding: {e}")
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
|
||||
@ -152,33 +161,52 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
|
||||
bool: True if the subtitle data was modified, False otherwise.
|
||||
"""
|
||||
modified = False
|
||||
indices_to_remove = []
|
||||
|
||||
for index, subtitle in enumerate(subtitle_data):
|
||||
if contains_ad(subtitle.text):
|
||||
print(f"Removing: {subtitle}\n")
|
||||
del subtitle_data[index]
|
||||
indices_to_remove.append(index)
|
||||
modified = True
|
||||
|
||||
for index in sorted(indices_to_remove, reverse=True):
|
||||
del subtitle_data[index]
|
||||
|
||||
return modified
|
||||
|
||||
|
||||
def process_subtitle_file(subtitle_file: str) -> bool:
|
||||
def process_subtitle_file(subtitle_file_path: str) -> bool:
|
||||
"""
|
||||
Process a subtitle file to remove ad lines.
|
||||
|
||||
Args:
|
||||
subtitle_file (str): The path to the subtitle file.
|
||||
subtitle_file_path (str): The path to the subtitle file.
|
||||
|
||||
Returns:
|
||||
bool: True if the subtitle file was modified, False otherwise.
|
||||
"""
|
||||
try:
|
||||
subtitle_file = pathlib.Path(subtitle_file_path)
|
||||
|
||||
print(f"Analyzing: {subtitle_file}")
|
||||
|
||||
if not subtitle_file.exists():
|
||||
print(f"File not found: {subtitle_file}")
|
||||
return False
|
||||
|
||||
if is_processed_before(subtitle_file):
|
||||
print(f"Already processed {subtitle_file}")
|
||||
return False
|
||||
|
||||
print(f"Analyzing: {subtitle_file}")
|
||||
|
||||
encoding = get_encoding(subtitle_file)
|
||||
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
|
||||
try:
|
||||
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
|
||||
except UnicodeDecodeError:
|
||||
print(f"Failed to open with detected encoding {encoding}, trying utf-8")
|
||||
subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
|
||||
except Exception as e:
|
||||
print(f"Error opening subtitle file with pysrt: {e}")
|
||||
return False
|
||||
|
||||
if remove_ad_lines(subtitle_data):
|
||||
print(f"Saving {subtitle_file}")
|
||||
@ -186,7 +214,7 @@ def process_subtitle_file(subtitle_file: str) -> bool:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error processing {subtitle_file}: {e}")
|
||||
print(f"Error processing {subtitle_file_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""Unit tests for the subscleaner module."""
|
||||
|
||||
import os
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pysrt
|
||||
@ -34,6 +36,14 @@ Another sample subtitle.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def special_chars_temp_dir(tmpdir):
|
||||
"""Create a temporary directory with special character filenames."""
|
||||
special_chars_dir = Path(tmpdir) / "special_chars"
|
||||
special_chars_dir.mkdir(exist_ok=True)
|
||||
return special_chars_dir
|
||||
|
||||
|
||||
def create_sample_srt_file(tmpdir, content):
|
||||
"""Create a sample SRT file with the given content."""
|
||||
file_path = tmpdir.join("sample.srt")
|
||||
@ -41,6 +51,28 @@ def create_sample_srt_file(tmpdir, content):
|
||||
return str(file_path)
|
||||
|
||||
|
||||
def create_special_char_files(dir_path, content):
|
||||
"""Create sample SRT files with special characters in their names."""
|
||||
special_filenames = [
|
||||
"file,with,commas.srt",
|
||||
"file with spaces.srt",
|
||||
"file_with_ümlaut.srt",
|
||||
"file_with_ß_char.srt",
|
||||
"file_with_áccent.srt",
|
||||
"file_with_$ymbol.srt",
|
||||
"file_with_パーセント.srt", # Japanese characters
|
||||
]
|
||||
|
||||
created_files = []
|
||||
for filename in special_filenames:
|
||||
file_path = dir_path / filename
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
created_files.append(str(file_path))
|
||||
|
||||
return created_files
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"subtitle_line, expected_result",
|
||||
[
|
||||
@ -69,11 +101,13 @@ def test_is_processed_before(tmpdir):
|
||||
tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file.
|
||||
"""
|
||||
subtitle_file = create_sample_srt_file(tmpdir, "")
|
||||
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=0):
|
||||
assert is_processed_before(subtitle_file) is True
|
||||
subtitle_path = Path(subtitle_file)
|
||||
|
||||
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=9999999999):
|
||||
assert is_processed_before(subtitle_file) is False
|
||||
with patch("os.path.getctime", return_value=0):
|
||||
assert is_processed_before(subtitle_path) is True
|
||||
|
||||
with patch("os.path.getctime", return_value=9999999999):
|
||||
assert is_processed_before(subtitle_path) is False
|
||||
|
||||
|
||||
def test_get_encoding(tmpdir, sample_srt_content):
|
||||
@ -85,7 +119,8 @@ def test_get_encoding(tmpdir, sample_srt_content):
|
||||
sample_srt_content (str): The sample SRT content.
|
||||
"""
|
||||
subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content)
|
||||
assert get_encoding(subtitle_file) == "ascii"
|
||||
encoding = get_encoding(Path(subtitle_file))
|
||||
assert encoding in ("ascii", "utf-8"), f"Expected ascii or utf-8, got {encoding}"
|
||||
|
||||
|
||||
def test_remove_ad_lines(sample_srt_content):
|
||||
@ -192,3 +227,140 @@ def test_main_with_modification(tmpdir, sample_srt_content):
|
||||
):
|
||||
main()
|
||||
mock_process_subtitle_files.assert_called_once_with([subtitle_file])
|
||||
|
||||
|
||||
def test_process_files_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||
"""
|
||||
Test processing subtitle files with special characters in their names.
|
||||
|
||||
Args:
|
||||
special_chars_temp_dir: Temporary directory for special character files
|
||||
sample_srt_content: Sample SRT content
|
||||
"""
|
||||
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
|
||||
|
||||
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
|
||||
modified_files = process_subtitle_files(special_files)
|
||||
|
||||
assert len(modified_files) == len(special_files), "Not all files with special characters were processed"
|
||||
|
||||
for file_path in special_files:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
assert "OpenSubtitles" not in content, f"Ad not removed from {file_path}"
|
||||
|
||||
|
||||
def test_get_encoding_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||
"""
|
||||
Test encoding detection for files with special characters in their names.
|
||||
|
||||
Args:
|
||||
special_chars_temp_dir: Temporary directory for special character files
|
||||
sample_srt_content: Sample SRT content
|
||||
"""
|
||||
file_path = special_chars_temp_dir / "test_ümlaut_ß_áccent.srt"
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(sample_srt_content)
|
||||
|
||||
encoding = get_encoding(file_path)
|
||||
assert encoding is not None, "Encoding detection failed for file with special characters"
|
||||
|
||||
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
|
||||
try:
|
||||
encoding = get_encoding(non_existent_file)
|
||||
assert encoding == "utf-8", "Fallback encoding is not utf-8"
|
||||
except Exception as e:
|
||||
pytest.fail(f"get_encoding raised {e} with non-existent file")
|
||||
|
||||
|
||||
def test_is_processed_before_with_special_chars(special_chars_temp_dir):
|
||||
"""
|
||||
Test is_processed_before function with special character filenames.
|
||||
|
||||
Args:
|
||||
special_chars_temp_dir: Temporary directory for special character files
|
||||
"""
|
||||
file_path = special_chars_temp_dir / "check_processed_ümlaut.srt"
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write("Test content")
|
||||
|
||||
with patch("os.path.getctime", return_value=0):
|
||||
assert is_processed_before(file_path) is True
|
||||
|
||||
with patch("os.path.getctime", return_value=9999999999):
|
||||
assert is_processed_before(file_path) is False
|
||||
|
||||
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
|
||||
assert is_processed_before(non_existent_file) is False
|
||||
|
||||
|
||||
def test_process_subtitle_file_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||
"""
|
||||
Test process_subtitle_file function with special character filenames.
|
||||
|
||||
Args:
|
||||
special_chars_temp_dir: Temporary directory for special character files
|
||||
sample_srt_content: Sample SRT content
|
||||
"""
|
||||
file_path = special_chars_temp_dir / "process_this_ümlaut,file.srt"
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(sample_srt_content)
|
||||
|
||||
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
|
||||
assert process_subtitle_file(str(file_path)) is True
|
||||
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
assert "OpenSubtitles" not in content
|
||||
|
||||
non_existent_file = str(special_chars_temp_dir / "non_existent_ümlaut,file.srt")
|
||||
assert process_subtitle_file(non_existent_file) is False
|
||||
|
||||
|
||||
def test_file_saving_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||
"""
|
||||
Test that files with special characters can be saved correctly after modification.
|
||||
|
||||
Args:
|
||||
special_chars_temp_dir: Temporary directory for special character files
|
||||
sample_srt_content: Sample SRT content
|
||||
"""
|
||||
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
|
||||
|
||||
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
|
||||
modified_files = process_subtitle_files(special_files)
|
||||
|
||||
for file_path in modified_files:
|
||||
assert os.path.exists(file_path), f"File {file_path} does not exist after saving"
|
||||
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
assert "OpenSubtitles" not in content, f"Content was not properly saved in {file_path}"
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed to reopen file {file_path} after saving: {e}")
|
||||
|
||||
|
||||
def test_main_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||
"""
|
||||
Test the main function with filenames containing special characters.
|
||||
|
||||
Args:
|
||||
special_chars_temp_dir: Temporary directory for special character files
|
||||
sample_srt_content: Sample SRT content
|
||||
"""
|
||||
file_path = special_chars_temp_dir / "main_test_ümlaut,file.srt"
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(sample_srt_content)
|
||||
|
||||
stdin_content = str(file_path)
|
||||
|
||||
with (
|
||||
patch("sys.stdin", StringIO(stdin_content)),
|
||||
patch(
|
||||
"src.subscleaner.subscleaner.process_subtitle_files",
|
||||
return_value=[str(file_path)],
|
||||
) as mock_process_subtitle_files,
|
||||
):
|
||||
main()
|
||||
mock_process_subtitle_files.assert_called_once_with([str(file_path)])
|
||||
|
Loading…
x
Reference in New Issue
Block a user