Merge branch 'fix-special-characters-issue-on-windows' into 'master'
Add support for special characters in filenames Closes #2 See merge request rogs/subscleaner!2
This commit is contained in:
commit
cf619272d3
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "subscleaner"
|
name = "subscleaner"
|
||||||
version = "1.1.5"
|
version = "1.2.0"
|
||||||
description = "Remove advertisements from subtitle files"
|
description = "Remove advertisements from subtitle files"
|
||||||
authors = ["Roger Gonzalez <roger@rogs.me>"]
|
authors = ["Roger Gonzalez <roger@rogs.me>"]
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
|
@ -19,6 +19,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import pathlib
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
@ -110,35 +111,43 @@ def contains_ad(subtitle_line: str) -> bool:
|
|||||||
return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS)
|
return any(pattern.search(subtitle_line) for pattern in AD_PATTERNS)
|
||||||
|
|
||||||
|
|
||||||
def is_processed_before(subtitle_file: str) -> bool:
|
def is_processed_before(subtitle_file: pathlib.Path) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the subtitle file has already been processed.
|
Check if the subtitle file has already been processed.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
subtitle_file (str): The path to the subtitle file.
|
subtitle_file (pathlib.Path): The path to the subtitle file.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the subtitle file has already been processed, False otherwise.
|
bool: True if the subtitle file has already been processed, False otherwise.
|
||||||
"""
|
"""
|
||||||
file_creation_time = os.path.getctime(subtitle_file)
|
try:
|
||||||
processed_timestamp = time.mktime(
|
file_creation_time = os.path.getctime(subtitle_file)
|
||||||
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
|
processed_timestamp = time.mktime(
|
||||||
)
|
time.strptime("2021-05-13 00:00:00", "%Y-%m-%d %H:%M:%S"),
|
||||||
return file_creation_time < processed_timestamp
|
)
|
||||||
|
return file_creation_time < processed_timestamp
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error checking if file was processed before: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_encoding(subtitle_file: str) -> str:
|
def get_encoding(subtitle_file: pathlib.Path) -> str:
|
||||||
"""
|
"""
|
||||||
Detect the encoding of the subtitle file.
|
Detect the encoding of the subtitle file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
subtitle_file (str): The path to the subtitle file.
|
subtitle_file (pathlib.Path): The path to the subtitle file.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The detected encoding of the subtitle file.
|
str: The detected encoding of the subtitle file.
|
||||||
"""
|
"""
|
||||||
with open(subtitle_file, "rb") as file:
|
try:
|
||||||
return chardet.detect(file.read())["encoding"]
|
with open(subtitle_file, "rb") as file:
|
||||||
|
return chardet.detect(file.read())["encoding"] or "utf-8"
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error detecting encoding: {e}")
|
||||||
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
|
def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
|
||||||
@ -152,33 +161,52 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
|
|||||||
bool: True if the subtitle data was modified, False otherwise.
|
bool: True if the subtitle data was modified, False otherwise.
|
||||||
"""
|
"""
|
||||||
modified = False
|
modified = False
|
||||||
|
indices_to_remove = []
|
||||||
|
|
||||||
for index, subtitle in enumerate(subtitle_data):
|
for index, subtitle in enumerate(subtitle_data):
|
||||||
if contains_ad(subtitle.text):
|
if contains_ad(subtitle.text):
|
||||||
print(f"Removing: {subtitle}\n")
|
print(f"Removing: {subtitle}\n")
|
||||||
del subtitle_data[index]
|
indices_to_remove.append(index)
|
||||||
modified = True
|
modified = True
|
||||||
|
|
||||||
|
for index in sorted(indices_to_remove, reverse=True):
|
||||||
|
del subtitle_data[index]
|
||||||
|
|
||||||
return modified
|
return modified
|
||||||
|
|
||||||
|
|
||||||
def process_subtitle_file(subtitle_file: str) -> bool:
|
def process_subtitle_file(subtitle_file_path: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Process a subtitle file to remove ad lines.
|
Process a subtitle file to remove ad lines.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
subtitle_file (str): The path to the subtitle file.
|
subtitle_file_path (str): The path to the subtitle file.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the subtitle file was modified, False otherwise.
|
bool: True if the subtitle file was modified, False otherwise.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
subtitle_file = pathlib.Path(subtitle_file_path)
|
||||||
|
|
||||||
|
print(f"Analyzing: {subtitle_file}")
|
||||||
|
|
||||||
|
if not subtitle_file.exists():
|
||||||
|
print(f"File not found: {subtitle_file}")
|
||||||
|
return False
|
||||||
|
|
||||||
if is_processed_before(subtitle_file):
|
if is_processed_before(subtitle_file):
|
||||||
print(f"Already processed {subtitle_file}")
|
print(f"Already processed {subtitle_file}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
print(f"Analyzing: {subtitle_file}")
|
|
||||||
|
|
||||||
encoding = get_encoding(subtitle_file)
|
encoding = get_encoding(subtitle_file)
|
||||||
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
|
try:
|
||||||
|
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
print(f"Failed to open with detected encoding {encoding}, trying utf-8")
|
||||||
|
subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error opening subtitle file with pysrt: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
if remove_ad_lines(subtitle_data):
|
if remove_ad_lines(subtitle_data):
|
||||||
print(f"Saving {subtitle_file}")
|
print(f"Saving {subtitle_file}")
|
||||||
@ -186,7 +214,7 @@ def process_subtitle_file(subtitle_file: str) -> bool:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {subtitle_file}: {e}")
|
print(f"Error processing {subtitle_file_path}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""Unit tests for the subscleaner module."""
|
"""Unit tests for the subscleaner module."""
|
||||||
|
|
||||||
|
import os
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pysrt
|
import pysrt
|
||||||
@ -34,6 +36,14 @@ Another sample subtitle.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def special_chars_temp_dir(tmpdir):
|
||||||
|
"""Create a temporary directory with special character filenames."""
|
||||||
|
special_chars_dir = Path(tmpdir) / "special_chars"
|
||||||
|
special_chars_dir.mkdir(exist_ok=True)
|
||||||
|
return special_chars_dir
|
||||||
|
|
||||||
|
|
||||||
def create_sample_srt_file(tmpdir, content):
|
def create_sample_srt_file(tmpdir, content):
|
||||||
"""Create a sample SRT file with the given content."""
|
"""Create a sample SRT file with the given content."""
|
||||||
file_path = tmpdir.join("sample.srt")
|
file_path = tmpdir.join("sample.srt")
|
||||||
@ -41,6 +51,28 @@ def create_sample_srt_file(tmpdir, content):
|
|||||||
return str(file_path)
|
return str(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def create_special_char_files(dir_path, content):
|
||||||
|
"""Create sample SRT files with special characters in their names."""
|
||||||
|
special_filenames = [
|
||||||
|
"file,with,commas.srt",
|
||||||
|
"file with spaces.srt",
|
||||||
|
"file_with_ümlaut.srt",
|
||||||
|
"file_with_ß_char.srt",
|
||||||
|
"file_with_áccent.srt",
|
||||||
|
"file_with_$ymbol.srt",
|
||||||
|
"file_with_パーセント.srt", # Japanese characters
|
||||||
|
]
|
||||||
|
|
||||||
|
created_files = []
|
||||||
|
for filename in special_filenames:
|
||||||
|
file_path = dir_path / filename
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(content)
|
||||||
|
created_files.append(str(file_path))
|
||||||
|
|
||||||
|
return created_files
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"subtitle_line, expected_result",
|
"subtitle_line, expected_result",
|
||||||
[
|
[
|
||||||
@ -69,11 +101,13 @@ def test_is_processed_before(tmpdir):
|
|||||||
tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file.
|
tmpdir (pytest.fixture): A temporary directory for creating the sample SRT file.
|
||||||
"""
|
"""
|
||||||
subtitle_file = create_sample_srt_file(tmpdir, "")
|
subtitle_file = create_sample_srt_file(tmpdir, "")
|
||||||
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=0):
|
subtitle_path = Path(subtitle_file)
|
||||||
assert is_processed_before(subtitle_file) is True
|
|
||||||
|
|
||||||
with patch("src.subscleaner.subscleaner.os.path.getctime", return_value=9999999999):
|
with patch("os.path.getctime", return_value=0):
|
||||||
assert is_processed_before(subtitle_file) is False
|
assert is_processed_before(subtitle_path) is True
|
||||||
|
|
||||||
|
with patch("os.path.getctime", return_value=9999999999):
|
||||||
|
assert is_processed_before(subtitle_path) is False
|
||||||
|
|
||||||
|
|
||||||
def test_get_encoding(tmpdir, sample_srt_content):
|
def test_get_encoding(tmpdir, sample_srt_content):
|
||||||
@ -85,7 +119,8 @@ def test_get_encoding(tmpdir, sample_srt_content):
|
|||||||
sample_srt_content (str): The sample SRT content.
|
sample_srt_content (str): The sample SRT content.
|
||||||
"""
|
"""
|
||||||
subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content)
|
subtitle_file = create_sample_srt_file(tmpdir, sample_srt_content)
|
||||||
assert get_encoding(subtitle_file) == "ascii"
|
encoding = get_encoding(Path(subtitle_file))
|
||||||
|
assert encoding in ("ascii", "utf-8"), f"Expected ascii or utf-8, got {encoding}"
|
||||||
|
|
||||||
|
|
||||||
def test_remove_ad_lines(sample_srt_content):
|
def test_remove_ad_lines(sample_srt_content):
|
||||||
@ -192,3 +227,140 @@ def test_main_with_modification(tmpdir, sample_srt_content):
|
|||||||
):
|
):
|
||||||
main()
|
main()
|
||||||
mock_process_subtitle_files.assert_called_once_with([subtitle_file])
|
mock_process_subtitle_files.assert_called_once_with([subtitle_file])
|
||||||
|
|
||||||
|
|
||||||
|
def test_process_files_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||||
|
"""
|
||||||
|
Test processing subtitle files with special characters in their names.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
special_chars_temp_dir: Temporary directory for special character files
|
||||||
|
sample_srt_content: Sample SRT content
|
||||||
|
"""
|
||||||
|
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
|
||||||
|
|
||||||
|
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
|
||||||
|
modified_files = process_subtitle_files(special_files)
|
||||||
|
|
||||||
|
assert len(modified_files) == len(special_files), "Not all files with special characters were processed"
|
||||||
|
|
||||||
|
for file_path in special_files:
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
assert "OpenSubtitles" not in content, f"Ad not removed from {file_path}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_encoding_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||||
|
"""
|
||||||
|
Test encoding detection for files with special characters in their names.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
special_chars_temp_dir: Temporary directory for special character files
|
||||||
|
sample_srt_content: Sample SRT content
|
||||||
|
"""
|
||||||
|
file_path = special_chars_temp_dir / "test_ümlaut_ß_áccent.srt"
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(sample_srt_content)
|
||||||
|
|
||||||
|
encoding = get_encoding(file_path)
|
||||||
|
assert encoding is not None, "Encoding detection failed for file with special characters"
|
||||||
|
|
||||||
|
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
|
||||||
|
try:
|
||||||
|
encoding = get_encoding(non_existent_file)
|
||||||
|
assert encoding == "utf-8", "Fallback encoding is not utf-8"
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"get_encoding raised {e} with non-existent file")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_processed_before_with_special_chars(special_chars_temp_dir):
|
||||||
|
"""
|
||||||
|
Test is_processed_before function with special character filenames.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
special_chars_temp_dir: Temporary directory for special character files
|
||||||
|
"""
|
||||||
|
file_path = special_chars_temp_dir / "check_processed_ümlaut.srt"
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("Test content")
|
||||||
|
|
||||||
|
with patch("os.path.getctime", return_value=0):
|
||||||
|
assert is_processed_before(file_path) is True
|
||||||
|
|
||||||
|
with patch("os.path.getctime", return_value=9999999999):
|
||||||
|
assert is_processed_before(file_path) is False
|
||||||
|
|
||||||
|
non_existent_file = special_chars_temp_dir / "non_existent_ümlaut.srt"
|
||||||
|
assert is_processed_before(non_existent_file) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_process_subtitle_file_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||||
|
"""
|
||||||
|
Test process_subtitle_file function with special character filenames.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
special_chars_temp_dir: Temporary directory for special character files
|
||||||
|
sample_srt_content: Sample SRT content
|
||||||
|
"""
|
||||||
|
file_path = special_chars_temp_dir / "process_this_ümlaut,file.srt"
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(sample_srt_content)
|
||||||
|
|
||||||
|
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
|
||||||
|
assert process_subtitle_file(str(file_path)) is True
|
||||||
|
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
assert "OpenSubtitles" not in content
|
||||||
|
|
||||||
|
non_existent_file = str(special_chars_temp_dir / "non_existent_ümlaut,file.srt")
|
||||||
|
assert process_subtitle_file(non_existent_file) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_file_saving_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||||
|
"""
|
||||||
|
Test that files with special characters can be saved correctly after modification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
special_chars_temp_dir: Temporary directory for special character files
|
||||||
|
sample_srt_content: Sample SRT content
|
||||||
|
"""
|
||||||
|
special_files = create_special_char_files(special_chars_temp_dir, sample_srt_content)
|
||||||
|
|
||||||
|
with patch("src.subscleaner.subscleaner.is_processed_before", return_value=False):
|
||||||
|
modified_files = process_subtitle_files(special_files)
|
||||||
|
|
||||||
|
for file_path in modified_files:
|
||||||
|
assert os.path.exists(file_path), f"File {file_path} does not exist after saving"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
assert "OpenSubtitles" not in content, f"Content was not properly saved in {file_path}"
|
||||||
|
except Exception as e:
|
||||||
|
pytest.fail(f"Failed to reopen file {file_path} after saving: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_main_with_special_chars(special_chars_temp_dir, sample_srt_content):
|
||||||
|
"""
|
||||||
|
Test the main function with filenames containing special characters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
special_chars_temp_dir: Temporary directory for special character files
|
||||||
|
sample_srt_content: Sample SRT content
|
||||||
|
"""
|
||||||
|
file_path = special_chars_temp_dir / "main_test_ümlaut,file.srt"
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(sample_srt_content)
|
||||||
|
|
||||||
|
stdin_content = str(file_path)
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("sys.stdin", StringIO(stdin_content)),
|
||||||
|
patch(
|
||||||
|
"src.subscleaner.subscleaner.process_subtitle_files",
|
||||||
|
return_value=[str(file_path)],
|
||||||
|
) as mock_process_subtitle_files,
|
||||||
|
):
|
||||||
|
main()
|
||||||
|
mock_process_subtitle_files.assert_called_once_with([str(file_path)])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user