Added database cache functionality
This commit is contained in:
parent
518146097e
commit
6885a0c491
@ -0,0 +1,3 @@
|
||||
"""Subscleaner package."""
|
||||
|
||||
__version__ = "1.3.0"
|
@ -18,14 +18,18 @@ You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
|
||||
import chardet
|
||||
import pysrt
|
||||
from appdirs import user_data_dir
|
||||
|
||||
AD_PATTERNS = [
|
||||
re.compile(r"\bnordvpn\b", re.IGNORECASE),
|
||||
@ -98,6 +102,118 @@ AD_PATTERNS = [
|
||||
]
|
||||
|
||||
|
||||
def get_db_path(debug=False):
|
||||
"""
|
||||
Get the path to the SQLite database.
|
||||
|
||||
Args:
|
||||
debug (bool): If True, use the current directory for the database.
|
||||
|
||||
Returns:
|
||||
pathlib.Path: The path to the database file.
|
||||
"""
|
||||
if debug:
|
||||
return pathlib.Path.cwd() / "subscleaner.db"
|
||||
|
||||
app_data_dir = pathlib.Path(user_data_dir("subscleaner", "subscleaner"))
|
||||
app_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
return app_data_dir / "subscleaner.db"
|
||||
|
||||
|
||||
def init_db(db_path):
|
||||
"""
|
||||
Initialize the database if it doesn't exist.
|
||||
|
||||
Args:
|
||||
db_path (pathlib.Path): The path to the database file.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS processed_files (
|
||||
file_path TEXT PRIMARY KEY,
|
||||
file_hash TEXT NOT NULL,
|
||||
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_file_hash(file_path):
|
||||
"""
|
||||
Generate an MD5 hash of the file content.
|
||||
|
||||
Args:
|
||||
file_path (pathlib.Path): The path to the file.
|
||||
|
||||
Returns:
|
||||
str: The MD5 hash of the file content.
|
||||
"""
|
||||
hash_md5 = hashlib.md5()
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
except Exception as e:
|
||||
print(f"Error generating hash for {file_path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def is_file_processed(db_path, file_path, file_hash):
|
||||
"""
|
||||
Check if the file has been processed before.
|
||||
|
||||
Args:
|
||||
db_path (pathlib.Path): The path to the database file.
|
||||
file_path (str): The path to the file.
|
||||
file_hash (str): The MD5 hash of the file content.
|
||||
|
||||
Returns:
|
||||
bool: True if the file has been processed before, False otherwise.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT file_hash FROM processed_files WHERE file_path = ?",
|
||||
(str(file_path),),
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
if result is None:
|
||||
return False
|
||||
|
||||
# If the hash has changed, the file has been modified
|
||||
return result[0] == file_hash
|
||||
|
||||
|
||||
def mark_file_processed(db_path, file_path, file_hash):
|
||||
"""
|
||||
Mark the file as processed in the database.
|
||||
|
||||
Args:
|
||||
db_path (pathlib.Path): The path to the database file.
|
||||
file_path (str): The path to the file.
|
||||
file_hash (str): The MD5 hash of the file content.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO processed_files (file_path, file_hash) VALUES (?, ?)",
|
||||
(str(file_path), file_hash),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def contains_ad(subtitle_line: str) -> bool:
|
||||
"""
|
||||
Check if the given subtitle line contains an ad.
|
||||
@ -175,62 +291,118 @@ def remove_ad_lines(subtitle_data: pysrt.SubRipFile) -> bool:
|
||||
return modified
|
||||
|
||||
|
||||
def process_subtitle_file(subtitle_file_path: str) -> bool:
|
||||
def is_already_processed(subtitle_file, db_path, file_hash, force=False):
|
||||
"""
|
||||
Check if the subtitle file has already been processed.
|
||||
|
||||
This function checks both the database and the timestamp to determine
|
||||
if a file has already been processed.
|
||||
|
||||
Args:
|
||||
subtitle_file (pathlib.Path): The path to the subtitle file.
|
||||
db_path (pathlib.Path): The path to the database file.
|
||||
file_hash (str): The MD5 hash of the file content.
|
||||
force (bool): If True, ignore previous processing status.
|
||||
|
||||
Returns:
|
||||
bool: True if the file has already been processed, False otherwise.
|
||||
"""
|
||||
if force:
|
||||
return False
|
||||
|
||||
# Check if the file is in the database with the same hash
|
||||
if is_file_processed(db_path, str(subtitle_file), file_hash):
|
||||
print(f"Already processed {subtitle_file} (hash match)")
|
||||
return True
|
||||
|
||||
# Check based on timestamp
|
||||
if is_processed_before(subtitle_file):
|
||||
print(f"Already processed {subtitle_file} (timestamp check)")
|
||||
# Still mark it in the database
|
||||
mark_file_processed(db_path, str(subtitle_file), file_hash)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def process_subtitle_file(subtitle_file_path: str, db_path, force=False) -> bool:
|
||||
"""
|
||||
Process a subtitle file to remove ad lines.
|
||||
|
||||
Args:
|
||||
subtitle_file_path (str): The path to the subtitle file.
|
||||
db_path (pathlib.Path): The path to the database file.
|
||||
force (bool): If True, process the file even if it has been processed before.
|
||||
|
||||
Returns:
|
||||
bool: True if the subtitle file was modified, False otherwise.
|
||||
"""
|
||||
try:
|
||||
subtitle_file = pathlib.Path(subtitle_file_path)
|
||||
|
||||
print(f"Analyzing: {subtitle_file}")
|
||||
|
||||
# Early validation checks
|
||||
if not subtitle_file.exists():
|
||||
print(f"File not found: {subtitle_file}")
|
||||
return False
|
||||
|
||||
if is_processed_before(subtitle_file):
|
||||
print(f"Already processed {subtitle_file}")
|
||||
# Get file hash and check if already processed
|
||||
file_hash = get_file_hash(subtitle_file)
|
||||
if file_hash is None or is_already_processed(subtitle_file, db_path, file_hash, force):
|
||||
return False
|
||||
|
||||
# Process the subtitle file
|
||||
modified = False
|
||||
encoding = get_encoding(subtitle_file)
|
||||
|
||||
# Try to open the subtitle file
|
||||
subtitle_data = None
|
||||
try:
|
||||
subtitle_data = pysrt.open(subtitle_file, encoding=encoding)
|
||||
except UnicodeDecodeError:
|
||||
print(f"Failed to open with detected encoding {encoding}, trying utf-8")
|
||||
subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
|
||||
try:
|
||||
subtitle_data = pysrt.open(subtitle_file, encoding="utf-8")
|
||||
except Exception as e:
|
||||
print(f"Error opening subtitle file with pysrt: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error opening subtitle file with pysrt: {e}")
|
||||
return False
|
||||
|
||||
if remove_ad_lines(subtitle_data):
|
||||
# Remove ad lines and save if modified
|
||||
if subtitle_data and remove_ad_lines(subtitle_data):
|
||||
print(f"Saving {subtitle_file}")
|
||||
subtitle_data.save(subtitle_file)
|
||||
return True
|
||||
return False
|
||||
# Update the hash after modification
|
||||
new_hash = get_file_hash(subtitle_file)
|
||||
mark_file_processed(db_path, str(subtitle_file), new_hash)
|
||||
modified = True
|
||||
else:
|
||||
# Mark as processed even if no changes were made
|
||||
mark_file_processed(db_path, str(subtitle_file), file_hash)
|
||||
|
||||
return modified
|
||||
except Exception as e:
|
||||
print(f"Error processing {subtitle_file_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def process_subtitle_files(subtitle_files: list[str]) -> list[str]:
|
||||
def process_subtitle_files(subtitle_files: list[str], db_path, force=False) -> list[str]:
|
||||
"""
|
||||
Process multiple subtitle files to remove ad lines.
|
||||
|
||||
Args:
|
||||
subtitle_files (list[str]): A list of subtitle file paths.
|
||||
db_path (pathlib.Path): The path to the database file.
|
||||
force (bool): If True, process files even if they have been processed before.
|
||||
|
||||
Returns:
|
||||
list[str]: A list of modified subtitle file paths.
|
||||
"""
|
||||
modified_files = []
|
||||
for subtitle_file in subtitle_files:
|
||||
if process_subtitle_file(subtitle_file):
|
||||
if process_subtitle_file(subtitle_file, db_path, force):
|
||||
modified_files.append(subtitle_file)
|
||||
return modified_files
|
||||
|
||||
@ -243,9 +415,17 @@ def main():
|
||||
and print the result. Keep track of the modified files and print
|
||||
a summary at the end.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Remove advertisements from subtitle files.")
|
||||
parser.add_argument("--debug", action="store_true", help="Use current directory for database")
|
||||
parser.add_argument("--force", action="store_true", help="Process files even if they have been processed before")
|
||||
args = parser.parse_args()
|
||||
|
||||
db_path = get_db_path(args.debug)
|
||||
init_db(db_path)
|
||||
|
||||
subtitle_files = [file_path.strip() for file_path in sys.stdin]
|
||||
print("Starting script")
|
||||
modified_files = process_subtitle_files(subtitle_files)
|
||||
modified_files = process_subtitle_files(subtitle_files, db_path, args.force)
|
||||
if modified_files:
|
||||
print(f"Modified {len(modified_files)} files")
|
||||
print("Done")
|
||||
|
Loading…
x
Reference in New Issue
Block a user