cleanmedia/cleanmedia
2022-11-08 18:51:59 +01:00

167 lines
7.2 KiB
Python
Executable File

#!/usr/bin/env python3
#TODO: Sanity checks: Are files on the file system that the db does not know about?
#TODO: Sanity checks: Are there thumbnails in the db that do not have corresponding media file entries?
# mediaapi_media_repository: media_id | media_origin | content_type | file_size_bytes | creation_ts | upload_name | base64hash | user_id
# mediaapi_thumbnail: media_id | media_origin | content_type | file_size_bytes | creation_ts | width | height | resize_method
from datetime import datetime, timedelta
from functools import cached_property
from pathlib import Path
import argparse, logging, typing
try:
import psycopg2
import yaml
except ImportError as e:
print("Please install psycopg2 and pyyaml")
exit(1)
#------------------------------------------------------------------------------------
class File:
"""Represents a file in our database together with (hopefully) a physical file and thumbnails"""
def __init__(self, media_repo: 'MediaRepository', media_id: str, creation_ts: int, base64hash: str):
# The MediaRepository in which this file is recorded
self.repo = media_repo
self.media_id = media_id
# creation_ts is seconds since the epoch
self.create_date = datetime.fromtimestamp(creation_ts)
self.base64hash = base64hash
@cached_property
def fullpath(self):
"""returns the directory in which the "file" and all thumbnails are located, or None if no file is known"""
# TODO: Make a property, calculate on first usage and cache it?
if not self.base64hash:
return None
return self.repo.media_path / self.base64hash[0:1] / self.base64hash[1:2] / self.base64hash[2:]
def delete(self):
"""Delete db entries, and the file itself"""
if self.fullpath is None:
logging.info(f"No known path for file id '{self.media_id}', cannot delete file.")
elif not self.fullpath.is_dir():
logging.debug(f"Path for file id '{self.media_id}' is not a directory or does not exist, not deleting.")
else:
for file in self.fullpath.glob('*'):
# note: this does not handle directories in fullpath
file.unlink()
self.fullpath.rmdir()
logging.debug(f"Deleted directory {self.fullpath}")
with self.repo.conn.cursor() as cur:
cur.execute("DELETE from mediaapi_thumbnail WHERE media_id=%s;", (self.media_id,))
num_thumbnails = cur.rowcount
cur.execute("DELETE from mediaapi_media_repository WHERE media_id=%s;", (self.media_id,))
num_media = cur.rowcount
self.repo.conn.commit()
logging.debug(f"Deleted {num_media} + {num_thumbnails} db entries for media id {self.media_id}")
def exists(self):
"""returns True if the media file itself exists on the file system"""
if self.fullpath is None:
return False
return (self.fullpath / 'file').exists()
def has_thumbnail(self):
"""Returns the number of thumbnails associated with this file"""
with self.repo.conn.cursor() as cur:
cur.execute(f"select COUNT(media_id) from mediaapi_thumbnail WHERE media_id='{self.media_id}';")
row = cur.fetchone()
return(row[0])
#----------------------------------------------------------------------
class MediaRepository:
def __init__(self, media_path: Path, connection_string: str):
# media_path is a pathlib.Path
self.media_path = media_path
if not self.media_path.is_absolute():
logging.warn("The media path is relative, make sure you run this script in the correct directory!")
if not self.media_path.is_dir():
raise Exception(f"The configured media dir cannot be found!")
# psql db connection
self.conn = None
self.db_conn_string = connection_string
self.connect_db();
def connect_db(self):
#postgresql://user:pass@localhost/database?params
if self.db_conn_string is None or not self.db_conn_string.startswith("postgres://"):
errstr = "DB connection not a postgres one"
logging.error(errstr)
raise ValueError(errstr)
self.conn = psycopg2.connect(self.db_conn_string)
def get_remote_media(self):
with self.conn.cursor() as cur:
# media_id | media_origin | content_type | file_size_bytes | creation_ts | upload_name | base64hash | user_id
res = cur.execute("select media_id, creation_ts, base64hash from mediaapi_media_repository WHERE user_id = '';")
files = []
for row in cur.fetchall():
# creation_ts is ms since the epoch, so convert to seconds
f = File(self, row[0], row[1]//1000, row[2])
files.append(f)
return files
#--------------------------------------------------------------
def read_config(conf_file):
"""Read in the dendrite config file and return db creds and media path"""
try:
with open(conf_file) as f:
config = yaml.safe_load(f)
except FileNotFoundError as e:
errstr = f"Config file {conf_file} not found. Use the --help option to find out more."
logging.error(errstr)
exit(1)
if "media_api" not in config:
logging.error("Missing section media_api")
exit(1)
if "global" in config and "database" in config["global"]:
CONN_STR = config["global"]["database"].get("connection_string", None)
else:
logging.debug("No database section, so we need the media_api specific connection string")
CONN_STR = config["media_api"].get("connection_string", None)
if CONN_STR is None:
logging.error("Did not find connection string to media database.")
exit(1)
BASE_PATH = Path(config["media_api"].get("base_path", None))
if BASE_PATH is None:
logging.error("Missing base_path in media_api")
exit(1)
return (BASE_PATH, CONN_STR)
def parse_options():
loglevel=logging.INFO # default
parser = argparse.ArgumentParser(
prog = 'cleanmedia',
description = 'Deletes 30 day old remote media files from dendrite servers')
parser.add_argument('-c', '--config', default="config.yaml", help="location of the dendrite.yaml config file.")
parser.add_argument('-n', '--dryrun', action='store_true', help="Dry run (don't actually modify any files).")
parser.add_argument('-d', '--debug', action='store_true', help="Turn debug output on.")
args = parser.parse_args()
if args.debug: loglevel=logging.DEBUG
logging.basicConfig(level=loglevel, format= '%(levelname)s - %(message)s')
return args
if __name__ == '__main__':
args = parse_options()
(MEDIA_PATH, CONN_STR) = read_config(args.config)
mr = MediaRepository(MEDIA_PATH, CONN_STR)
cleantime = datetime.today() - timedelta(days=30)
files = mr.get_remote_media()
for file in files:
if file.create_date < cleantime:
if not file.exists():
logging.info(f"file id {file.media_id} does not physically exist (path {file.fullpath})")
if not args.dryrun:
file.delete()