diff --git a/tools/images_report.py b/tools/images_report.py new file mode 100755 index 0000000..e4b6fd4 --- /dev/null +++ b/tools/images_report.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" Geenrate a report on images in the database. """ + +import sys +import os +import hashlib +from collections import defaultdict + +import sqlalchemy +from sqlalchemy import text + +# --------------------------------------------------------------------- + +def main(): + """Report on images in the database.""" + + # parse the command line arguments + if len(sys.argv) != 2: + print( "Usage: {} ".format( os.path.split(__file__)[0] ) ) + print( " dbconn: database connection string e.g. \"sqlite:///~/asl-articles.db\"" ) + sys.exit( 0 ) + dbconn = sys.argv[1] + + # connect to the database + engine = sqlalchemy.create_engine( dbconn ) + conn = engine.connect() + + # initialize + image_hashes = defaultdict( list ) + + def find_images( conn, table_name, col_names, get_name ): + + # find rows in the specified table that have images + sql = "SELECT {cols}, image_data" \ + " FROM {table}_image LEFT JOIN {table}" \ + " ON {table}_image.{id_col} = {table}.{id_col}".format( + cols = ",".join( "{}.{}".format( table_name, c ) for c in col_names ), + table=table_name, id_col=col_names[0] + ) + rows = [ dict(row) for row in conn.execute( text( sql ) ) ] + + # save the image hashes + for row in rows: + image_hash = hashlib.md5( row["image_data"] ).hexdigest() + name = get_name( row ) + image_hashes[ image_hash ].append( name ) + + # output the results + rows = [ + [ len(row["image_data"]), row[col_names[0]], get_name(row) ] + for row in rows + ] + rows.sort( key = lambda r: r[0], reverse=True ) + print( "=== {}s ({}) ===".format( table_name, len(rows) ) ) + print() + print( "{:>6} {:>5}".format( "size", "ID" ) ) + for row in rows: + print( "{:-6.1f} | {:5} | {}".format( row[0]/1024, row[1], row[2] ) ) + print() + + def get_pub_name( row ): + name = row["pub_name"] + if row["pub_edition"]: + name += " ({})".format( row["pub_edition"] ) + return name + + # look for images in each table + find_images( conn, "publisher", + [ "publ_id", "publ_name" ], + lambda r: r["publ_name"] + ) + find_images( conn, "publication", + [ "pub_id", "pub_name", "pub_edition" ], + get_pub_name + ) + find_images( conn, "article", + [ "article_id", "article_title" ], + lambda r: r["article_title"] + ) + + # report on any duplicate images + for image_hash,images in image_hashes.items(): + if len(images) == 1: + continue + print( "Found duplicate images ({}):".format( image_hash ) ) + for image in images: + print( "- {}".format( image ) ) + +# --------------------------------------------------------------------- + +if __name__ == "__main__": + main()