parent
e6760ce807
commit
898e34535d
@ -0,0 +1,92 @@ |
|||||||
|
#!/usr/bin/env python3 |
||||||
|
""" Geenrate a report on images in the database. """ |
||||||
|
|
||||||
|
import sys |
||||||
|
import os |
||||||
|
import hashlib |
||||||
|
from collections import defaultdict |
||||||
|
|
||||||
|
import sqlalchemy |
||||||
|
from sqlalchemy import text |
||||||
|
|
||||||
|
# --------------------------------------------------------------------- |
||||||
|
|
||||||
|
def main(): |
||||||
|
"""Report on images in the database.""" |
||||||
|
|
||||||
|
# parse the command line arguments |
||||||
|
if len(sys.argv) != 2: |
||||||
|
print( "Usage: {} <dbconn> <url-base>".format( os.path.split(__file__)[0] ) ) |
||||||
|
print( " dbconn: database connection string e.g. \"sqlite:///~/asl-articles.db\"" ) |
||||||
|
sys.exit( 0 ) |
||||||
|
dbconn = sys.argv[1] |
||||||
|
|
||||||
|
# connect to the database |
||||||
|
engine = sqlalchemy.create_engine( dbconn ) |
||||||
|
conn = engine.connect() |
||||||
|
|
||||||
|
# initialize |
||||||
|
image_hashes = defaultdict( list ) |
||||||
|
|
||||||
|
def find_images( conn, table_name, col_names, get_name ): |
||||||
|
|
||||||
|
# find rows in the specified table that have images |
||||||
|
sql = "SELECT {cols}, image_data" \ |
||||||
|
" FROM {table}_image LEFT JOIN {table}" \ |
||||||
|
" ON {table}_image.{id_col} = {table}.{id_col}".format( |
||||||
|
cols = ",".join( "{}.{}".format( table_name, c ) for c in col_names ), |
||||||
|
table=table_name, id_col=col_names[0] |
||||||
|
) |
||||||
|
rows = [ dict(row) for row in conn.execute( text( sql ) ) ] |
||||||
|
|
||||||
|
# save the image hashes |
||||||
|
for row in rows: |
||||||
|
image_hash = hashlib.md5( row["image_data"] ).hexdigest() |
||||||
|
name = get_name( row ) |
||||||
|
image_hashes[ image_hash ].append( name ) |
||||||
|
|
||||||
|
# output the results |
||||||
|
rows = [ |
||||||
|
[ len(row["image_data"]), row[col_names[0]], get_name(row) ] |
||||||
|
for row in rows |
||||||
|
] |
||||||
|
rows.sort( key = lambda r: r[0], reverse=True ) |
||||||
|
print( "=== {}s ({}) ===".format( table_name, len(rows) ) ) |
||||||
|
print() |
||||||
|
print( "{:>6} {:>5}".format( "size", "ID" ) ) |
||||||
|
for row in rows: |
||||||
|
print( "{:-6.1f} | {:5} | {}".format( row[0]/1024, row[1], row[2] ) ) |
||||||
|
print() |
||||||
|
|
||||||
|
def get_pub_name( row ): |
||||||
|
name = row["pub_name"] |
||||||
|
if row["pub_edition"]: |
||||||
|
name += " ({})".format( row["pub_edition"] ) |
||||||
|
return name |
||||||
|
|
||||||
|
# look for images in each table |
||||||
|
find_images( conn, "publisher", |
||||||
|
[ "publ_id", "publ_name" ], |
||||||
|
lambda r: r["publ_name"] |
||||||
|
) |
||||||
|
find_images( conn, "publication", |
||||||
|
[ "pub_id", "pub_name", "pub_edition" ], |
||||||
|
get_pub_name |
||||||
|
) |
||||||
|
find_images( conn, "article", |
||||||
|
[ "article_id", "article_title" ], |
||||||
|
lambda r: r["article_title"] |
||||||
|
) |
||||||
|
|
||||||
|
# report on any duplicate images |
||||||
|
for image_hash,images in image_hashes.items(): |
||||||
|
if len(images) == 1: |
||||||
|
continue |
||||||
|
print( "Found duplicate images ({}):".format( image_hash ) ) |
||||||
|
for image in images: |
||||||
|
print( "- {}".format( image ) ) |
||||||
|
|
||||||
|
# --------------------------------------------------------------------- |
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
main() |
Loading…
Reference in new issue