Manage ASL magazines and their articles.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
asl-articles/asl_articles/db_report.py

149 lines
4.4 KiB

""" Generate the database report. """
import urllib.request
import urllib.error
import hashlib
from collections import defaultdict
from flask import request, jsonify, abort
from asl_articles import app, db
# ---------------------------------------------------------------------
@app.route( "/db-report/row-counts" )
def get_db_row_counts():
"""Get the database row counts."""
results = {}
for table_name in [
"publisher", "publication", "article", "author",
"publisher_image", "publication_image", "article_image",
"scenario"
]:
query = db.engine.execute( "SELECT count(*) FROM {}".format( table_name ) )
results[ table_name ] = query.scalar()
return jsonify( results )
# ---------------------------------------------------------------------
@app.route( "/db-report/links" )
def get_db_links():
"""Get all links in the database."""
# initialize
results = {}
def find_db_links( table_name, col_names ):
links = []
query = db.engine.execute( "SELECT * FROM {}".format( table_name ) )
for row in query:
url = row[ col_names[1] ]
if not url:
continue
obj_id = row[ col_names[0] ]
name = col_names[2]( row ) if callable( col_names[2] ) else row[ col_names[2] ]
links.append( [ obj_id, name, url ] )
results[ table_name ] = links
# find all links
find_db_links( "publisher", [
"publ_id", "publ_url", "publ_name"
] )
find_db_links( "publication", [
"pub_id", "pub_url", _get_pub_name
] )
find_db_links( "article", [
"article_id", "article_url", "article_title"
] )
return jsonify( results )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@app.route( "/db-report/check-link", methods=["POST"] )
def check_db_link():
"""Check if a link appears to be working."""
url = request.args.get( "url" )
try:
req = urllib.request.Request( url, method="HEAD" )
with urllib.request.urlopen( req ) as resp:
resp_code = resp.code
except urllib.error.URLError as ex:
resp_code = getattr( ex, "code", None )
if not resp_code:
resp_code = 400
if resp_code != 200:
abort( resp_code )
return "ok"
# ---------------------------------------------------------------------
@app.route( "/db-report/images" )
def get_db_images():
"""Analyze the images stored in the database."""
# initialize
results = {}
image_hashes = defaultdict( list )
def find_images( table_name, col_names, get_name ):
# find rows in the specified table that have images
sql = "SELECT {cols}, image_data" \
" FROM {table}_image LEFT JOIN {table}" \
" ON {table}_image.{id_col} = {table}.{id_col}".format(
cols = ",".join( "{}.{}".format( table_name, c ) for c in col_names ),
table = table_name,
id_col = col_names[0]
)
rows = [
dict( row )
for row in db.engine.execute( sql )
]
# save the image hashes
for row in rows:
image_hash = hashlib.md5( row["image_data"] ).hexdigest()
image_hashes[ image_hash ].append( [
table_name, row[col_names[0]], get_name(row)
] )
# save the image sizes
image_sizes = [
[ len(row["image_data"]), row[col_names[0]], get_name(row) ]
for row in rows
]
image_sizes.sort( key = lambda r: r[0], reverse=True )
results[ table_name ] = image_sizes
# look for images in each table
find_images( "publisher",
[ "publ_id", "publ_name" ],
lambda row: row["publ_name"]
)
find_images( "publication",
[ "pub_id", "pub_name", "pub_edition" ],
_get_pub_name
)
find_images( "article",
[ "article_id", "article_title" ],
lambda row: row["article_title"]
)
# look for duplicate images
results["duplicates"] = {}
for image_hash, images in image_hashes.items():
if len(images) == 1:
continue
results["duplicates"][ image_hash ] = images
return results
# ---------------------------------------------------------------------
def _get_pub_name( row ):
"""Get a publication's display name."""
name = row["pub_name"]
if row["pub_edition"]:
name += " ({})".format( row["pub_edition"] )
return name