Manage ASL magazines and their articles.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
asl-articles/asl_articles/utils.py

233 lines
8.3 KiB

""" Helper utilities. """
import re
import configparser
import typing
import itertools
import logging
from flask import jsonify, abort
import lxml.html.clean
_html_whitelists = None
_startup_logger = logging.getLogger( "startup" )
_CONTROL_CHARS = list( ch for ch in itertools.chain( range(0,31+1), range(127,159+1) )
if ch not in (10,13)
)
# ---------------------------------------------------------------------
def get_request_args( vals, arg_names, log=None ):
"""Unload the arguments from a Flask request."""
arg_names = [ _parse_arg_name( k ) for k in arg_names ]
vals = { a[0]: vals.get( a[0] ) for a in arg_names }
vals = {
k: v.strip() if isinstance(v,str) else v
for k,v in vals.items()
}
if log:
log[0].debug( "%s", log[1] )
for a in arg_names:
log[0].debug( "- %s = %s", a[0], str(vals[a[0]]) )
# check for required arguments
required = [ a[0] for a in arg_names if a[1] ]
required = [ r for r in required if r not in vals or not vals[r] ]
if required:
abort( 400, "Missing required values: {}".format( ", ".join( required ) ) )
return vals
def clean_request_args( vals, fields, warnings, logger ):
"""Clean incoming data."""
cleaned = {}
for f in fields:
if f.endswith( "_url" ):
continue # nb: don't clean URL's
f = _parse_arg_name( f )[ 0 ]
if isinstance( vals[f], str ):
val2 = clean_html( vals[f] )
if val2 != vals[f]:
vals[f] = val2
cleaned[f] = val2
logger.debug( "Cleaned HTML: %s => %s", f, val2 )
warnings.append( "Some values had HTML cleaned up." )
return cleaned
def _parse_arg_name( arg_name ):
"""Parse a request argument name."""
if arg_name[0] == "*":
return ( arg_name[1:], True ) # required argument
return ( arg_name, False ) # optional argument
def make_ok_response( extras=None, updated=None, warnings=None ):
"""Generate a Flask 'success' response."""
resp = { "status": "OK" }
if extras:
resp.update( extras )
if updated:
resp[ "updated" ] = updated
if warnings:
resp[ "warnings" ] = list( set( warnings ) ) # nb: remove duplicate messages
return jsonify( resp )
# ---------------------------------------------------------------------
def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-locals,too-many-branches,too-many-statements
"""Sanitize HTML using a whitelist."""
# check if we need to do anything
if val is None:
return None
val = val.strip()
if not val:
return val
# fixup smart quotes and dashes
def replace_chars( val, ch, targets ):
for t in targets:
if isinstance( t, typing.Pattern ):
val = t.sub( ch, val )
else:
assert isinstance( t, str )
val = val.replace( t, ch )
return val
val = replace_chars( val, '"', [ "\u00ab", "\u00bb", "\u201c", "\u201d", "\u201e", "\u201f" ] )
val = replace_chars( val, "'", [ "\u2018", "\u2019", "\u201a", "\u201b", "\u2039", "\u203a" ] )
val = replace_chars( val, r"\1 - \2", [ re.compile( r"(\S+)\u2014(\S+)" ) ] )
val = replace_chars( val, "-", [ "\u2013", "\u2014" ] )
val = replace_chars( val, "...", [ "\u2026" ] )
# remove control characters
val = val.replace( "\t", " " )
val = "".join( ch for ch in val if ord(ch) not in _CONTROL_CHARS )
# FUDGE! lxml replaces HTML entities with their actual character :-/ It's possible to stop it from doing this,
# by passing in an ElementTree, which gives us an ElementTree back, and we can then control how it is serialized
# back into a string e.g.
# html = lxml.html.fromstring( val )
# html = cleaner.clean_html( html )
# val = lxml.html.tostring( html, encoding="ascii" ).decode( encoding="ascii" )
# but the original HTML entities are converted into numeric e.g. "è" => "è" :-/
# We hack around this by replacing all HTML entities with a special marker string, clean the HTML,
# then replace all the marker strings with their original HTML entities :-/
markers = {}
matches = list( re.finditer( "&[a-z][a-z0-9]+;", val ) )
matches = reversed( matches )
for n,mo in enumerate(matches):
marker = "[!${}$!]".format( n )
markers[ marker ] = mo.group()
val = val[:mo.start()] + marker + val[mo.end():]
# strip the HTML
args = {}
if allow_tags is None:
allow_tags = _html_whitelists.get( "tags" )
elif allow_tags == []:
allow_tags = [ "" ] # nb: this is how we remove everything :-/
if allow_tags:
args[ "allow_tags" ] = allow_tags
args[ "remove_unknown_tags" ] = None
if safe_attrs is None:
safe_attrs = _html_whitelists.get( "attrs" )
elif safe_attrs == []:
safe_attrs = [ "" ] # nb: this is how we remove everything :-/
if safe_attrs:
args[ "safe_attrs" ] = safe_attrs
cleaner = lxml.html.clean.Cleaner( **args )
buf = cleaner.clean_html( val )
# restore the HTML entities
for marker,entity in markers.items():
buf = buf.replace( marker, entity )
# clean up the results
while True:
buf = buf.strip()
prev_buf = buf
buf = re.sub( " +", " ", buf ) # nb: we don't use "\s+" to preserve newlines
buf = re.sub( r"^\s+", "", buf, re.MULTILINE )
buf = re.sub( r"\s+$", "", buf, re.MULTILINE )
for tag in ["body","div","span"]:
if buf.startswith( "<{}>".format(tag) ) and buf.endswith( "</{}>".format(tag) ):
buf = buf[ len(tag)+2 : -len(tag)-3 ]
if buf == prev_buf:
break
if buf.startswith( "<p>" ) and buf.endswith( "</p>" ):
buf2 = buf[ 3: -4 ]
if "<p>" not in buf2 and "</p>" not in buf2:
buf = buf2
return buf.strip()
def load_html_whitelists( app ):
"""Load the HTML whitelists."""
global _html_whitelists
assert _html_whitelists is None
def parse_whitelist( key ):
whitelist = app.config.get( key, "" )
whitelist = whitelist.replace( ",", " " )
whitelist = [ s.strip() for s in whitelist.split(" ") ]
whitelist = [ s for s in whitelist if s ]
_startup_logger.debug( "Configured %s: %s", key, whitelist )
return whitelist
_html_whitelists = {
"tags": parse_whitelist( "HTML_TAG_WHITELIST" ),
"attrs": parse_whitelist( "HTML_ATTR_WHITELIST" )
}
# ---------------------------------------------------------------------
def clean_tags( tags, warnings ):
"""Remove HTML from tags."""
cleaned_tags = [ clean_html( t, allow_tags=[], safe_attrs=[] ) for t in tags ]
if cleaned_tags != tags:
warnings.append( "Some values had HTML removed." )
return cleaned_tags
def encode_tags( tags ):
"""Encode tags prior to storing them in the database."""
if not tags:
return None
return "\n".join( tags )
def decode_tags( tags ):
"""Decode tags after loading them from the database."""
if not tags:
return None
return tags.split( "\n" )
# ---------------------------------------------------------------------
class AppConfigParser():
"""Wrapper around the standard ConfigParser."""
def __init__( self, fname ):
self._configparser = configparser.ConfigParser()
self._configparser.optionxform = str # preserve case for the keys :-/
self._configparser.read( fname )
def get_section( self, section_name ):
"""Read a section from the config."""
try:
return self._configparser.items( section_name )
except configparser.NoSectionError:
return []
# ---------------------------------------------------------------------
def apply_attrs( obj, vals ):
"""Update an object's attributes."""
for k,v in vals.items():
setattr( obj, k, v )
def to_bool( val ):
"""Interpret a value as a boolean."""
if val is None:
return None
val = str( val ).lower()
if val in ["yes","true","enabled","1"]:
return True
if val in ["no","false","disabled","0"]:
return False
return None
def squash_spaces( val ):
"""Squash multiple spaces down into a single space."""
return " ".join( val.split() )