diff --git a/asl_articles/utils.py b/asl_articles/utils.py index 4961afa..f65bdda 100644 --- a/asl_articles/utils.py +++ b/asl_articles/utils.py @@ -64,7 +64,7 @@ def make_ok_response( extras=None, updated=None, warnings=None ): # --------------------------------------------------------------------- -def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-branches +def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-locals,too-many-branches """Sanitize HTML using a whitelist.""" # check if we need to do anything @@ -88,6 +88,23 @@ def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-ma val = replace_chars( val, r"\1 - \2", [ re.compile( r"(\S+)\u2014(\S+)" ) ] ) val = replace_chars( val, "-", [ "\u2014" ] ) + # FUDGE! lxml replaces HTML entities with their actual character :-/ It's possible to stop it from doing this, + # by passing in an ElementTree, which gives us an ElementTree back, and we can then control how it is serialized + # back into a string e.g. + # html = lxml.html.fromstring( val ) + # html = cleaner.clean_html( html ) + # val = lxml.html.tostring( html, encoding="ascii" ).decode( encoding="ascii" ) + # but the original HTML entities are converted into numeric e.g. "è" => "è" :-/ + # We hack around this by replacing all HTML entities with a special marker string, clean the HTML, + # then replace all the marker strings with their original HTML entities :-/ + markers = {} + matches = list( re.finditer( "&[a-z]+;", val ) ) + matches = reversed( matches ) + for n,mo in enumerate(matches): + marker = "[!${}$!]".format( n ) + markers[ marker ] = mo.group() + val = val[:mo.start()] + marker + val[mo.end():] + # strip the HTML args = {} if allow_tags is None: @@ -106,6 +123,10 @@ def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-ma cleaner = lxml.html.clean.Cleaner( **args ) buf = cleaner.clean_html( val ) + # restore the HTML entities + for marker,entity in markers.items(): + buf = buf.replace( marker, entity ) + # clean up the results while True: buf = buf.strip()