Preserve HTML entities when cleaning HTML.

master
Pacman Ghost 4 years ago
parent 5b9ea6636d
commit 666ea2be57
  1. 23
      asl_articles/utils.py

@ -64,7 +64,7 @@ def make_ok_response( extras=None, updated=None, warnings=None ):
# ---------------------------------------------------------------------
def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-branches
def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-locals,too-many-branches
"""Sanitize HTML using a whitelist."""
# check if we need to do anything
@ -88,6 +88,23 @@ def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-ma
val = replace_chars( val, r"\1 - \2", [ re.compile( r"(\S+)\u2014(\S+)" ) ] )
val = replace_chars( val, "-", [ "\u2014" ] )
# FUDGE! lxml replaces HTML entities with their actual character :-/ It's possible to stop it from doing this,
# by passing in an ElementTree, which gives us an ElementTree back, and we can then control how it is serialized
# back into a string e.g.
# html = lxml.html.fromstring( val )
# html = cleaner.clean_html( html )
# val = lxml.html.tostring( html, encoding="ascii" ).decode( encoding="ascii" )
# but the original HTML entities are converted into numeric e.g. "è" => "è" :-/
# We hack around this by replacing all HTML entities with a special marker string, clean the HTML,
# then replace all the marker strings with their original HTML entities :-/
markers = {}
matches = list( re.finditer( "&[a-z]+;", val ) )
matches = reversed( matches )
for n,mo in enumerate(matches):
marker = "[!${}$!]".format( n )
markers[ marker ] = mo.group()
val = val[:mo.start()] + marker + val[mo.end():]
# strip the HTML
args = {}
if allow_tags is None:
@ -106,6 +123,10 @@ def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-ma
cleaner = lxml.html.clean.Cleaner( **args )
buf = cleaner.clean_html( val )
# restore the HTML entities
for marker,entity in markers.items():
buf = buf.replace( marker, entity )
# clean up the results
while True:
buf = buf.strip()

Loading…
Cancel
Save