Remove control characters from content.

master
Pacman Ghost 4 years ago
parent 9d02ee2dcc
commit 8d3b886c9d
  1. 11
      asl_articles/utils.py

@ -2,6 +2,7 @@
import re
import typing
import itertools
import logging
from flask import jsonify, abort
@ -10,6 +11,10 @@ import lxml.html.clean
_html_whitelists = None
_startup_logger = logging.getLogger( "startup" )
_CONTROL_CHARS = list( ch for ch in itertools.chain( range(0,31+1), range(127,159+1) )
if ch not in (10,13)
)
# ---------------------------------------------------------------------
def get_request_args( vals, arg_names, log=None ):
@ -64,7 +69,7 @@ def make_ok_response( extras=None, updated=None, warnings=None ):
# ---------------------------------------------------------------------
def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-locals,too-many-branches
def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-many-locals,too-many-branches,too-many-statements
"""Sanitize HTML using a whitelist."""
# check if we need to do anything
@ -89,6 +94,10 @@ def clean_html( val, allow_tags=None, safe_attrs=None ): #pylint: disable=too-ma
val = replace_chars( val, "-", [ "\u2013", "\u2014" ] )
val = replace_chars( val, "...", [ "\u2026" ] )
# remove control characters
val = val.replace( "\t", " " )
val = "".join( ch for ch in val if ord(ch) not in _CONTROL_CHARS )
# FUDGE! lxml replaces HTML entities with their actual character :-/ It's possible to stop it from doing this,
# by passing in an ElementTree, which gives us an ElementTree back, and we can then control how it is serialized
# back into a string e.g.

Loading…
Cancel
Save