Tightened up how we clean up stripped HTML.

master
Pacman Ghost 4 years ago
parent 867ad06bfd
commit 8276bb4b29
  1. 19
      asl_articles/utils.py

@ -83,15 +83,16 @@ def clean_html( val ):
buf = cleaner.clean_html( val )
# clean up the results
buf = re.sub( r"\s+", " ", buf )
buf = re.sub( r"^\s+", "", buf, re.MULTILINE )
buf = re.sub( r"\s+$", "", buf, re.MULTILINE )
if buf.startswith( "<p>" ) and buf.endswith( "</p>" ):
buf = buf[3:-4]
if buf.startswith( "<div>" ) and buf.endswith( "</div>" ):
buf = buf[5:-6]
if buf.startswith( "<span>" ) and buf.endswith( "</span>" ):
buf = buf[6:-7]
while True:
prev_buf = buf
buf = re.sub( r"\s+", " ", buf )
buf = re.sub( r"^\s+", "", buf, re.MULTILINE )
buf = re.sub( r"\s+$", "", buf, re.MULTILINE )
for tag in ["body","p","div","span"]:
if buf.startswith( "<{}>".format(tag) ) and buf.endswith( "</{}>".format(tag) ):
buf = buf[ len(tag)+2 : -len(tag)-3 ]
if buf == prev_buf:
break
return buf.strip()
def load_html_whitelists( app ):

Loading…
Cancel
Save