Tightened up how we clean up stripped HTML.

4 years ago · 8276bb4b29
parent 867ad06bfd
commit 8276bb4b29
1 changed files with 10 additions and 9 deletions
--- a/asl_articles/utils.py
+++ b/asl_articles/utils.py
@ -83,15 +83,16 @@ def clean_html( val ):
    buf = cleaner.clean_html( val )

    # clean up the results
-    buf = re.sub( r"\s+", " ", buf )
-    buf = re.sub( r"^\s+", "", buf, re.MULTILINE )
-    buf = re.sub( r"\s+$", "", buf, re.MULTILINE )
-    if buf.startswith( "<p>" ) and buf.endswith( "</p>" ):
-        buf = buf[3:-4]
-    if buf.startswith( "<div>" ) and buf.endswith( "</div>" ):
-        buf = buf[5:-6]
-    if buf.startswith( "<span>" ) and buf.endswith( "</span>" ):
-        buf = buf[6:-7]
+    while True:
+        prev_buf = buf
+        buf = re.sub( r"\s+", " ", buf )
+        buf = re.sub( r"^\s+", "", buf, re.MULTILINE )
+        buf = re.sub( r"\s+$", "", buf, re.MULTILINE )
+        for tag in ["body","p","div","span"]:
+            if buf.startswith( "<{}>".format(tag) ) and buf.endswith( "</{}>".format(tag) ):
+                buf = buf[ len(tag)+2 : -len(tag)-3 ]
+        if buf == prev_buf:
+            break
    return buf.strip()

 def load_html_whitelists( app ):