Replace smart quotes and dashes.

master
Pacman Ghost 4 years ago
parent c914976302
commit 7f4ae2676c
  1. 23
      asl_articles/tests/test_articles.py
  2. 19
      asl_articles/tests/test_publications.py
  3. 18
      asl_articles/tests/test_publishers.py
  4. 17
      asl_articles/utils.py

@ -304,30 +304,37 @@ def test_clean_html( webdriver, flask_app, dbconn ):
# initialize
init_tests( webdriver, flask_app, dbconn )
replace = [
"[\u00ab\u00bb\u201c\u201d\u201e\u201f foo\u2014bar \u2018\u2019\u201a\u201b\u2039\u203a]",
"[\"\"\"\"\"\" foo - bar '''''']"
]
# create a article with HTML content
create_article( {
"title": "title: <span style='boo!'> <b>bold</b> <xxx>xxx</xxx> <i>italic</i>",
"subtitle": "<i>italicized subtitle</i>",
"snippet": "bad stuff here: <script>HCF</script>"
"title": "title: <span style='boo!'> <b>bold</b> <xxx>xxx</xxx> <i>italic</i> {}".format( replace[0] ),
"subtitle": "<i>italicized subtitle</i> {}".format( replace[0] ),
"snippet": "bad stuff here: <script>HCF</script> {}".format( replace[0] )
}, toast_type="warning" )
# check that the HTML was cleaned
sr = check_search_result( None, _check_sr, [
"title: bold xxx italic", "italicized subtitle", "bad stuff here:", "", [], None
"title: bold xxx italic {}".format( replace[1] ),
"italicized subtitle {}".format( replace[1] ),
"bad stuff here: {}".format( replace[1] ),
"", [], None
] )
assert find_child( ".title", sr ).get_attribute( "innerHTML" ) \
== "title: <span> <b>bold</b> xxx <i>italic</i></span>"
== "title: <span> <b>bold</b> xxx <i>italic</i> {}</span>".format( replace[1] )
assert find_child( ".subtitle", sr ).get_attribute( "innerHTML" ) \
== "<i>italicized subtitle</i>"
assert check_toast( "warning", "Some values had HTML removed.", contains=True )
== "<i>italicized subtitle</i> {}".format( replace[1] )
assert check_toast( "warning", "Some values had HTML cleaned up.", contains=True )
# update the article with new HTML content
edit_article( sr, {
"title": "<div style='...'>updated</div>"
}, toast_type="warning" )
wait_for( 2, lambda: get_search_result_names() == ["updated"] )
assert check_toast( "warning", "Some values had HTML removed.", contains=True )
assert check_toast( "warning", "Some values had HTML cleaned up.", contains=True )
# ---------------------------------------------------------------------

@ -349,21 +349,28 @@ def test_clean_html( webdriver, flask_app, dbconn ):
# initialize
init_tests( webdriver, flask_app, dbconn )
replace = [
"[\u00ab\u00bb\u201c\u201d\u201e\u201f foo\u2014bar \u2018\u2019\u201a\u201b\u2039\u203a]",
"[\"\"\"\"\"\" foo - bar '''''']"
]
# create a publication with HTML content
create_publication( {
"name": "name: <span style='boo!'> <b>bold</b> <xxx>xxx</xxx> <i>italic</i>",
"name": "name: <span style='boo!'> <b>bold</b> <xxx>xxx</xxx> <i>italic</i> {}".format( replace[0] ),
"edition": "<i>2</i>",
"description": "bad stuff here: <script>HCF</script>"
"description": "bad stuff here: <script>HCF</script> {}".format( replace[0] )
}, toast_type="warning" )
# check that the HTML was cleaned
sr = check_search_result( None, _check_sr, [
"name: bold xxx italic", "2", "bad stuff here:", [], None
"name: bold xxx italic {}".format( replace[1] ),
"2",
"bad stuff here: {}".format( replace[1] ),
[], None
] )
assert find_child( ".name", sr ).get_attribute( "innerHTML" ) \
== "name: <span> <b>bold</b> xxx <i>italic</i></span> (<i>2</i>)"
assert check_toast( "warning", "Some values had HTML removed.", contains=True )
== "name: <span> <b>bold</b> xxx <i>italic</i> {}</span> (<i>2</i>)".format( replace[1] )
assert check_toast( "warning", "Some values had HTML cleaned up.", contains=True )
# update the publication with new HTML content
edit_publication( sr, {
@ -372,7 +379,7 @@ def test_clean_html( webdriver, flask_app, dbconn ):
results = get_search_results()
assert len(results) == 1
wait_for( 2, lambda: find_child( ".name", results[0] ).text == "updated (2)" )
assert check_toast( "warning", "Some values had HTML removed.", contains=True )
assert check_toast( "warning", "Some values had HTML cleaned up.", contains=True )
# ---------------------------------------------------------------------

@ -324,20 +324,26 @@ def test_clean_html( webdriver, flask_app, dbconn ):
# initialize
init_tests( webdriver, flask_app, dbconn )
replace = [
"[\u00ab\u00bb\u201c\u201d\u201e\u201f foo\u2014bar \u2018\u2019\u201a\u201b\u2039\u203a]",
"[\"\"\"\"\"\" foo - bar '''''']"
]
# create a publisher with HTML content
create_publisher( {
"name": "name: <span style='boo!'> <b>bold</b> <xxx>xxx</xxx> <i>italic</i>",
"description": "bad stuff here: <script>HCF</script>"
"name": "name: <span style='boo!'> <b>bold</b> <xxx>xxx</xxx> <i>italic</i> {}".format( replace[0] ),
"description": "bad stuff here: <script>HCF</script> {}".format( replace[0] )
}, toast_type="warning" )
# check that the HTML was cleaned
sr = check_search_result( None, _check_sr, [
"name: bold xxx italic", "bad stuff here:", None
"name: bold xxx italic {}".format( replace[1] ),
"bad stuff here: {}".format( replace[1] ),
None
] )
assert find_child( ".name", sr ).get_attribute( "innerHTML" ) \
== "name: <span> <b>bold</b> xxx <i>italic</i></span>"
assert check_toast( "warning", "Some values had HTML removed.", contains=True )
== "name: <span> <b>bold</b> xxx <i>italic</i> {}</span>".format( replace[1] )
assert check_toast( "warning", "Some values had HTML cleaned up.", contains=True )
# update the publisher with new HTML content
edit_publisher( sr, {
@ -346,7 +352,7 @@ def test_clean_html( webdriver, flask_app, dbconn ):
results = get_search_results()
assert len(results) == 1
wait_for( 2, lambda: find_child( ".name", sr ).text == "updated" )
assert check_toast( "warning", "Some values had HTML removed.", contains=True )
assert check_toast( "warning", "Some values had HTML cleaned up.", contains=True )
# ---------------------------------------------------------------------

@ -1,6 +1,7 @@
""" Helper utilities. """
import re
import typing
import logging
from flask import jsonify, abort
@ -41,7 +42,7 @@ def clean_request_args( vals, fields, warnings, logger ):
vals[f] = val2
cleaned[f] = val2
logger.debug( "Cleaned HTML: %s => %s", f, val2 )
warnings.append( "Some values had HTML removed." )
warnings.append( "Some values had HTML cleaned up." )
return cleaned
def _parse_arg_name( arg_name ):
@ -73,6 +74,20 @@ def clean_html( val, allow_tags=None, safe_attrs=None ):
if not val:
return val
# fixup smart quotes and dashes
def replace_chars( val, ch, targets ):
for t in targets:
if isinstance( t, typing.Pattern ):
val = t.sub( ch, val )
else:
assert isinstance( t, str )
val = val.replace( t, ch )
return val
val = replace_chars( val, '"', [ "\u00ab", "\u00bb", "\u201c", "\u201d", "\u201e", "\u201f" ] )
val = replace_chars( val, "'", [ "\u2018", "\u2019", "\u201a", "\u201b", "\u2039", "\u203a" ] )
val = replace_chars( val, r"\1 - \2", [ re.compile( r"(\S+)\u2014(\S+)" ) ] )
val = replace_chars( val, "-", [ "\u2014" ] )
# strip the HTML
args = {}
if allow_tags is None:

Loading…
Cancel
Save