From 922479ff57d3fa5184133716bba8f3bde2f09fc9 Mon Sep 17 00:00:00 2001 From: Taka Date: Fri, 20 Mar 2020 10:25:37 +0000 Subject: [PATCH] Added integration with an eASLRB. --- asl_articles/config/site.cfg.example | 3 + asl_articles/search.py | 124 +++++++++++++++++++++++++-- asl_articles/tests/test_search.py | 54 +++++++++++- docker-compose.yml | 3 + run-containers.sh | 7 +- web/src/SearchResults.css | 1 + 6 files changed, 185 insertions(+), 7 deletions(-) diff --git a/asl_articles/config/site.cfg.example b/asl_articles/config/site.cfg.example index d69a1df..4f89fbd 100644 --- a/asl_articles/config/site.cfg.example +++ b/asl_articles/config/site.cfg.example @@ -7,3 +7,6 @@ DB_CONN_STRING = ... ; Base directory for external documents. EXTERNAL_DOCS_BASEDIR = ... + +; Base URL for the eASLRB. +ASLRB_BASE_URL = ... diff --git a/asl_articles/search.py b/asl_articles/search.py index 117b431..1cd5d9c 100644 --- a/asl_articles/search.py +++ b/asl_articles/search.py @@ -27,6 +27,7 @@ _logger = logging.getLogger( "search" ) _SQLITE_FTS_SPECIAL_CHARS = "+-#':/.@$" +# NOTE: The column order defined here is important, since we have to access row results by column index. _SEARCHABLE_COL_NAMES = [ "name", "name2", "description", "authors", "scenarios", "tags" ] _get_publisher_vals = lambda p: get_publisher_vals( p, True ) @@ -147,7 +148,9 @@ def search_publication( pub_id ): results = [ get_publication_vals( pub, True, True ) ] articles = sorted( pub.articles, key=get_article_sort_key ) for article in articles: - results.append( get_article_vals( article, True ) ) + article = get_article_vals( article, True ) + _create_aslrb_links( article ) + results.append( article ) return jsonify( results ) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -158,9 +161,11 @@ def search_article( article_id ): article = Article.query.get( article_id ) if not article: return jsonify( [] ) - results = [ get_article_vals( article, True ) ] - if article.pub_id: - pub = Publication.query.get( article.pub_id ) + article = get_article_vals( article, True ) + _create_aslrb_links( article ) + results = [ article ] + if article["pub_id"]: + pub = Publication.query.get( article["pub_id"] ) if pub: results.append( get_publication_vals( pub, True, True ) ) return jsonify( results ) @@ -293,7 +298,8 @@ def _do_fts_search( fts_query_string, col_names, results=None ): #pylint: disabl # return highlighted versions of the content to the caller fields = _FIELD_MAPPINGS[ owner_type ] - for col_no,col_name in enumerate(["name","name2","description"]): + assert _SEARCHABLE_COL_NAMES[:3] == [ "name", "name2", "description" ] + for col_no,col_name in enumerate(_SEARCHABLE_COL_NAMES[:3]): field = fields.get( col_name ) if not field: continue @@ -309,6 +315,10 @@ def _do_fts_search( fts_query_string, col_names, results=None ): #pylint: disabl if row[7] and BEGIN_HILITE in row[7]: result[ "tags!" ] = row[7].split( "\n" ) + # create links to the eASLRB + if owner_type == "article": + _create_aslrb_links( result ) + # add the result to the list results.append( result ) @@ -374,6 +384,110 @@ def _make_fts_query_string( query_string, search_aliases ): # --------------------------------------------------------------------- +# regex's that specify what a ruleid looks like +_RULEID_REGEXES = [ + re.compile( r"\b[A-Z]\d{0,3}\.\d{1,5}[A-Za-z]?\b" ), + # nb: while there are ruleid's like "C5", it's far more likely this is referring to a hex :-/ + #re.compile( r"\b[A-Z]\d{1,4}[A-Za-z]?\b" ), +] + +def _create_aslrb_links( article ): + """Create links to the ASLRB for ruleid's.""" + + # initialize + base_url = app.config.get( "ASLRB_BASE_URL", os.environ.get("ASLRB_BASE_URL") ) + if not base_url: + return + if "article_snippet!" in article: + snippet = article[ "article_snippet!" ] + else: + snippet = article[ "article_snippet" ] + + def make_link( startpos, endpos, ruleid, caption ): + nonlocal snippet + if ruleid: + link = "{}".format( + base_url, ruleid, caption + ) + snippet = snippet[:startpos] + link + snippet[endpos:] + else: + # NOTE: We can get here when a manually-created link has no ruleid e.g. because the content + # contains something that is incorrectly being detected as a ruleid, and the user has fixed it up. + snippet = snippet[:startpos] + caption + snippet[endpos:] + + # find ruleid's in the snippet and replace them with links to the ASLRB + matches = _find_aslrb_ruleids( snippet ) + for match in reversed(matches): + startpos, endpos, ruleid, caption = match + make_link( startpos, endpos, ruleid, caption ) + article[ "article_snippet!" ] = snippet + +def _find_aslrb_ruleids( val ): #pylint: disable=too-many-branches + """Find ruleid's.""" + + # locate any manually-created links; format is "{:ruleid|caption:}" + # NOTE: The ruleid is optional, so that if something is incorrectly being detected as a ruleid, + # the user can disable the link by creating one of these with no ruleid. + manual = list( re.finditer( r"{:(.*?)\|(.+?):}", val ) ) + def is_manual( target ): + return any( + target.start() >= mo.start() and target.end() <= mo.end() + for mo in manual + ) + + # look for ruleid's + matches = [] + for regex in _RULEID_REGEXES: + for mo in regex.finditer( val ): + if is_manual( mo ): + continue # nb: ignore any ruleid's that are part of a manually-created link + matches.append( mo ) + + # FUDGE! Remove overlapping matches e.g. if we have "B1.23", we will have matches for "B1" and "B1.23". + matches2, prev_mo = [], None + matches.sort( key=lambda mo: mo.start() ) + for mo in matches: + if prev_mo and mo.start() == prev_mo.start() and len(mo.group()) < len(prev_mo.group()): + continue + matches2.append( mo ) + prev_mo = mo + + # extract the start/end positions of each match, ruleid and caption + matches = [ + [ mo.start(), mo.end(), mo.group(), mo.group() ] + for mo in matches2 + ] + + # NOTE: If we have something like "C1.23-.45", we want to link to "C1.23", + # but have the tag wrap the whole thing. + # NOTE: This won't work if the user searched for "C1.23", since it will be wrapped + # in a highlight . + for match in matches: + endpos = match[1] + if endpos == len(val) or val[endpos] != "-": + continue + nchars, allow_dot = 1, True + while endpos + nchars < len(val): + ch = val[ endpos + nchars ] + if ch.isdigit(): + nchars += 1 + elif ch == "." and allow_dot: + nchars += 1 + allow_dot = False + else: + break + if nchars > 1: + match[1] += nchars + match[3] = val[ match[0] : match[1] ] + + # add any manually-created links + for mo in manual: + matches.append( [ mo.start(), mo.end(), mo.group(1), mo.group(2) ] ) + + return sorted( matches, key=lambda m: m[0] ) + +# --------------------------------------------------------------------- + def init_search( session, logger ): """Initialize the search engine.""" diff --git a/asl_articles/tests/test_search.py b/asl_articles/tests/test_search.py index 8f107b1..674e562 100644 --- a/asl_articles/tests/test_search.py +++ b/asl_articles/tests/test_search.py @@ -1,6 +1,6 @@ """ Test search operations. """ -from asl_articles.search import _load_search_aliases, _make_fts_query_string +from asl_articles.search import _load_search_aliases, _make_fts_query_string, _find_aslrb_ruleids from asl_articles.search import SEARCH_ALL from asl_articles.tests.test_publishers import create_publisher, edit_publisher @@ -584,6 +584,58 @@ def test_make_fts_query_string(): # --------------------------------------------------------------------- +def test_aslrb_links(): + """Test creating links to the ASLRB.""" + + def do_test( snippet, expected ): + matches = _find_aslrb_ruleids( snippet ) + if expected: + assert len(matches) == len(expected) + for match,exp in zip(matches,expected): + startpos, endpos, ruleid, caption = match + if isinstance( exp, str ): + assert exp == ruleid == caption + assert exp == snippet[ startpos : endpos ] + else: + assert isinstance( exp, tuple ) + assert exp[0] == ruleid + assert exp[1] == caption + else: + assert matches == [] + + # test detecting ruleid's + do_test( "A1.23", ["A1.23"] ) + do_test( " A1.23 ", ["A1.23"] ) + do_test( ".A1.23,", ["A1.23"] ) + do_test( "xA1.23,", None ) + do_test( "A1.23 B.4 C5. D6", ["A1.23","B.4"] ) + do_test( "A1.23 B.4,C5.;D6", ["A1.23","B.4"] ) + + # test ruleid ranges + do_test( "A1.23-", ["A1.23"] ) + do_test( "A1.23-4", [ ("A1.23","A1.23-4") ] ) + do_test( "A1.23-45", [ ("A1.23","A1.23-45") ] ) + do_test( "A1.23-.6", [ ("A1.23","A1.23-.6") ] ) + do_test( "A1.23-.6.7", [ ("A1.23","A1.23-.6") ] ) + + # test manually created links + do_test( "A1.23 Z9.99", + [ "A1.23", "Z9.99" ] + ) + do_test( "A1.23 {:D5.6|foo:} Z9.99", + [ "A1.23", ("D5.6","foo"), "Z9.99" ] + ) + do_test( "A1.23 {:|foo:} Z9.99", + [ "A1.23", ("","foo"), "Z9.99" ] + ) + # NOTE: Because the following manual link has no caption, it won't get detected as a manual link, + # and so the ruleid is detected as a normal ruleid. + do_test( "A1.23 {:D5.6|:} Z9.99", + [ "A1.23", "D5.6", "Z9.99" ] + ) + +# --------------------------------------------------------------------- + def _do_test_search( query, expected ): """Run a search and check the results.""" results = do_search( query ) diff --git a/docker-compose.yml b/docker-compose.yml index 0289070..a223190 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,8 @@ # Similarly, EXTERNAL_DOCS_BASEDIR is the base directory for external documents that we want to link to, # but it needs to be set even if it's not being used :-/ # +## Similary, ASLRB_BASE_URL is the base URL for an external eASLRB. +# # See the run-containers.sh script that manages all of this. version: "3" @@ -36,3 +38,4 @@ services: environment: - DBCONN - EXTERNAL_DOCS_BASEDIR + - ASLRB_BASE_URL diff --git a/run-containers.sh b/run-containers.sh index a3845b3..c4ca0b1 100755 --- a/run-containers.sh +++ b/run-containers.sh @@ -3,7 +3,7 @@ # parse the command-line arguments if [ -z "$1" ]; then - echo "Usage: `basename "$0"` " + echo "Usage: `basename "$0"` " echo " Build and launch the \"asl-articles\" containers, using the specified database e.g." echo " ~/asl-articles.db (path to a SQLite database)" echo " postgresql://USER:PASS@host/dbname (database connection string)" @@ -11,6 +11,8 @@ if [ -z "$1" ]; then echo echo " If you want link articles to their original documents, specify a base directory for the documents." echo + echo " If you want to have links to an eASLRB, specify its base URL." + echo echo " The TAG env variable can also be set to specify which containers to run e.g." echo " TAG=testing ./run.sh /tmp/asl-articles.db" exit 0 @@ -35,6 +37,9 @@ else # FUDGE! This needs to be set, even if it's not being used :-/ export EXTERNAL_DOCS_BASEDIR=/dev/null fi +if [ ! -z "$3" ]; then + export ASLRB_BASE_URL=$3 +fi # initialize if [ "$TAG" == "testing" ]; then diff --git a/web/src/SearchResults.css b/web/src/SearchResults.css index 0f1d5e9..defb00b 100644 --- a/web/src/SearchResults.css +++ b/web/src/SearchResults.css @@ -29,6 +29,7 @@ .search-result .content { padding: 2px 5px ; font-size: 90% ; } .search-result .content p:not(:first-child) { margin-top: 0.25em ; } .search-result .content ul p, .search-result .content ol p { margin-top: 0.1em ; } +.search-result .content a.aslrb { color: #804040 ; text-decoration: none ; border-bottom: 1px dotted #804040 ; } .search-result .content .image { float: left ; margin: 0.25em 0.5em 0.5em 0 ; max-height: 8em ; max-width: 6em ; } .search-result .content .collapsible { margin-top:0.5em ; font-size: 90% ; color: #333 ; } .search-result .content .collapsible a { color: #333 ; text-decoration: none ; }