diff --git a/asl_rulebook2/utils.py b/asl_rulebook2/utils.py index caeb264..5c9db95 100644 --- a/asl_rulebook2/utils.py +++ b/asl_rulebook2/utils.py @@ -1,8 +1,82 @@ """ Miscellaneous utilities. """ +import os import pathlib +import tempfile import re import math +from io import StringIO +from html.parser import HTMLParser + +# --------------------------------------------------------------------- + +class TempFile: + """Manage a temp file that can be closed while it's still being used.""" + + def __init__( self, mode="wb", extn=None, encoding=None ): + self.mode = mode + self.extn = extn + self.encoding = encoding + self.temp_file = None + self.name = None + + def open( self ): + """Allocate a temp file.""" + if self.encoding: + encoding = self.encoding + else: + encoding = "utf-8" if "b" not in self.mode else None + assert self.temp_file is None + self.temp_file = tempfile.NamedTemporaryFile( + mode = self.mode, + encoding = encoding, + suffix = self.extn, + delete = False + ) + self.name = self.temp_file.name + + def close( self, delete ): + """Close the temp file.""" + self.temp_file.close() + if delete: + os.unlink( self.temp_file.name ) + + def write( self, data ): + """Write data to the temp file.""" + self.temp_file.write( data ) + + def __enter__( self ): + """Enter the context manager.""" + self.open() + return self + + def __exit__( self, exc_type, exc_val, exc_tb ): + """Exit the context manager.""" + self.close( delete=True ) + +# --------------------------------------------------------------------- + +def strip_html( val ): + """Strip HTML.""" + + if not val: + return val + + buf = StringIO() + class StripHtml( HTMLParser ): + """Strip HTML.""" + def __init__( self ): + super().__init__() + self.strict = False + def handle_data( self, data ): + buf.write( data ) + def error( self, message ): + pass + + # strip HTML + html_stripper = StripHtml() + html_stripper.feed( val ) + return buf.getvalue() # --------------------------------------------------------------------- @@ -99,6 +173,10 @@ def append_text( buf, new ): buf += " " return buf + new +def plural( n, name1, name2 ): + """Return the singular/plural form of a string.""" + return "{} {}".format( n, name1 if n == 1 else name2 ) + def remove_quotes( val ): """Remove enclosing quotes from a string.""" if val[0] in ('"',"'") and val[-1] == val[0]: diff --git a/asl_rulebook2/webapp/__init__.py b/asl_rulebook2/webapp/__init__.py index 1d3c473..19b3120 100644 --- a/asl_rulebook2/webapp/__init__.py +++ b/asl_rulebook2/webapp/__init__.py @@ -11,7 +11,7 @@ from flask import Flask import flask.cli import yaml -from asl_rulebook2.webapp.config.constants import BASE_DIR +from asl_rulebook2.webapp.config.constants import BASE_DIR, CONFIG_DIR shutdown_event = threading.Event() @@ -19,6 +19,7 @@ shutdown_event = threading.Event() def _load_config( fname, section ): """Load config settings from a file.""" + fname = os.path.join( CONFIG_DIR, fname ) if not os.path.isfile( fname ): return config_parser = configparser.ConfigParser() @@ -50,21 +51,12 @@ flask.cli.show_server_banner = lambda *args: None app = Flask( __name__ ) # load the application configuration -config_dir = os.path.join( BASE_DIR, "config" ) -_fname = os.path.join( config_dir, "app.cfg" ) -_load_config( _fname, "System" ) - -# load any site configuration -_fname = os.path.join( config_dir, "site.cfg" ) -_load_config( _fname, "Site Config" ) - -# load any debug configuration -_fname = os.path.join( config_dir, "debug.cfg" ) -if os.path.isfile( _fname ) : - _load_config( _fname, "Debug" ) +_load_config( "app.cfg", "System" ) +_load_config( "site.cfg", "Site Config" ) +_load_config( "debug.cfg", "Debug" ) # initialize logging -_fname = os.path.join( config_dir, "logging.yaml" ) +_fname = os.path.join( CONFIG_DIR, "logging.yaml" ) if os.path.isfile( _fname ): with open( _fname, "r", encoding="utf-8" ) as fp: try: diff --git a/asl_rulebook2/webapp/config/constants.py b/asl_rulebook2/webapp/config/constants.py index a49e889..35c818f 100644 --- a/asl_rulebook2/webapp/config/constants.py +++ b/asl_rulebook2/webapp/config/constants.py @@ -7,3 +7,4 @@ APP_VERSION = "v0.1" # nb: also update setup.py APP_DESCRIPTION = "Search engine for the ASL Rulebook." BASE_DIR = os.path.abspath( os.path.join( os.path.dirname(__file__), ".." ) ) +CONFIG_DIR = os.path.join( BASE_DIR, "config" ) diff --git a/asl_rulebook2/webapp/config/search-aliases.json b/asl_rulebook2/webapp/config/search-aliases.json new file mode 100644 index 0000000..05dc688 --- /dev/null +++ b/asl_rulebook2/webapp/config/search-aliases.json @@ -0,0 +1,25 @@ +{ + +"_comment_": "This file defines search aliases.", +"_comment_": "Keys that appear in a query string will match itself or any of its associated values.", +"_comment_": " e.g. searching for 'entrenchments' will actually search for 'entrenchments OR foxhole OR trench OR ditch'", +"_comment_": "These differ from search synonyms in that only the key word will trigger the replacement, not any word from the set.", +"_comment_": "A user-defined version of this file in the data directory will also be loaded.", + +"latw": [ + "atmm", "atr", "baz", "mol-p", "mol-projector", "piat", "pf", "pfk", "psk" +], +"fortification/foritifcations": [ + "cave", "a-t ditch", "foxhole", "sangar", "trench", "bunker", "minefield", "mines", "booby trap", "panji", "pillbox", "roadblock", "tetrahedron", "wire" +], +"entrenchment/entrenchments": [ + "foxhole", "trench", "ditch" +], +"vehicle/vehicles": [ + "tank", "halftrack", "half-track", "jeep", "carrier" +], +"illumination": [ + "tarshell", "illuminating round", "trip flare" +] + +} diff --git a/asl_rulebook2/webapp/config/search-replacements.json b/asl_rulebook2/webapp/config/search-replacements.json new file mode 100644 index 0000000..8faa469 --- /dev/null +++ b/asl_rulebook2/webapp/config/search-replacements.json @@ -0,0 +1,14 @@ +{ + +"_comment_": "This file defines search replacements.", +"_comment_": "Keys that appear in a query string will be replaced by the value.", +"_comment_": " e.g. searching for '1/2 MF' will actually search for '½ MF'", +"_comment_": "A user-defined version of this file in the data directory will also be loaded.", + +"1/2": "½", +"3/4": "¾", +"3/8": "⅜", +"5/8": "⅝", +"(r)": "®" + +} diff --git a/asl_rulebook2/webapp/config/search-synonyms.json b/asl_rulebook2/webapp/config/search-synonyms.json new file mode 100644 index 0000000..0ad12b2 --- /dev/null +++ b/asl_rulebook2/webapp/config/search-synonyms.json @@ -0,0 +1,51 @@ +[ + +"This file defines search synonyms.", +"If a word appears in a query string, it will match any of the words in its set.", +" e.g. searching for 'finn gun' will actually search for '(finn OR finnish) AND gun'", +"These differ from search aliases in that any word from a set will trigger the replacement.", +"A user-defined version of this file in the data directory will also be loaded.", + +[ "u.s.", "america", "american" ], +[ "usmc", "marine" ], +[ "finn", "finnish" ], +[ "romania", "romanian" ], +[ "hungary", "hungarian" ], +[ "slovakia", "slovakian" ], +[ "croatia", "croatian" ], +[ "bulgaria", "bulgarian" ], + +[ "dc", "demo charge", "demolition charge" ], +[ "ft", "flamethrower", "flame-thrower" ], +[ "baz", "bazooka" ], +[ "pf", "panzerfaust" ], +[ "psk", "panzershreck" ], +[ "wp", "white phosphorous" ], +[ "mol", "molotov cocktail" ], +[ "ovr", "overrun" ], +[ "cc", "close combat" ], +[ "thh", "t-h hero", "tank-hunter hero" ], +[ "scw", "shaped-charge weapon" ], +[ "sw", "support weapon" ], +[ "mg", "machinegun", "machine-gun", "machine gun" ], +[ "firelane", "fire-lane", "fire lane" ], +[ "firegroup", "fire-group", "fire group" ], +[ "lc", "landing craft" ], +[ "ht", "halftrack", "half-track" ], +[ "wa", "wall advantage" ], +[ "hob", "heat of battle" ], +[ "cg", "campaign game" ], +[ "pbm", "pbem" ], + +[ "rb", "red barricades" ], +[ "votg", "valor of the guards" ], +[ "kgp", "kampfgrupper peiper" ], +[ "kgs", "kampfgrupper scherer" ], +[ "brt", "br:t", "blood reef tarawa" ], +[ "pb", "pegasus bridge" ], + +[ "ammo", "ammunition" ], +[ "armor", "armour" ], +[ "color", "colour" ] + +] diff --git a/asl_rulebook2/webapp/content.py b/asl_rulebook2/webapp/content.py index 1055126..0d020cf 100644 --- a/asl_rulebook2/webapp/content.py +++ b/asl_rulebook2/webapp/content.py @@ -2,6 +2,7 @@ import os import io +import json import glob from flask import jsonify, send_file, url_for, abort @@ -13,7 +14,7 @@ content_docs = None # --------------------------------------------------------------------- -def load_content_docs(): +def load_content_docs( logger ): """Load the content documents from the data directory.""" # initialize @@ -29,26 +30,32 @@ def load_content_docs(): fname = os.path.join( dname, fname ) if not os.path.isfile( fname ): return - kwargs = {} - kwargs["mode"] = "rb" if binary else "r" - if not binary: - kwargs["encoding"] = "utf-8" - with open( fname, **kwargs ) as fp: - content_doc[ key ] = fp.read() + if binary: + with open( fname, mode="rb" ) as fp: + data = fp.read() + logger.debug( "- Loaded \"%s\" file: #bytes=%d", key, len(data) ) + content_doc[ key ] = data + else: + with open( fname, "r", encoding="utf-8" ) as fp: + content_doc[ key ] = json.load( fp ) + logger.debug( "- Loaded \"%s\" file.", key ) # load each content doc + logger.info( "Loading content docs: %s", dname ) fspec = os.path.join( dname, "*.index" ) for fname in glob.glob( fspec ): - fname = os.path.basename( fname ) - title = os.path.splitext( fname )[0] + fname2 = os.path.basename( fname ) + logger.info( "- %s", fname2 ) + title = os.path.splitext( fname2 )[0] content_doc = { + "_fname": fname, "doc_id": slugify( title ), "title": title, } - get_doc( content_doc, "index", fname ) - get_doc( content_doc, "targets", change_extn(fname,".targets") ) - get_doc( content_doc, "footnotes", change_extn(fname,".footnotes") ) - get_doc( content_doc, "content", change_extn(fname,".pdf"), binary=True ) + get_doc( content_doc, "index", fname2 ) + get_doc( content_doc, "targets", change_extn(fname2,".targets") ) + get_doc( content_doc, "footnotes", change_extn(fname2,".footnotes") ) + get_doc( content_doc, "content", change_extn(fname2,".pdf"), binary=True ) content_docs[ content_doc["doc_id"] ] = content_doc # --------------------------------------------------------------------- @@ -59,11 +66,13 @@ def get_content_docs(): resp = {} for cdoc in content_docs.values(): cdoc2 = { - "docId": cdoc["doc_id"], + "doc_id": cdoc["doc_id"], "title": cdoc["title"], } if "content" in cdoc: cdoc2["url"] = url_for( "get_content", doc_id=cdoc["doc_id"] ) + if "targets" in cdoc: + cdoc2["targets"] = cdoc["targets"] resp[ cdoc["doc_id"] ] = cdoc2 return jsonify( resp ) diff --git a/asl_rulebook2/webapp/main.py b/asl_rulebook2/webapp/main.py index 0df4676..85da6b3 100644 --- a/asl_rulebook2/webapp/main.py +++ b/asl_rulebook2/webapp/main.py @@ -9,6 +9,7 @@ from flask import render_template, jsonify, abort from asl_rulebook2.webapp import app, globvars, shutdown_event from asl_rulebook2.webapp.content import load_content_docs +from asl_rulebook2.webapp.search import init_search from asl_rulebook2.webapp.utils import parse_int # --------------------------------------------------------------------- @@ -20,7 +21,9 @@ def init_webapp(): after that by the test suite, to reset the webapp before each test. """ # initialize the webapp - load_content_docs() + logger = logging.getLogger( "startup" ) + load_content_docs( logger ) + init_search( logger ) # --------------------------------------------------------------------- diff --git a/asl_rulebook2/webapp/search.py b/asl_rulebook2/webapp/search.py new file mode 100644 index 0000000..2d88c78 --- /dev/null +++ b/asl_rulebook2/webapp/search.py @@ -0,0 +1,475 @@ +""" Manage the search engine. """ + +import os +import sqlite3 +import json +import re +import itertools +import string +import tempfile +import logging +import traceback + +from flask import request, jsonify + +from asl_rulebook2.utils import plural +from asl_rulebook2.webapp import app +from asl_rulebook2.webapp import content as webapp_content +from asl_rulebook2.webapp.utils import make_config_path, make_data_path + +_sqlite_path = None +_fts_index_entries= None + +_logger = logging.getLogger( "search" ) + +# these are used to highlight search matches (nb: the front-end looks for these) +_BEGIN_HIGHLIGHT = "!@:" +_END_HIGHLIGHT = ":@!" + +# NOTE: These regex's fix up content returned to us by the SQLite search engine (typically problems +# with highlighting search terms). +_FIXUP_TEXT_REGEXES = [ + [ re.compile( fixup[0].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) ), + fixup[1].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) + ] + for fixup in [ + [ r"&{}(.+?){};", r"{}&\g<1>;{}" ], # HTML entities e.g. &((frac12)); -> (($frac12;)) + [ r"{}(.+?){}#", r"{}\g<1>#{}" ], # e.g. ((TH)# -> ((TH#) + [ r"{}U\.S{}\.", "{}U.S.{}" ], # ((U.S)). -> ((U.S.)) + ] +] + +# these are used to separate ruleref's in the FTS table (internal use only) +_RULEREF_SEPARATOR = "-:-" + +_SEARCH_TERM_ADJUSTMENTS = None + +# --------------------------------------------------------------------- + +@app.route( "/search", methods=["POST"] ) +def search() : + """Run a search.""" + + # log the request + _logger.info( "SEARCH REQUEST:" ) + args = dict( request.form.items() ) + for key,val in args.items(): + _logger.info( "- %s: %s", key, val ) + + # run the search + try: + return _do_search( args ) + except Exception as exc: #pylint: disable=broad-except + msg = str( exc ) + if msg.startswith( "fts5: " ): + msg = msg[5:] # nb: this is a sqlite3.OperationalError + _logger.warning( "SEARCH ERROR: %s\n%s", args, traceback.format_exc() ) + return jsonify( { "error": msg } ) + +def _do_search( args ): + + def fixup_text( val ): + if val is None: + return None + for regex in _FIXUP_TEXT_REGEXES: + val = regex[0].sub( regex[1], val ) + return val + + # run the search + query_string = args[ "queryString" ].strip() + if query_string == "!:simulated-error:!": + raise RuntimeError( "Simulated error." ) # nb: for the test suite + fts_query_string, search_terms = _make_fts_query_string( query_string ) + _logger.debug( "FTS query string: %s", fts_query_string ) + conn = sqlite3.connect( _sqlite_path ) + def highlight( n ): + # NOTE: highlight() is an FTS extension function, and takes column numbers :-/ + return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) + sql = "SELECT rowid,doc_id,sr_type,rank,{},{},{},{} FROM searchable".format( + highlight(2), highlight(3), highlight(4), highlight(5) + ) + sql += " WHERE searchable MATCH ?" + sql += " ORDER BY rank" + curs = conn.execute( sql, + ( "{title subtitle content rulerefs}: " + fts_query_string, ) + ) + + def get_col( sr, key, val ): + if val: + sr[key] = fixup_text( val ) + + # get the results + results = [] + for row in curs: + if row[2] != "index": + _logger.error( "Unknown searchable row type (rowid=%d): %s", row[0], row[2] ) + continue + index_entry = _fts_index_entries[ row[0] ] + result = { + "doc_id": row[1], + "sr_type": row[2], + "_score": - row[3], + } + get_col( result, "title", row[4] ) + get_col( result, "subtitle", row[5] ) + get_col( result, "content", row[6] ) + if index_entry.get( "ruleids" ): + result["ruleids"] = index_entry["ruleids"] + if index_entry.get( "see_also" ): + result["see_also"] = index_entry["see_also"] + rulerefs = [ r.strip() for r in row[7].split(_RULEREF_SEPARATOR) ] if row[7] else [] + assert len(rulerefs) == len(index_entry.get("rulerefs",[])) + if rulerefs: + result[ "rulerefs" ] = [] + for i, ruleref in enumerate(rulerefs): + ruleref2 = {} + if "caption" in index_entry["rulerefs"][i]: + assert ruleref.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ) \ + == index_entry["rulerefs"][i]["caption"] + ruleref2["caption"] = fixup_text( ruleref ) + if "ruleids" in index_entry["rulerefs"][i]: + ruleref2["ruleids"] = index_entry["rulerefs"][i]["ruleids"] + assert ruleref2 + result["rulerefs"].append( ruleref2 ) + results.append( result ) + + # fixup the results + results = _fixup_results_for_hash_terms( results, search_terms ) + + # adjust the sort order + results = _adjust_sort_order( results ) + + # return the results + _logger.debug( "Search results:" if len(results) > 0 else "Search results: none" ) + for result in results: + _logger.debug( "- %s (%.3f)", + result["title"].replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ), + result["_score"] + ) + return jsonify( results ) + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +PASSTHROUGH_REGEXES = set([ + re.compile( r"\bAND\b" ), + re.compile( r"\bOR\b" ), + re.compile( r"\bNOT\b" ), + re.compile( r"\((?![Rr]\))" ), +]) + +def _make_fts_query_string( query_string ): + """Generate the SQLite query string. + + SQLite's MATCH function recognizes a lot of special characters, which need + to be enclosed in double-quotes to disable. + """ + + # check if this looks like a raw FTS query + if any( regex.search(query_string) for regex in PASSTHROUGH_REGEXES ): + return query_string.strip(), None + + # split the search string into words (taking quoted phrases into account) + ignore = app.config.get( "SQLITE_FTS_IGNORE_CHARS", ",;!?$" ) + query_string = "".join( ch for ch in query_string if ch not in ignore ) + terms = query_string.lower().split() + i = 0 + while True: + if i >= len(terms): + break + if i > 0 and terms[i-1].startswith( '"' ): + terms[i-1] += " {}".format( terms[i] ) + del terms[i] + if terms[i-1].startswith( '"' ) and terms[i-1].endswith( '"' ): + terms[i-1] = terms[i-1][1:-1] + continue + i += 1 + + # clean up quoted phrases + terms = [ t[1:] if t.startswith('"') else t for t in terms ] + terms = [ t[:-1] if t.endswith('"') else t for t in terms ] + terms = [ t.strip() for t in terms ] + terms = [ t for t in terms if t ] + + # adjust search terms + for term_no, term in enumerate(terms): + aliases = _SEARCH_TERM_ADJUSTMENTS.get( term ) + if not aliases: + continue + if isinstance( aliases, str ): + # the search term is replaced by a new one + terms[ term_no ] = aliases + elif isinstance( aliases, set ): + # the search term is replaced by multiple new ones (that will be OR'ed together) + # NOTE: We sort the terms so that the tests will work reliably. + terms[ term_no ] = sorted( aliases ) + else: + assert "Unknown search alias type: {}".format( type(aliases) ) + + # fixup each term + def has_special_char( term ): + """Check if the term contains any special characters.""" + for ch in term: + if ch in "*": + continue + if ch.isspace() or ch in string.punctuation: + return True + if ord(ch) < 32 or ord(ch) > 127: + return True + return False + def fixup_terms( terms ): + """Fixup a list of terms.""" + for term_no, term in enumerate(terms): + if isinstance( term, str ): + if has_special_char( term ): + terms[term_no] = '"{}"'.format( term ) + else: + fixup_terms( term ) + fixup_terms( terms ) + + # return the final FTS query string + def term_string( term ): + if isinstance( term, str ): + return term + assert isinstance( term, list ) + return "( {} )".format( " OR ".join( term ) ) + return " AND ".join( term_string(t) for t in terms ), terms + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def _fixup_results_for_hash_terms( results, search_terms ): + """Fixup search results for search terms that end with a hash. + + SQLite doesn't handle search terms that end with a hash particularly well. + We correct highlighted search terms in fixup_text(), but searching for e.g. "US#" + will also match "use" and "using" - we remove such results here. + """ + + # figure out which search terms end with a hash + # NOTE: We don't bother descending down into sub-terms. + if not search_terms: + return results + terms = [ + t[1:-1] for t in search_terms + if isinstance(t,str) and t.startswith('"') and t.endswith('"') + ] + terms = [ + t[:-1].lower() for t in terms + if isinstance(t,str) and t.endswith("#") + ] + if not terms: + return results + if "us" in terms: + terms.extend( [ "use", "used", "using", "user" ] ) + + def keep( sr ): + # remove every incorrectly matched search term (e.g. ((K)) when searching for "K#") + buf = json.dumps( sr ).lower() + for term in terms: + buf = buf.replace( "{}{}{}".format( _BEGIN_HIGHLIGHT, term, _END_HIGHLIGHT ), "_removed_" ) + # we keep this search result if there are still some highlighted search terms + return _BEGIN_HIGHLIGHT in buf + + return [ + result for result in results if keep(result) + ] + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def _adjust_sort_order( results ): + """Adjust the sort order of the search results.""" + + results2 = [] + def extract_sr( func ): + # move results that pass the filter function to the new list + i = 0 + while True: + if i >= len(results): + break + # NOTE: We never prefer small entries (i.e .have no ruleref's) + # e.g. those that only contain a "see also". + if func( results[i] ) and len(results[i].get("rulerefs",[])) > 0: + results2.append( results[i] ) + del results[i] + else: + i += 1 + + def get( sr, key ): + val = sr.get( key ) + return val if val else "" + + # prefer search results whose title is an exact match + extract_sr( + lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) and get(sr,"title").endswith( _END_HIGHLIGHT ) + ) + # prefer search results whose title starts with a match + extract_sr( + lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) + ) + # prefer search results that have a match in the title + extract_sr( + lambda sr: _BEGIN_HIGHLIGHT in get(sr,"title") + ) + # prefer search results that have a match in the subtitle + extract_sr( + lambda sr: _BEGIN_HIGHLIGHT in get(sr,"subtitle") + ) + + # include any remaining search results + results2.extend( results ) + + return results2 + +# --------------------------------------------------------------------- + +def init_search( logger ): + """Initialize the search engine.""" + + # initialize + global _fts_index_entries + _fts_index_entries = {} + + # initialize the database + global _sqlite_path + _sqlite_path = app.config.get( "SQLITE_PATH" ) + if not _sqlite_path: + # FUDGE! We should be able to create a shared, in-memory database using this: + # file::XYZ:?mode=memory&cache=shared + # but it doesn't seem to work (on Linux) and ends up creating a file with this name :-/ + # We manually create a temp file, which has to have the same name each time, so that we don't + # keep creating a new database each time we start up. Sigh... + _sqlite_path = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb" ) + if os.path.isfile( _sqlite_path ): + os.unlink( _sqlite_path ) + logger.info( "Creating the search index: %s", _sqlite_path ) + conn = sqlite3.connect( _sqlite_path ) + # NOTE: Storing everything in a single table allows FTS to rank search results based on + # the overall content, and also lets us do AND/OR queries across all searchable content. + conn.execute( + "CREATE VIRTUAL TABLE searchable USING fts5" + " ( doc_id, sr_type, title, subtitle, content, rulerefs, tokenize='porter unicode61' )" + ) + + # load the searchable content + logger.info( "Loading the search index..." ) + conn.execute( "DELETE FROM searchable" ) + curs = conn.cursor() + for cdoc in webapp_content.content_docs.values(): + logger.info( "- Loading index file: %s", cdoc["_fname"] ) + nrows = 0 + for index_entry in cdoc["index"]: + rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) ) + # NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags + # will need to be included in search terms. However, this means that the content returned by a query + # will be this stripped content. We could go back to the original data to get the original HTML content, + # but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert + # the original content, since none of it should contain HTML, anyway. + curs.execute( + "INSERT INTO searchable (doc_id,sr_type,title,subtitle,content,rulerefs) VALUES (?,?,?,?,?,?)", ( + cdoc["doc_id"], "index", + index_entry.get("title"), index_entry.get("subtitle"), index_entry.get("content"), rulerefs + ) ) + _fts_index_entries[ curs.lastrowid ] = index_entry + index_entry["_fts_rowid"] = curs.lastrowid + nrows += 1 + conn.commit() + logger.info( " - Loaded %s.", plural(nrows,"index entry","index entries"), ) + assert len(_fts_index_entries) == _get_row_count( conn, "searchable" ) + + # load the search config + load_search_config( logger ) + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def load_search_config( logger ): + """Load the search config.""" + + # initialize + global _SEARCH_TERM_ADJUSTMENTS + _SEARCH_TERM_ADJUSTMENTS = {} + + def add_search_term_adjustment( key, vals ): + # make sure everything is lower-case + key = key.lower() + if isinstance( vals, str ): + vals = vals.lower() + elif isinstance( vals, set ): + vals = set( v.lower() for v in vals ) + else: + assert "Unknown search alias type: {}".format( type(vals) ) + # add new the search term adjustment + if key not in _SEARCH_TERM_ADJUSTMENTS: + _SEARCH_TERM_ADJUSTMENTS[ key ] = vals + else: + # found a multiple definition - try to do something sensible + logger.warning( " - Duplicate search alias: %s\n- current aliases = %s\n- new aliases = %s", key, + _SEARCH_TERM_ADJUSTMENTS[key], vals + ) + if isinstance( _SEARCH_TERM_ADJUSTMENTS[key], str ): + _SEARCH_TERM_ADJUSTMENTS[ key ] = vals + else: + assert isinstance( _SEARCH_TERM_ADJUSTMENTS[key], set ) + _SEARCH_TERM_ADJUSTMENTS[ key ].update( vals ) + + # load the search replacements + def load_search_replacements( fname ): + if not os.path.isfile( fname ): + return + logger.info( "Loading search replacements: %s", fname ) + with open( fname, "r", encoding="utf-8" ) as fp: + data = json.load( fp ) + nitems = 0 + for key, val in data.items(): + if key.startswith( "_" ): + continue # nb: ignore comments + logger.debug( "- %s -> %s", key, val ) + add_search_term_adjustment( key, val ) + nitems += 1 + logger.info( "- Loaded %s.", plural(nitems,"search replacement","search replacements") ) + load_search_replacements( make_config_path( "search-replacements.json" ) ) + load_search_replacements( make_data_path( "search-replacements.json" ) ) + + # load the search aliases + def load_search_aliases( fname ): + if not os.path.isfile( fname ): + return + logger.info( "Loading search aliases: %s", fname ) + with open( fname, "r", encoding="utf-8" ) as fp: + data = json.load( fp ) + nitems = 0 + for keys, aliases in data.items(): + if keys.startswith( "_" ): + continue # nb: ignore comments + logger.debug( "- %s -> %s", keys, " ; ".join(aliases) ) + for key in keys.split( "/" ): + add_search_term_adjustment( key, set( itertools.chain( aliases, [key] ) ) ) + nitems += 1 + logger.info( "- Loaded %s.", plural(nitems,"search aliases","search aliases") ) + load_search_aliases( make_config_path( "search-aliases.json" ) ) + load_search_aliases( make_data_path( "search-aliases.json" ) ) + + # load the search synonyms + def load_search_synonyms( fname ): + if not os.path.isfile( fname ): + return + logger.info( "Loading search synonyms: %s", fname ) + with open( fname, "r", encoding="utf-8" ) as fp: + data = json.load( fp ) + nitems = 0 + for synonyms in data: + if isinstance( synonyms, str ): + continue # nb: ignore comments + logger.debug( "- %s", " ; ".join(synonyms) ) + synonyms = set( synonyms ) + for term in synonyms: + add_search_term_adjustment( term, synonyms ) + nitems += 1 + logger.info( "- Loaded %s.", plural(nitems,"search synonym","search synonyms") ) + load_search_synonyms( make_config_path( "search-synonyms.json" ) ) + load_search_synonyms( make_data_path( "search-synonyms.json" ) ) + +# --------------------------------------------------------------------- + +def _get_row_count( conn, table_name ): + """Get the number of rows in a table.""" + cur = conn.execute( "SELECT count(*) FROM {}".format( table_name ) ) + return cur.fetchone()[0] diff --git a/asl_rulebook2/webapp/static/ContentPane.js b/asl_rulebook2/webapp/static/ContentPane.js index 398eb60..53fa05e 100644 --- a/asl_rulebook2/webapp/static/ContentPane.js +++ b/asl_rulebook2/webapp/static/ContentPane.js @@ -8,13 +8,13 @@ gMainApp.component( "content-pane", { template: ` - + `, mounted() { - gEventBus.on( "show-content-doc", (docId) => { + gEventBus.on( "show-target", (docId, target) => { //eslint-disable-line no-unused-vars this.$refs.tabbedPages.activateTab( docId ) ; // nb: tabId == docId } ) ; }, @@ -27,14 +27,37 @@ gMainApp.component( "content-doc", { props: [ "doc" ], data() { return { + target: null, noContent: gUrlParams.get( "no-content" ), } ; }, template: ` -
-
Content disabled.
-