diff --git a/asl_rulebook2/utils.py b/asl_rulebook2/utils.py
index caeb264..5c9db95 100644
--- a/asl_rulebook2/utils.py
+++ b/asl_rulebook2/utils.py
@@ -1,8 +1,82 @@
""" Miscellaneous utilities. """
+import os
import pathlib
+import tempfile
import re
import math
+from io import StringIO
+from html.parser import HTMLParser
+
+# ---------------------------------------------------------------------
+
+class TempFile:
+ """Manage a temp file that can be closed while it's still being used."""
+
+ def __init__( self, mode="wb", extn=None, encoding=None ):
+ self.mode = mode
+ self.extn = extn
+ self.encoding = encoding
+ self.temp_file = None
+ self.name = None
+
+ def open( self ):
+ """Allocate a temp file."""
+ if self.encoding:
+ encoding = self.encoding
+ else:
+ encoding = "utf-8" if "b" not in self.mode else None
+ assert self.temp_file is None
+ self.temp_file = tempfile.NamedTemporaryFile(
+ mode = self.mode,
+ encoding = encoding,
+ suffix = self.extn,
+ delete = False
+ )
+ self.name = self.temp_file.name
+
+ def close( self, delete ):
+ """Close the temp file."""
+ self.temp_file.close()
+ if delete:
+ os.unlink( self.temp_file.name )
+
+ def write( self, data ):
+ """Write data to the temp file."""
+ self.temp_file.write( data )
+
+ def __enter__( self ):
+ """Enter the context manager."""
+ self.open()
+ return self
+
+ def __exit__( self, exc_type, exc_val, exc_tb ):
+ """Exit the context manager."""
+ self.close( delete=True )
+
+# ---------------------------------------------------------------------
+
+def strip_html( val ):
+ """Strip HTML."""
+
+ if not val:
+ return val
+
+ buf = StringIO()
+ class StripHtml( HTMLParser ):
+ """Strip HTML."""
+ def __init__( self ):
+ super().__init__()
+ self.strict = False
+ def handle_data( self, data ):
+ buf.write( data )
+ def error( self, message ):
+ pass
+
+ # strip HTML
+ html_stripper = StripHtml()
+ html_stripper.feed( val )
+ return buf.getvalue()
# ---------------------------------------------------------------------
@@ -99,6 +173,10 @@ def append_text( buf, new ):
buf += " "
return buf + new
+def plural( n, name1, name2 ):
+ """Return the singular/plural form of a string."""
+ return "{} {}".format( n, name1 if n == 1 else name2 )
+
def remove_quotes( val ):
"""Remove enclosing quotes from a string."""
if val[0] in ('"',"'") and val[-1] == val[0]:
diff --git a/asl_rulebook2/webapp/__init__.py b/asl_rulebook2/webapp/__init__.py
index 1d3c473..19b3120 100644
--- a/asl_rulebook2/webapp/__init__.py
+++ b/asl_rulebook2/webapp/__init__.py
@@ -11,7 +11,7 @@ from flask import Flask
import flask.cli
import yaml
-from asl_rulebook2.webapp.config.constants import BASE_DIR
+from asl_rulebook2.webapp.config.constants import BASE_DIR, CONFIG_DIR
shutdown_event = threading.Event()
@@ -19,6 +19,7 @@ shutdown_event = threading.Event()
def _load_config( fname, section ):
"""Load config settings from a file."""
+ fname = os.path.join( CONFIG_DIR, fname )
if not os.path.isfile( fname ):
return
config_parser = configparser.ConfigParser()
@@ -50,21 +51,12 @@ flask.cli.show_server_banner = lambda *args: None
app = Flask( __name__ )
# load the application configuration
-config_dir = os.path.join( BASE_DIR, "config" )
-_fname = os.path.join( config_dir, "app.cfg" )
-_load_config( _fname, "System" )
-
-# load any site configuration
-_fname = os.path.join( config_dir, "site.cfg" )
-_load_config( _fname, "Site Config" )
-
-# load any debug configuration
-_fname = os.path.join( config_dir, "debug.cfg" )
-if os.path.isfile( _fname ) :
- _load_config( _fname, "Debug" )
+_load_config( "app.cfg", "System" )
+_load_config( "site.cfg", "Site Config" )
+_load_config( "debug.cfg", "Debug" )
# initialize logging
-_fname = os.path.join( config_dir, "logging.yaml" )
+_fname = os.path.join( CONFIG_DIR, "logging.yaml" )
if os.path.isfile( _fname ):
with open( _fname, "r", encoding="utf-8" ) as fp:
try:
diff --git a/asl_rulebook2/webapp/config/constants.py b/asl_rulebook2/webapp/config/constants.py
index a49e889..35c818f 100644
--- a/asl_rulebook2/webapp/config/constants.py
+++ b/asl_rulebook2/webapp/config/constants.py
@@ -7,3 +7,4 @@ APP_VERSION = "v0.1" # nb: also update setup.py
APP_DESCRIPTION = "Search engine for the ASL Rulebook."
BASE_DIR = os.path.abspath( os.path.join( os.path.dirname(__file__), ".." ) )
+CONFIG_DIR = os.path.join( BASE_DIR, "config" )
diff --git a/asl_rulebook2/webapp/config/search-aliases.json b/asl_rulebook2/webapp/config/search-aliases.json
new file mode 100644
index 0000000..05dc688
--- /dev/null
+++ b/asl_rulebook2/webapp/config/search-aliases.json
@@ -0,0 +1,25 @@
+{
+
+"_comment_": "This file defines search aliases.",
+"_comment_": "Keys that appear in a query string will match itself or any of its associated values.",
+"_comment_": " e.g. searching for 'entrenchments' will actually search for 'entrenchments OR foxhole OR trench OR ditch'",
+"_comment_": "These differ from search synonyms in that only the key word will trigger the replacement, not any word from the set.",
+"_comment_": "A user-defined version of this file in the data directory will also be loaded.",
+
+"latw": [
+ "atmm", "atr", "baz", "mol-p", "mol-projector", "piat", "pf", "pfk", "psk"
+],
+"fortification/foritifcations": [
+ "cave", "a-t ditch", "foxhole", "sangar", "trench", "bunker", "minefield", "mines", "booby trap", "panji", "pillbox", "roadblock", "tetrahedron", "wire"
+],
+"entrenchment/entrenchments": [
+ "foxhole", "trench", "ditch"
+],
+"vehicle/vehicles": [
+ "tank", "halftrack", "half-track", "jeep", "carrier"
+],
+"illumination": [
+ "tarshell", "illuminating round", "trip flare"
+]
+
+}
diff --git a/asl_rulebook2/webapp/config/search-replacements.json b/asl_rulebook2/webapp/config/search-replacements.json
new file mode 100644
index 0000000..8faa469
--- /dev/null
+++ b/asl_rulebook2/webapp/config/search-replacements.json
@@ -0,0 +1,14 @@
+{
+
+"_comment_": "This file defines search replacements.",
+"_comment_": "Keys that appear in a query string will be replaced by the value.",
+"_comment_": " e.g. searching for '1/2 MF' will actually search for '½ MF'",
+"_comment_": "A user-defined version of this file in the data directory will also be loaded.",
+
+"1/2": "½",
+"3/4": "¾",
+"3/8": "⅜",
+"5/8": "⅝",
+"(r)": "®"
+
+}
diff --git a/asl_rulebook2/webapp/config/search-synonyms.json b/asl_rulebook2/webapp/config/search-synonyms.json
new file mode 100644
index 0000000..0ad12b2
--- /dev/null
+++ b/asl_rulebook2/webapp/config/search-synonyms.json
@@ -0,0 +1,51 @@
+[
+
+"This file defines search synonyms.",
+"If a word appears in a query string, it will match any of the words in its set.",
+" e.g. searching for 'finn gun' will actually search for '(finn OR finnish) AND gun'",
+"These differ from search aliases in that any word from a set will trigger the replacement.",
+"A user-defined version of this file in the data directory will also be loaded.",
+
+[ "u.s.", "america", "american" ],
+[ "usmc", "marine" ],
+[ "finn", "finnish" ],
+[ "romania", "romanian" ],
+[ "hungary", "hungarian" ],
+[ "slovakia", "slovakian" ],
+[ "croatia", "croatian" ],
+[ "bulgaria", "bulgarian" ],
+
+[ "dc", "demo charge", "demolition charge" ],
+[ "ft", "flamethrower", "flame-thrower" ],
+[ "baz", "bazooka" ],
+[ "pf", "panzerfaust" ],
+[ "psk", "panzershreck" ],
+[ "wp", "white phosphorous" ],
+[ "mol", "molotov cocktail" ],
+[ "ovr", "overrun" ],
+[ "cc", "close combat" ],
+[ "thh", "t-h hero", "tank-hunter hero" ],
+[ "scw", "shaped-charge weapon" ],
+[ "sw", "support weapon" ],
+[ "mg", "machinegun", "machine-gun", "machine gun" ],
+[ "firelane", "fire-lane", "fire lane" ],
+[ "firegroup", "fire-group", "fire group" ],
+[ "lc", "landing craft" ],
+[ "ht", "halftrack", "half-track" ],
+[ "wa", "wall advantage" ],
+[ "hob", "heat of battle" ],
+[ "cg", "campaign game" ],
+[ "pbm", "pbem" ],
+
+[ "rb", "red barricades" ],
+[ "votg", "valor of the guards" ],
+[ "kgp", "kampfgrupper peiper" ],
+[ "kgs", "kampfgrupper scherer" ],
+[ "brt", "br:t", "blood reef tarawa" ],
+[ "pb", "pegasus bridge" ],
+
+[ "ammo", "ammunition" ],
+[ "armor", "armour" ],
+[ "color", "colour" ]
+
+]
diff --git a/asl_rulebook2/webapp/content.py b/asl_rulebook2/webapp/content.py
index 1055126..0d020cf 100644
--- a/asl_rulebook2/webapp/content.py
+++ b/asl_rulebook2/webapp/content.py
@@ -2,6 +2,7 @@
import os
import io
+import json
import glob
from flask import jsonify, send_file, url_for, abort
@@ -13,7 +14,7 @@ content_docs = None
# ---------------------------------------------------------------------
-def load_content_docs():
+def load_content_docs( logger ):
"""Load the content documents from the data directory."""
# initialize
@@ -29,26 +30,32 @@ def load_content_docs():
fname = os.path.join( dname, fname )
if not os.path.isfile( fname ):
return
- kwargs = {}
- kwargs["mode"] = "rb" if binary else "r"
- if not binary:
- kwargs["encoding"] = "utf-8"
- with open( fname, **kwargs ) as fp:
- content_doc[ key ] = fp.read()
+ if binary:
+ with open( fname, mode="rb" ) as fp:
+ data = fp.read()
+ logger.debug( "- Loaded \"%s\" file: #bytes=%d", key, len(data) )
+ content_doc[ key ] = data
+ else:
+ with open( fname, "r", encoding="utf-8" ) as fp:
+ content_doc[ key ] = json.load( fp )
+ logger.debug( "- Loaded \"%s\" file.", key )
# load each content doc
+ logger.info( "Loading content docs: %s", dname )
fspec = os.path.join( dname, "*.index" )
for fname in glob.glob( fspec ):
- fname = os.path.basename( fname )
- title = os.path.splitext( fname )[0]
+ fname2 = os.path.basename( fname )
+ logger.info( "- %s", fname2 )
+ title = os.path.splitext( fname2 )[0]
content_doc = {
+ "_fname": fname,
"doc_id": slugify( title ),
"title": title,
}
- get_doc( content_doc, "index", fname )
- get_doc( content_doc, "targets", change_extn(fname,".targets") )
- get_doc( content_doc, "footnotes", change_extn(fname,".footnotes") )
- get_doc( content_doc, "content", change_extn(fname,".pdf"), binary=True )
+ get_doc( content_doc, "index", fname2 )
+ get_doc( content_doc, "targets", change_extn(fname2,".targets") )
+ get_doc( content_doc, "footnotes", change_extn(fname2,".footnotes") )
+ get_doc( content_doc, "content", change_extn(fname2,".pdf"), binary=True )
content_docs[ content_doc["doc_id"] ] = content_doc
# ---------------------------------------------------------------------
@@ -59,11 +66,13 @@ def get_content_docs():
resp = {}
for cdoc in content_docs.values():
cdoc2 = {
- "docId": cdoc["doc_id"],
+ "doc_id": cdoc["doc_id"],
"title": cdoc["title"],
}
if "content" in cdoc:
cdoc2["url"] = url_for( "get_content", doc_id=cdoc["doc_id"] )
+ if "targets" in cdoc:
+ cdoc2["targets"] = cdoc["targets"]
resp[ cdoc["doc_id"] ] = cdoc2
return jsonify( resp )
diff --git a/asl_rulebook2/webapp/main.py b/asl_rulebook2/webapp/main.py
index 0df4676..85da6b3 100644
--- a/asl_rulebook2/webapp/main.py
+++ b/asl_rulebook2/webapp/main.py
@@ -9,6 +9,7 @@ from flask import render_template, jsonify, abort
from asl_rulebook2.webapp import app, globvars, shutdown_event
from asl_rulebook2.webapp.content import load_content_docs
+from asl_rulebook2.webapp.search import init_search
from asl_rulebook2.webapp.utils import parse_int
# ---------------------------------------------------------------------
@@ -20,7 +21,9 @@ def init_webapp():
after that by the test suite, to reset the webapp before each test.
"""
# initialize the webapp
- load_content_docs()
+ logger = logging.getLogger( "startup" )
+ load_content_docs( logger )
+ init_search( logger )
# ---------------------------------------------------------------------
diff --git a/asl_rulebook2/webapp/search.py b/asl_rulebook2/webapp/search.py
new file mode 100644
index 0000000..2d88c78
--- /dev/null
+++ b/asl_rulebook2/webapp/search.py
@@ -0,0 +1,475 @@
+""" Manage the search engine. """
+
+import os
+import sqlite3
+import json
+import re
+import itertools
+import string
+import tempfile
+import logging
+import traceback
+
+from flask import request, jsonify
+
+from asl_rulebook2.utils import plural
+from asl_rulebook2.webapp import app
+from asl_rulebook2.webapp import content as webapp_content
+from asl_rulebook2.webapp.utils import make_config_path, make_data_path
+
+_sqlite_path = None
+_fts_index_entries= None
+
+_logger = logging.getLogger( "search" )
+
+# these are used to highlight search matches (nb: the front-end looks for these)
+_BEGIN_HIGHLIGHT = "!@:"
+_END_HIGHLIGHT = ":@!"
+
+# NOTE: These regex's fix up content returned to us by the SQLite search engine (typically problems
+# with highlighting search terms).
+_FIXUP_TEXT_REGEXES = [
+ [ re.compile( fixup[0].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) ),
+ fixup[1].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
+ ]
+ for fixup in [
+ [ r"&{}(.+?){};", r"{}&\g<1>;{}" ], # HTML entities e.g. &((frac12)); -> (($frac12;))
+ [ r"{}(.+?){}#", r"{}\g<1>#{}" ], # e.g. ((TH)# -> ((TH#)
+ [ r"{}U\.S{}\.", "{}U.S.{}" ], # ((U.S)). -> ((U.S.))
+ ]
+]
+
+# these are used to separate ruleref's in the FTS table (internal use only)
+_RULEREF_SEPARATOR = "-:-"
+
+_SEARCH_TERM_ADJUSTMENTS = None
+
+# ---------------------------------------------------------------------
+
+@app.route( "/search", methods=["POST"] )
+def search() :
+ """Run a search."""
+
+ # log the request
+ _logger.info( "SEARCH REQUEST:" )
+ args = dict( request.form.items() )
+ for key,val in args.items():
+ _logger.info( "- %s: %s", key, val )
+
+ # run the search
+ try:
+ return _do_search( args )
+ except Exception as exc: #pylint: disable=broad-except
+ msg = str( exc )
+ if msg.startswith( "fts5: " ):
+ msg = msg[5:] # nb: this is a sqlite3.OperationalError
+ _logger.warning( "SEARCH ERROR: %s\n%s", args, traceback.format_exc() )
+ return jsonify( { "error": msg } )
+
+def _do_search( args ):
+
+ def fixup_text( val ):
+ if val is None:
+ return None
+ for regex in _FIXUP_TEXT_REGEXES:
+ val = regex[0].sub( regex[1], val )
+ return val
+
+ # run the search
+ query_string = args[ "queryString" ].strip()
+ if query_string == "!:simulated-error:!":
+ raise RuntimeError( "Simulated error." ) # nb: for the test suite
+ fts_query_string, search_terms = _make_fts_query_string( query_string )
+ _logger.debug( "FTS query string: %s", fts_query_string )
+ conn = sqlite3.connect( _sqlite_path )
+ def highlight( n ):
+ # NOTE: highlight() is an FTS extension function, and takes column numbers :-/
+ return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
+ sql = "SELECT rowid,doc_id,sr_type,rank,{},{},{},{} FROM searchable".format(
+ highlight(2), highlight(3), highlight(4), highlight(5)
+ )
+ sql += " WHERE searchable MATCH ?"
+ sql += " ORDER BY rank"
+ curs = conn.execute( sql,
+ ( "{title subtitle content rulerefs}: " + fts_query_string, )
+ )
+
+ def get_col( sr, key, val ):
+ if val:
+ sr[key] = fixup_text( val )
+
+ # get the results
+ results = []
+ for row in curs:
+ if row[2] != "index":
+ _logger.error( "Unknown searchable row type (rowid=%d): %s", row[0], row[2] )
+ continue
+ index_entry = _fts_index_entries[ row[0] ]
+ result = {
+ "doc_id": row[1],
+ "sr_type": row[2],
+ "_score": - row[3],
+ }
+ get_col( result, "title", row[4] )
+ get_col( result, "subtitle", row[5] )
+ get_col( result, "content", row[6] )
+ if index_entry.get( "ruleids" ):
+ result["ruleids"] = index_entry["ruleids"]
+ if index_entry.get( "see_also" ):
+ result["see_also"] = index_entry["see_also"]
+ rulerefs = [ r.strip() for r in row[7].split(_RULEREF_SEPARATOR) ] if row[7] else []
+ assert len(rulerefs) == len(index_entry.get("rulerefs",[]))
+ if rulerefs:
+ result[ "rulerefs" ] = []
+ for i, ruleref in enumerate(rulerefs):
+ ruleref2 = {}
+ if "caption" in index_entry["rulerefs"][i]:
+ assert ruleref.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ) \
+ == index_entry["rulerefs"][i]["caption"]
+ ruleref2["caption"] = fixup_text( ruleref )
+ if "ruleids" in index_entry["rulerefs"][i]:
+ ruleref2["ruleids"] = index_entry["rulerefs"][i]["ruleids"]
+ assert ruleref2
+ result["rulerefs"].append( ruleref2 )
+ results.append( result )
+
+ # fixup the results
+ results = _fixup_results_for_hash_terms( results, search_terms )
+
+ # adjust the sort order
+ results = _adjust_sort_order( results )
+
+ # return the results
+ _logger.debug( "Search results:" if len(results) > 0 else "Search results: none" )
+ for result in results:
+ _logger.debug( "- %s (%.3f)",
+ result["title"].replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ),
+ result["_score"]
+ )
+ return jsonify( results )
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+PASSTHROUGH_REGEXES = set([
+ re.compile( r"\bAND\b" ),
+ re.compile( r"\bOR\b" ),
+ re.compile( r"\bNOT\b" ),
+ re.compile( r"\((?![Rr]\))" ),
+])
+
+def _make_fts_query_string( query_string ):
+ """Generate the SQLite query string.
+
+ SQLite's MATCH function recognizes a lot of special characters, which need
+ to be enclosed in double-quotes to disable.
+ """
+
+ # check if this looks like a raw FTS query
+ if any( regex.search(query_string) for regex in PASSTHROUGH_REGEXES ):
+ return query_string.strip(), None
+
+ # split the search string into words (taking quoted phrases into account)
+ ignore = app.config.get( "SQLITE_FTS_IGNORE_CHARS", ",;!?$" )
+ query_string = "".join( ch for ch in query_string if ch not in ignore )
+ terms = query_string.lower().split()
+ i = 0
+ while True:
+ if i >= len(terms):
+ break
+ if i > 0 and terms[i-1].startswith( '"' ):
+ terms[i-1] += " {}".format( terms[i] )
+ del terms[i]
+ if terms[i-1].startswith( '"' ) and terms[i-1].endswith( '"' ):
+ terms[i-1] = terms[i-1][1:-1]
+ continue
+ i += 1
+
+ # clean up quoted phrases
+ terms = [ t[1:] if t.startswith('"') else t for t in terms ]
+ terms = [ t[:-1] if t.endswith('"') else t for t in terms ]
+ terms = [ t.strip() for t in terms ]
+ terms = [ t for t in terms if t ]
+
+ # adjust search terms
+ for term_no, term in enumerate(terms):
+ aliases = _SEARCH_TERM_ADJUSTMENTS.get( term )
+ if not aliases:
+ continue
+ if isinstance( aliases, str ):
+ # the search term is replaced by a new one
+ terms[ term_no ] = aliases
+ elif isinstance( aliases, set ):
+ # the search term is replaced by multiple new ones (that will be OR'ed together)
+ # NOTE: We sort the terms so that the tests will work reliably.
+ terms[ term_no ] = sorted( aliases )
+ else:
+ assert "Unknown search alias type: {}".format( type(aliases) )
+
+ # fixup each term
+ def has_special_char( term ):
+ """Check if the term contains any special characters."""
+ for ch in term:
+ if ch in "*":
+ continue
+ if ch.isspace() or ch in string.punctuation:
+ return True
+ if ord(ch) < 32 or ord(ch) > 127:
+ return True
+ return False
+ def fixup_terms( terms ):
+ """Fixup a list of terms."""
+ for term_no, term in enumerate(terms):
+ if isinstance( term, str ):
+ if has_special_char( term ):
+ terms[term_no] = '"{}"'.format( term )
+ else:
+ fixup_terms( term )
+ fixup_terms( terms )
+
+ # return the final FTS query string
+ def term_string( term ):
+ if isinstance( term, str ):
+ return term
+ assert isinstance( term, list )
+ return "( {} )".format( " OR ".join( term ) )
+ return " AND ".join( term_string(t) for t in terms ), terms
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def _fixup_results_for_hash_terms( results, search_terms ):
+ """Fixup search results for search terms that end with a hash.
+
+ SQLite doesn't handle search terms that end with a hash particularly well.
+ We correct highlighted search terms in fixup_text(), but searching for e.g. "US#"
+ will also match "use" and "using" - we remove such results here.
+ """
+
+ # figure out which search terms end with a hash
+ # NOTE: We don't bother descending down into sub-terms.
+ if not search_terms:
+ return results
+ terms = [
+ t[1:-1] for t in search_terms
+ if isinstance(t,str) and t.startswith('"') and t.endswith('"')
+ ]
+ terms = [
+ t[:-1].lower() for t in terms
+ if isinstance(t,str) and t.endswith("#")
+ ]
+ if not terms:
+ return results
+ if "us" in terms:
+ terms.extend( [ "use", "used", "using", "user" ] )
+
+ def keep( sr ):
+ # remove every incorrectly matched search term (e.g. ((K)) when searching for "K#")
+ buf = json.dumps( sr ).lower()
+ for term in terms:
+ buf = buf.replace( "{}{}{}".format( _BEGIN_HIGHLIGHT, term, _END_HIGHLIGHT ), "_removed_" )
+ # we keep this search result if there are still some highlighted search terms
+ return _BEGIN_HIGHLIGHT in buf
+
+ return [
+ result for result in results if keep(result)
+ ]
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def _adjust_sort_order( results ):
+ """Adjust the sort order of the search results."""
+
+ results2 = []
+ def extract_sr( func ):
+ # move results that pass the filter function to the new list
+ i = 0
+ while True:
+ if i >= len(results):
+ break
+ # NOTE: We never prefer small entries (i.e .have no ruleref's)
+ # e.g. those that only contain a "see also".
+ if func( results[i] ) and len(results[i].get("rulerefs",[])) > 0:
+ results2.append( results[i] )
+ del results[i]
+ else:
+ i += 1
+
+ def get( sr, key ):
+ val = sr.get( key )
+ return val if val else ""
+
+ # prefer search results whose title is an exact match
+ extract_sr(
+ lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) and get(sr,"title").endswith( _END_HIGHLIGHT )
+ )
+ # prefer search results whose title starts with a match
+ extract_sr(
+ lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT )
+ )
+ # prefer search results that have a match in the title
+ extract_sr(
+ lambda sr: _BEGIN_HIGHLIGHT in get(sr,"title")
+ )
+ # prefer search results that have a match in the subtitle
+ extract_sr(
+ lambda sr: _BEGIN_HIGHLIGHT in get(sr,"subtitle")
+ )
+
+ # include any remaining search results
+ results2.extend( results )
+
+ return results2
+
+# ---------------------------------------------------------------------
+
+def init_search( logger ):
+ """Initialize the search engine."""
+
+ # initialize
+ global _fts_index_entries
+ _fts_index_entries = {}
+
+ # initialize the database
+ global _sqlite_path
+ _sqlite_path = app.config.get( "SQLITE_PATH" )
+ if not _sqlite_path:
+ # FUDGE! We should be able to create a shared, in-memory database using this:
+ # file::XYZ:?mode=memory&cache=shared
+ # but it doesn't seem to work (on Linux) and ends up creating a file with this name :-/
+ # We manually create a temp file, which has to have the same name each time, so that we don't
+ # keep creating a new database each time we start up. Sigh...
+ _sqlite_path = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb" )
+ if os.path.isfile( _sqlite_path ):
+ os.unlink( _sqlite_path )
+ logger.info( "Creating the search index: %s", _sqlite_path )
+ conn = sqlite3.connect( _sqlite_path )
+ # NOTE: Storing everything in a single table allows FTS to rank search results based on
+ # the overall content, and also lets us do AND/OR queries across all searchable content.
+ conn.execute(
+ "CREATE VIRTUAL TABLE searchable USING fts5"
+ " ( doc_id, sr_type, title, subtitle, content, rulerefs, tokenize='porter unicode61' )"
+ )
+
+ # load the searchable content
+ logger.info( "Loading the search index..." )
+ conn.execute( "DELETE FROM searchable" )
+ curs = conn.cursor()
+ for cdoc in webapp_content.content_docs.values():
+ logger.info( "- Loading index file: %s", cdoc["_fname"] )
+ nrows = 0
+ for index_entry in cdoc["index"]:
+ rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) )
+ # NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags
+ # will need to be included in search terms. However, this means that the content returned by a query
+ # will be this stripped content. We could go back to the original data to get the original HTML content,
+ # but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert
+ # the original content, since none of it should contain HTML, anyway.
+ curs.execute(
+ "INSERT INTO searchable (doc_id,sr_type,title,subtitle,content,rulerefs) VALUES (?,?,?,?,?,?)", (
+ cdoc["doc_id"], "index",
+ index_entry.get("title"), index_entry.get("subtitle"), index_entry.get("content"), rulerefs
+ ) )
+ _fts_index_entries[ curs.lastrowid ] = index_entry
+ index_entry["_fts_rowid"] = curs.lastrowid
+ nrows += 1
+ conn.commit()
+ logger.info( " - Loaded %s.", plural(nrows,"index entry","index entries"), )
+ assert len(_fts_index_entries) == _get_row_count( conn, "searchable" )
+
+ # load the search config
+ load_search_config( logger )
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def load_search_config( logger ):
+ """Load the search config."""
+
+ # initialize
+ global _SEARCH_TERM_ADJUSTMENTS
+ _SEARCH_TERM_ADJUSTMENTS = {}
+
+ def add_search_term_adjustment( key, vals ):
+ # make sure everything is lower-case
+ key = key.lower()
+ if isinstance( vals, str ):
+ vals = vals.lower()
+ elif isinstance( vals, set ):
+ vals = set( v.lower() for v in vals )
+ else:
+ assert "Unknown search alias type: {}".format( type(vals) )
+ # add new the search term adjustment
+ if key not in _SEARCH_TERM_ADJUSTMENTS:
+ _SEARCH_TERM_ADJUSTMENTS[ key ] = vals
+ else:
+ # found a multiple definition - try to do something sensible
+ logger.warning( " - Duplicate search alias: %s\n- current aliases = %s\n- new aliases = %s", key,
+ _SEARCH_TERM_ADJUSTMENTS[key], vals
+ )
+ if isinstance( _SEARCH_TERM_ADJUSTMENTS[key], str ):
+ _SEARCH_TERM_ADJUSTMENTS[ key ] = vals
+ else:
+ assert isinstance( _SEARCH_TERM_ADJUSTMENTS[key], set )
+ _SEARCH_TERM_ADJUSTMENTS[ key ].update( vals )
+
+ # load the search replacements
+ def load_search_replacements( fname ):
+ if not os.path.isfile( fname ):
+ return
+ logger.info( "Loading search replacements: %s", fname )
+ with open( fname, "r", encoding="utf-8" ) as fp:
+ data = json.load( fp )
+ nitems = 0
+ for key, val in data.items():
+ if key.startswith( "_" ):
+ continue # nb: ignore comments
+ logger.debug( "- %s -> %s", key, val )
+ add_search_term_adjustment( key, val )
+ nitems += 1
+ logger.info( "- Loaded %s.", plural(nitems,"search replacement","search replacements") )
+ load_search_replacements( make_config_path( "search-replacements.json" ) )
+ load_search_replacements( make_data_path( "search-replacements.json" ) )
+
+ # load the search aliases
+ def load_search_aliases( fname ):
+ if not os.path.isfile( fname ):
+ return
+ logger.info( "Loading search aliases: %s", fname )
+ with open( fname, "r", encoding="utf-8" ) as fp:
+ data = json.load( fp )
+ nitems = 0
+ for keys, aliases in data.items():
+ if keys.startswith( "_" ):
+ continue # nb: ignore comments
+ logger.debug( "- %s -> %s", keys, " ; ".join(aliases) )
+ for key in keys.split( "/" ):
+ add_search_term_adjustment( key, set( itertools.chain( aliases, [key] ) ) )
+ nitems += 1
+ logger.info( "- Loaded %s.", plural(nitems,"search aliases","search aliases") )
+ load_search_aliases( make_config_path( "search-aliases.json" ) )
+ load_search_aliases( make_data_path( "search-aliases.json" ) )
+
+ # load the search synonyms
+ def load_search_synonyms( fname ):
+ if not os.path.isfile( fname ):
+ return
+ logger.info( "Loading search synonyms: %s", fname )
+ with open( fname, "r", encoding="utf-8" ) as fp:
+ data = json.load( fp )
+ nitems = 0
+ for synonyms in data:
+ if isinstance( synonyms, str ):
+ continue # nb: ignore comments
+ logger.debug( "- %s", " ; ".join(synonyms) )
+ synonyms = set( synonyms )
+ for term in synonyms:
+ add_search_term_adjustment( term, synonyms )
+ nitems += 1
+ logger.info( "- Loaded %s.", plural(nitems,"search synonym","search synonyms") )
+ load_search_synonyms( make_config_path( "search-synonyms.json" ) )
+ load_search_synonyms( make_data_path( "search-synonyms.json" ) )
+
+# ---------------------------------------------------------------------
+
+def _get_row_count( conn, table_name ):
+ """Get the number of rows in a table."""
+ cur = conn.execute( "SELECT count(*) FROM {}".format( table_name ) )
+ return cur.fetchone()[0]
diff --git a/asl_rulebook2/webapp/static/ContentPane.js b/asl_rulebook2/webapp/static/ContentPane.js
index 398eb60..53fa05e 100644
--- a/asl_rulebook2/webapp/static/ContentPane.js
+++ b/asl_rulebook2/webapp/static/ContentPane.js
@@ -8,13 +8,13 @@ gMainApp.component( "content-pane", {
template: `