Implemented a basic search engine.

3 years ago · b387871bbe
parent 9d2495aa64
commit b387871bbe
27 changed files with 1311 additions and 105 deletions
--- a/asl_rulebook2/utils.py
+++ b/asl_rulebook2/utils.py
@ -1,8 +1,82 @@
 """ Miscellaneous utilities. """

+import os
 import pathlib
+import tempfile
 import re
 import math
+from io import StringIO
+from html.parser import HTMLParser
+
+# ---------------------------------------------------------------------
+
+class TempFile:
+    """Manage a temp file that can be closed while it's still being used."""
+
+    def __init__( self, mode="wb", extn=None, encoding=None ):
+        self.mode = mode
+        self.extn = extn
+        self.encoding = encoding
+        self.temp_file = None
+        self.name = None
+
+    def open( self ):
+        """Allocate a temp file."""
+        if self.encoding:
+            encoding = self.encoding
+        else:
+            encoding = "utf-8" if "b" not in self.mode else None
+        assert self.temp_file is None
+        self.temp_file = tempfile.NamedTemporaryFile(
+            mode = self.mode,
+            encoding = encoding,
+            suffix = self.extn,
+            delete = False
+        )
+        self.name = self.temp_file.name
+
+    def close( self, delete ):
+        """Close the temp file."""
+        self.temp_file.close()
+        if delete:
+            os.unlink( self.temp_file.name )
+
+    def write( self, data ):
+        """Write data to the temp file."""
+        self.temp_file.write( data )
+
+    def __enter__( self ):
+        """Enter the context manager."""
+        self.open()
+        return self
+
+    def __exit__( self, exc_type, exc_val, exc_tb ):
+        """Exit the context manager."""
+        self.close( delete=True )
+
+# ---------------------------------------------------------------------
+
+def strip_html( val ):
+    """Strip HTML."""
+
+    if not val:
+        return val
+
+    buf = StringIO()
+    class StripHtml( HTMLParser ):
+        """Strip HTML."""
+        def __init__( self ):
+            super().__init__()
+            self.strict = False
+        def handle_data( self, data ):
+            buf.write( data )
+        def error( self, message ):
+            pass
+
+    # strip HTML
+    html_stripper = StripHtml()
+    html_stripper.feed( val )
+    return buf.getvalue()

 # ---------------------------------------------------------------------

@ -99,6 +173,10 @@ def append_text( buf, new ):
            buf += " "
    return buf + new

+def plural( n, name1, name2 ):
+    """Return the singular/plural form of a string."""
+    return "{} {}".format( n, name1 if n == 1 else name2 )
+
 def remove_quotes( val ):
    """Remove enclosing quotes from a string."""
    if val[0] in ('"',"'") and val[-1] == val[0]:
--- a/asl_rulebook2/webapp/init.py
+++ b/asl_rulebook2/webapp/init.py
@ -11,7 +11,7 @@ from flask import Flask
 import flask.cli
 import yaml

-from asl_rulebook2.webapp.config.constants import BASE_DIR
+from asl_rulebook2.webapp.config.constants import BASE_DIR, CONFIG_DIR

 shutdown_event = threading.Event()

@ -19,6 +19,7 @@ shutdown_event = threading.Event()

 def _load_config( fname, section ):
    """Load config settings from a file."""
+    fname = os.path.join( CONFIG_DIR, fname )
    if not os.path.isfile( fname ):
        return
    config_parser = configparser.ConfigParser()
@ -50,21 +51,12 @@ flask.cli.show_server_banner = lambda *args: None
 app = Flask( __name__ )

 # load the application configuration
-config_dir = os.path.join( BASE_DIR, "config" )
-_fname = os.path.join( config_dir, "app.cfg" )
-_load_config( _fname, "System" )
-
-# load any site configuration
-_fname = os.path.join( config_dir, "site.cfg" )
-_load_config( _fname, "Site Config" )
-
-# load any debug configuration
-_fname = os.path.join( config_dir, "debug.cfg" )
-if os.path.isfile( _fname ) :
-    _load_config( _fname, "Debug" )
+_load_config( "app.cfg", "System" )
+_load_config( "site.cfg", "Site Config" )
+_load_config( "debug.cfg", "Debug" )

 # initialize logging
-_fname = os.path.join( config_dir, "logging.yaml" )
+_fname = os.path.join( CONFIG_DIR, "logging.yaml" )
 if os.path.isfile( _fname ):
    with open( _fname, "r", encoding="utf-8" ) as fp:
        try:
--- a/asl_rulebook2/webapp/config/constants.py
+++ b/asl_rulebook2/webapp/config/constants.py
@ -7,3 +7,4 @@ APP_VERSION = "v0.1" # nb: also update setup.py
 APP_DESCRIPTION = "Search engine for the ASL Rulebook."

 BASE_DIR = os.path.abspath( os.path.join( os.path.dirname(__file__), ".." ) )
+CONFIG_DIR = os.path.join( BASE_DIR, "config" )
--- a/asl_rulebook2/webapp/config/search-aliases.json
+++ b/asl_rulebook2/webapp/config/search-aliases.json
@ -0,0 +1,25 @@
+{
+
+"_comment_": "This file defines search aliases.",
+"_comment_": "Keys that appear in a query string will match itself or any of its associated values.",
+"_comment_": "  e.g. searching for 'entrenchments' will actually search for 'entrenchments OR foxhole OR trench OR ditch'",
+"_comment_": "These differ from search synonyms in that only the key word will trigger the replacement, not any word from the set.",
+"_comment_": "A user-defined version of this file in the data directory will also be loaded.",
+
+"latw": [
+    "atmm", "atr", "baz", "mol-p", "mol-projector", "piat", "pf", "pfk", "psk"
+],
+"fortification/foritifcations": [
+    "cave", "a-t ditch", "foxhole", "sangar", "trench", "bunker", "minefield", "mines", "booby trap", "panji", "pillbox", "roadblock", "tetrahedron", "wire"
+],
+"entrenchment/entrenchments": [
+    "foxhole", "trench", "ditch"
+],
+"vehicle/vehicles": [
+    "tank", "halftrack", "half-track", "jeep", "carrier"
+],
+"illumination": [
+    "tarshell", "illuminating round", "trip flare"
+]
+
+}
--- a/asl_rulebook2/webapp/config/search-replacements.json
+++ b/asl_rulebook2/webapp/config/search-replacements.json
@ -0,0 +1,14 @@
+{
+
+"_comment_": "This file defines search replacements.",
+"_comment_": "Keys that appear in a query string will be replaced by the value.",
+"_comment_": "  e.g. searching for '1/2 MF' will actually search for '&frac12; MF'",
+"_comment_": "A user-defined version of this file in the data directory will also be loaded.",
+
+"1/2": "&frac12;",
+"3/4": "&frac34;",
+"3/8": "&frac38;",
+"5/8": "&frac58;",
+"(r)": "&reg;"
+
+}
--- a/asl_rulebook2/webapp/config/search-synonyms.json
+++ b/asl_rulebook2/webapp/config/search-synonyms.json
@ -0,0 +1,51 @@
+[
+
+"This file defines search synonyms.",
+"If a word appears in a query string, it will match any of the words in its set.",
+"  e.g. searching for 'finn gun' will actually search for '(finn OR finnish) AND gun'",
+"These differ from search aliases in that any word from a set will trigger the replacement.",
+"A user-defined version of this file in the data directory will also be loaded.",
+
+[ "u.s.", "america", "american" ],
+[ "usmc", "marine" ],
+[ "finn", "finnish" ],
+[ "romania", "romanian" ],
+[ "hungary", "hungarian" ],
+[ "slovakia", "slovakian" ],
+[ "croatia", "croatian" ],
+[ "bulgaria", "bulgarian" ],
+
+[ "dc", "demo charge", "demolition charge" ],
+[ "ft", "flamethrower", "flame-thrower" ],
+[ "baz", "bazooka" ],
+[ "pf", "panzerfaust" ],
+[ "psk", "panzershreck" ],
+[ "wp", "white phosphorous" ],
+[ "mol", "molotov cocktail" ],
+[ "ovr", "overrun" ],
+[ "cc", "close combat" ],
+[ "thh", "t-h hero", "tank-hunter hero" ],
+[ "scw", "shaped-charge weapon" ],
+[ "sw", "support weapon" ],
+[ "mg", "machinegun", "machine-gun", "machine gun" ],
+[ "firelane", "fire-lane", "fire lane" ],
+[ "firegroup", "fire-group", "fire group" ],
+[ "lc", "landing craft" ],
+[ "ht", "halftrack", "half-track" ],
+[ "wa", "wall advantage" ],
+[ "hob", "heat of battle" ],
+[ "cg", "campaign game" ],
+[ "pbm", "pbem" ],
+
+[ "rb", "red barricades" ],
+[ "votg", "valor of the guards" ],
+[ "kgp", "kampfgrupper peiper" ],
+[ "kgs", "kampfgrupper scherer" ],
+[ "brt", "br:t", "blood reef tarawa" ],
+[ "pb", "pegasus bridge" ],
+
+[ "ammo", "ammunition" ],
+[ "armor", "armour" ],
+[ "color", "colour" ]
+
+]
--- a/asl_rulebook2/webapp/content.py
+++ b/asl_rulebook2/webapp/content.py
@ -2,6 +2,7 @@

 import os
 import io
+import json
 import glob

 from flask import jsonify, send_file, url_for, abort
@ -13,7 +14,7 @@ content_docs = None

 # ---------------------------------------------------------------------

-def load_content_docs():
+def load_content_docs( logger ):
    """Load the content documents from the data directory."""

    # initialize
@ -29,26 +30,32 @@ def load_content_docs():
        fname = os.path.join( dname, fname )
        if not os.path.isfile( fname ):
            return
-        kwargs = {}
-        kwargs["mode"] = "rb" if binary else "r"
-        if not binary:
-            kwargs["encoding"] = "utf-8"
-        with open( fname, **kwargs ) as fp:
-            content_doc[ key ] = fp.read()
+        if binary:
+            with open( fname, mode="rb" ) as fp:
+                data = fp.read()
+            logger.debug( "- Loaded \"%s\" file: #bytes=%d", key, len(data) )
+            content_doc[ key ] = data
+        else:
+            with open( fname, "r", encoding="utf-8" ) as fp:
+                content_doc[ key ] = json.load( fp )
+            logger.debug( "- Loaded \"%s\" file.", key )

    # load each content doc
+    logger.info( "Loading content docs: %s", dname )
    fspec = os.path.join( dname, "*.index" )
    for fname in glob.glob( fspec ):
-        fname = os.path.basename( fname )
-        title = os.path.splitext( fname )[0]
+        fname2 = os.path.basename( fname )
+        logger.info( "- %s", fname2 )
+        title = os.path.splitext( fname2 )[0]
        content_doc = {
+            "_fname": fname,
            "doc_id": slugify( title ),
            "title": title,
        }
-        get_doc( content_doc, "index", fname )
-        get_doc( content_doc, "targets", change_extn(fname,".targets") )
-        get_doc( content_doc, "footnotes", change_extn(fname,".footnotes") )
-        get_doc( content_doc, "content", change_extn(fname,".pdf"), binary=True )
+        get_doc( content_doc, "index", fname2 )
+        get_doc( content_doc, "targets", change_extn(fname2,".targets") )
+        get_doc( content_doc, "footnotes", change_extn(fname2,".footnotes") )
+        get_doc( content_doc, "content", change_extn(fname2,".pdf"), binary=True )
        content_docs[ content_doc["doc_id"] ] = content_doc

 # ---------------------------------------------------------------------
@ -59,11 +66,13 @@ def get_content_docs():
    resp = {}
    for cdoc in content_docs.values():
        cdoc2 = {
-            "docId": cdoc["doc_id"],
+            "doc_id": cdoc["doc_id"],
            "title": cdoc["title"],
        }
        if "content" in cdoc:
            cdoc2["url"] = url_for( "get_content", doc_id=cdoc["doc_id"] )
+        if "targets" in cdoc:
+            cdoc2["targets"] = cdoc["targets"]
        resp[ cdoc["doc_id"] ] = cdoc2
    return jsonify( resp )

--- a/asl_rulebook2/webapp/main.py
+++ b/asl_rulebook2/webapp/main.py
@ -9,6 +9,7 @@ from flask import render_template, jsonify, abort

 from asl_rulebook2.webapp import app, globvars, shutdown_event
 from asl_rulebook2.webapp.content import load_content_docs
+from asl_rulebook2.webapp.search import init_search
 from asl_rulebook2.webapp.utils import parse_int

 # ---------------------------------------------------------------------
@ -20,7 +21,9 @@ def init_webapp():
    after that by the test suite, to reset the webapp before each test.
    """
    # initialize the webapp
-    load_content_docs()
+    logger = logging.getLogger( "startup" )
+    load_content_docs( logger )
+    init_search( logger )

 # ---------------------------------------------------------------------

--- a/asl_rulebook2/webapp/search.py
+++ b/asl_rulebook2/webapp/search.py
@ -0,0 +1,475 @@
+""" Manage the search engine. """
+
+import os
+import sqlite3
+import json
+import re
+import itertools
+import string
+import tempfile
+import logging
+import traceback
+
+from flask import request, jsonify
+
+from asl_rulebook2.utils import plural
+from asl_rulebook2.webapp import app
+from asl_rulebook2.webapp import content as webapp_content
+from asl_rulebook2.webapp.utils import make_config_path, make_data_path
+
+_sqlite_path = None
+_fts_index_entries= None
+
+_logger = logging.getLogger( "search" )
+
+# these are used to highlight search matches (nb: the front-end looks for these)
+_BEGIN_HIGHLIGHT = "!@:"
+_END_HIGHLIGHT = ":@!"
+
+# NOTE: These regex's fix up content returned to us by the SQLite search engine (typically problems
+# with highlighting search terms).
+_FIXUP_TEXT_REGEXES = [
+    [ re.compile( fixup[0].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) ),
+      fixup[1].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
+    ]
+    for fixup in [
+        [ r"&{}(.+?){};", r"{}&\g<1>;{}" ], # HTML entities e.g. &((frac12)); -> (($frac12;))
+        [ r"{}(.+?){}#", r"{}\g<1>#{}" ], # e.g. ((TH)# -> ((TH#)
+        [ r"{}U\.S{}\.", "{}U.S.{}" ], # ((U.S)). -> ((U.S.))
+    ]
+]
+
+# these are used to separate ruleref's in the FTS table (internal use only)
+_RULEREF_SEPARATOR = "-:-"
+
+_SEARCH_TERM_ADJUSTMENTS = None
+
+# ---------------------------------------------------------------------
+
+@app.route( "/search", methods=["POST"] )
+def search() :
+    """Run a search."""
+
+    # log the request
+    _logger.info( "SEARCH REQUEST:" )
+    args = dict( request.form.items() )
+    for key,val in args.items():
+        _logger.info( "- %s: %s", key, val )
+
+    # run the search
+    try:
+        return _do_search( args )
+    except Exception as exc: #pylint: disable=broad-except
+        msg = str( exc )
+        if msg.startswith( "fts5: " ):
+            msg = msg[5:] # nb: this is a sqlite3.OperationalError
+        _logger.warning( "SEARCH ERROR: %s\n%s", args, traceback.format_exc() )
+        return jsonify( { "error": msg } )
+
+def _do_search( args ):
+
+    def fixup_text( val ):
+        if val is None:
+            return None
+        for regex in _FIXUP_TEXT_REGEXES:
+            val = regex[0].sub( regex[1], val )
+        return val
+
+    # run the search
+    query_string = args[ "queryString" ].strip()
+    if query_string == "!:simulated-error:!":
+        raise RuntimeError( "Simulated error." ) # nb: for the test suite
+    fts_query_string, search_terms = _make_fts_query_string( query_string )
+    _logger.debug( "FTS query string: %s", fts_query_string )
+    conn = sqlite3.connect( _sqlite_path )
+    def highlight( n ):
+         # NOTE: highlight() is an FTS extension function, and takes column numbers :-/
+        return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
+    sql = "SELECT rowid,doc_id,sr_type,rank,{},{},{},{} FROM searchable".format(
+        highlight(2), highlight(3), highlight(4), highlight(5)
+    )
+    sql += " WHERE searchable MATCH ?"
+    sql += " ORDER BY rank"
+    curs = conn.execute( sql,
+        ( "{title subtitle content rulerefs}: " + fts_query_string, )
+    )
+
+    def get_col( sr, key, val ):
+        if val:
+            sr[key] = fixup_text( val )
+
+    # get the results
+    results = []
+    for row in curs:
+        if row[2] != "index":
+            _logger.error( "Unknown searchable row type (rowid=%d): %s", row[0], row[2] )
+            continue
+        index_entry = _fts_index_entries[ row[0] ]
+        result = {
+            "doc_id": row[1],
+            "sr_type": row[2],
+            "_score": - row[3],
+        }
+        get_col( result, "title", row[4] )
+        get_col( result, "subtitle", row[5] )
+        get_col( result, "content", row[6] )
+        if index_entry.get( "ruleids" ):
+            result["ruleids"] = index_entry["ruleids"]
+        if index_entry.get( "see_also" ):
+            result["see_also"] = index_entry["see_also"]
+        rulerefs = [ r.strip() for r in row[7].split(_RULEREF_SEPARATOR) ] if row[7] else []
+        assert len(rulerefs) == len(index_entry.get("rulerefs",[]))
+        if rulerefs:
+            result[ "rulerefs" ] = []
+            for i, ruleref in enumerate(rulerefs):
+                ruleref2 = {}
+                if "caption" in index_entry["rulerefs"][i]:
+                    assert ruleref.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ) \
+                           == index_entry["rulerefs"][i]["caption"]
+                    ruleref2["caption"] = fixup_text( ruleref )
+                if "ruleids" in index_entry["rulerefs"][i]:
+                    ruleref2["ruleids"] = index_entry["rulerefs"][i]["ruleids"]
+                assert ruleref2
+                result["rulerefs"].append( ruleref2 )
+        results.append( result )
+
+    # fixup the results
+    results = _fixup_results_for_hash_terms( results, search_terms )
+
+    # adjust the sort order
+    results = _adjust_sort_order( results )
+
+    # return the results
+    _logger.debug( "Search results:" if len(results) > 0 else "Search results: none" )
+    for result in results:
+        _logger.debug( "- %s (%.3f)",
+           result["title"].replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ),
+           result["_score"]
+        )
+    return jsonify( results )
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+PASSTHROUGH_REGEXES = set([
+    re.compile( r"\bAND\b" ),
+    re.compile( r"\bOR\b" ),
+    re.compile( r"\bNOT\b" ),
+    re.compile( r"\((?![Rr]\))" ),
+])
+
+def _make_fts_query_string( query_string ):
+    """Generate the SQLite query string.
+
+    SQLite's MATCH function recognizes a lot of special characters, which need
+    to be enclosed in double-quotes to disable.
+    """
+
+    # check if this looks like a raw FTS query
+    if any( regex.search(query_string) for regex in PASSTHROUGH_REGEXES ):
+        return query_string.strip(), None
+
+    # split the search string into words (taking quoted phrases into account)
+    ignore = app.config.get( "SQLITE_FTS_IGNORE_CHARS", ",;!?$" )
+    query_string = "".join( ch for ch in query_string if ch not in ignore )
+    terms = query_string.lower().split()
+    i = 0
+    while True:
+        if i >= len(terms):
+            break
+        if i > 0 and terms[i-1].startswith( '"' ):
+            terms[i-1] += " {}".format( terms[i] )
+            del terms[i]
+            if terms[i-1].startswith( '"' ) and terms[i-1].endswith( '"' ):
+                terms[i-1] = terms[i-1][1:-1]
+            continue
+        i += 1
+
+    # clean up quoted phrases
+    terms = [ t[1:] if t.startswith('"') else t for t in terms ]
+    terms = [ t[:-1] if t.endswith('"') else t for t in terms ]
+    terms = [ t.strip() for t in terms ]
+    terms = [ t for t in terms if t ]
+
+    # adjust search terms
+    for term_no, term in enumerate(terms):
+        aliases = _SEARCH_TERM_ADJUSTMENTS.get( term )
+        if not aliases:
+            continue
+        if isinstance( aliases, str ):
+            # the search term is replaced by a new one
+            terms[ term_no ] = aliases
+        elif isinstance( aliases, set ):
+            # the search term is replaced by multiple new ones (that will be OR'ed together)
+            # NOTE: We sort the terms so that the tests will work reliably.
+            terms[ term_no ] = sorted( aliases )
+        else:
+            assert "Unknown search alias type: {}".format( type(aliases) )
+
+    # fixup each term
+    def has_special_char( term ):
+        """Check if the term contains any special characters."""
+        for ch in term:
+            if ch in "*":
+                continue
+            if ch.isspace() or ch in string.punctuation:
+                return True
+            if ord(ch) < 32 or ord(ch) > 127:
+                return True
+        return False
+    def fixup_terms( terms ):
+        """Fixup a list of terms."""
+        for term_no, term in enumerate(terms):
+            if isinstance( term, str ):
+                if has_special_char( term ):
+                    terms[term_no] = '"{}"'.format( term )
+            else:
+                fixup_terms( term )
+    fixup_terms( terms )
+
+    # return the final FTS query string
+    def term_string( term ):
+        if isinstance( term, str ):
+            return term
+        assert isinstance( term, list )
+        return "( {} )".format( " OR ".join( term ) )
+    return " AND ".join( term_string(t) for t in terms ), terms
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def _fixup_results_for_hash_terms( results, search_terms ):
+    """Fixup search results for search terms that end with a hash.
+
+    SQLite doesn't handle search terms that end with a hash particularly well.
+    We correct highlighted search terms in fixup_text(), but searching for e.g. "US#"
+    will also match "use" and "using" - we remove such results here.
+    """
+
+    # figure out which search terms end with a hash
+    # NOTE: We don't bother descending down into sub-terms.
+    if not search_terms:
+        return results
+    terms = [
+        t[1:-1] for t in search_terms
+        if isinstance(t,str) and t.startswith('"') and t.endswith('"')
+    ]
+    terms = [
+        t[:-1].lower() for t in terms
+        if isinstance(t,str) and t.endswith("#")
+    ]
+    if not terms:
+        return results
+    if "us" in terms:
+        terms.extend( [ "use", "used", "using", "user" ] )
+
+    def keep( sr ):
+        # remove every incorrectly matched search term (e.g. ((K)) when searching for "K#")
+        buf = json.dumps( sr ).lower()
+        for term in terms:
+            buf = buf.replace( "{}{}{}".format( _BEGIN_HIGHLIGHT, term, _END_HIGHLIGHT ), "_removed_" )
+        # we keep this search result if there are still some highlighted search terms
+        return _BEGIN_HIGHLIGHT in buf
+
+    return [
+        result for result in results if keep(result)
+    ]
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def _adjust_sort_order( results ):
+    """Adjust the sort order of the search results."""
+
+    results2 = []
+    def extract_sr( func ):
+        # move results that pass the filter function to the new list
+        i = 0
+        while True:
+            if i >= len(results):
+                break
+            # NOTE: We never prefer small entries (i.e .have no ruleref's)
+            # e.g. those that only contain a "see also".
+            if func( results[i] ) and len(results[i].get("rulerefs",[])) > 0:
+                results2.append( results[i] )
+                del results[i]
+            else:
+                i += 1
+
+    def get( sr, key ):
+        val = sr.get( key )
+        return val if val else ""
+
+    # prefer search results whose title is an exact match
+    extract_sr(
+        lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) and get(sr,"title").endswith( _END_HIGHLIGHT )
+    )
+    # prefer search results whose title starts with a match
+    extract_sr(
+        lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT )
+    )
+    # prefer search results that have a match in the title
+    extract_sr(
+        lambda sr: _BEGIN_HIGHLIGHT in get(sr,"title")
+    )
+    # prefer search results that have a match in the subtitle
+    extract_sr(
+        lambda sr: _BEGIN_HIGHLIGHT in get(sr,"subtitle")
+    )
+
+    # include any remaining search results
+    results2.extend( results )
+
+    return results2
+
+# ---------------------------------------------------------------------
+
+def init_search( logger ):
+    """Initialize the search engine."""
+
+    # initialize
+    global _fts_index_entries
+    _fts_index_entries = {}
+
+    # initialize the database
+    global _sqlite_path
+    _sqlite_path = app.config.get( "SQLITE_PATH" )
+    if not _sqlite_path:
+        # FUDGE! We should be able to create a shared, in-memory database using this:
+        #   file::XYZ:?mode=memory&cache=shared
+        # but it doesn't seem to work (on Linux) and ends up creating a file with this name :-/
+        # We manually create a temp file, which has to have the same name each time, so that we don't
+        # keep creating a new database each time we start up. Sigh...
+        _sqlite_path = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb" )
+    if os.path.isfile( _sqlite_path ):
+        os.unlink( _sqlite_path )
+    logger.info( "Creating the search index: %s", _sqlite_path )
+    conn = sqlite3.connect( _sqlite_path )
+    # NOTE: Storing everything in a single table allows FTS to rank search results based on
+    # the overall content, and also lets us do AND/OR queries across all searchable content.
+    conn.execute(
+        "CREATE VIRTUAL TABLE searchable USING fts5"
+        " ( doc_id, sr_type, title, subtitle, content, rulerefs, tokenize='porter unicode61' )"
+    )
+
+    # load the searchable content
+    logger.info( "Loading the search index..." )
+    conn.execute( "DELETE FROM searchable" )
+    curs = conn.cursor()
+    for cdoc in webapp_content.content_docs.values():
+        logger.info( "- Loading index file: %s", cdoc["_fname"] )
+        nrows = 0
+        for index_entry in cdoc["index"]:
+            rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) )
+            # NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags
+            # will need to be included in search terms. However, this means that the content returned by a query
+            # will be this stripped content. We could go back to the original data to get the original HTML content,
+            # but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert
+            # the original content, since none of it should contain HTML, anyway.
+            curs.execute(
+                "INSERT INTO searchable (doc_id,sr_type,title,subtitle,content,rulerefs) VALUES (?,?,?,?,?,?)", (
+                    cdoc["doc_id"], "index",
+                    index_entry.get("title"), index_entry.get("subtitle"), index_entry.get("content"), rulerefs
+            ) )
+            _fts_index_entries[ curs.lastrowid ] = index_entry
+            index_entry["_fts_rowid"] = curs.lastrowid
+            nrows += 1
+        conn.commit()
+        logger.info( "  - Loaded %s.", plural(nrows,"index entry","index entries"),  )
+    assert len(_fts_index_entries) == _get_row_count( conn, "searchable" )
+
+    # load the search config
+    load_search_config( logger )
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def load_search_config( logger ):
+    """Load the search config."""
+
+    # initialize
+    global _SEARCH_TERM_ADJUSTMENTS
+    _SEARCH_TERM_ADJUSTMENTS = {}
+
+    def add_search_term_adjustment( key, vals ):
+        # make sure everything is lower-case
+        key = key.lower()
+        if isinstance( vals, str ):
+            vals = vals.lower()
+        elif isinstance( vals, set ):
+            vals = set( v.lower() for v in vals )
+        else:
+            assert "Unknown search alias type: {}".format( type(vals) )
+        # add new the search term adjustment
+        if key not in _SEARCH_TERM_ADJUSTMENTS:
+            _SEARCH_TERM_ADJUSTMENTS[ key ] = vals
+        else:
+            # found a multiple definition - try to do something sensible
+            logger.warning( "  - Duplicate search alias: %s\n- current aliases = %s\n- new aliases = %s", key,
+                _SEARCH_TERM_ADJUSTMENTS[key], vals
+            )
+            if isinstance( _SEARCH_TERM_ADJUSTMENTS[key], str ):
+                _SEARCH_TERM_ADJUSTMENTS[ key ] = vals
+            else:
+                assert isinstance( _SEARCH_TERM_ADJUSTMENTS[key], set )
+                _SEARCH_TERM_ADJUSTMENTS[ key ].update( vals )
+
+    # load the search replacements
+    def load_search_replacements( fname ):
+        if not os.path.isfile( fname ):
+            return
+        logger.info( "Loading search replacements: %s", fname )
+        with open( fname, "r", encoding="utf-8" ) as fp:
+            data = json.load( fp )
+        nitems = 0
+        for key, val in data.items():
+            if key.startswith( "_" ):
+                continue # nb: ignore comments
+            logger.debug( "- %s -> %s", key, val )
+            add_search_term_adjustment( key, val )
+            nitems += 1
+        logger.info( "- Loaded %s.", plural(nitems,"search replacement","search replacements") )
+    load_search_replacements( make_config_path( "search-replacements.json" ) )
+    load_search_replacements( make_data_path( "search-replacements.json" ) )
+
+    # load the search aliases
+    def load_search_aliases( fname ):
+        if not os.path.isfile( fname ):
+            return
+        logger.info( "Loading search aliases: %s", fname )
+        with open( fname, "r", encoding="utf-8" ) as fp:
+            data = json.load( fp )
+        nitems = 0
+        for keys, aliases in data.items():
+            if keys.startswith( "_" ):
+                continue # nb: ignore comments
+            logger.debug( "- %s -> %s", keys, " ; ".join(aliases) )
+            for key in keys.split( "/" ):
+                add_search_term_adjustment( key, set( itertools.chain( aliases, [key] ) ) )
+            nitems += 1
+        logger.info( "- Loaded %s.", plural(nitems,"search aliases","search aliases") )
+    load_search_aliases( make_config_path( "search-aliases.json" ) )
+    load_search_aliases( make_data_path( "search-aliases.json" ) )
+
+    # load the search synonyms
+    def load_search_synonyms( fname ):
+        if not os.path.isfile( fname ):
+            return
+        logger.info( "Loading search synonyms: %s", fname )
+        with open( fname, "r", encoding="utf-8" ) as fp:
+            data = json.load( fp )
+        nitems = 0
+        for synonyms in data:
+            if isinstance( synonyms, str ):
+                continue # nb: ignore comments
+            logger.debug( "- %s", " ; ".join(synonyms) )
+            synonyms = set( synonyms )
+            for term in synonyms:
+                add_search_term_adjustment( term, synonyms )
+            nitems += 1
+        logger.info( "- Loaded %s.", plural(nitems,"search synonym","search synonyms") )
+    load_search_synonyms( make_config_path( "search-synonyms.json" ) )
+    load_search_synonyms( make_data_path( "search-synonyms.json" ) )
+
+# ---------------------------------------------------------------------
+
+def _get_row_count( conn, table_name ):
+    """Get the number of rows in a table."""
+    cur = conn.execute( "SELECT count(*) FROM {}".format( table_name ) )
+    return cur.fetchone()[0]
--- a/asl_rulebook2/webapp/static/ContentPane.js
+++ b/asl_rulebook2/webapp/static/ContentPane.js
@ -8,13 +8,13 @@ gMainApp.component( "content-pane", {

    template: `
 <tabbed-pages ref="tabbedPages">
-    <tabbed-page v-for="doc in contentDocs" :tabId=doc.docId :caption=doc.title >
+    <tabbed-page v-for="doc in contentDocs" :tabId=doc.doc_id :caption=doc.title :key=doc.doc_id >
        <content-doc :doc=doc />
    </tabbed-page>
 </tabbed-pages>`,

    mounted() {
-        gEventBus.on( "show-content-doc", (docId) => {
+        gEventBus.on( "show-target", (docId, target) => { //eslint-disable-line no-unused-vars
            this.$refs.tabbedPages.activateTab( docId ) ; // nb: tabId == docId
        } ) ;
    },
@ -27,14 +27,37 @@ gMainApp.component( "content-doc", {

    props: [ "doc" ],
    data() { return {
+        target: null,
        noContent: gUrlParams.get( "no-content" ),
    } ; },

    template: `
-<div class="content-doc">
-    <div v-if=noContent class="disabled"> Content disabled. </div>
-    <iframe v-else-if=doc.url :src=doc.url />
+<div class="content-doc" :data-target=target >
+    <div v-if=noContent class="disabled"> Content disabled. <div v-if=target>target = {{target}}</div> </div>
+    <iframe v-else-if=doc.url :src=makeDocUrl />
    <div v-else class="disabled"> No content. </div>
 </div>`,

+    created() {
+        gEventBus.on( "show-target", (docId, target) => {
+            if ( docId != this.doc.doc_id )
+                return ;
+            // FUDGE! We give the tab time to show itself before we scroll to the target.
+            setTimeout( () => {
+                this.target = target ;
+            }, 50 ) ;
+        } ) ;
+    },
+
+    computed: {
+
+        makeDocUrl() {
+            let url = this.doc.url ;
+            if ( this.target )
+                url += "#nameddest=" + this.target ;
+            return url ;
+        }
+
+    },
+
 } ) ;
--- a/asl_rulebook2/webapp/static/MainApp.js
+++ b/asl_rulebook2/webapp/static/MainApp.js
@ -12,6 +12,10 @@ $(document).ready( () => {
    gMainApp.mount( "#main-app" ) ;
 } ) ;

+// FUDGE! Can't seem to get access to the content docs via gMainApp, so we make them available
+// to the rest of the program via this global variable :-/
+export let gContentDocs = null ;
+
 // --------------------------------------------------------------------

 gMainApp.component( "main-app", {
@ -47,23 +51,27 @@ gMainApp.component( "main-app", {

    methods: {

-        getContentDocs: (self) => new Promise( (resolve, reject) => {
-            // get the content docs
-            $.getJSON( gGetContentDocsUrl, (resp) => { //eslint-disable-line no-undef
-                self.contentDocs = resp ;
-                let docIds = Object.keys( resp ) ;
-                if ( docIds.length > 0 ) {
-                    Vue.nextTick( () => {
-                        gEventBus.emit( "show-content-doc", docIds[0] ) ; // FIXME! which one do we choose?
-                    } ) ;
-                }
-                resolve() ;
-            } ).fail( (xhr, status, errorMsg) => {
-                const msg = "Couldn't get the content docs." ;
-                showErrorMsg( msg + " <div class='pre'>" + errorMsg + "</div>" ) ;
-                reject( msg )
+        getContentDocs( self ) {
+            return new Promise( (resolve, reject) => {
+                // get the content docs
+                $.getJSON( gGetContentDocsUrl, (resp) => { //eslint-disable-line no-undef
+                    if ( gUrlParams.get( "add-empty-doc" ) )
+                        resp["empty"] = { "doc_id": "empty", "title": "Empty document" } ; // nb: for testing porpoises
+                    gContentDocs = self.contentDocs = resp ;
+                    let docIds = Object.keys( resp ) ;
+                    if ( docIds.length > 0 ) {
+                        Vue.nextTick( () => {
+                            gEventBus.emit( "show-target", docIds[0], null ) ; // FIXME! which one do we choose?
+                        } ) ;
+                    }
+                    resolve() ;
+                } ).fail( (xhr, status, errorMsg) => {
+                    const msg = "Couldn't get the content docs." ;
+                    showErrorMsg( msg + " <div class='pre'>" + errorMsg + "</div>" ) ;
+                    reject( msg )
+                } ) ;
            } ) ;
-        } ),
+        },

    },

--- a/asl_rulebook2/webapp/static/NavPane.js
+++ b/asl_rulebook2/webapp/static/NavPane.js
@ -4,17 +4,28 @@ import { gMainApp, gEventBus } from "./MainApp.js" ;

 gMainApp.component( "nav-pane", {

+    data() { return {
+        seqNo: 0, // nb: for the test suite
+    } ; },
+
    template: `
 <tabbed-pages>
    <tabbed-page tabId="search" caption="Search" data-display="flex" >
        <search-box id="search-box" @search=onSearch />
-        <search-results id="search-results" />
+        <search-results id="search-results" :data-seqno=seqNo />
    </tabbed-page>
 </tabbed-pages>`,

+    mounted() {
+        gEventBus.on( "search-done", () => {
+            // notify the test suite that the search results are now available
+            this.seqNo += 1 ;
+        } ) ;
+    },
+
    methods: {

-        onSearch: (queryString) => {
+        onSearch( queryString ) {
            gEventBus.emit( "search", queryString ) ;
        },

--- a/asl_rulebook2/webapp/static/SearchPane.js
+++ b/asl_rulebook2/webapp/static/SearchPane.js
@ -1,5 +1,5 @@
 import { gMainApp, gEventBus } from "./MainApp.js" ;
-import { IndexSearchResult } from "./SearchResult.js" ;
+import { fixupSearchHilites } from "./utils.js" ;

 // --------------------------------------------------------------------

@ -30,7 +30,7 @@ gMainApp.component( "search-box", {
    },

    methods: {
-        onKeyUp: function( evt ) {
+        onKeyUp( evt ) {
            if ( evt.keyCode == 13 )
                this.$refs["submit"].click() ;
        }
@ -43,12 +43,15 @@ gMainApp.component( "search-box", {
 gMainApp.component( "search-results", {

    data() { return {
-        searchResults: [],
+        searchResults: null,
+        errorMsg: null,
    } ; },

    template: `<div>
-<div v-for="sr in searchResults" :key=sr.key >
-    <index-sr v-if="sr.srType == 'index'" :sr=sr />
+<div v-if=errorMsg class="error"> Search error: <div class="pre"> {{errorMsg}} </div> </div>
+<div v-else-if="searchResults != null && searchResults.length == 0" class="no-results"> Nothing was found. </div>
+<div v-else v-for="sr in searchResults" :key=sr.key >
+    <index-sr v-if="sr.sr_type == 'index'" :sr=sr />
    <div v-else> ??? </div>
 </div>
 </div>`,
@ -60,22 +63,39 @@ gMainApp.component( "search-results", {
    methods: {

        onSearch( queryString ) {
-            // generate some dummy search results
-            let searchResults = [] ;
-            for ( let i=0 ; i < queryString.length ; ++i ) {
-                let buf = [ "Search result #" + (1+i) ] ;
-                let nItems = Math.floor( Math.sqrt( 100 * Math.random() ) ) - 1 ;
-                if ( nItems > 0 ) {
-                    buf.push( "<ul style='padding-left:1em;'>" ) ;
-                    for ( let j=0 ; j < nItems ; ++j )
-                        buf.push( "<li> item " + (1+j) ) ;
-                    buf.push( "</ul>" ) ;
+            // submit the search request
+            const onError = (errorMsg) => {
+                this.errorMsg = errorMsg ;
+                Vue.nextTick( () => {
+                    gEventBus.emit( "search-done" ) ;
+                } ) ;
+            } ;
+            this.errorMsg = null ;
+            $.ajax( { url: gSearchUrl, type: "POST", //eslint-disable-line no-undef
+                data: { queryString: queryString },
+                dataType: "json",
+            } ).done( (resp) => {
+                // check if there was an error
+                if ( resp.error ) {
+                    onError( resp.error ) ;
+                    return ;
                }
-                searchResults.push(
-                    new IndexSearchResult( i, buf.join("") )
-                ) ;
-            }
-            this.searchResults = searchResults ;
+                // adjust highlighted text
+                resp.forEach( (sr) => {
+                    [ "title", "subtitle", "content" ].forEach( function( key ) {
+                        if ( sr[key] )
+                            sr[key] = fixupSearchHilites( sr[key] ) ;
+                    } ) ;
+                } ) ;
+                // load the search results into the UI
+                this.$el.scrollTop = 0;
+                this.searchResults = resp ;
+                Vue.nextTick( () => {
+                    gEventBus.emit( "search-done" ) ;
+                } ) ;
+            } ).fail( (xhr, status, errorMsg) => {
+                onError( errorMsg ) ;
+            } ) ;
        },

    },
--- a/asl_rulebook2/webapp/static/SearchResult.js
+++ b/asl_rulebook2/webapp/static/SearchResult.js
@ -1,23 +1,80 @@
-import { gMainApp } from "./MainApp.js" ;
+import { gMainApp, gEventBus, gContentDocs } from "./MainApp.js" ;
+import { fixupSearchHilites } from "./utils.js" ;

 // --------------------------------------------------------------------

-export class IndexSearchResult {
-    constructor( key, content ) {
-        this.key = key ;
-        this.srType = "index" ;
-        this.content = content ;
-    }
-}
-
-// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
 gMainApp.component( "index-sr", {

    props: [ "sr" ],

    template: `
-<div class="sr index-sr" v-html=sr.content />
-`,
+<div class="sr index-sr" >
+    <div v-if="sr.title || sr.subtitle" class="title" >
+        <span v-if=sr.title class="title" v-html=sr.title />
+        <span v-if=sr.subtitle class="subtitle" v-html=sr.subtitle />
+    </div>
+    <div class="body">
+        <div v-if=sr.content class="content" v-html=sr.content />
+        <div v-if=makeSeeAlso v-html=makeSeeAlso class="see-also" />
+        <div v-if=sr.ruleids class="ruleids" >
+            <ruleid v-for="rid in sr.ruleids" :docId=sr.doc_id :ruleId=rid :key=rid />
+        </div>
+        <ul v-if=sr.rulerefs class="rulerefs" >
+            <li v-for="rref in sr.rulerefs" :key=rref >
+                <span v-if=rref.caption class="caption" v-html=fixupHilites(rref.caption) />
+                <ruleid v-for="rid in rref.ruleids" :docId=sr.doc_id :ruleId=rid :key=rid />
+            </li>
+        </ul>
+    </div>
+</div>`,
+
+    computed: {
+        makeSeeAlso() {
+            if ( this.sr.see_also )
+                return "See also: " + this.sr.see_also.join( ", " ) ;
+            return null ;
+        },
+    },
+
+    methods: {
+        fixupHilites( val ) {
+            return fixupSearchHilites( val ) ;
+        },
+    },
+
+} ) ;
+
+// --------------------------------------------------------------------
+
+gMainApp.component( "ruleid", {
+
+    props: [ "docId", "ruleId" ],
+    data() { return {
+        target: null,
+    } ; },
+
+    template: `<span class="ruleid" v-bind:class="{unknown:!target}">[<a v-if=target @click=onClick>{{ruleId}}</a><span v-else>{{ruleId}}</span>]</span>`,
+
+    created() {
+        // figure out which rule is being referenced
+        let ruleId = this.ruleId ;
+        let pos = ruleId.indexOf( "-" ) ;
+        if ( pos >= 0 ) {
+            // NOTE: For ruleid's of the form "A12.3-.4", we want to target "A12.3".
+            ruleId = ruleId.substring( 0, pos ) ;
+        }
+        // check if the rule is one we know about
+        if ( gContentDocs[this.docId] && gContentDocs[this.docId].targets ) {
+            if ( gContentDocs[this.docId].targets[ ruleId ] )
+                this.target = ruleId ;
+        }
+    },
+
+    methods: {
+        onClick() {
+            // show the target
+            gEventBus.emit( "show-target", this.docId, this.target ) ;
+        },
+    },

 } ) ;
--- a/asl_rulebook2/webapp/static/TabbedPages.js
+++ b/asl_rulebook2/webapp/static/TabbedPages.js
@ -13,7 +13,7 @@ gMainApp.component( "tabbed-pages", {
 <div class="tabbed-pages">
    <slot />
    <div class="tab-strip">
-        <div v-for="tab in tabs" :data-tabid=tab.tabId @click=onTabClicked class="tab" v-bind:class="{'active': tab.tabId == activeTabId}" >
+        <div v-for="tab in tabs" :data-tabid=tab.tabId @click=onTabClicked class="tab" v-bind:class="{'active': tab.tabId == activeTabId}" :key=tab.tabId >
            {{tab.caption}}
        </div>
    </div>
@ -44,12 +44,12 @@ gMainApp.component( "tabbed-pages", {

    methods: {

-        onTabClicked: function( evt ) {
+        onTabClicked( evt ) {
            // activate the selected tab
            this.activateTab( evt.target.dataset.tabid ) ;
        },

-        activateTab: function( tabId ) {
+        activateTab( tabId ) {
            // activate the specified tab
            this.activeTabId = tabId ;
            $( this.$el ).find( ".tabbed-page" ).each( function() {
--- a/asl_rulebook2/webapp/static/css/SearchPane.css
+++ b/asl_rulebook2/webapp/static/css/SearchPane.css
@ -6,3 +6,5 @@

 /* search results */
 #search-results { flex-grow: -1 ; margin: 8px 0 2px 0 ; overflow-y: auto ; }
+#search-results .no-results { font-style: italic ; color: #666 ; }
+#search-results .error .pre { font-family: monospace ; margin: 0.25em 0 0 0.5em ; }
--- a/asl_rulebook2/webapp/static/css/SearchResult.css
+++ b/asl_rulebook2/webapp/static/css/SearchResult.css
@ -1 +1,13 @@
-#search-results .sr { margin: 0 10px 2px 0 ; border: 1px dotted #666 ; padding: 5px ; }
+#search-results .sr { margin: 0 10px 2px 0 ; padding: 5px ; }
+#search-results .sr .hilite { padding: 0 2px ; background: #ffa ; }
+
+#search-results .index-sr .title { background: #e0e0e0 ; border-bottom: 1px solid #ccc ; padding: 2px 5px ; font-weight: bold ; }
+#search-results .index-sr .subtitle { padding: 2px 5px ; font-weight: normal ; font-size: 80% ; font-style: italic ; }
+#search-results .index-sr .body { padding: 2px 5px 0 5px ; font-size: 80% ; }
+#search-results .index-sr .content { color: #444 ; }
+#search-results .index-sr .see-also { color: #444 ; }
+#search-results .index-sr ul.rulerefs { margin-left: 1.2em ; }
+#search-results .index-sr ul.rulerefs .caption { padding-right: 0.5em ; }
+#search-results .index-sr .ruleid { margin-right: 0.25em ; font-style: italic ; color: #444 ; }
+#search-results .index-sr .ruleid.unknown {  color: #888 ; }
+#search-results .index-sr .ruleid a { cursor: pointer ; }
--- a/asl_rulebook2/webapp/static/utils.js
+++ b/asl_rulebook2/webapp/static/utils.js
@ -1,3 +1,21 @@
+// --------------------------------------------------------------------
+
+const _HILITE_REGEXES = [
+    new RegExp("!@:","g"), new RegExp(":@!","g"),
+] ;
+
+export function fixupSearchHilites( val )
+{
+    // NOTE: The search engine highlights search tems in the returned search content using special markers.
+    // We convert those markers to HTML span's here.
+    if ( val === null || val === undefined )
+        return val ;
+    return val.replace( _HILITE_REGEXES[0], "<span class='hilite'>" )
+              .replace( _HILITE_REGEXES[1], "</span>" ) ;
+}
+
+// --------------------------------------------------------------------
+
 export function showInfoMsg( msg ) { _doShowNotificationMsg( "notice", msg ) ; }
 export function showWarningMsg( msg ) { _doShowNotificationMsg( "warning", msg ) ; }
 export function showErrorMsg( msg ) { _doShowNotificationMsg( "error", msg ) ; }
--- a/asl_rulebook2/webapp/templates/index.html
+++ b/asl_rulebook2/webapp/templates/index.html
@ -43,6 +43,7 @@

 <script>
 gGetContentDocsUrl = "{{ url_for( 'get_content_docs') }}" ;
+gSearchUrl = "{{ url_for( 'search' ) }}" ;
 </script>

 <script type="module" src="{{ url_for( 'static', filename='MainApp.js' ) }}"></script>
--- a/asl_rulebook2/webapp/tests/fixtures/simple/simple.index
+++ b/asl_rulebook2/webapp/tests/fixtures/simple/simple.index
@ -11,7 +11,7 @@
 { "title": "Backblast",
  "ruleids": [ "C13.8" ],
  "rulerefs": [
-    { "caption": "Huts", "ruleids": [ "G5.62" ] },
+    { "caption": "HEAT", "ruleids": [ "C13.8" ] },
    { "caption": "RCL", "ruleids": [ "C12.3-.4" ] }
  ]
 },
@ -31,11 +31,9 @@
  "content": "Also known as \"running <em>really</em> fast.\"",
  "rulerefs": [
    { "caption": "ENEMY Guard Automatic Action", "ruleids": [ "S6.303" ] },
-    { "caption": "Manhandling", "ruleids": [ "C10.3" ] },
-    { "caption": "NA for Pathfinders", "ruleids": [ "T1.2" ] },
-    { "caption": "S? NA", "ruleids": [ "S3.321" ] },
-    { "caption": "Water Shortage", "ruleids": [ "RCG21" ] },
-    { "caption": "Wire NA", "ruleids": [ "B26.46" ] }
+    { "ruleids": [ "C10.3" ] },
+    { "caption": "NA in Advance Phase", "ruleids": [ "A4.7" ] },
+    { "caption": "'S?' is \"&lt;NA&gt;\"" }
  ]
 },

@ -54,6 +52,7 @@
 },

 { "title": "Firepower",
+  "content": "The U.S. has lots of this.",
  "ruleids": [ "A1.21" ],
  "see_also": [ "FP" ]
 },
@ -71,6 +70,12 @@

 { "title": "Identity, Vehicular",
  "ruleids": [ "D1.4" ]
+},
+
+{ "title": "HTML ti<u>tl</u>e",
+  "subtitle": "HTML subti<u>tl</u>e",
+  "content": "HTML con<u>ten</u>t: 2&frac34; MP",
+  "see_also": [ "HTML see-<u>al</u>so" ]
 }

 ]
--- a/asl_rulebook2/webapp/tests/fixtures/simple/simple.pdf
+++ b/asl_rulebook2/webapp/tests/fixtures/simple/simple.pdf
--- a/asl_rulebook2/webapp/tests/fixtures/simple/simple.targets
+++ b/asl_rulebook2/webapp/tests/fixtures/simple/simple.targets
@ -1,15 +1,15 @@
 {

-"A4.7": { "caption": "ADVANCE PHASE", "page_no": 1, "pos": [72,702] },
-"C13.8": { "caption": "BACKBLAST", "page_no": 1, "pos": [72,404] },
-"A3.8": { "caption": "CLOSE COMBAT PHASE (CCPh)", "page_no": 1, "pos": [72.97] },
+"A4.7": { "caption": "ADVANCE PHASE", "page_no": 1, "pos": [72,718] },
+"C13.8": { "caption": "BACKBLAST", "page_no": 1, "pos": [72,503] },
+"A3.8": { "caption": "CLOSE COMBAT PHASE (CCPh)", "page_no": 1, "pos": [72,292] },

-"A4.5": { "caption": "DOUBLE TIME", "page_no": 2, "pos": [72,702] },
-"A19.1": { "caption": "EXPERIENCE LEVEL RATING (ELR)", "page_no": 2, "pos": [72.404] },
-"A1.21": { "caption": "FIREPOWER (FP)", "page_no": 2, "pos": [72,97] },
+"A4.5": { "caption": "DOUBLE TIME", "page_no": 2, "pos": [72,718] },
+"A19.1": { "caption": "EXPERIENCE LEVEL RATING (ELR)", "page_no": 2, "pos": [72,503] },
+"A1.21": { "caption": "FIREPOWER (FP)", "page_no": 2, "pos": [72,292] },

-"A1.21": { "caption": "FIREPOWER (FP)", "page_no": 3, "pos": [72,702] },
-"E11.21": { "caption": "GAPS", "page_no": 3, "pos":[72,404] },
-"C8.3": { "caption": "HEAT (H)", "page_no": 3, "pos": [72,97] }
+"E11.21": { "caption": "GAPS", "page_no": 3, "pos":[72,718] },
+"C8.3": { "caption": "HEAT (H)", "page_no": 3, "pos": [72,503] },
+"D1.4": { "caption": "IDENTITY & GROUND PRESSURE", "page_no": 3, "pos": [72,292] }

 }
--- a/asl_rulebook2/webapp/tests/test_search.py
+++ b/asl_rulebook2/webapp/tests/test_search.py
@ -0,0 +1,298 @@
+""" Test search. """
+
+import re
+import logging
+
+from selenium.webdriver.common.keys import Keys
+
+from asl_rulebook2.utils import strip_html
+from asl_rulebook2.webapp.search import load_search_config, _make_fts_query_string
+from asl_rulebook2.webapp.tests.utils import init_webapp, select_tabbed_page, get_classes, \
+    wait_for, find_child, find_children
+
+# ---------------------------------------------------------------------
+
+def test_search( webapp, webdriver ):
+    """Test search."""
+
+    # initialize
+    webapp.control_tests.set_data_dir( "simple" )
+    init_webapp( webapp, webdriver )
+
+    # test a search that finds nothing
+    results = _do_search( "oogah, boogah!" )
+    assert results is None
+
+    # test error handling
+    results = _do_search( "!:simulated-error:!" )
+    assert "Simulated error." in results
+
+    # do a search
+    results = _do_search( "enemy" )
+    assert results == [
+        { "sr_type": "index",
+          "title": "CCPh", "subtitle": "Close Combat Phase",
+          "ruleids": [ "A3.8" ],
+          "rulerefs": [
+              { "caption": "((ENEMY)) Attacks", "ruleids": [ "S11.5" ] },
+              { "caption": "dropping SW before CC", "ruleids": [ "A4.43" ] },
+          ]
+        },
+        { "sr_type": "index",
+          "title": "Double Time",
+          "content": "Also known as \"running really fast.\"",
+          "see_also": [ "CX" ],
+          "ruleids": [ "A4.5-.51", "S6.222" ],
+          "rulerefs": [
+              { "caption": "((ENEMY)) Guard Automatic Action", "ruleids": [ "S6.303" ] },
+              { "ruleids": [ "C10.3" ] },
+              { "caption": "NA in Advance Phase", "ruleids": [ "A4.7" ] },
+              { "caption": "'S?' is \"<NA>\"" },
+          ]
+      },
+    ]
+
+    # do another search
+    results = _do_search( "gap" )
+    assert results == [
+        { "sr_type": "index",
+          "title": "((Gaps)), Convoy",
+          "ruleids": [ "E11.21" ],
+        },
+    ]
+
+# ---------------------------------------------------------------------
+
+def test_content_fixup( webapp, webdriver ):
+    """Test fixing up of content returned by the search engine."""
+
+    # initialize
+    webapp.control_tests.set_data_dir( "simple" )
+    init_webapp( webapp, webdriver )
+
+    # search for a fraction
+    results = _do_search( "3/4" )
+    assert len(results) == 1
+    assert results[0]["content"] == "HTML content: 2((\u00be)) MP"
+
+    # search for something that ends with a hash
+    results = _do_search( "H#" )
+    assert len(results) == 1
+    assert results[0]["title"] == "((H#))"
+
+    # search for "U.S."
+    results = _do_search( "U.S." )
+    assert len(results) == 1
+    assert results[0]["content"] == "The ((U.S.)) has lots of this."
+
+# ---------------------------------------------------------------------
+
+def test_targets( webapp, webdriver ):
+    """Test clicking on search results."""
+
+    # initialize
+    webapp.control_tests.set_data_dir( "simple" )
+    init_webapp( webapp, webdriver, no_content=1, add_empty_doc=1 )
+
+    def do_test( query_string, sel, expected ):
+
+        # select the dummy document
+        select_tabbed_page( "#content", "empty" )
+
+        # do the search
+        _do_search( query_string )
+
+        # click on a target
+        elem = find_child( "#search-results {}".format( sel ) )
+        elem.click()
+        def check_target():
+            # check the active tab
+            if find_child( "#content .tab-strip .tab.active" ).get_attribute( "data-tabid" ) != "simple":
+                return False
+            # check the current target
+            elem = find_child( "#content .tabbed-page[data-tabid='simple'] .content-doc" )
+            return elem.get_attribute( "data-target" ) == expected
+        wait_for( 2, check_target )
+
+    # do the tests
+    do_test( "CC", ".sr .ruleids .ruleid a", "A3.8" )
+    do_test( "time", ".sr .rulerefs .ruleid a", "A4.7" )
+
+# ---------------------------------------------------------------------
+
+def test_make_fts_query_string():
+    """Test generating the FTS query string."""
+
+    # initialize
+    load_search_config( logging.getLogger("_unknown_") )
+
+    def check( query, expected ):
+        fts_query_string, _ = _make_fts_query_string(query)
+        assert fts_query_string == expected
+
+    # test some query strings
+    check( "", "" )
+    check( "hello", "hello" )
+    check( "  hello,  world!  ", "hello AND world" )
+    check(
+        "foo 1+2 A-T K# bar",
+        'foo AND "1+2" AND "a-t" AND "k#" AND bar'
+    )
+    check(
+        "a'b a''b",
+        "\"a'b\" AND \"a''b\""
+    )
+    check(
+        'foo "set dc" bar',
+        'foo AND "set dc" AND bar'
+    )
+
+    # test some quoted phrases
+    check( '""', '' )
+    check( ' " " ', '' )
+    check(
+        '"hello world"',
+        '"hello world"'
+    )
+    check(
+        '  foo  "hello  world"  bar  ',
+        'foo AND "hello world" AND bar'
+    )
+    check(
+        ' foo " xyz " bar ',
+        'foo AND xyz AND bar'
+    )
+    check(
+        ' foo " xyz 123 " bar ',
+        'foo AND "xyz 123" AND bar'
+    )
+
+    # test some incorrectly quoted phrases
+    check( '"', '' )
+    check( ' " " " ', '' )
+    check( ' a "b c d e', 'a AND "b c d e"' )
+    check( ' a b" c d e ', 'a AND b AND c AND d AND e' )
+
+    # test pass-through
+    check( "AND", "AND" )
+    check( " OR", "OR" )
+    check( "OR ", "OR" )
+    check( "foo OR bar", "foo OR bar" )
+    check( "(a OR b)", "(a OR b)" )
+
+    # test search replacements
+    check( "1/2 3/4 3/8 5/8", '"&frac12;" AND "&frac34;" AND "&frac38;" AND "&frac58;"' )
+    check( "(r)", '"&reg;"' )
+
+    # test search aliases
+    check( "entrenchment", "( ditch OR entrenchment OR foxhole OR trench )" )
+    check( "entrenchments", "( ditch OR entrenchments OR foxhole OR trench )" )
+    check( "foxhole", "foxhole" )
+
+    # test search synonyms
+    check( "armor", "( armor OR armour )" )
+    check( "american big armor", '( america OR american OR "u.s." ) AND big AND ( armor OR armour )' )
+
+# ---------------------------------------------------------------------
+
+def _do_search( query_string ):
+    """Do a search."""
+
+    def get_seq_no():
+        return find_child( "#search-results" ).get_attribute( "data-seqno" )
+
+    # submit the search
+    select_tabbed_page( "#nav", "search" )
+    elem = find_child( "input#query-string" )
+    elem.clear()
+    elem.send_keys( query_string )
+    seq_no = get_seq_no()
+    elem.send_keys( Keys.RETURN )
+
+    # unload the results
+    wait_for( 2, lambda: get_seq_no() > seq_no )
+    elem = find_child( "#search-results .error" )
+    if elem:
+        return elem.text # nb: string = error message
+    elem = find_child( "#search-results .no-results" )
+    if elem:
+        assert elem.text == "Nothing was found."
+        return None # nb: None = no results
+    results = _unload_search_results()
+    assert isinstance( results, list ) # nb: list = search results
+    return results
+
+def _unload_search_results():
+    """Unload the search results."""
+
+    def unload_elem( result, key, elem ):
+        """Unload a single element."""
+        if not elem:
+            return False
+        elem_text = get_elem_text( elem )
+        if not elem_text:
+            return False
+        result[key] = elem_text
+        return True
+
+    def get_elem_text( elem ):
+        """Get the element's text content."""
+        val = elem.get_attribute( "innerHTML" )
+        # change how highlighted content is represented
+        matches = list( re.finditer( r'<span class="hilite">(.*?)</span>', val ) )
+        for mo in reversed(matches):
+            val = val[:mo.start()] + "((" + mo.group(1) + "))" + val[mo.end():]
+        # remove HTML tags
+        return strip_html( val.strip() )
+
+    def unload_ruleids( result, key, parent ):
+        """Unload a list of ruleid's."""
+        if not parent:
+            return
+        ruleids = []
+        for elem in find_children( ".ruleid", parent ):
+            ruleid = get_elem_text( elem )
+            assert ruleid.startswith( "[" ) and ruleid.endswith( "]" )
+            ruleids.append( ruleid[1:-1] )
+        if ruleids:
+            result[key] = ruleids
+
+    def unload_rulerefs( result, key, parent ):
+        """Unload a list of ruleref's."""
+        if not parent:
+            return
+        rulerefs = []
+        for elem in find_children( "li", parent ):
+            ruleref = {}
+            unload_elem( ruleref, "caption", find_child(".caption",elem) )
+            unload_ruleids( ruleref, "ruleids", elem )
+            rulerefs.append( ruleref )
+        if rulerefs:
+            result[key] = rulerefs
+
+    def unload_index_sr( sr ): #pylint: disable=possibly-unused-variable
+        """Unload an "index" search result."""
+        result = {}
+        unload_elem( result, "title", find_child("span.title",sr) )
+        unload_elem( result, "subtitle", find_child(".subtitle",sr) )
+        unload_elem( result, "content", find_child(".content",sr) )
+        if unload_elem( result, "see_also", find_child(".see-also",sr) ):
+            assert result["see_also"].startswith( "See also:" )
+            result["see_also"] = [ s.strip() for s in result["see_also"][9:].split( "," ) ]
+        unload_ruleids( result, "ruleids", find_child(".ruleids",sr) )
+        unload_rulerefs( result, "rulerefs", find_child(".rulerefs",sr) )
+        return result
+
+    # unload the search results
+    results = []
+    for sr in find_children( "#search-results .sr"):
+        classes = get_classes( sr )
+        classes.remove( "sr" )
+        assert len(classes) == 1 and classes[0].endswith( "-sr" )
+        sr_type = classes[0][:-3]
+        func = locals()[ "unload_{}_sr".format( sr_type ) ]
+        sr = func( sr )
+        sr["sr_type"] = sr_type
+        results.append( sr )
+
+    return results
--- a/asl_rulebook2/webapp/tests/utils.py
+++ b/asl_rulebook2/webapp/tests/utils.py
@ -17,6 +17,10 @@ def init_webapp( webapp, webdriver, **options ):
    global _webapp, _webdriver
    _webapp = webapp
    _webdriver = webdriver
+    options = {
+        key.replace("_","-"): val
+        for key, val in options.items()
+    }

    # load the webapp
    if get_pytest_option("webdriver") == "chrome" and get_pytest_option("headless"):
@ -39,6 +43,18 @@ def _wait_for_webapp():

 # ---------------------------------------------------------------------

+def select_tabbed_page( parent_sel, tab_id ):
+    """Select a tabbed page."""
+    tabbed_pages = find_child( ".tabbed-pages", find_child(parent_sel) )
+    btn = find_child( ".tab-strip .tab[data-tabid='{}']".format( tab_id ), tabbed_pages )
+    btn.click()
+    def find_tabbed_page():
+        elem = find_child( ".tabbed-page[data-tabid='{}']".format( tab_id ), tabbed_pages )
+        return elem and elem.is_displayed()
+    wait_for( 2, find_tabbed_page )
+
+# ---------------------------------------------------------------------
+
 def get_nav_panels():
    """Get the available nav panels."""
    return _get_tab_ids( "#nav .tab-strip" )
@ -72,6 +88,11 @@ def find_children( sel, parent=None ):
    except NoSuchElementException:
        return None

+def get_classes( elem ):
+    """Get the element's classes."""
+    classes = elem.get_attribute( "class" )
+    return classes.split()
+
 # ---------------------------------------------------------------------

 def wait_for( timeout, func ):
--- a/asl_rulebook2/webapp/utils.py
+++ b/asl_rulebook2/webapp/utils.py
@ -1,8 +1,24 @@
 """Helper functions."""

+import os
 import pathlib
 import re

+from asl_rulebook2.webapp import app, CONFIG_DIR
+
+# ---------------------------------------------------------------------
+
+def make_data_path( path ):
+    """Generate a path relative to the data directory."""
+    dname = app.config.get( "DATA_DIR" )
+    if not dname:
+        return None
+    return os.path.join( dname, path )
+
+def make_config_path( path ):
+    """Generate a path in the config directory."""
+    return os.path.join( CONFIG_DIR, path )
+
 # ---------------------------------------------------------------------

 def change_extn( fname, extn ):
--- a/bin/add_pdf_dests.py
+++ b/bin/add_pdf_dests.py
@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+""" Add named destinations to a PDF file. """
+
+import subprocess
+import json
+import time
+import datetime
+
+import click
+
+from asl_rulebook2.utils import TempFile
+
+# ---------------------------------------------------------------------
+
+@click.command()
+@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
+@click.option( "--title", help="Document title." )
+@click.option( "--targets","-t","targets_fname", required=True, type=click.Path(dir_okay=False),
+    help="Target definition file."
+)
+@click.option( "--yoffset", default=5, help="Offset to add to y co-ordinates." )
+@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False),
+    help="Output PDF file."
+)
+@click.option( "--gs","gs_path", default="gs",  help="Path to the Ghostscript executable." )
+def main( pdf_file, title, targets_fname, yoffset, output_fname, gs_path ):
+    """Add named destinations to a PDF file."""
+
+    # load the targets
+    with open( targets_fname, "r" ) as fp:
+        targets = json.load( fp )
+
+    with TempFile( mode="w" ) as temp_file:
+
+        # generate the pdfmarks
+        print( "Generating the pdfmarks..." )
+        if title:
+            print( "[ /Title ({})".format( title ), file=temp_file )
+        else:
+            print( "[", file=temp_file )
+        print( "  /DOCINFO pdfmark", file=temp_file )
+        print( file=temp_file )
+        for ruleid, target in targets.items():
+            xpos, ypos = target["pos"]
+            print( "[ /Dest /{} /Page {} /View [/XYZ {} {}] /DEST pdfmark".format(
+                ruleid, target["page_no"], xpos, ypos+yoffset
+            ), file=temp_file )
+        print( file=temp_file )
+        temp_file.close( delete=False )
+
+        # generate the pdfmark'ed document
+        print( "Generating the pdfmark'ed document..." )
+        print( "- {} => {}".format( pdf_file, output_fname ) )
+        args = [ gs_path, "-q", "-dBATCH", "-dNOPAUSE", "-sDEVICE=pdfwrite" ]
+        args.extend( [ "-o", output_fname ] )
+        args.extend( [ "-f", pdf_file ] )
+        args.append( temp_file.name )
+        start_time = time.time()
+        subprocess.run( args, check=True )
+        elapsed_time = time.time() - start_time
+        print( "- Elapsed time: {}".format( datetime.timedelta(seconds=int(elapsed_time)) ) )
+
+# ---------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main() #pylint: disable=no-value-for-parameter
--- a/bin/extract_pages.py
+++ b/bin/extract_pages.py
@ -10,7 +10,7 @@ from asl_rulebook2.utils import parse_page_numbers

@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
-@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False), help="Output PDF file" )
+@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False), help="Output PDF file." )
@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." )
 def main( pdf_file, output_fname, pages ):
    """Extract pages from a PDF."""