asl-rulebook2/asl_rulebook2/webapp/search.py

""" Manage the search engine. """

import os
import sqlite3
import json
import re
import itertools
import string
import copy
import time
import tempfile
import logging
import traceback

from flask import request, jsonify
import lxml.html

from asl_rulebook2.utils import plural
from asl_rulebook2.webapp import app
import asl_rulebook2.webapp.startup as webapp_startup
from asl_rulebook2.webapp.content import tag_ruleids
from asl_rulebook2.webapp.utils import make_config_path, make_data_path, split_strip

_sqlite_path = None
_fts_index = None

_logger = logging.getLogger( "search" )

# these are used to highlight search matches (nb: the front-end looks for these)
_BEGIN_HIGHLIGHT = "!@:"
_END_HIGHLIGHT = ":@!"

# NOTE: These regex's fix up content returned to us by the SQLite search engine (typically problems
# with highlighting search terms).
_FIXUP_TEXT_REGEXES = [
    [ re.compile( fixup[0].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) ),
      fixup[1].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
    ]
    for fixup in [
        [ r"&{}(.+?){};", r"{}&\g<1>;{}" ], # HTML entities e.g. &((frac12)); -> (($frac12;))
        [ r"{}(.+?){}#", r"{}\g<1>#{}" ], # e.g. ((TH)# -> ((TH#)
        [ r"{}U\.S{}\.", "{}U.S.{}" ], # ((U.S)). -> ((U.S.))
    ]
]

# NOTE: This regex identifies highlight markers that SQLite has inadvertently inserted *inside* an HTML tag,
# because it is treating the searchable content as plain-text, and not HTML. There could be multiple cases
# of this within a single tag, so we identify any such tag first, then do a simple search-and-replace
# to remove the highlight markers.
# NOTE: The content has cases of naked <'s e.g. "move < 2 MP", so we need to be careful not to get tripped up
# by these.
_HILITES_INSIDE_HTML_TAG_REGEX = re.compile(
    r"\<\S[^>]*?{}.*?\>".format( _BEGIN_HIGHLIGHT )
)

# these are used to separate ruleref's in the FTS table
_RULEREF_SEPARATOR = "-:-"

# these are used to separate Q+A fields in the FTS table
_QA_CONTENT_SEPERATOR = " !=! "
_QA_FIELD_SEPARATOR = " :-: "
_NO_QA_QUESTION = "_??_"

_SEARCH_TERM_ADJUSTMENTS = None

# ---------------------------------------------------------------------

@app.route( "/search", methods=["POST"] )
def search() :
    """Run a search."""

    # log the request
    _logger.info( "SEARCH REQUEST:" )
    args = dict( request.form.items() )
    for key,val in args.items():
        _logger.info( "- %s: %s", key, val )

    # run the search
    # NOTE: We can't use the search index nor in-memory data structures if the "fix content" thread
    # is still running (and possible updating them). However, the tasks running in that thread
    # relinquish the lock regularly, to give the user a chance to jump in and grab it here, if they
    # want to do a search while that thread is still running.
    with webapp_startup.fixup_content_lock:
        try:
            return _do_search( args )
        except Exception as exc: #pylint: disable=broad-except
            msg = str( exc )
            if msg.startswith( "fts5: " ):
                msg = msg[5:] # nb: this is a sqlite3.OperationalError
            _logger.warning( "SEARCH ERROR: %s\n%s", args, traceback.format_exc() )
            return jsonify( { "error": msg } )

def _do_search( args ):

    # run the search
    query_string = args[ "queryString" ].strip()
    if query_string == "!:simulated-error:!":
        raise RuntimeError( "Simulated error." ) # nb: for the test suite
    if not query_string:
        raise RuntimeError( "Missing query string." )
    fts_query_string, search_terms = _make_fts_query_string( query_string )
    _logger.debug( "FTS query string: %s", fts_query_string )
    conn = sqlite3.connect( _sqlite_path )
    def highlight( n ):
         # NOTE: highlight() is an FTS extension function, and takes column numbers :-/
        return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
    sql = "SELECT rowid, sr_type, cset_id, rank, {}, {}, {}, {} FROM searchable".format(
        highlight(2), highlight(3), highlight(4), highlight(5)
    )
    sql += " WHERE searchable MATCH ?"
    sql += " ORDER BY rank"
    curs = conn.execute( sql,
        ( "{title subtitle content rulerefs}: " + fts_query_string, )
    )

    def remove_bad_hilites( val ):
        # remove highlight markers that SQLite may have incorrectly inserted into a value
        if val is None:
            return None
        matches = list( _HILITES_INSIDE_HTML_TAG_REGEX.finditer( val ) )
        for mo in reversed( matches ):
            match = mo.group().replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" )
            val = val[:mo.start()] + match + val[mo.end():]
        return val

    # get the results
    results = []
    for row in curs:
        row = list( row )
        for col_no in range( 4, 7+1 ):
            row[col_no] = remove_bad_hilites( row[col_no] )
        if row[1] == "index":
            result = _unload_index_sr( row )
        elif row[1] == "qa":
            result = _unload_qa_sr( row )
        elif row[1] == "errata":
            result = _unload_anno_sr( row, "errata" )
        elif row[1] == "user-anno":
            result = _unload_anno_sr( row, "user-anno" )
        elif row[1] == "asop-entry":
            result = _unload_asop_entry_sr( row )
        else:
            _logger.error( "Unknown searchable row type (rowid=%d): %s", row[0], row[1] )
            continue
        if not result:
            continue
        result.update( {
            "sr_type": row[1],
            "_score": - row[3],
        } )
        results.append( result )

    # fixup the results
    results = _fixup_results_for_hash_terms( results, search_terms )

    # adjust the sort order
    results = _adjust_sort_order( results )

    # return the results
    if _logger.isEnabledFor( logging.DEBUG ):
        _logger.debug( "Search results:" if len(results) > 0 else "Search results: none" )
        for result in results:
            title = result.get( "title", result.get("caption","???") )
            _logger.debug( "- %s: %s (%.3f)",
                result["_fts_rowid"],
                title.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ),
                result["_score"]
            )
    return jsonify( results )

def _unload_index_sr( row ):
    """Unload an index search result from the database."""
    index_entry = _fts_index["index"][ row[0] ] # nb: our copy of the index entry (must remain unchanged)
    result = copy.deepcopy( index_entry ) # nb: the index entry we will return to the caller
    result[ "cset_id" ] = row[2]
    _get_result_col( result, "title", row[4] )
    _get_result_col( result, "subtitle", row[5] )
    _get_result_col( result, "content", row[6] )
    rulerefs = split_strip( row[7], _RULEREF_SEPARATOR ) if row[7] else []
    assert len(rulerefs) == len(index_entry.get("rulerefs",[]))
    if rulerefs:
        result[ "rulerefs" ] = []
        for i, ruleref in enumerate(rulerefs):
            ruleref2 = {}
            if "caption" in index_entry["rulerefs"][i]:
                assert ruleref.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ) \
                       == index_entry["rulerefs"][i]["caption"].strip()
                ruleref2["caption"] = _fixup_text( ruleref )
            if "ruleids" in index_entry["rulerefs"][i]:
                ruleref2["ruleids"] = index_entry["rulerefs"][i]["ruleids"]
            assert ruleref2
            result["rulerefs"].append( ruleref2 )
    return result

def _unload_qa_sr( row ):
    """Unload a Q+A search result from the database."""
    qa_entry = _fts_index["qa"][ row[0] ] # nb: our copy of the Q+A entry (must remain unchanged)
    result = copy.deepcopy( qa_entry ) # nb: the Q+A entry we will return to the caller (will be changed)
    # replace the content in the Q+A entry we will return to the caller with the values
    # from the search index (which will have search term highlighting)
    if row[4]:
        result["caption"] = row[4]
    sr_content = split_strip( row[6], _QA_CONTENT_SEPERATOR ) if row[6] else []
    qa_entry_content = qa_entry.get( "content", [] )
    if len(sr_content) != len(qa_entry_content):
        _logger.error( "Mismatched # content's for Q+A entry: %s", qa_entry )
        return None
    for content_no, content in enumerate( qa_entry_content ):
        fields = split_strip( sr_content[content_no], _QA_FIELD_SEPARATOR )
        answers = content.get( "answers", [] )
        if len(fields) - 1 != len(answers): # nb: fields = question + answer 1 + answer 2 + ...
            _logger.error( "Mismatched # answers for content %d: %s\n- answers = %s", content_no, qa_entry, answers )
            return None
        if fields[0] != _NO_QA_QUESTION:
            result["content"][content_no]["question"] = fields[0]
        for answer_no, _ in enumerate(answers):
            result["content"][content_no]["answers"][answer_no][0] = fields[ 1+answer_no ]
    return result

def _unload_anno_sr( row, atype ):
    """Unload an annotation search result from the database."""
    anno = _fts_index[atype][ row[0] ] # nb: our copy of the annotation (must remain unchanged)
    result = copy.deepcopy( anno ) # nb: the annotation we will return to the caller (will be changed)
    _get_result_col( result, "content", row[6] )
    return result

def _unload_asop_entry_sr( row ):
    """Unload an ASOP entry search result from the database."""
    section = _fts_index["asop-entry"][ row[0] ][0] # nb: our copy of the ASOP section (must remain unchanged)
    result = copy.deepcopy( section ) # nb: the ASOP section we will return to the caller (will be changed)
    _get_result_col( result, "content", row[6] )
    return result

def _fixup_text( val ):
    """Fix-up a text value retrieved from the search index."""
    if val is None:
        return None
    for regex in _FIXUP_TEXT_REGEXES:
        val = regex[0].sub( regex[1], val )
    return val

def _get_result_col( sr, key, val ):
    """Get a column from a search result."""
    if val:
        sr[ key ] = _fixup_text( val )

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

PASSTHROUGH_REGEXES = set([
    re.compile( r"\bAND\b" ),
    re.compile( r"\bOR\b" ),
    re.compile( r"\bNOT\b" ), # nb: this is a binary operator i.e. x NOT y = x && !x
    re.compile( r"\((?![Rr]\))" ),
])

def _make_fts_query_string( query_string ):
    """Generate the SQLite query string.

    SQLite's MATCH function recognizes a lot of special characters, which need
    to be enclosed in double-quotes to disable.
    """

    # check if this looks like a raw FTS query
    if any( regex.search(query_string) for regex in PASSTHROUGH_REGEXES ):
        return query_string.strip(), None

    # split the search string into words (taking quoted phrases into account)
    ignore = app.config.get( "SQLITE_FTS_IGNORE_CHARS", ",;!?$" )
    query_string = "".join( ch for ch in query_string if ch not in ignore )
    terms = query_string.lower().split()
    i = 0
    while True:
        if i >= len(terms):
            break
        if i > 0 and terms[i-1].startswith( '"' ):
            terms[i-1] += " {}".format( terms[i] )
            del terms[i]
            if terms[i-1].startswith( '"' ) and terms[i-1].endswith( '"' ):
                terms[i-1] = terms[i-1][1:-1]
            continue
        i += 1

    # clean up quoted phrases
    terms = [ t[1:] if t.startswith('"') else t for t in terms ]
    terms = [ t[:-1] if t.endswith('"') else t for t in terms ]
    terms = [ t.strip() for t in terms ]
    terms = [ t for t in terms if t ]

    # adjust search terms
    for term_no, term in enumerate(terms):
        aliases = _SEARCH_TERM_ADJUSTMENTS.get( term )
        if not aliases:
            continue
        if isinstance( aliases, str ):
            # the search term is replaced by a new one
            terms[ term_no ] = aliases
        elif isinstance( aliases, set ):
            # the search term is replaced by multiple new ones (that will be OR'ed together)
            # NOTE: We sort the terms so that the tests will work reliably.
            terms[ term_no ] = sorted( aliases )
        else:
            assert "Unknown search alias type: {}".format( type(aliases) )

    # fixup each term
    def has_special_char( term ):
        """Check if the term contains any special characters."""
        for ch in term:
            if ch in "*":
                continue
            if ch.isspace() or ch in string.punctuation:
                return True
            if ord(ch) < 32 or ord(ch) > 127:
                return True
        return False
    def fixup_terms( terms ):
        """Fixup a list of terms."""
        for term_no, term in enumerate(terms):
            if isinstance( term, str ):
                if has_special_char( term ):
                    terms[term_no] = '"{}"'.format( term )
            else:
                fixup_terms( term )
    fixup_terms( terms )

    # return the final FTS query string
    def term_string( term ):
        if isinstance( term, str ):
            return term
        assert isinstance( term, list )
        return "( {} )".format( " OR ".join( term ) )
    return " AND ".join( term_string(t) for t in terms ), terms

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def _fixup_results_for_hash_terms( results, search_terms ):
    """Fixup search results for search terms that end with a hash.

    SQLite doesn't handle search terms that end with a hash particularly well.
    We correct highlighted search terms in _fixup_text(), but searching for e.g. "US#"
    will also match "use" and "using" - we remove such results here.
    """

    # figure out which search terms end with a hash
    # NOTE: We don't bother descending down into sub-terms.
    if not search_terms:
        return results
    terms = [
        t[1:-1] for t in search_terms
        if isinstance(t,str) and t.startswith('"') and t.endswith('"')
    ]
    terms = [
        t[:-1].lower() for t in terms
        if isinstance(t,str) and t.endswith("#")
    ]
    if not terms:
        return results
    if "us" in terms:
        terms.extend( [ "use", "used", "using", "user" ] )

    def keep( sr ):
        # remove every incorrectly matched search term (e.g. ((K)) when searching for "K#")
        buf = json.dumps( sr ).lower()
        for term in terms:
            buf = buf.replace( "{}{}{}".format( _BEGIN_HIGHLIGHT, term, _END_HIGHLIGHT ), "_removed_" )
        # we keep this search result if there are still some highlighted search terms
        return _BEGIN_HIGHLIGHT in buf

    return [
        result for result in results if keep(result)
    ]

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def _adjust_sort_order( results ):
    """Adjust the sort order of the search results."""

    results2 = []
    def extract_sr( func ):
        # move results that pass the filter function to the new list
        i = 0
        while True:
            if i >= len(results):
                break
            # NOTE: We never prefer small entries (i.e .have no ruleref's)
            # e.g. those that only contain a "see also".
            if func( results[i] ) and len(results[i].get("rulerefs",[])) > 0:
                results2.append( results[i] )
                del results[i]
            else:
                i += 1

    def get( sr, key ):
        val = sr.get( key )
        return val if val else ""

    # prefer search results whose title is an exact match
    extract_sr(
        lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) and get(sr,"title").endswith( _END_HIGHLIGHT )
    )
    # prefer search results whose title starts with a match
    extract_sr(
        lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT )
    )
    # prefer search results that have a match in the title
    extract_sr(
        lambda sr: _BEGIN_HIGHLIGHT in get(sr,"title")
    )
    # prefer search results that have a match in the subtitle
    extract_sr(
        lambda sr: _BEGIN_HIGHLIGHT in get(sr,"subtitle")
    )

    # include any remaining search results
    results2.extend( results )

    return results2

# ---------------------------------------------------------------------

def init_search( content_sets, qa, errata, user_anno, asop, asop_content, startup_msgs, logger ):
    """Initialize the search engine."""

    # initialize
    global _fts_index
    _fts_index = { "index": {}, "qa": {}, "errata": {}, "user-anno": {}, "asop-entry": {} }

    # initialize the database
    global _sqlite_path
    _sqlite_path = app.config.get( "SQLITE_PATH" )
    if not _sqlite_path:
        # FUDGE! We should be able to create a shared, in-memory database using this:
        #   file::XYZ:?mode=memory&cache=shared
        # but it doesn't seem to work (on Linux) and ends up creating a file with this name :-/
        # We manually create a temp file, which has to have the same name each time, so that we don't
        # keep creating a new database each time we start up. Sigh...
        _sqlite_path = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb" )
    if os.path.isfile( _sqlite_path ):
        os.unlink( _sqlite_path )
    logger.info( "Creating the search index: %s", _sqlite_path )
    conn = sqlite3.connect( _sqlite_path )
    # NOTE: Storing everything in a single table allows FTS to rank search results based on
    # the overall content, and also lets us do AND/OR queries across all searchable content.
    conn.execute(
        "CREATE VIRTUAL TABLE searchable USING fts5"
        " ( sr_type, cset_id, title, subtitle, content, rulerefs, tokenize='porter unicode61' )"
    )

    # initialize the search index
    logger.info( "Building the search index..." )
    conn.execute( "DELETE FROM searchable" )
    curs = conn.cursor()
    if content_sets:
        _init_content_sets( conn, curs, content_sets, logger )
    if qa:
        _init_qa( curs, qa, logger )
    if errata:
        _init_errata( curs, errata, logger )
    if user_anno:
        _init_user_anno( curs, user_anno, logger )
    if asop:
        _init_asop( curs, asop, asop_content, logger )
    conn.commit()

    # load the search config
    load_search_config( startup_msgs, logger )

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def _init_content_sets( conn, curs, content_sets, logger ):
    """Add the content sets to the search index."""

    def make_fields( index_entry ):
        return {
            "subtitle": index_entry.get( "subtitle" ),
            "content": index_entry.get( "content" ),
        }

    # add the index entries to the search index
    sr_type = "index"
    for cset in content_sets.values():
        logger.info( "- Adding index file: %s", cset["index_fname"] )
        nrows = 0
        for index_entry in cset["index"]:
            rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) )
            # NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags
            # will need to be included in search terms. However, this means that the content returned by a query
            # will be this stripped content. We could go back to the original data to get the original HTML content,
            # but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert
            # the original content, since none of it should contain HTML, anyway.
            fields = make_fields( index_entry )
            curs.execute(
                "INSERT INTO searchable"
                " ( sr_type, cset_id, title, subtitle, content, rulerefs )"
                " VALUES ( ?, ?, ?, ?, ?, ? )", (
                    sr_type, cset["cset_id"],
                    index_entry.get("title"), fields["subtitle"], fields["content"], rulerefs
            ) )
            _fts_index[sr_type][ curs.lastrowid ] = index_entry
            index_entry["_fts_rowid"] = curs.lastrowid
            nrows += 1
        logger.info( "  - Added %s.", plural(nrows,"index entry","index entries"),  )
    assert len(_fts_index[sr_type]) == _get_row_count( conn, "searchable" )

    # register a task to fixup the content
    def fixup_index_entry( rowid, cset_id ):
        index_entry = _fts_index[ sr_type ][ rowid ]
        _tag_ruleids_in_field( index_entry, "subtitle", cset_id )
        _tag_ruleids_in_field( index_entry, "content", cset_id )
        return index_entry
    from asl_rulebook2.webapp.startup import add_fixup_content_task
    add_fixup_content_task( "index searchable content",
        lambda: _fixup_searchable_content( sr_type, fixup_index_entry, make_fields )
    )

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def _init_qa( curs, qa, logger ):
    """Add the Q+A to the search index."""

    def make_fields( qa_entry ):
        buf = []
        for content in qa_entry.get( "content", [] ):
            buf2 = []
            buf2.append( content.get( "question", _NO_QA_QUESTION ) )
            # NOTE: We don't really want to index answers, since they are mostly not very useful (e.g. "Yes."),
            # but we do so in order to get highlighting for those cases where they contain a search term.
            for answer in content.get( "answers", [] ):
                buf2.append( answer[0] )
            buf.append( _QA_FIELD_SEPARATOR.join( buf2 ) )
        return {
            "title": qa_entry.get( "caption" ),
            "content":_QA_CONTENT_SEPERATOR.join( buf ),
        }

    logger.info( "- Adding the Q+A." )
    nrows = 0
    sr_type = "qa"
    for qa_entries in qa.values():
        for qa_entry in qa_entries:
            fields = make_fields( qa_entry )
            curs.execute(
                "INSERT INTO searchable ( sr_type, title, content ) VALUES ( ?, ?, ? )", (
                    sr_type, fields["title"], fields["content"]
            ) )
            _fts_index[sr_type][ curs.lastrowid ] = qa_entry
            qa_entry["_fts_rowid"] = curs.lastrowid
            nrows += 1
    logger.info( "  - Added %s.", plural(nrows,"Q+A entry","Q+A entries"),  )

    # register a task to fixup the content
    def fixup_qa( rowid, cset_id ):
        qa_entry = _fts_index[ sr_type ][ rowid ]
        _tag_ruleids_in_field( qa_entry, "caption", cset_id )
        for content in qa_entry.get( "content", [] ):
            _tag_ruleids_in_field( content, "question", cset_id )
            for answer in content.get( "answers", [] ):
                _tag_ruleids_in_field( answer, 0, cset_id )
        return qa_entry
    from asl_rulebook2.webapp.startup import add_fixup_content_task
    add_fixup_content_task( "Q+A searchable content",
        lambda: _fixup_searchable_content( sr_type, fixup_qa, make_fields )
    )

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def _init_errata( curs, errata, logger ):
    """Add the errata to the search index."""
    logger.info( "- Adding the errata." )
    nrows = _do_init_anno( curs, errata, "errata" )
    logger.info( "  - Added %s.", plural(nrows,"errata entry","errata entries"),  )

def _init_user_anno( curs, user_anno, logger ):
    """Add the user-defined annotations to the search index."""
    logger.info( "- Adding the annotations." )
    nrows = _do_init_anno( curs, user_anno, "user-anno" )
    logger.info( "  - Added %s.", plural(nrows,"annotation","annotations"),  )

def _do_init_anno( curs, anno, atype ):
    """Add annotations to the search index."""

    def make_fields( anno ):
        return {
            "content": anno.get( "content" ),
        }

    # add the annotations to the search index
    nrows = 0
    sr_type = atype
    for ruleid in anno:
        for a in anno[ruleid]:
            fields = make_fields( a )
            curs.execute(
                "INSERT INTO searchable ( sr_type, content ) VALUES ( ?, ? )", (
                sr_type, fields["content"]
            ) )
            _fts_index[sr_type][ curs.lastrowid ] = a
            a["_fts_rowid"] = curs.lastrowid
            nrows += 1

    # register a task to fixup the content
    def fixup_anno( rowid, cset_id ):
        anno = _fts_index[ sr_type ][ rowid ]
        _tag_ruleids_in_field( anno, "content", cset_id )
        return anno
    from asl_rulebook2.webapp.startup import add_fixup_content_task
    add_fixup_content_task( atype+" searchable content",
        lambda: _fixup_searchable_content( sr_type, fixup_anno, make_fields )
    )

    return nrows

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def _init_asop( curs, asop, asop_content, logger ):
    """Add the ASOP to the search index."""

    logger.info( "- Adding the ASOP." )
    sr_type = "asop-entry"
    fixup_chapters, fixup_sections = [], []
    nentries = 0
    for chapter in asop.get( "chapters", [] ):
        fixup_chapters.append( chapter )
        for section in chapter.get( "sections", [] ):
            content = asop_content.get( section["section_id"] )
            if not content:
                continue
            fixup_sections.append( section )
            entries = _extract_section_entries( content )
            # NOTE: The way we manage the FTS index for ASOP entries is a little different to normal,
            # since they don't exist as individual entities (this is the only place where they do,
            # so that we can return them as individual search results). Each database row points
            # to the parent section, and the section has a list of FTS rows for its child entries.
            section[ "_fts_rowids" ] = []
            for entry in entries:
                curs.execute(
                    "INSERT INTO searchable ( sr_type, content ) VALUES ( ?, ? )", (
                    sr_type, entry
                ) )
                _fts_index[sr_type][ curs.lastrowid ] = [ section, entry ]
                section[ "_fts_rowids" ].append( curs.lastrowid )
            nentries += 1
    logger.info( "  - Added %s.", plural(nentries,"entry","entries") )

    # register a task to fixup the content
    def fixup_content():
        _fixup_searchable_content( sr_type, fixup_entry, make_fields )
        # we also need to fixup the in-memory data structures
        cset_id = None
        for chapter in fixup_chapters:
            _tag_ruleids_in_field( chapter, "preamble", cset_id )
        for section in fixup_sections:
            _tag_ruleids_in_field( asop_content, section["section_id"], cset_id )
    def fixup_entry( rowid, cset_id ):
        entry = _fts_index[ sr_type ][ rowid ].pop()
        entry = tag_ruleids( entry, cset_id )
        return entry
    def make_fields( entry ):
        return { "content": entry }
    from asl_rulebook2.webapp.startup import add_fixup_content_task
    add_fixup_content_task( "ASOP searchable content", fixup_content )

def _extract_section_entries( content ):
    """Separate out each entry from the section's content."""
    entries = []
    fragment = lxml.html.fragment_fromstring(
        "<div> {} </div>".format( content )
    )
    for elem in fragment.xpath( ".//div[contains(@class,'entry')]" ):
        if "entry" not in elem.attrib["class"].split():
            continue
        entry = lxml.html.tostring( elem )
        entries.append( entry.decode( "utf-8" ) )
    if not entries:
        # NOTE: If the content hasn't been divided into entries, we return the whole thing as
        # one big entry, which will kinda suck as a search result if it's big, but it's better
        # than not seeing anything at all.
        return [ content ]
    return entries

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def load_search_config( startup_msgs, logger ):
    """Load the search config."""

    # initialize
    global _SEARCH_TERM_ADJUSTMENTS
    _SEARCH_TERM_ADJUSTMENTS = {}

    def add_search_term_adjustment( key, vals ):
        # make sure everything is lower-case
        key = key.lower()
        if isinstance( vals, str ):
            vals = vals.lower()
        elif isinstance( vals, set ):
            vals = set( v.lower() for v in vals )
        else:
            assert "Unknown search alias type: {}".format( type(vals) )
        # add new the search term adjustment
        if key not in _SEARCH_TERM_ADJUSTMENTS:
            _SEARCH_TERM_ADJUSTMENTS[ key ] = vals
        else:
            # found a multiple definition - try to do something sensible
            logger.warning( "  - Duplicate search alias: %s\n- current aliases = %s\n- new aliases = %s", key,
                _SEARCH_TERM_ADJUSTMENTS[key], vals
            )
            if isinstance( _SEARCH_TERM_ADJUSTMENTS[key], str ):
                _SEARCH_TERM_ADJUSTMENTS[ key ] = vals
            else:
                assert isinstance( _SEARCH_TERM_ADJUSTMENTS[key], set )
                _SEARCH_TERM_ADJUSTMENTS[ key ].update( vals )

    # load the search replacements
    def load_search_replacements( fname, ftype ):
        if fname is None or not os.path.isfile( fname ):
            return
        logger.info( "Loading %s search replacements: %s", ftype, fname )
        try:
            with open( fname, "r", encoding="utf-8" ) as fp:
                data = json.load( fp )
        except Exception as ex: #pylint: disable=broad-except
            startup_msgs.warning( "Can't load {} search replacements.".format( ftype ), str(ex) )
            return
        nitems = 0
        for key, val in data.items():
            if key.startswith( "_" ):
                continue # nb: ignore comments
            logger.debug( "- %s -> %s", key, val )
            add_search_term_adjustment( key, val )
            nitems += 1
        logger.info( "- Loaded %s.", plural(nitems,"search replacement","search replacements") )
    load_search_replacements( make_config_path( "search-replacements.json" ), "default" )
    load_search_replacements( make_data_path( "search-replacements.json" ), "user" )

    # load the search aliases
    def load_search_aliases( fname, ftype ):
        if fname is None or not os.path.isfile( fname ):
            return
        logger.info( "Loading %s search aliases: %s", ftype, fname )
        try:
            with open( fname, "r", encoding="utf-8" ) as fp:
                data = json.load( fp )
        except Exception as ex: #pylint: disable=broad-except
            startup_msgs.warning( "Can't load {} search aliases.".format( ftype ), str(ex) )
            return
        nitems = 0
        for keys, aliases in data.items():
            if keys.startswith( "_" ):
                continue # nb: ignore comments
            logger.debug( "- %s -> %s", keys, " ; ".join(aliases) )
            for key in keys.split( "/" ):
                add_search_term_adjustment( key, set( itertools.chain( aliases, [key] ) ) )
            nitems += 1
        logger.info( "- Loaded %s.", plural(nitems,"search aliases","search aliases") )
    load_search_aliases( make_config_path( "search-aliases.json" ), "default" )
    load_search_aliases( make_data_path( "search-aliases.json" ), "user" )

    # load the search synonyms
    def load_search_synonyms( fname, ftype ):
        if fname is None or not os.path.isfile( fname ):
            return
        logger.info( "Loading %s search synonyms: %s", ftype, fname )
        try:
            with open( fname, "r", encoding="utf-8" ) as fp:
                data = json.load( fp )
        except Exception as ex: #pylint: disable=broad-except
            startup_msgs.warning( "Can't load {} search synonyms.".format( ftype ), str(ex) )
            return
        nitems = 0
        for synonyms in data:
            if isinstance( synonyms, str ):
                continue # nb: ignore comments
            logger.debug( "- %s", " ; ".join(synonyms) )
            synonyms = set( synonyms )
            for term in synonyms:
                add_search_term_adjustment( term, synonyms )
            nitems += 1
        logger.info( "- Loaded %s.", plural(nitems,"search synonym","search synonyms") )
    load_search_synonyms( make_config_path( "search-synonyms.json" ), "default" )
    load_search_synonyms( make_data_path( "search-synonyms.json" ), "user" )

# ---------------------------------------------------------------------

def _fixup_searchable_content( sr_type, fixup_row, make_fields ):
    """Fixup the searchable content for the specified search result type."""

    # locate the rows we're going to fixup
    # NOTE: Then searchable table never changes after it has been built, so we don't need the lock.
    conn = sqlite3.connect( _sqlite_path )
    curs = conn.cursor()
    query = curs.execute( "SELECT rowid, cset_id, title, subtitle, content FROM searchable WHERE sr_type=?",
        ( sr_type, )
    )
    content_rows = list( query.fetchall() )

    # update the searchable content in each row
    nrows = 0
    last_commit_time = time.time()
    for row in content_rows:

        # NOTE: The fixup_row() callback will usually be using _tag_ruleids_in_field(), which manages
        # the lock; otherwise the callback needs to do it itself. We don't want to invoke this callback
        # inside the lock since it can be quite slow; _tag_ruleids_in_field() holds the lock for the
        # minimum amount of time.
        new_row = fixup_row( row[0], row[1] )

        with webapp_startup.fixup_content_lock:
            # NOTE: The make_fields() callback will usually be accessing the fields we want to fixup,
            # so we need to protect them with the lock.
            fields = make_fields( new_row )
            # NOTE: We update the row inside the lock to prevent "database is locked" errors, if the user
            # tries to do a search while this is happening.
            query = "UPDATE searchable SET {} WHERE rowid={}".format(
                ", ".join( "{}=?".format( f ) for f in fields ),
                row[0]
            )
            curs.execute( query, tuple(fields.values()) )
            nrows += 1

        # commit the changes regularly (so that they are available to the front-end)
        if time.time() - last_commit_time >= 1:
            conn.commit()
            last_commit_time = time.time()

    # commit the last block of updates
    conn.commit()

    return plural( nrows, "row", "rows" )

def _tag_ruleids_in_field( obj, key, cset_id ):
    """Tag ruleid's in an optional field."""
    if isinstance( key, int ) or key in obj:
        # NOTE: The data structures we use to manage all the in-memory objects never change after
        # they have been loaded, so the only thread-safety we need to worry about is when we read
        # the original value from an object, and when we update it with a new value. The actual process
        # of tagging ruleid's in a piece of content is done outside the lock, since it's quite slow.
        with webapp_startup.fixup_content_lock:
            val = obj[key]
        new_val = tag_ruleids( val, cset_id )
        with webapp_startup.fixup_content_lock:
            obj[key] = new_val

def _get_row_count( conn, table_name ):
    """Get the number of rows in a table."""
    cur = conn.execute( "SELECT count(*) FROM {}".format( table_name ) )
    return cur.fetchone()[0]