You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1156 lines
49 KiB
1156 lines
49 KiB
""" Manage the search engine. """
|
|
|
|
import os
|
|
import shutil
|
|
import threading
|
|
import sqlite3
|
|
import hashlib
|
|
import io
|
|
import json
|
|
import re
|
|
import itertools
|
|
import string
|
|
import copy
|
|
import time
|
|
import tempfile
|
|
import logging
|
|
import traceback
|
|
|
|
from flask import request, jsonify
|
|
import lxml.html
|
|
|
|
from asl_rulebook2.utils import plural
|
|
from asl_rulebook2.webapp import app
|
|
from asl_rulebook2.webapp import startup as webapp_startup
|
|
from asl_rulebook2.webapp.content import tag_ruleids
|
|
from asl_rulebook2.webapp.utils import make_config_path, make_data_path, split_strip
|
|
|
|
_searchdb_fname = None
|
|
_cached_searchdb_fname = None
|
|
_fts_index = None
|
|
_fixup_content_lock = threading.Lock()
|
|
|
|
_logger = logging.getLogger( "search" )
|
|
|
|
# these are used to highlight search matches (nb: the front-end looks for these)
|
|
_BEGIN_HIGHLIGHT = "!@:"
|
|
_END_HIGHLIGHT = ":@!"
|
|
|
|
# NOTE: These regex's fix up content returned to us by the SQLite search engine (typically problems
|
|
# with highlighting search terms).
|
|
_FIXUP_TEXT_REGEXES = [
|
|
[ re.compile( fixup[0].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) ),
|
|
fixup[1].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
|
|
]
|
|
for fixup in [
|
|
[ r"&{}(.+?){};", r"{}&\g<1>;{}" ], # HTML entities e.g. &((frac12)); -> (($frac12;))
|
|
[ r"{}(.+?){}#", r"{}\g<1>#{}" ], # e.g. ((TH)# -> ((TH#)
|
|
[ r"{}U\.S{}\.", "{}U.S.{}" ], # ((U.S)). -> ((U.S.))
|
|
]
|
|
]
|
|
|
|
# NOTE: This regex identifies highlight markers that SQLite has inadvertently inserted *inside* an HTML tag,
|
|
# because it is treating the searchable content as plain-text, and not HTML. There could be multiple cases
|
|
# of this within a single tag, so we identify any such tag first, then do a simple search-and-replace
|
|
# to remove the highlight markers.
|
|
# NOTE: The content has cases of naked <'s e.g. "move < 2 MP", so we need to be careful not to get tripped up
|
|
# by these.
|
|
_HILITES_INSIDE_HTML_TAG_REGEX = re.compile(
|
|
r"\<\S[^>]*?{}.*?\>".format( _BEGIN_HIGHLIGHT )
|
|
)
|
|
|
|
# these are used to separate ruleref's in the FTS table
|
|
_RULEREF_SEPARATOR = "-:-"
|
|
|
|
# these are used to separate Q+A fields in the FTS table
|
|
_QA_CONTENT_SEPERATOR = " !=! "
|
|
_QA_FIELD_SEPARATOR = " :-: "
|
|
_NO_QA_QUESTION = "_??_"
|
|
|
|
_SEARCH_TERM_ADJUSTMENTS = None
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
@app.route( "/search", methods=["POST"] )
|
|
def search() :
|
|
"""Run a search."""
|
|
|
|
# log the request
|
|
_logger.info( "SEARCH REQUEST:" )
|
|
args = dict( request.form.items() )
|
|
for key,val in args.items():
|
|
_logger.info( "- %s: %s", key, val )
|
|
|
|
# run the search
|
|
# NOTE: We can't use the search index nor in-memory data structures if the startup tasks thread
|
|
# is still running (and possible updating them, as it fixes up content). However, the tasks running
|
|
# in that thread relinquish the lock regularly, to give the user a chance to jump in and grab it here,
|
|
# if they want to do a search while that thread is still running.
|
|
with _fixup_content_lock:
|
|
try:
|
|
return _do_search( args )
|
|
except Exception as exc: #pylint: disable=broad-except
|
|
msg = str( exc )
|
|
if msg.startswith( "fts5: " ):
|
|
msg = msg[5:] # nb: this is a sqlite3.OperationalError
|
|
_logger.warning( "SEARCH ERROR: %s\n%s", args, traceback.format_exc() )
|
|
return jsonify( { "error": msg } )
|
|
|
|
def _do_search( args ):
|
|
|
|
# run the search
|
|
query_string = args[ "queryString" ].strip()
|
|
if query_string == "!:simulated-error:!":
|
|
raise RuntimeError( "Simulated error." ) # nb: for the test suite
|
|
if not query_string:
|
|
raise RuntimeError( "Missing query string." )
|
|
fts_query_string, search_terms = _make_fts_query_string( query_string )
|
|
_logger.debug( "FTS query string: %s", fts_query_string )
|
|
conn = sqlite3.connect( _searchdb_fname )
|
|
def highlight( n ):
|
|
# NOTE: highlight() is an FTS extension function, and takes column numbers :-/
|
|
return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
|
|
sql = "SELECT rowid, sr_type, cset_id, rank, {}, {}, {}, {} FROM searchable".format(
|
|
highlight(2), highlight(3), highlight(4), highlight(5)
|
|
)
|
|
sql += " WHERE searchable MATCH ?"
|
|
sql += " ORDER BY rank"
|
|
curs = conn.execute( sql,
|
|
( "{title subtitle content rulerefs}: " + fts_query_string, )
|
|
)
|
|
|
|
def remove_bad_hilites( val ):
|
|
# remove highlight markers that SQLite may have incorrectly inserted into a value
|
|
if val is None:
|
|
return None
|
|
matches = list( _HILITES_INSIDE_HTML_TAG_REGEX.finditer( val ) )
|
|
for mo in reversed( matches ):
|
|
match = mo.group().replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" )
|
|
val = val[:mo.start()] + match + val[mo.end():]
|
|
return val
|
|
|
|
# get the results
|
|
results = []
|
|
for row in curs:
|
|
row = list( row )
|
|
for col_no in range( 4, 7+1 ):
|
|
row[col_no] = remove_bad_hilites( row[col_no] )
|
|
if row[1] == "index":
|
|
result = _unload_index_sr( row )
|
|
elif row[1] == "qa":
|
|
result = _unload_qa_sr( row )
|
|
elif row[1] == "errata":
|
|
result = _unload_anno_sr( row, "errata" )
|
|
elif row[1] == "user-anno":
|
|
result = _unload_anno_sr( row, "user-anno" )
|
|
elif row[1] == "asop-entry":
|
|
result = _unload_asop_entry_sr( row )
|
|
else:
|
|
_logger.error( "Unknown searchable row type (rowid=%d): %s", row[0], row[1] )
|
|
continue
|
|
if not result:
|
|
continue
|
|
result.update( {
|
|
"sr_type": row[1],
|
|
"_score": - row[3],
|
|
} )
|
|
results.append( result )
|
|
|
|
# fixup the results
|
|
results = _fixup_results_for_hash_terms( results, search_terms )
|
|
|
|
# adjust the sort order
|
|
results = _adjust_sort_order( results )
|
|
|
|
# return the results
|
|
if _logger.isEnabledFor( logging.DEBUG ):
|
|
_logger.debug( "Search results:" if len(results) > 0 else "Search results: none" )
|
|
for result in results:
|
|
title = result.get( "title", result.get("caption","???") )
|
|
_logger.debug( "- %s: %s (%.3f)",
|
|
result["_fts_rowid"],
|
|
title.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ),
|
|
result["_score"]
|
|
)
|
|
return jsonify( results )
|
|
|
|
def _unload_index_sr( row ):
|
|
"""Unload an index search result from the database."""
|
|
index_entry = _fts_index["index"][ row[0] ] # nb: our copy of the index entry (must remain unchanged)
|
|
result = copy.deepcopy( index_entry ) # nb: the index entry we will return to the caller
|
|
result[ "cset_id" ] = row[2]
|
|
_get_result_col( result, "title", row[4] )
|
|
_get_result_col( result, "subtitle", row[5] )
|
|
_get_result_col( result, "content", row[6] )
|
|
rulerefs = split_strip( row[7], _RULEREF_SEPARATOR ) if row[7] else []
|
|
assert len(rulerefs) == len(index_entry.get("rulerefs",[]))
|
|
if rulerefs:
|
|
result[ "rulerefs" ] = []
|
|
for i, ruleref in enumerate(rulerefs):
|
|
ruleref2 = {}
|
|
if "caption" in index_entry["rulerefs"][i]:
|
|
assert ruleref.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ) \
|
|
== index_entry["rulerefs"][i]["caption"].strip()
|
|
ruleref2["caption"] = _fixup_text( ruleref )
|
|
if "ruleids" in index_entry["rulerefs"][i]:
|
|
ruleref2["ruleids"] = index_entry["rulerefs"][i]["ruleids"]
|
|
assert ruleref2
|
|
result["rulerefs"].append( ruleref2 )
|
|
return result
|
|
|
|
def _unload_qa_sr( row ):
|
|
"""Unload a Q+A search result from the database."""
|
|
qa_entry = _fts_index["qa"][ row[0] ] # nb: our copy of the Q+A entry (must remain unchanged)
|
|
result = copy.deepcopy( qa_entry ) # nb: the Q+A entry we will return to the caller (will be changed)
|
|
# replace the content in the Q+A entry we will return to the caller with the values
|
|
# from the search index (which will have search term highlighting)
|
|
if row[4]:
|
|
result["caption"] = row[4]
|
|
sr_content = split_strip( row[6], _QA_CONTENT_SEPERATOR ) if row[6] else []
|
|
qa_entry_content = qa_entry.get( "content", [] )
|
|
if len(sr_content) != len(qa_entry_content):
|
|
_logger.error( "Mismatched # content's for Q+A entry: %s", qa_entry )
|
|
return None
|
|
for content_no, content in enumerate( qa_entry_content ):
|
|
fields = split_strip( sr_content[content_no], _QA_FIELD_SEPARATOR )
|
|
answers = content.get( "answers", [] )
|
|
if len(fields) - 1 != len(answers): # nb: fields = question + answer 1 + answer 2 + ...
|
|
_logger.error( "Mismatched # answers for content %d: %s\n- answers = %s", content_no, qa_entry, answers )
|
|
return None
|
|
if fields[0] != _NO_QA_QUESTION:
|
|
result["content"][content_no]["question"] = fields[0]
|
|
for answer_no, _ in enumerate(answers):
|
|
result["content"][content_no]["answers"][answer_no][0] = fields[ 1+answer_no ]
|
|
return result
|
|
|
|
def _unload_anno_sr( row, atype ):
|
|
"""Unload an annotation search result from the database."""
|
|
anno = _fts_index[atype][ row[0] ] # nb: our copy of the annotation (must remain unchanged)
|
|
result = copy.deepcopy( anno ) # nb: the annotation we will return to the caller (will be changed)
|
|
_get_result_col( result, "content", row[6] )
|
|
return result
|
|
|
|
def _unload_asop_entry_sr( row ):
|
|
"""Unload an ASOP entry search result from the database."""
|
|
section = _fts_index["asop-entry"][ row[0] ][0] # nb: our copy of the ASOP section (must remain unchanged)
|
|
result = copy.deepcopy( section ) # nb: the ASOP section we will return to the caller (will be changed)
|
|
_get_result_col( result, "content", row[6] )
|
|
return result
|
|
|
|
def _fixup_text( val ):
|
|
"""Fix-up a text value retrieved from the search index."""
|
|
if val is None:
|
|
return None
|
|
for regex in _FIXUP_TEXT_REGEXES:
|
|
val = regex[0].sub( regex[1], val )
|
|
return val
|
|
|
|
def _get_result_col( sr, key, val ):
|
|
"""Get a column from a search result."""
|
|
if val:
|
|
sr[ key ] = _fixup_text( val )
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
PASSTHROUGH_REGEXES = set([
|
|
re.compile( r"\bAND\b" ),
|
|
re.compile( r"\bOR\b" ),
|
|
re.compile( r"\bNOT\b" ), # nb: this is a binary operator i.e. x NOT y = x && !x
|
|
re.compile( r"\((?![Rr]\))" ),
|
|
])
|
|
|
|
def _make_fts_query_string( query_string ):
|
|
"""Generate the SQLite query string.
|
|
|
|
SQLite's MATCH function recognizes a lot of special characters, which need
|
|
to be enclosed in double-quotes to disable.
|
|
"""
|
|
|
|
# check if this looks like a raw FTS query
|
|
if any( regex.search(query_string) for regex in PASSTHROUGH_REGEXES ):
|
|
return query_string.strip(), None
|
|
|
|
# split the search string into words (taking quoted phrases into account)
|
|
ignore = app.config.get( "SQLITE_FTS_IGNORE_CHARS", ",;!?$" )
|
|
query_string = "".join( ch for ch in query_string if ch not in ignore )
|
|
terms = query_string.lower().split()
|
|
i = 0
|
|
while True:
|
|
if i >= len(terms):
|
|
break
|
|
if i > 0 and terms[i-1].startswith( '"' ):
|
|
terms[i-1] += " {}".format( terms[i] )
|
|
del terms[i]
|
|
if terms[i-1].startswith( '"' ) and terms[i-1].endswith( '"' ):
|
|
terms[i-1] = terms[i-1][1:-1]
|
|
continue
|
|
i += 1
|
|
|
|
# clean up quoted phrases
|
|
terms = [ t[1:] if t.startswith('"') else t for t in terms ]
|
|
terms = [ t[:-1] if t.endswith('"') else t for t in terms ]
|
|
terms = [ t.strip() for t in terms ]
|
|
terms = [ t for t in terms if t ]
|
|
|
|
# adjust search terms
|
|
for term_no, term in enumerate(terms):
|
|
aliases = _SEARCH_TERM_ADJUSTMENTS.get( term )
|
|
if not aliases:
|
|
continue
|
|
if isinstance( aliases, str ):
|
|
# the search term is replaced by a new one
|
|
terms[ term_no ] = aliases
|
|
elif isinstance( aliases, set ):
|
|
# the search term is replaced by multiple new ones (that will be OR'ed together)
|
|
# NOTE: We sort the terms so that the tests will work reliably.
|
|
terms[ term_no ] = sorted( aliases )
|
|
else:
|
|
assert "Unknown search alias type: {}".format( type(aliases) )
|
|
|
|
# fixup each term
|
|
def has_special_char( term ):
|
|
"""Check if the term contains any special characters."""
|
|
for ch in term:
|
|
if ch in "*":
|
|
continue
|
|
if ch.isspace() or ch in string.punctuation:
|
|
return True
|
|
if ord(ch) < 32 or ord(ch) > 127:
|
|
return True
|
|
return False
|
|
def fixup_terms( terms ):
|
|
"""Fixup a list of terms."""
|
|
for term_no, term in enumerate(terms):
|
|
if isinstance( term, str ):
|
|
if has_special_char( term ):
|
|
terms[term_no] = '"{}"'.format( term )
|
|
else:
|
|
fixup_terms( term )
|
|
fixup_terms( terms )
|
|
|
|
# return the final FTS query string
|
|
def term_string( term ):
|
|
if isinstance( term, str ):
|
|
return term
|
|
assert isinstance( term, list )
|
|
return "( {} )".format( " OR ".join( term ) )
|
|
return " AND ".join( term_string(t) for t in terms ), terms
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _fixup_results_for_hash_terms( results, search_terms ):
|
|
"""Fixup search results for search terms that end with a hash.
|
|
|
|
SQLite doesn't handle search terms that end with a hash particularly well.
|
|
We correct highlighted search terms in _fixup_text(), but searching for e.g. "US#"
|
|
will also match "use" and "using" - we remove such results here.
|
|
"""
|
|
|
|
# figure out which search terms end with a hash
|
|
# NOTE: We don't bother descending down into sub-terms.
|
|
if not search_terms:
|
|
return results
|
|
terms = [
|
|
t[1:-1] for t in search_terms
|
|
if isinstance(t,str) and t.startswith('"') and t.endswith('"')
|
|
]
|
|
terms = [
|
|
t[:-1].lower() for t in terms
|
|
if isinstance(t,str) and t.endswith("#")
|
|
]
|
|
if not terms:
|
|
return results
|
|
if "us" in terms:
|
|
terms.extend( [ "use", "used", "using", "user" ] )
|
|
|
|
def keep( sr ):
|
|
# remove every incorrectly matched search term (e.g. ((K)) when searching for "K#")
|
|
buf = json.dumps( sr ).lower()
|
|
for term in terms:
|
|
buf = buf.replace( "{}{}{}".format( _BEGIN_HIGHLIGHT, term, _END_HIGHLIGHT ), "_removed_" )
|
|
# we keep this search result if there are still some highlighted search terms
|
|
return _BEGIN_HIGHLIGHT in buf
|
|
|
|
return [
|
|
result for result in results if keep(result)
|
|
]
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _adjust_sort_order( results ):
|
|
"""Adjust the sort order of the search results."""
|
|
|
|
results2 = []
|
|
def extract_sr( func ):
|
|
# move results that pass the filter function to the new list
|
|
i = 0
|
|
while True:
|
|
if i >= len(results):
|
|
break
|
|
# NOTE: We never prefer small entries (i.e .have no ruleref's)
|
|
# e.g. those that only contain a "see also".
|
|
if func( results[i] ) and len(results[i].get("rulerefs",[])) > 0:
|
|
results2.append( results[i] )
|
|
del results[i]
|
|
else:
|
|
i += 1
|
|
|
|
def get( sr, key ):
|
|
val = sr.get( key )
|
|
return val if val else ""
|
|
|
|
# prefer search results whose title is an exact match
|
|
extract_sr(
|
|
lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) and get(sr,"title").endswith( _END_HIGHLIGHT )
|
|
)
|
|
# prefer search results whose title starts with a match
|
|
extract_sr(
|
|
lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT )
|
|
)
|
|
# prefer search results that have a match in the title
|
|
extract_sr(
|
|
lambda sr: _BEGIN_HIGHLIGHT in get(sr,"title")
|
|
)
|
|
# prefer search results that have a match in the subtitle
|
|
extract_sr(
|
|
lambda sr: _BEGIN_HIGHLIGHT in get(sr,"subtitle")
|
|
)
|
|
|
|
# include any remaining search results
|
|
results2.extend( results )
|
|
|
|
return results2
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
def init_search( content_sets, #pylint: disable=too-many-arguments
|
|
qa, qa_fnames,
|
|
errata, errata_fnames,
|
|
user_anno, user_anno_fname,
|
|
asop, asop_preambles, asop_content, asop_fnames,
|
|
startup_msgs, logger
|
|
):
|
|
"""Initialize the search engine."""
|
|
|
|
# initialize
|
|
global _fts_index
|
|
_fts_index = { "index": {}, "qa": {}, "errata": {}, "user-anno": {}, "asop-entry": {} }
|
|
|
|
# locate the database
|
|
global _searchdb_fname
|
|
_searchdb_fname = app.config.get( "SEARCHDB" )
|
|
if not _searchdb_fname:
|
|
# FUDGE! We should be able to create a shared, in-memory database using this:
|
|
# file::XYZ:?mode=memory&cache=shared
|
|
# but it doesn't seem to work (on Linux) and ends up creating a file with this name :-/
|
|
# We manually create a temp file, which has to have the same name each time, so that we don't
|
|
# keep creating a new database each time we start up. Sigh...
|
|
_searchdb_fname = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb" )
|
|
|
|
def init_searchdb():
|
|
_init_searchdb( content_sets,
|
|
qa, qa_fnames,
|
|
errata, errata_fnames,
|
|
user_anno, user_anno_fname,
|
|
asop, asop_preambles, asop_content, asop_fnames,
|
|
logger
|
|
)
|
|
|
|
# check if we should force the database to be built from a cached version
|
|
# NOTE: This should only be done for running tests (to ensure that database was built correctly).
|
|
if app.config.get( "FORCE_CACHED_SEARCHDB" ):
|
|
# initialize the database using a new cache file (this will force the creation of the cached version)
|
|
fname = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb-forced_cache" )
|
|
if os.path.isfile( fname ):
|
|
os.unlink( fname )
|
|
app.config[ "CACHED_SEARCHDB" ] = fname
|
|
assert webapp_startup._startup_tasks == [] #pylint: disable=protected-access
|
|
init_searchdb()
|
|
webapp_startup._do_startup_tasks( False ) #pylint: disable=protected-access
|
|
webapp_startup._startup_tasks = [] #pylint: disable=protected-access
|
|
# NOTE: When we continue on from here, the database will be initialized again, using the cached version.
|
|
|
|
# initialize the database
|
|
init_searchdb()
|
|
|
|
# load the search config
|
|
load_search_config( startup_msgs, logger )
|
|
|
|
def _init_searchdb( content_sets, #pylint: disable=too-many-arguments
|
|
qa, qa_fnames,
|
|
errata, errata_fnames,
|
|
user_anno, user_anno_fname,
|
|
asop, asop_preambles, asop_content, asop_fnames,
|
|
logger
|
|
):
|
|
"""Initialize the search database."""
|
|
|
|
# NOTE: Building the database can be a slow process if there is a lot of content (e.g. Q+A), since we are
|
|
# runnning many regex's over them, to identify ruleid's that should be converted to links. So, we offer
|
|
# the option to take a copy of the database after it has been built, and use that the next time we run.
|
|
# However, the initialization process is complicated, and we can't just use that cached database (e.g. because
|
|
# we also need to update in-memory objects), so instead, we build the database in the normal way, but where
|
|
# we would normally run the regex's, we instead grab the result from the cached database, and update
|
|
# the in-memory objects as required (see _fixup_searchable_content()). This gives significantly faster times
|
|
# for the startup tasks:
|
|
# rebuild cached
|
|
# vm-linux-dev2 2:04 0:01
|
|
# Raspberry Pi 4 4:11 0:01
|
|
# Banana Pi 17:59 0:08
|
|
|
|
# check if there is a cached database
|
|
global _cached_searchdb_fname
|
|
_cached_searchdb_fname = None
|
|
fname = app.config.get( "CACHED_SEARCHDB" )
|
|
# NOTE: We treat an empty file as being not present since files must exist to be able to mount them
|
|
# into Docker (run-container.sh creates the file if it is being created for this first time).
|
|
if fname and os.path.isfile( fname ) and os.path.getsize( fname ) > 0:
|
|
# yup - compare the file hashes
|
|
logger.debug( "Checking cached search database: %s", fname )
|
|
with sqlite3.connect( fname ) as conn:
|
|
conn.row_factory = sqlite3.Row
|
|
curs = conn.cursor()
|
|
query = curs.execute( "SELECT * from file_hash" )
|
|
old_file_hashes = [ dict(row) for row in query ]
|
|
logger.debug( "- cached hashes:\n%s", _dump_file_hashes( old_file_hashes, prefix=" " ) )
|
|
curr_file_hashes = _make_file_hashes(
|
|
content_sets, qa_fnames, errata_fnames, user_anno_fname, asop_fnames
|
|
)
|
|
logger.debug( "- curr. hashes:\n%s", _dump_file_hashes( curr_file_hashes, prefix=" " ) )
|
|
if old_file_hashes == curr_file_hashes:
|
|
# the file hashes are the same - flag that we should use the cached database
|
|
logger.info( "Using cached search database: %s", fname )
|
|
_cached_searchdb_fname = fname
|
|
|
|
# initialize the database
|
|
if os.path.isfile( _searchdb_fname ):
|
|
os.unlink( _searchdb_fname )
|
|
logger.info( "Creating the search index: %s", _searchdb_fname )
|
|
conn = sqlite3.connect( _searchdb_fname )
|
|
conn.execute( "PRAGMA journal = memory" )
|
|
# NOTE: Storing everything in a single table allows FTS to rank search results based on
|
|
# the overall content, and also lets us do AND/OR queries across all searchable content.
|
|
conn.execute(
|
|
"CREATE VIRTUAL TABLE searchable USING fts5"
|
|
" ( sr_type, cset_id, title, subtitle, content, rulerefs, tokenize='porter unicode61' )"
|
|
)
|
|
|
|
# initialize the search index
|
|
logger.info( "Building the search index..." )
|
|
curs = conn.cursor()
|
|
if content_sets:
|
|
_init_content_sets( conn, curs, content_sets, logger )
|
|
if qa:
|
|
_init_qa( curs, qa, logger )
|
|
if errata:
|
|
_init_errata( curs, errata, logger )
|
|
if user_anno:
|
|
_init_user_anno( curs, user_anno, logger )
|
|
if asop:
|
|
_init_asop( curs, asop, asop_preambles, asop_content, logger )
|
|
conn.commit()
|
|
|
|
# save the file hashes
|
|
logger.info( "Calculating file hashes..." )
|
|
conn.execute( "CREATE TABLE file_hash ( ftype, fname, hash )" )
|
|
file_hashes = _make_file_hashes(
|
|
content_sets, qa_fnames, errata_fnames, user_anno_fname, asop_fnames
|
|
)
|
|
for fh in file_hashes:
|
|
logger.debug( "- %s/%s = %s", fh["ftype"], fh["fname"], fh["hash"] )
|
|
conn.execute( "INSERT INTO file_hash"
|
|
" ( ftype, fname, hash )"
|
|
" VALUES ( :ftype, :fname, :hash )",
|
|
fh
|
|
)
|
|
conn.commit()
|
|
|
|
# register a task for post-fixup processing
|
|
fname = app.config.get( "CACHED_SEARCHDB" )
|
|
if fname:
|
|
def on_post_fixup():
|
|
# check if the database was built using the cached version
|
|
if _cached_searchdb_fname:
|
|
# yup - validate what we built
|
|
_check_searchdb( logger )
|
|
else:
|
|
# nope - save a copy of what we built (for next time)
|
|
# NOTE: While VACUUM INTO is nice, it doesn't seem to work inside a Docker container,
|
|
# and we can't use it anyway, since it may change rowid's :-(
|
|
# NOTE: While SQLite sometimes creates additional files associated with the database:
|
|
# https://sqlite.org/tempfiles.html
|
|
# I don't think any of these cases apply here, and we can just copy the database file itself.
|
|
logger.info( "Saving a copy of the search database: %s", fname )
|
|
shutil.copyfile( _searchdb_fname, fname )
|
|
from asl_rulebook2.webapp.startup import _add_startup_task
|
|
_add_startup_task( "post-fixup processing", on_post_fixup )
|
|
|
|
def _check_searchdb( logger ):
|
|
"""Compare the newly-built search database with the cached one."""
|
|
|
|
with sqlite3.connect( _searchdb_fname ) as conn, sqlite3.connect( _cached_searchdb_fname ) as conn2:
|
|
|
|
# check the number of rows
|
|
nrows = _get_row_count( conn, "searchable" )
|
|
nrows2 = _get_row_count( conn2, "searchable" )
|
|
if nrows != nrows2:
|
|
logger.error( "Searchable row count mismatch: got %d, expected %d", nrows, nrows2 )
|
|
|
|
# check the row content
|
|
query = "SELECT rowid, * FROM searchable ORDER BY rowid"
|
|
curs = conn.execute( query )
|
|
curs2 = conn2.execute( query )
|
|
for _ in range( nrows ):
|
|
row = curs.fetchone()
|
|
row2 = curs2.fetchone()
|
|
if row != row2:
|
|
logger.error( "Search row mismatch:\n- got: %s\n- expected: %s", row, row2 )
|
|
|
|
# NOTE: It would be nice to show an error balloon if we detected any problems here, but since
|
|
# we are running in a startup task, it's too late (the UI will have already called $/startup-msgs).
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _init_content_sets( conn, curs, content_sets, logger ):
|
|
"""Add the content sets to the search index."""
|
|
|
|
def make_fields( index_entry ):
|
|
return {
|
|
"subtitle": index_entry.get( "subtitle" ),
|
|
"content": index_entry.get( "content" ),
|
|
}
|
|
|
|
# add the index entries to the search index
|
|
# IMPORTANT! The insert order must be stable (so that we can match rows in the cached database by rowid).
|
|
sr_type = "index"
|
|
for cset_id in sorted( content_sets.keys() ):
|
|
cset = content_sets[ cset_id ]
|
|
logger.info( "- Adding index file: %s", cset["index_fname"] )
|
|
nrows = 0
|
|
assert isinstance( cset["index"], list )
|
|
for index_entry in cset["index"]:
|
|
rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) )
|
|
# NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags
|
|
# will need to be included in search terms. However, this means that the content returned by a query
|
|
# will be this stripped content. We could go back to the original data to get the original HTML content,
|
|
# but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert
|
|
# the original content, since none of it should contain HTML, anyway.
|
|
fields = make_fields( index_entry )
|
|
curs.execute(
|
|
"INSERT INTO searchable"
|
|
" ( sr_type, cset_id, title, subtitle, content, rulerefs )"
|
|
" VALUES ( ?, ?, ?, ?, ?, ? )", (
|
|
sr_type, cset["cset_id"],
|
|
index_entry.get("title"), fields["subtitle"], fields["content"], rulerefs
|
|
) )
|
|
_fts_index[sr_type][ curs.lastrowid ] = index_entry
|
|
index_entry["_fts_rowid"] = curs.lastrowid
|
|
nrows += 1
|
|
logger.info( " - Added %s.", plural(nrows,"index entry","index entries"), )
|
|
assert len(_fts_index[sr_type]) == _get_row_count( conn, "searchable" )
|
|
|
|
# register a task to fixup the content
|
|
def fixup_row( rowid, cset_id ):
|
|
index_entry = _fts_index[ sr_type ][ rowid ]
|
|
_tag_ruleids_in_field( index_entry, "subtitle", cset_id )
|
|
_tag_ruleids_in_field( index_entry, "content", cset_id )
|
|
return index_entry
|
|
from asl_rulebook2.webapp.startup import _add_startup_task
|
|
_add_startup_task( "fixup index searchable content",
|
|
lambda: _fixup_searchable_content( sr_type, fixup_row, make_fields )
|
|
)
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _init_qa( curs, qa, logger ):
|
|
"""Add the Q+A to the search index."""
|
|
|
|
def make_fields( qa_entry ):
|
|
buf = []
|
|
for content in qa_entry.get( "content", [] ):
|
|
buf2 = []
|
|
buf2.append( content.get( "question", _NO_QA_QUESTION ) )
|
|
# NOTE: We don't really want to index answers, since they are mostly not very useful (e.g. "Yes."),
|
|
# but we do so in order to get highlighting for those cases where they contain a search term.
|
|
for answer in content.get( "answers", [] ):
|
|
buf2.append( answer[0] )
|
|
buf.append( _QA_FIELD_SEPARATOR.join( buf2 ) )
|
|
return {
|
|
"title": qa_entry.get( "caption" ),
|
|
"content": _QA_CONTENT_SEPERATOR.join( buf ),
|
|
}
|
|
|
|
def unload_fields( qa_entry, fields ):
|
|
"""Unload the Q+A entry's fields from the cached search database."""
|
|
qa_entry["caption"] = fields["title"]
|
|
contents = fields["content"].split( _QA_CONTENT_SEPERATOR )
|
|
for content_no, content in enumerate( contents ):
|
|
fields = content.split( _QA_FIELD_SEPARATOR )
|
|
if fields[0] != _NO_QA_QUESTION:
|
|
qa_entry["content"][content_no]["question"] = fields[0]
|
|
for field_no in range( 1, len(fields) ):
|
|
qa_entry["content"][content_no]["answers"][field_no-1][0] = fields[ field_no ]
|
|
|
|
logger.info( "- Adding the Q+A." )
|
|
nrows = 0
|
|
sr_type = "qa"
|
|
# IMPORTANT! The insert order must be stable (so that we can match rows in the cached database by rowid).
|
|
for qa_key in sorted( qa.keys() ):
|
|
qa_entries = qa[ qa_key ]
|
|
assert isinstance( qa_entries, list )
|
|
for qa_entry in qa_entries:
|
|
fields = make_fields( qa_entry )
|
|
curs.execute(
|
|
"INSERT INTO searchable ( sr_type, title, content ) VALUES ( ?, ?, ? )", (
|
|
sr_type, fields["title"], fields["content"]
|
|
) )
|
|
_fts_index[sr_type][ curs.lastrowid ] = qa_entry
|
|
qa_entry["_fts_rowid"] = curs.lastrowid
|
|
nrows += 1
|
|
logger.info( " - Added %s.", plural(nrows,"Q+A entry","Q+A entries"), )
|
|
|
|
# register a task to fixup the content
|
|
def fixup_row( rowid, cset_id ):
|
|
qa_entry = _fts_index[ sr_type ][ rowid ]
|
|
_tag_ruleids_in_field( qa_entry, "caption", cset_id )
|
|
for content in qa_entry.get( "content", [] ):
|
|
_tag_ruleids_in_field( content, "question", cset_id )
|
|
for answer in content.get( "answers", [] ):
|
|
_tag_ruleids_in_field( answer, 0, cset_id )
|
|
return qa_entry
|
|
from asl_rulebook2.webapp.startup import _add_startup_task
|
|
_add_startup_task( "fixup Q+A searchable content",
|
|
lambda: _fixup_searchable_content( sr_type, fixup_row, make_fields, unload_fields=unload_fields )
|
|
)
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _init_errata( curs, errata, logger ):
|
|
"""Add the errata to the search index."""
|
|
logger.info( "- Adding the errata." )
|
|
nrows = _do_init_anno( curs, errata, "errata" )
|
|
logger.info( " - Added %s.", plural(nrows,"errata entry","errata entries"), )
|
|
|
|
def _init_user_anno( curs, user_anno, logger ):
|
|
"""Add the user-defined annotations to the search index."""
|
|
logger.info( "- Adding the annotations." )
|
|
nrows = _do_init_anno( curs, user_anno, "user-anno" )
|
|
logger.info( " - Added %s.", plural(nrows,"annotation","annotations"), )
|
|
|
|
def _do_init_anno( curs, anno, atype ):
|
|
"""Add annotations to the search index."""
|
|
|
|
def make_fields( anno ):
|
|
return {
|
|
"content": anno.get( "content" ),
|
|
}
|
|
|
|
# add the annotations to the search index
|
|
# IMPORTANT! The insert order must be stable (so that we can match rows in the cached database by rowid).
|
|
nrows = 0
|
|
sr_type = atype
|
|
for ruleid in sorted( anno, key=str ):
|
|
assert isinstance( anno[ruleid], list )
|
|
for a in anno[ruleid]:
|
|
fields = make_fields( a )
|
|
curs.execute(
|
|
"INSERT INTO searchable ( sr_type, content ) VALUES ( ?, ? )", (
|
|
sr_type, fields["content"]
|
|
) )
|
|
_fts_index[sr_type][ curs.lastrowid ] = a
|
|
a["_fts_rowid"] = curs.lastrowid
|
|
nrows += 1
|
|
|
|
# register a task to fixup the content
|
|
def fixup_row( rowid, cset_id ):
|
|
anno = _fts_index[ sr_type ][ rowid ]
|
|
_tag_ruleids_in_field( anno, "content", cset_id )
|
|
return anno
|
|
from asl_rulebook2.webapp.startup import _add_startup_task
|
|
_add_startup_task( "fixup {} searchable content".format( atype ),
|
|
lambda: _fixup_searchable_content( sr_type, fixup_row, make_fields )
|
|
)
|
|
|
|
return nrows
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _init_asop( curs, asop, asop_preambles, asop_content, logger ):
|
|
"""Add the ASOP to the search index."""
|
|
|
|
logger.info( "- Adding the ASOP." )
|
|
sr_type = "asop-entry"
|
|
fixup_sections = []
|
|
nentries = 0
|
|
# IMPORTANT! The insert order must be stable (so that we can match rows in the cached database by rowid).
|
|
for chapter in asop.get( "chapters", [] ):
|
|
for section in chapter.get( "sections", [] ):
|
|
content = asop_content.get( section["section_id"] )
|
|
if not content:
|
|
continue
|
|
fixup_sections.append( section )
|
|
entries = _extract_section_entries( content )
|
|
# NOTE: The way we manage the FTS index for ASOP entries is a little different to normal,
|
|
# since they don't exist as individual entities (this is the only place where they do,
|
|
# so that we can return them as individual search results). Each database row points
|
|
# to the parent section, and the section has a list of FTS rows for its child entries.
|
|
section[ "_fts_rowids" ] = []
|
|
assert isinstance( entries, list )
|
|
for entry in entries:
|
|
curs.execute(
|
|
"INSERT INTO searchable ( sr_type, content ) VALUES ( ?, ? )", (
|
|
sr_type, entry
|
|
) )
|
|
_fts_index[sr_type][ curs.lastrowid ] = [ section, entry ]
|
|
section[ "_fts_rowids" ].append( curs.lastrowid )
|
|
nentries += 1
|
|
logger.info( " - Added %s.", plural(nentries,"entry","entries") )
|
|
|
|
# register a task to fixup the content
|
|
def fixup_content():
|
|
_fixup_searchable_content( sr_type, fixup_row, make_fields )
|
|
# we also need to fixup the in-memory data structures
|
|
if _cached_searchdb_fname is None:
|
|
cset_id = None
|
|
# NOTE: ASOP sections are divided up into individual entries, and each entry stored as a separate
|
|
# searchable row, which means that we would have to reconstitute the sections from these rows
|
|
# when they are read back from a cached database. While it's maybe possible to do this, it's safer
|
|
# to just stored the fixed-up sections verbatim.
|
|
with sqlite3.connect( _searchdb_fname ) as conn:
|
|
conn.execute( "CREATE TABLE fixedup_asop_preamble ( chapter_id, content )" )
|
|
conn.execute( "CREATE TABLE fixedup_asop_section ( section_id, content )" )
|
|
for chapter_id in asop_preambles:
|
|
_tag_ruleids_in_field( asop_preambles, chapter_id, cset_id )
|
|
conn.execute( "INSERT INTO fixedup_asop_preamble ( chapter_id, content ) VALUES ( ?, ? )", (
|
|
chapter_id, asop_preambles[chapter_id]
|
|
) )
|
|
for section in fixup_sections:
|
|
section_id = section["section_id"]
|
|
_tag_ruleids_in_field( asop_content, section_id, cset_id )
|
|
conn.execute( "INSERT INTO fixedup_asop_section ( section_id, content ) VALUES ( ?, ? )", (
|
|
section_id, asop_content[section_id]
|
|
) )
|
|
conn.commit()
|
|
else:
|
|
# restore the fixed-up ASOP content into the in-memory objects
|
|
with sqlite3.connect( _cached_searchdb_fname ) as conn:
|
|
for row in conn.execute( "SELECT chapter_id, content FROM fixedup_asop_preamble" ):
|
|
asop_preambles[ row[0] ] = row[1]
|
|
for row in conn.execute( "SELECT section_id, content FROM fixedup_asop_section" ):
|
|
asop_content[ row[0] ] = row[1]
|
|
|
|
def fixup_row( rowid, cset_id ):
|
|
entry = _fts_index[ sr_type ][ rowid ].pop()
|
|
entry = tag_ruleids( entry, cset_id )
|
|
return entry
|
|
def make_fields( entry ):
|
|
return { "content": entry }
|
|
from asl_rulebook2.webapp.startup import _add_startup_task
|
|
_add_startup_task( "fixup ASOP searchable content", fixup_content )
|
|
|
|
def _extract_section_entries( content ):
|
|
"""Separate out each entry from the section's content."""
|
|
entries = []
|
|
fragment = lxml.html.fragment_fromstring(
|
|
"<div> {} </div>".format( content )
|
|
)
|
|
for elem in fragment.xpath( ".//div[contains(@class,'entry')]" ):
|
|
if "entry" not in elem.attrib["class"].split():
|
|
continue
|
|
entry = lxml.html.tostring( elem )
|
|
entries.append( entry.decode( "utf-8" ) )
|
|
if not entries:
|
|
# NOTE: If the content hasn't been divided into entries, we return the whole thing as
|
|
# one big entry, which will kinda suck as a search result if it's big, but it's better
|
|
# than not seeing anything at all.
|
|
return [ content ]
|
|
return entries
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def _make_file_hashes( content_sets, qa_fnames, errata_fnames, user_anno_fname, asop_fnames ):
|
|
"""Generate hashes for the files that are used to populate the search index."""
|
|
|
|
file_hashes = []
|
|
def add_file( fh_type, fname ):
|
|
with open( fname, "rb" ) as fp:
|
|
hashval = hashlib.md5( fp.read() ).hexdigest()
|
|
file_hashes.append( {
|
|
"ftype": fh_type,
|
|
"fname": os.path.basename( fname ),
|
|
"hash": hashval
|
|
} )
|
|
|
|
# add each file to the table
|
|
if content_sets:
|
|
for cset_id, cset in content_sets.items():
|
|
add_file( "index:{}".format(cset_id), cset["index_fname"] )
|
|
if qa_fnames:
|
|
for fname in qa_fnames:
|
|
add_file( "q+a", fname )
|
|
if errata_fnames:
|
|
for fname in errata_fnames:
|
|
add_file( "errata", fname )
|
|
if user_anno_fname:
|
|
add_file( "user-anno", user_anno_fname )
|
|
if asop_fnames:
|
|
for fname in asop_fnames:
|
|
add_file( "asop", fname )
|
|
|
|
file_hashes.sort(
|
|
key = lambda row: ( row["ftype"], row["fname"] )
|
|
)
|
|
return file_hashes
|
|
|
|
def _dump_file_hashes( file_hashes, prefix="" ):
|
|
"""Dump file hashes."""
|
|
if not file_hashes:
|
|
return ""
|
|
max_ftype_len = max( len(fh["ftype"]) for fh in file_hashes )
|
|
max_fname_len = max( len(fh["fname"]) for fh in file_hashes )
|
|
fmt = prefix + "{ftype:<%d} | {fname:<%d} | {hash}" % ( max_ftype_len, max_fname_len )
|
|
buf = io.StringIO()
|
|
for fh in file_hashes:
|
|
print( fmt.format( **fh ), file=buf )
|
|
return buf.getvalue().rstrip()
|
|
|
|
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
|
|
def load_search_config( startup_msgs, logger ):
|
|
"""Load the search config."""
|
|
|
|
# initialize
|
|
global _SEARCH_TERM_ADJUSTMENTS
|
|
_SEARCH_TERM_ADJUSTMENTS = {}
|
|
|
|
def add_search_term_adjustment( key, vals ):
|
|
# make sure everything is lower-case
|
|
key = key.lower()
|
|
if isinstance( vals, str ):
|
|
vals = vals.lower()
|
|
elif isinstance( vals, set ):
|
|
vals = set( v.lower() for v in vals )
|
|
else:
|
|
assert "Unknown search alias type: {}".format( type(vals) )
|
|
# add new the search term adjustment
|
|
if key not in _SEARCH_TERM_ADJUSTMENTS:
|
|
_SEARCH_TERM_ADJUSTMENTS[ key ] = vals
|
|
else:
|
|
# found a multiple definition - try to do something sensible
|
|
logger.warning( " - Duplicate search alias: %s\n- current aliases = %s\n- new aliases = %s", key,
|
|
_SEARCH_TERM_ADJUSTMENTS[key], vals
|
|
)
|
|
if isinstance( _SEARCH_TERM_ADJUSTMENTS[key], str ):
|
|
_SEARCH_TERM_ADJUSTMENTS[ key ] = vals
|
|
else:
|
|
assert isinstance( _SEARCH_TERM_ADJUSTMENTS[key], set )
|
|
_SEARCH_TERM_ADJUSTMENTS[ key ].update( vals )
|
|
|
|
# load the search replacements
|
|
def load_search_replacements( fname, ftype ):
|
|
if fname is None or not os.path.isfile( fname ):
|
|
return
|
|
logger.info( "Loading %s search replacements: %s", ftype, fname )
|
|
try:
|
|
with open( fname, "r", encoding="utf-8" ) as fp:
|
|
data = json.load( fp )
|
|
except Exception as ex: #pylint: disable=broad-except
|
|
startup_msgs.warning( "Can't load {} search replacements.".format( ftype ), str(ex) )
|
|
return
|
|
nitems = 0
|
|
for key, val in data.items():
|
|
if key.startswith( "_" ):
|
|
continue # nb: ignore comments
|
|
logger.debug( "- %s -> %s", key, val )
|
|
add_search_term_adjustment( key, val )
|
|
nitems += 1
|
|
logger.info( "- Loaded %s.", plural(nitems,"search replacement","search replacements") )
|
|
load_search_replacements( make_config_path( "search-replacements.json" ), "default" )
|
|
load_search_replacements( make_data_path( "search-replacements.json" ), "user" )
|
|
|
|
# load the search aliases
|
|
def load_search_aliases( fname, ftype ):
|
|
if fname is None or not os.path.isfile( fname ):
|
|
return
|
|
logger.info( "Loading %s search aliases: %s", ftype, fname )
|
|
try:
|
|
with open( fname, "r", encoding="utf-8" ) as fp:
|
|
data = json.load( fp )
|
|
except Exception as ex: #pylint: disable=broad-except
|
|
startup_msgs.warning( "Can't load {} search aliases.".format( ftype ), str(ex) )
|
|
return
|
|
nitems = 0
|
|
for keys, aliases in data.items():
|
|
if keys.startswith( "_" ):
|
|
continue # nb: ignore comments
|
|
logger.debug( "- %s -> %s", keys, " ; ".join(aliases) )
|
|
for key in keys.split( "/" ):
|
|
add_search_term_adjustment( key, set( itertools.chain( aliases, [key] ) ) )
|
|
nitems += 1
|
|
logger.info( "- Loaded %s.", plural(nitems,"search aliases","search aliases") )
|
|
load_search_aliases( make_config_path( "search-aliases.json" ), "default" )
|
|
load_search_aliases( make_data_path( "search-aliases.json" ), "user" )
|
|
|
|
# load the search synonyms
|
|
def load_search_synonyms( fname, ftype ):
|
|
if fname is None or not os.path.isfile( fname ):
|
|
return
|
|
logger.info( "Loading %s search synonyms: %s", ftype, fname )
|
|
try:
|
|
with open( fname, "r", encoding="utf-8" ) as fp:
|
|
data = json.load( fp )
|
|
except Exception as ex: #pylint: disable=broad-except
|
|
startup_msgs.warning( "Can't load {} search synonyms.".format( ftype ), str(ex) )
|
|
return
|
|
nitems = 0
|
|
for synonyms in data:
|
|
if isinstance( synonyms, str ):
|
|
continue # nb: ignore comments
|
|
logger.debug( "- %s", " ; ".join(synonyms) )
|
|
synonyms = set( synonyms )
|
|
for term in synonyms:
|
|
add_search_term_adjustment( term, synonyms )
|
|
nitems += 1
|
|
logger.info( "- Loaded %s.", plural(nitems,"search synonym","search synonyms") )
|
|
load_search_synonyms( make_config_path( "search-synonyms.json" ), "default" )
|
|
load_search_synonyms( make_data_path( "search-synonyms.json" ), "user" )
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
def _fixup_searchable_content( sr_type, fixup_row, make_fields, unload_fields=None ):
|
|
"""Fixup the searchable content for the specified search result type."""
|
|
|
|
# initialize
|
|
conn = sqlite3.connect( _searchdb_fname )
|
|
conn.row_factory = sqlite3.Row
|
|
curs = conn.cursor()
|
|
|
|
# check if we have a cached database to retrieve values from
|
|
cached_searchdb_conn = None
|
|
if _cached_searchdb_fname:
|
|
cached_searchdb_conn = sqlite3.connect( _cached_searchdb_fname )
|
|
cached_searchdb_conn.row_factory = sqlite3.Row
|
|
|
|
# update the searchable content in each row
|
|
nrows = 0
|
|
last_commit_time = time.time()
|
|
query = conn.execute( "SELECT rowid, cset_id FROM searchable WHERE sr_type=?",
|
|
( sr_type, )
|
|
)
|
|
for row in query:
|
|
|
|
# prepare the row
|
|
row = dict( row )
|
|
nrows += 1
|
|
|
|
# fixup the searchable row
|
|
if cached_searchdb_conn:
|
|
# find the corresponding row in the cached database
|
|
# IMPORTANT! This relies on the 2 rows having the same rowid.
|
|
cached_row = dict( cached_searchdb_conn.execute(
|
|
"SELECT * FROM searchable WHERE rowid=?", (row["rowid"],)
|
|
).fetchone() )
|
|
_restore_cached_searchable_row( row, sr_type, make_fields, unload_fields, cached_row, curs )
|
|
else:
|
|
_fixup_searchable_row( row, fixup_row, make_fields, curs )
|
|
|
|
# commit the changes regularly (so that they are available to the front-end)
|
|
if time.time() - last_commit_time >= 1:
|
|
conn.commit()
|
|
last_commit_time = time.time()
|
|
|
|
# commit the last block of updates
|
|
conn.commit()
|
|
|
|
return plural( nrows, "row", "rows" )
|
|
|
|
def _fixup_searchable_row( row, fixup_row, make_fields, curs ):
|
|
"""Fix up a single row in the searchable table."""
|
|
|
|
# NOTE: The fixup_row() callback will usually be using _tag_ruleids_in_field(), which manages
|
|
# the lock; otherwise the callback needs to do it itself. We don't want to invoke this callback
|
|
# inside the lock since it can be quite slow; _tag_ruleids_in_field() holds the lock for the
|
|
# minimum amount of time.
|
|
new_row = fixup_row( row["rowid"], row["cset_id"] )
|
|
|
|
with _fixup_content_lock:
|
|
|
|
# NOTE: The make_fields() callback will usually be accessing the fields we want to fixup,
|
|
# so we need to protect them with the lock.
|
|
fields = make_fields( new_row )
|
|
|
|
# NOTE: We update the row inside the lock to prevent "database is locked" errors, if the user
|
|
# tries to do a search while this is happening.
|
|
query = "UPDATE searchable SET {} WHERE rowid={}".format(
|
|
", ".join( "{}=?".format( f ) for f in fields ),
|
|
row["rowid"]
|
|
)
|
|
curs.execute( query, tuple(fields.values()) )
|
|
|
|
def _restore_cached_searchable_row( row, sr_type, make_fields, unload_fields, cached_row, curs ):
|
|
"""Restore a searchable row from the cached database."""
|
|
|
|
# get the in-memory object corresponding to the next searchable row
|
|
obj = _fts_index[ sr_type ][ row["rowid"] ]
|
|
fields = make_fields( obj )
|
|
|
|
# figure out which fields need to be updated
|
|
if sr_type == "asop-entry":
|
|
# flag that the content field in the searchable row needs to be updated
|
|
assert list( fields.keys() ) == [ "content" ]
|
|
update_fields = { "content": cached_row["content"] }
|
|
# NOTE: We can't update the in-memory ASOP sections here (since the searchable rows contain
|
|
# individual section entries that have been separated out - see _extract_section_entries()),
|
|
# so we do this in the "fixup asop" task.
|
|
else:
|
|
update_fields = [
|
|
f for f in fields
|
|
if obj.get( f ) != cached_row[f]
|
|
]
|
|
|
|
# update the fields
|
|
if update_fields:
|
|
# NOTE: We need to update the in-memory objects to support $/rule-info.
|
|
if sr_type in ("errata", "qa", "user-anno"):
|
|
if unload_fields:
|
|
# let the caller update the in-memory object
|
|
unload_fields( obj, { f: cached_row[f] for f in fields } )
|
|
else:
|
|
# update the in-memory object ourself
|
|
for field in update_fields:
|
|
obj[ field ] = cached_row[ field ]
|
|
# update the searchable row
|
|
with _fixup_content_lock:
|
|
query = "UPDATE searchable SET {} WHERE rowid={}".format(
|
|
", ".join( "{}=?".format( f ) for f in update_fields ),
|
|
row["rowid"]
|
|
)
|
|
curs.execute( query, tuple(
|
|
cached_row[f] for f in update_fields
|
|
) )
|
|
|
|
_last_sleep_time = 0
|
|
|
|
def _tag_ruleids_in_field( obj, key, cset_id ):
|
|
"""Tag ruleid's in an optional field."""
|
|
if not isinstance( key, int ) and key not in obj:
|
|
return
|
|
# NOTE: The data structures we use to manage all the in-memory objects never change after
|
|
# they have been loaded, so the only thread-safety we need to worry about is when we read
|
|
# the original value from an object, and when we update it with a new value. The actual process
|
|
# of tagging ruleid's in a piece of content is done outside the lock, since it's quite slow.
|
|
with _fixup_content_lock:
|
|
val = obj[key]
|
|
new_val = tag_ruleids( val, cset_id )
|
|
with _fixup_content_lock:
|
|
obj[key] = new_val
|
|
# FUDGE! Give other threads a chance to run :-/
|
|
global _last_sleep_time
|
|
if time.time() - _last_sleep_time > 1:
|
|
time.sleep( 0.1 )
|
|
_last_sleep_time = time.time()
|
|
|
|
def _get_row_count( conn, table_name ):
|
|
"""Get the number of rows in a table."""
|
|
cur = conn.execute( "SELECT count(*) FROM {}".format( table_name ) )
|
|
return cur.fetchone()[0]
|
|
|