Implemented a basic search engine.

master
Pacman Ghost 3 years ago
parent 9d2495aa64
commit b387871bbe
  1. 78
      asl_rulebook2/utils.py
  2. 20
      asl_rulebook2/webapp/__init__.py
  3. 1
      asl_rulebook2/webapp/config/constants.py
  4. 25
      asl_rulebook2/webapp/config/search-aliases.json
  5. 14
      asl_rulebook2/webapp/config/search-replacements.json
  6. 51
      asl_rulebook2/webapp/config/search-synonyms.json
  7. 37
      asl_rulebook2/webapp/content.py
  8. 5
      asl_rulebook2/webapp/main.py
  9. 475
      asl_rulebook2/webapp/search.py
  10. 33
      asl_rulebook2/webapp/static/ContentPane.js
  11. 40
      asl_rulebook2/webapp/static/MainApp.js
  12. 15
      asl_rulebook2/webapp/static/NavPane.js
  13. 60
      asl_rulebook2/webapp/static/SearchPane.js
  14. 83
      asl_rulebook2/webapp/static/SearchResult.js
  15. 6
      asl_rulebook2/webapp/static/TabbedPages.js
  16. 2
      asl_rulebook2/webapp/static/css/SearchPane.css
  17. 14
      asl_rulebook2/webapp/static/css/SearchResult.css
  18. 18
      asl_rulebook2/webapp/static/utils.js
  19. 1
      asl_rulebook2/webapp/templates/index.html
  20. 17
      asl_rulebook2/webapp/tests/fixtures/simple/simple.index
  21. BIN
      asl_rulebook2/webapp/tests/fixtures/simple/simple.pdf
  22. 18
      asl_rulebook2/webapp/tests/fixtures/simple/simple.targets
  23. 298
      asl_rulebook2/webapp/tests/test_search.py
  24. 21
      asl_rulebook2/webapp/tests/utils.py
  25. 16
      asl_rulebook2/webapp/utils.py
  26. 66
      bin/add_pdf_dests.py
  27. 2
      bin/extract_pages.py

@ -1,8 +1,82 @@
""" Miscellaneous utilities. """
import os
import pathlib
import tempfile
import re
import math
from io import StringIO
from html.parser import HTMLParser
# ---------------------------------------------------------------------
class TempFile:
"""Manage a temp file that can be closed while it's still being used."""
def __init__( self, mode="wb", extn=None, encoding=None ):
self.mode = mode
self.extn = extn
self.encoding = encoding
self.temp_file = None
self.name = None
def open( self ):
"""Allocate a temp file."""
if self.encoding:
encoding = self.encoding
else:
encoding = "utf-8" if "b" not in self.mode else None
assert self.temp_file is None
self.temp_file = tempfile.NamedTemporaryFile(
mode = self.mode,
encoding = encoding,
suffix = self.extn,
delete = False
)
self.name = self.temp_file.name
def close( self, delete ):
"""Close the temp file."""
self.temp_file.close()
if delete:
os.unlink( self.temp_file.name )
def write( self, data ):
"""Write data to the temp file."""
self.temp_file.write( data )
def __enter__( self ):
"""Enter the context manager."""
self.open()
return self
def __exit__( self, exc_type, exc_val, exc_tb ):
"""Exit the context manager."""
self.close( delete=True )
# ---------------------------------------------------------------------
def strip_html( val ):
"""Strip HTML."""
if not val:
return val
buf = StringIO()
class StripHtml( HTMLParser ):
"""Strip HTML."""
def __init__( self ):
super().__init__()
self.strict = False
def handle_data( self, data ):
buf.write( data )
def error( self, message ):
pass
# strip HTML
html_stripper = StripHtml()
html_stripper.feed( val )
return buf.getvalue()
# ---------------------------------------------------------------------
@ -99,6 +173,10 @@ def append_text( buf, new ):
buf += " "
return buf + new
def plural( n, name1, name2 ):
"""Return the singular/plural form of a string."""
return "{} {}".format( n, name1 if n == 1 else name2 )
def remove_quotes( val ):
"""Remove enclosing quotes from a string."""
if val[0] in ('"',"'") and val[-1] == val[0]:

@ -11,7 +11,7 @@ from flask import Flask
import flask.cli
import yaml
from asl_rulebook2.webapp.config.constants import BASE_DIR
from asl_rulebook2.webapp.config.constants import BASE_DIR, CONFIG_DIR
shutdown_event = threading.Event()
@ -19,6 +19,7 @@ shutdown_event = threading.Event()
def _load_config( fname, section ):
"""Load config settings from a file."""
fname = os.path.join( CONFIG_DIR, fname )
if not os.path.isfile( fname ):
return
config_parser = configparser.ConfigParser()
@ -50,21 +51,12 @@ flask.cli.show_server_banner = lambda *args: None
app = Flask( __name__ )
# load the application configuration
config_dir = os.path.join( BASE_DIR, "config" )
_fname = os.path.join( config_dir, "app.cfg" )
_load_config( _fname, "System" )
# load any site configuration
_fname = os.path.join( config_dir, "site.cfg" )
_load_config( _fname, "Site Config" )
# load any debug configuration
_fname = os.path.join( config_dir, "debug.cfg" )
if os.path.isfile( _fname ) :
_load_config( _fname, "Debug" )
_load_config( "app.cfg", "System" )
_load_config( "site.cfg", "Site Config" )
_load_config( "debug.cfg", "Debug" )
# initialize logging
_fname = os.path.join( config_dir, "logging.yaml" )
_fname = os.path.join( CONFIG_DIR, "logging.yaml" )
if os.path.isfile( _fname ):
with open( _fname, "r", encoding="utf-8" ) as fp:
try:

@ -7,3 +7,4 @@ APP_VERSION = "v0.1" # nb: also update setup.py
APP_DESCRIPTION = "Search engine for the ASL Rulebook."
BASE_DIR = os.path.abspath( os.path.join( os.path.dirname(__file__), ".." ) )
CONFIG_DIR = os.path.join( BASE_DIR, "config" )

@ -0,0 +1,25 @@
{
"_comment_": "This file defines search aliases.",
"_comment_": "Keys that appear in a query string will match itself or any of its associated values.",
"_comment_": " e.g. searching for 'entrenchments' will actually search for 'entrenchments OR foxhole OR trench OR ditch'",
"_comment_": "These differ from search synonyms in that only the key word will trigger the replacement, not any word from the set.",
"_comment_": "A user-defined version of this file in the data directory will also be loaded.",
"latw": [
"atmm", "atr", "baz", "mol-p", "mol-projector", "piat", "pf", "pfk", "psk"
],
"fortification/foritifcations": [
"cave", "a-t ditch", "foxhole", "sangar", "trench", "bunker", "minefield", "mines", "booby trap", "panji", "pillbox", "roadblock", "tetrahedron", "wire"
],
"entrenchment/entrenchments": [
"foxhole", "trench", "ditch"
],
"vehicle/vehicles": [
"tank", "halftrack", "half-track", "jeep", "carrier"
],
"illumination": [
"tarshell", "illuminating round", "trip flare"
]
}

@ -0,0 +1,14 @@
{
"_comment_": "This file defines search replacements.",
"_comment_": "Keys that appear in a query string will be replaced by the value.",
"_comment_": " e.g. searching for '1/2 MF' will actually search for '½ MF'",
"_comment_": "A user-defined version of this file in the data directory will also be loaded.",
"1/2": "½",
"3/4": "¾",
"3/8": "⅜",
"5/8": "⅝",
"(r)": "®"
}

@ -0,0 +1,51 @@
[
"This file defines search synonyms.",
"If a word appears in a query string, it will match any of the words in its set.",
" e.g. searching for 'finn gun' will actually search for '(finn OR finnish) AND gun'",
"These differ from search aliases in that any word from a set will trigger the replacement.",
"A user-defined version of this file in the data directory will also be loaded.",
[ "u.s.", "america", "american" ],
[ "usmc", "marine" ],
[ "finn", "finnish" ],
[ "romania", "romanian" ],
[ "hungary", "hungarian" ],
[ "slovakia", "slovakian" ],
[ "croatia", "croatian" ],
[ "bulgaria", "bulgarian" ],
[ "dc", "demo charge", "demolition charge" ],
[ "ft", "flamethrower", "flame-thrower" ],
[ "baz", "bazooka" ],
[ "pf", "panzerfaust" ],
[ "psk", "panzershreck" ],
[ "wp", "white phosphorous" ],
[ "mol", "molotov cocktail" ],
[ "ovr", "overrun" ],
[ "cc", "close combat" ],
[ "thh", "t-h hero", "tank-hunter hero" ],
[ "scw", "shaped-charge weapon" ],
[ "sw", "support weapon" ],
[ "mg", "machinegun", "machine-gun", "machine gun" ],
[ "firelane", "fire-lane", "fire lane" ],
[ "firegroup", "fire-group", "fire group" ],
[ "lc", "landing craft" ],
[ "ht", "halftrack", "half-track" ],
[ "wa", "wall advantage" ],
[ "hob", "heat of battle" ],
[ "cg", "campaign game" ],
[ "pbm", "pbem" ],
[ "rb", "red barricades" ],
[ "votg", "valor of the guards" ],
[ "kgp", "kampfgrupper peiper" ],
[ "kgs", "kampfgrupper scherer" ],
[ "brt", "br:t", "blood reef tarawa" ],
[ "pb", "pegasus bridge" ],
[ "ammo", "ammunition" ],
[ "armor", "armour" ],
[ "color", "colour" ]
]

@ -2,6 +2,7 @@
import os
import io
import json
import glob
from flask import jsonify, send_file, url_for, abort
@ -13,7 +14,7 @@ content_docs = None
# ---------------------------------------------------------------------
def load_content_docs():
def load_content_docs( logger ):
"""Load the content documents from the data directory."""
# initialize
@ -29,26 +30,32 @@ def load_content_docs():
fname = os.path.join( dname, fname )
if not os.path.isfile( fname ):
return
kwargs = {}
kwargs["mode"] = "rb" if binary else "r"
if not binary:
kwargs["encoding"] = "utf-8"
with open( fname, **kwargs ) as fp:
content_doc[ key ] = fp.read()
if binary:
with open( fname, mode="rb" ) as fp:
data = fp.read()
logger.debug( "- Loaded \"%s\" file: #bytes=%d", key, len(data) )
content_doc[ key ] = data
else:
with open( fname, "r", encoding="utf-8" ) as fp:
content_doc[ key ] = json.load( fp )
logger.debug( "- Loaded \"%s\" file.", key )
# load each content doc
logger.info( "Loading content docs: %s", dname )
fspec = os.path.join( dname, "*.index" )
for fname in glob.glob( fspec ):
fname = os.path.basename( fname )
title = os.path.splitext( fname )[0]
fname2 = os.path.basename( fname )
logger.info( "- %s", fname2 )
title = os.path.splitext( fname2 )[0]
content_doc = {
"_fname": fname,
"doc_id": slugify( title ),
"title": title,
}
get_doc( content_doc, "index", fname )
get_doc( content_doc, "targets", change_extn(fname,".targets") )
get_doc( content_doc, "footnotes", change_extn(fname,".footnotes") )
get_doc( content_doc, "content", change_extn(fname,".pdf"), binary=True )
get_doc( content_doc, "index", fname2 )
get_doc( content_doc, "targets", change_extn(fname2,".targets") )
get_doc( content_doc, "footnotes", change_extn(fname2,".footnotes") )
get_doc( content_doc, "content", change_extn(fname2,".pdf"), binary=True )
content_docs[ content_doc["doc_id"] ] = content_doc
# ---------------------------------------------------------------------
@ -59,11 +66,13 @@ def get_content_docs():
resp = {}
for cdoc in content_docs.values():
cdoc2 = {
"docId": cdoc["doc_id"],
"doc_id": cdoc["doc_id"],
"title": cdoc["title"],
}
if "content" in cdoc:
cdoc2["url"] = url_for( "get_content", doc_id=cdoc["doc_id"] )
if "targets" in cdoc:
cdoc2["targets"] = cdoc["targets"]
resp[ cdoc["doc_id"] ] = cdoc2
return jsonify( resp )

@ -9,6 +9,7 @@ from flask import render_template, jsonify, abort
from asl_rulebook2.webapp import app, globvars, shutdown_event
from asl_rulebook2.webapp.content import load_content_docs
from asl_rulebook2.webapp.search import init_search
from asl_rulebook2.webapp.utils import parse_int
# ---------------------------------------------------------------------
@ -20,7 +21,9 @@ def init_webapp():
after that by the test suite, to reset the webapp before each test.
"""
# initialize the webapp
load_content_docs()
logger = logging.getLogger( "startup" )
load_content_docs( logger )
init_search( logger )
# ---------------------------------------------------------------------

@ -0,0 +1,475 @@
""" Manage the search engine. """
import os
import sqlite3
import json
import re
import itertools
import string
import tempfile
import logging
import traceback
from flask import request, jsonify
from asl_rulebook2.utils import plural
from asl_rulebook2.webapp import app
from asl_rulebook2.webapp import content as webapp_content
from asl_rulebook2.webapp.utils import make_config_path, make_data_path
_sqlite_path = None
_fts_index_entries= None
_logger = logging.getLogger( "search" )
# these are used to highlight search matches (nb: the front-end looks for these)
_BEGIN_HIGHLIGHT = "!@:"
_END_HIGHLIGHT = ":@!"
# NOTE: These regex's fix up content returned to us by the SQLite search engine (typically problems
# with highlighting search terms).
_FIXUP_TEXT_REGEXES = [
[ re.compile( fixup[0].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) ),
fixup[1].format( _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
]
for fixup in [
[ r"&{}(.+?){};", r"{}&\g<1>;{}" ], # HTML entities e.g. &((frac12)); -> (($frac12;))
[ r"{}(.+?){}#", r"{}\g<1>#{}" ], # e.g. ((TH)# -> ((TH#)
[ r"{}U\.S{}\.", "{}U.S.{}" ], # ((U.S)). -> ((U.S.))
]
]
# these are used to separate ruleref's in the FTS table (internal use only)
_RULEREF_SEPARATOR = "-:-"
_SEARCH_TERM_ADJUSTMENTS = None
# ---------------------------------------------------------------------
@app.route( "/search", methods=["POST"] )
def search() :
"""Run a search."""
# log the request
_logger.info( "SEARCH REQUEST:" )
args = dict( request.form.items() )
for key,val in args.items():
_logger.info( "- %s: %s", key, val )
# run the search
try:
return _do_search( args )
except Exception as exc: #pylint: disable=broad-except
msg = str( exc )
if msg.startswith( "fts5: " ):
msg = msg[5:] # nb: this is a sqlite3.OperationalError
_logger.warning( "SEARCH ERROR: %s\n%s", args, traceback.format_exc() )
return jsonify( { "error": msg } )
def _do_search( args ):
def fixup_text( val ):
if val is None:
return None
for regex in _FIXUP_TEXT_REGEXES:
val = regex[0].sub( regex[1], val )
return val
# run the search
query_string = args[ "queryString" ].strip()
if query_string == "!:simulated-error:!":
raise RuntimeError( "Simulated error." ) # nb: for the test suite
fts_query_string, search_terms = _make_fts_query_string( query_string )
_logger.debug( "FTS query string: %s", fts_query_string )
conn = sqlite3.connect( _sqlite_path )
def highlight( n ):
# NOTE: highlight() is an FTS extension function, and takes column numbers :-/
return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT )
sql = "SELECT rowid,doc_id,sr_type,rank,{},{},{},{} FROM searchable".format(
highlight(2), highlight(3), highlight(4), highlight(5)
)
sql += " WHERE searchable MATCH ?"
sql += " ORDER BY rank"
curs = conn.execute( sql,
( "{title subtitle content rulerefs}: " + fts_query_string, )
)
def get_col( sr, key, val ):
if val:
sr[key] = fixup_text( val )
# get the results
results = []
for row in curs:
if row[2] != "index":
_logger.error( "Unknown searchable row type (rowid=%d): %s", row[0], row[2] )
continue
index_entry = _fts_index_entries[ row[0] ]
result = {
"doc_id": row[1],
"sr_type": row[2],
"_score": - row[3],
}
get_col( result, "title", row[4] )
get_col( result, "subtitle", row[5] )
get_col( result, "content", row[6] )
if index_entry.get( "ruleids" ):
result["ruleids"] = index_entry["ruleids"]
if index_entry.get( "see_also" ):
result["see_also"] = index_entry["see_also"]
rulerefs = [ r.strip() for r in row[7].split(_RULEREF_SEPARATOR) ] if row[7] else []
assert len(rulerefs) == len(index_entry.get("rulerefs",[]))
if rulerefs:
result[ "rulerefs" ] = []
for i, ruleref in enumerate(rulerefs):
ruleref2 = {}
if "caption" in index_entry["rulerefs"][i]:
assert ruleref.replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ) \
== index_entry["rulerefs"][i]["caption"]
ruleref2["caption"] = fixup_text( ruleref )
if "ruleids" in index_entry["rulerefs"][i]:
ruleref2["ruleids"] = index_entry["rulerefs"][i]["ruleids"]
assert ruleref2
result["rulerefs"].append( ruleref2 )
results.append( result )
# fixup the results
results = _fixup_results_for_hash_terms( results, search_terms )
# adjust the sort order
results = _adjust_sort_order( results )
# return the results
_logger.debug( "Search results:" if len(results) > 0 else "Search results: none" )
for result in results:
_logger.debug( "- %s (%.3f)",
result["title"].replace( _BEGIN_HIGHLIGHT, "" ).replace( _END_HIGHLIGHT, "" ),
result["_score"]
)
return jsonify( results )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PASSTHROUGH_REGEXES = set([
re.compile( r"\bAND\b" ),
re.compile( r"\bOR\b" ),
re.compile( r"\bNOT\b" ),
re.compile( r"\((?![Rr]\))" ),
])
def _make_fts_query_string( query_string ):
"""Generate the SQLite query string.
SQLite's MATCH function recognizes a lot of special characters, which need
to be enclosed in double-quotes to disable.
"""
# check if this looks like a raw FTS query
if any( regex.search(query_string) for regex in PASSTHROUGH_REGEXES ):
return query_string.strip(), None
# split the search string into words (taking quoted phrases into account)
ignore = app.config.get( "SQLITE_FTS_IGNORE_CHARS", ",;!?$" )
query_string = "".join( ch for ch in query_string if ch not in ignore )
terms = query_string.lower().split()
i = 0
while True:
if i >= len(terms):
break
if i > 0 and terms[i-1].startswith( '"' ):
terms[i-1] += " {}".format( terms[i] )
del terms[i]
if terms[i-1].startswith( '"' ) and terms[i-1].endswith( '"' ):
terms[i-1] = terms[i-1][1:-1]
continue
i += 1
# clean up quoted phrases
terms = [ t[1:] if t.startswith('"') else t for t in terms ]
terms = [ t[:-1] if t.endswith('"') else t for t in terms ]
terms = [ t.strip() for t in terms ]
terms = [ t for t in terms if t ]
# adjust search terms
for term_no, term in enumerate(terms):
aliases = _SEARCH_TERM_ADJUSTMENTS.get( term )
if not aliases:
continue
if isinstance( aliases, str ):
# the search term is replaced by a new one
terms[ term_no ] = aliases
elif isinstance( aliases, set ):
# the search term is replaced by multiple new ones (that will be OR'ed together)
# NOTE: We sort the terms so that the tests will work reliably.
terms[ term_no ] = sorted( aliases )
else:
assert "Unknown search alias type: {}".format( type(aliases) )
# fixup each term
def has_special_char( term ):
"""Check if the term contains any special characters."""
for ch in term:
if ch in "*":
continue
if ch.isspace() or ch in string.punctuation:
return True
if ord(ch) < 32 or ord(ch) > 127:
return True
return False
def fixup_terms( terms ):
"""Fixup a list of terms."""
for term_no, term in enumerate(terms):
if isinstance( term, str ):
if has_special_char( term ):
terms[term_no] = '"{}"'.format( term )
else:
fixup_terms( term )
fixup_terms( terms )
# return the final FTS query string
def term_string( term ):
if isinstance( term, str ):
return term
assert isinstance( term, list )
return "( {} )".format( " OR ".join( term ) )
return " AND ".join( term_string(t) for t in terms ), terms
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def _fixup_results_for_hash_terms( results, search_terms ):
"""Fixup search results for search terms that end with a hash.
SQLite doesn't handle search terms that end with a hash particularly well.
We correct highlighted search terms in fixup_text(), but searching for e.g. "US#"
will also match "use" and "using" - we remove such results here.
"""
# figure out which search terms end with a hash
# NOTE: We don't bother descending down into sub-terms.
if not search_terms:
return results
terms = [
t[1:-1] for t in search_terms
if isinstance(t,str) and t.startswith('"') and t.endswith('"')
]
terms = [
t[:-1].lower() for t in terms
if isinstance(t,str) and t.endswith("#")
]
if not terms:
return results
if "us" in terms:
terms.extend( [ "use", "used", "using", "user" ] )
def keep( sr ):
# remove every incorrectly matched search term (e.g. ((K)) when searching for "K#")
buf = json.dumps( sr ).lower()
for term in terms:
buf = buf.replace( "{}{}{}".format( _BEGIN_HIGHLIGHT, term, _END_HIGHLIGHT ), "_removed_" )
# we keep this search result if there are still some highlighted search terms
return _BEGIN_HIGHLIGHT in buf
return [
result for result in results if keep(result)
]
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def _adjust_sort_order( results ):
"""Adjust the sort order of the search results."""
results2 = []
def extract_sr( func ):
# move results that pass the filter function to the new list
i = 0
while True:
if i >= len(results):
break
# NOTE: We never prefer small entries (i.e .have no ruleref's)
# e.g. those that only contain a "see also".
if func( results[i] ) and len(results[i].get("rulerefs",[])) > 0:
results2.append( results[i] )
del results[i]
else:
i += 1
def get( sr, key ):
val = sr.get( key )
return val if val else ""
# prefer search results whose title is an exact match
extract_sr(
lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT ) and get(sr,"title").endswith( _END_HIGHLIGHT )
)
# prefer search results whose title starts with a match
extract_sr(
lambda sr: get(sr,"title").startswith( _BEGIN_HIGHLIGHT )
)
# prefer search results that have a match in the title
extract_sr(
lambda sr: _BEGIN_HIGHLIGHT in get(sr,"title")
)
# prefer search results that have a match in the subtitle
extract_sr(
lambda sr: _BEGIN_HIGHLIGHT in get(sr,"subtitle")
)
# include any remaining search results
results2.extend( results )
return results2
# ---------------------------------------------------------------------
def init_search( logger ):
"""Initialize the search engine."""
# initialize
global _fts_index_entries
_fts_index_entries = {}
# initialize the database
global _sqlite_path
_sqlite_path = app.config.get( "SQLITE_PATH" )
if not _sqlite_path:
# FUDGE! We should be able to create a shared, in-memory database using this:
# file::XYZ:?mode=memory&cache=shared
# but it doesn't seem to work (on Linux) and ends up creating a file with this name :-/
# We manually create a temp file, which has to have the same name each time, so that we don't
# keep creating a new database each time we start up. Sigh...
_sqlite_path = os.path.join( tempfile.gettempdir(), "asl-rulebook2.searchdb" )
if os.path.isfile( _sqlite_path ):
os.unlink( _sqlite_path )
logger.info( "Creating the search index: %s", _sqlite_path )
conn = sqlite3.connect( _sqlite_path )
# NOTE: Storing everything in a single table allows FTS to rank search results based on
# the overall content, and also lets us do AND/OR queries across all searchable content.
conn.execute(
"CREATE VIRTUAL TABLE searchable USING fts5"
" ( doc_id, sr_type, title, subtitle, content, rulerefs, tokenize='porter unicode61' )"
)
# load the searchable content
logger.info( "Loading the search index..." )
conn.execute( "DELETE FROM searchable" )
curs = conn.cursor()
for cdoc in webapp_content.content_docs.values():
logger.info( "- Loading index file: %s", cdoc["_fname"] )
nrows = 0
for index_entry in cdoc["index"]:
rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) )
# NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags
# will need to be included in search terms. However, this means that the content returned by a query
# will be this stripped content. We could go back to the original data to get the original HTML content,
# but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert
# the original content, since none of it should contain HTML, anyway.
curs.execute(
"INSERT INTO searchable (doc_id,sr_type,title,subtitle,content,rulerefs) VALUES (?,?,?,?,?,?)", (
cdoc["doc_id"], "index",
index_entry.get("title"), index_entry.get("subtitle"), index_entry.get("content"), rulerefs
) )
_fts_index_entries[ curs.lastrowid ] = index_entry
index_entry["_fts_rowid"] = curs.lastrowid
nrows += 1
conn.commit()
logger.info( " - Loaded %s.", plural(nrows,"index entry","index entries"), )
assert len(_fts_index_entries) == _get_row_count( conn, "searchable" )
# load the search config
load_search_config( logger )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def load_search_config( logger ):
"""Load the search config."""
# initialize
global _SEARCH_TERM_ADJUSTMENTS
_SEARCH_TERM_ADJUSTMENTS = {}
def add_search_term_adjustment( key, vals ):
# make sure everything is lower-case
key = key.lower()
if isinstance( vals, str ):
vals = vals.lower()
elif isinstance( vals, set ):
vals = set( v.lower() for v in vals )
else:
assert "Unknown search alias type: {}".format( type(vals) )
# add new the search term adjustment
if key not in _SEARCH_TERM_ADJUSTMENTS:
_SEARCH_TERM_ADJUSTMENTS[ key ] = vals
else:
# found a multiple definition - try to do something sensible
logger.warning( " - Duplicate search alias: %s\n- current aliases = %s\n- new aliases = %s", key,
_SEARCH_TERM_ADJUSTMENTS[key], vals
)
if isinstance( _SEARCH_TERM_ADJUSTMENTS[key], str ):
_SEARCH_TERM_ADJUSTMENTS[ key ] = vals
else:
assert isinstance( _SEARCH_TERM_ADJUSTMENTS[key], set )
_SEARCH_TERM_ADJUSTMENTS[ key ].update( vals )
# load the search replacements
def load_search_replacements( fname ):
if not os.path.isfile( fname ):
return
logger.info( "Loading search replacements: %s", fname )
with open( fname, "r", encoding="utf-8" ) as fp:
data = json.load( fp )
nitems = 0
for key, val in data.items():
if key.startswith( "_" ):
continue # nb: ignore comments
logger.debug( "- %s -> %s", key, val )
add_search_term_adjustment( key, val )
nitems += 1
logger.info( "- Loaded %s.", plural(nitems,"search replacement","search replacements") )
load_search_replacements( make_config_path( "search-replacements.json" ) )
load_search_replacements( make_data_path( "search-replacements.json" ) )
# load the search aliases
def load_search_aliases( fname ):
if not os.path.isfile( fname ):
return
logger.info( "Loading search aliases: %s", fname )
with open( fname, "r", encoding="utf-8" ) as fp:
data = json.load( fp )
nitems = 0
for keys, aliases in data.items():
if keys.startswith( "_" ):
continue # nb: ignore comments
logger.debug( "- %s -> %s", keys, " ; ".join(aliases) )
for key in keys.split( "/" ):
add_search_term_adjustment( key, set( itertools.chain( aliases, [key] ) ) )
nitems += 1
logger.info( "- Loaded %s.", plural(nitems,"search aliases","search aliases") )
load_search_aliases( make_config_path( "search-aliases.json" ) )
load_search_aliases( make_data_path( "search-aliases.json" ) )
# load the search synonyms
def load_search_synonyms( fname ):
if not os.path.isfile( fname ):
return
logger.info( "Loading search synonyms: %s", fname )
with open( fname, "r", encoding="utf-8" ) as fp:
data = json.load( fp )
nitems = 0
for synonyms in data:
if isinstance( synonyms, str ):
continue # nb: ignore comments
logger.debug( "- %s", " ; ".join(synonyms) )
synonyms = set( synonyms )
for term in synonyms:
add_search_term_adjustment( term, synonyms )
nitems += 1
logger.info( "- Loaded %s.", plural(nitems,"search synonym","search synonyms") )
load_search_synonyms( make_config_path( "search-synonyms.json" ) )
load_search_synonyms( make_data_path( "search-synonyms.json" ) )
# ---------------------------------------------------------------------
def _get_row_count( conn, table_name ):
"""Get the number of rows in a table."""
cur = conn.execute( "SELECT count(*) FROM {}".format( table_name ) )
return cur.fetchone()[0]

@ -8,13 +8,13 @@ gMainApp.component( "content-pane", {
template: `
<tabbed-pages ref="tabbedPages">
<tabbed-page v-for="doc in contentDocs" :tabId=doc.docId :caption=doc.title >
<tabbed-page v-for="doc in contentDocs" :tabId=doc.doc_id :caption=doc.title :key=doc.doc_id >
<content-doc :doc=doc />
</tabbed-page>
</tabbed-pages>`,
mounted() {
gEventBus.on( "show-content-doc", (docId) => {
gEventBus.on( "show-target", (docId, target) => { //eslint-disable-line no-unused-vars
this.$refs.tabbedPages.activateTab( docId ) ; // nb: tabId == docId
} ) ;
},
@ -27,14 +27,37 @@ gMainApp.component( "content-doc", {
props: [ "doc" ],
data() { return {
target: null,
noContent: gUrlParams.get( "no-content" ),
} ; },
template: `
<div class="content-doc">
<div v-if=noContent class="disabled"> Content disabled. </div>
<iframe v-else-if=doc.url :src=doc.url />
<div class="content-doc" :data-target=target >
<div v-if=noContent class="disabled"> Content disabled. <div v-if=target>target = {{target}}</div> </div>
<iframe v-else-if=doc.url :src=makeDocUrl />
<div v-else class="disabled"> No content. </div>
</div>`,
created() {
gEventBus.on( "show-target", (docId, target) => {
if ( docId != this.doc.doc_id )
return ;
// FUDGE! We give the tab time to show itself before we scroll to the target.
setTimeout( () => {
this.target = target ;
}, 50 ) ;
} ) ;
},
computed: {
makeDocUrl() {
let url = this.doc.url ;
if ( this.target )
url += "#nameddest=" + this.target ;
return url ;
}
},
} ) ;

@ -12,6 +12,10 @@ $(document).ready( () => {
gMainApp.mount( "#main-app" ) ;
} ) ;
// FUDGE! Can't seem to get access to the content docs via gMainApp, so we make them available
// to the rest of the program via this global variable :-/
export let gContentDocs = null ;
// --------------------------------------------------------------------
gMainApp.component( "main-app", {
@ -47,23 +51,27 @@ gMainApp.component( "main-app", {
methods: {
getContentDocs: (self) => new Promise( (resolve, reject) => {
// get the content docs
$.getJSON( gGetContentDocsUrl, (resp) => { //eslint-disable-line no-undef
self.contentDocs = resp ;
let docIds = Object.keys( resp ) ;
if ( docIds.length > 0 ) {
Vue.nextTick( () => {
gEventBus.emit( "show-content-doc", docIds[0] ) ; // FIXME! which one do we choose?
} ) ;
}
resolve() ;
} ).fail( (xhr, status, errorMsg) => {
const msg = "Couldn't get the content docs." ;
showErrorMsg( msg + " <div class='pre'>" + errorMsg + "</div>" ) ;
reject( msg )
getContentDocs( self ) {
return new Promise( (resolve, reject) => {
// get the content docs
$.getJSON( gGetContentDocsUrl, (resp) => { //eslint-disable-line no-undef
if ( gUrlParams.get( "add-empty-doc" ) )
resp["empty"] = { "doc_id": "empty", "title": "Empty document" } ; // nb: for testing porpoises
gContentDocs = self.contentDocs = resp ;
let docIds = Object.keys( resp ) ;
if ( docIds.length > 0 ) {
Vue.nextTick( () => {
gEventBus.emit( "show-target", docIds[0], null ) ; // FIXME! which one do we choose?
} ) ;
}
resolve() ;
} ).fail( (xhr, status, errorMsg) => {
const msg = "Couldn't get the content docs." ;
showErrorMsg( msg + " <div class='pre'>" + errorMsg + "</div>" ) ;
reject( msg )
} ) ;
} ) ;
} ),
},
},

@ -4,17 +4,28 @@ import { gMainApp, gEventBus } from "./MainApp.js" ;
gMainApp.component( "nav-pane", {
data() { return {
seqNo: 0, // nb: for the test suite
} ; },
template: `
<tabbed-pages>
<tabbed-page tabId="search" caption="Search" data-display="flex" >
<search-box id="search-box" @search=onSearch />
<search-results id="search-results" />
<search-results id="search-results" :data-seqno=seqNo />
</tabbed-page>
</tabbed-pages>`,
mounted() {
gEventBus.on( "search-done", () => {
// notify the test suite that the search results are now available
this.seqNo += 1 ;
} ) ;
},
methods: {
onSearch: (queryString) => {
onSearch( queryString ) {
gEventBus.emit( "search", queryString ) ;
},

@ -1,5 +1,5 @@
import { gMainApp, gEventBus } from "./MainApp.js" ;
import { IndexSearchResult } from "./SearchResult.js" ;
import { fixupSearchHilites } from "./utils.js" ;
// --------------------------------------------------------------------
@ -30,7 +30,7 @@ gMainApp.component( "search-box", {
},
methods: {
onKeyUp: function( evt ) {
onKeyUp( evt ) {
if ( evt.keyCode == 13 )
this.$refs["submit"].click() ;
}
@ -43,12 +43,15 @@ gMainApp.component( "search-box", {
gMainApp.component( "search-results", {
data() { return {
searchResults: [],
searchResults: null,
errorMsg: null,
} ; },
template: `<div>
<div v-for="sr in searchResults" :key=sr.key >
<index-sr v-if="sr.srType == 'index'" :sr=sr />
<div v-if=errorMsg class="error"> Search error: <div class="pre"> {{errorMsg}} </div> </div>
<div v-else-if="searchResults != null && searchResults.length == 0" class="no-results"> Nothing was found. </div>
<div v-else v-for="sr in searchResults" :key=sr.key >
<index-sr v-if="sr.sr_type == 'index'" :sr=sr />
<div v-else> ??? </div>
</div>
</div>`,
@ -60,22 +63,39 @@ gMainApp.component( "search-results", {
methods: {
onSearch( queryString ) {
// generate some dummy search results
let searchResults = [] ;
for ( let i=0 ; i < queryString.length ; ++i ) {
let buf = [ "Search result #" + (1+i) ] ;
let nItems = Math.floor( Math.sqrt( 100 * Math.random() ) ) - 1 ;
if ( nItems > 0 ) {
buf.push( "<ul style='padding-left:1em;'>" ) ;
for ( let j=0 ; j < nItems ; ++j )
buf.push( "<li> item " + (1+j) ) ;
buf.push( "</ul>" ) ;
// submit the search request
const onError = (errorMsg) => {
this.errorMsg = errorMsg ;
Vue.nextTick( () => {
gEventBus.emit( "search-done" ) ;
} ) ;
} ;
this.errorMsg = null ;
$.ajax( { url: gSearchUrl, type: "POST", //eslint-disable-line no-undef
data: { queryString: queryString },
dataType: "json",
} ).done( (resp) => {
// check if there was an error
if ( resp.error ) {
onError( resp.error ) ;
return ;
}
searchResults.push(
new IndexSearchResult( i, buf.join("") )
) ;
}
this.searchResults = searchResults ;
// adjust highlighted text
resp.forEach( (sr) => {
[ "title", "subtitle", "content" ].forEach( function( key ) {
if ( sr[key] )
sr[key] = fixupSearchHilites( sr[key] ) ;
} ) ;
} ) ;
// load the search results into the UI
this.$el.scrollTop = 0;
this.searchResults = resp ;
Vue.nextTick( () => {
gEventBus.emit( "search-done" ) ;
} ) ;
} ).fail( (xhr, status, errorMsg) => {
onError( errorMsg ) ;
} ) ;
},
},

@ -1,23 +1,80 @@
import { gMainApp } from "./MainApp.js" ;
import { gMainApp, gEventBus, gContentDocs } from "./MainApp.js" ;
import { fixupSearchHilites } from "./utils.js" ;
// --------------------------------------------------------------------
export class IndexSearchResult {
constructor( key, content ) {
this.key = key ;
this.srType = "index" ;
this.content = content ;
}
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
gMainApp.component( "index-sr", {
props: [ "sr" ],
template: `
<div class="sr index-sr" v-html=sr.content />
`,
<div class="sr index-sr" >
<div v-if="sr.title || sr.subtitle" class="title" >
<span v-if=sr.title class="title" v-html=sr.title />
<span v-if=sr.subtitle class="subtitle" v-html=sr.subtitle />
</div>
<div class="body">
<div v-if=sr.content class="content" v-html=sr.content />
<div v-if=makeSeeAlso v-html=makeSeeAlso class="see-also" />
<div v-if=sr.ruleids class="ruleids" >
<ruleid v-for="rid in sr.ruleids" :docId=sr.doc_id :ruleId=rid :key=rid />
</div>
<ul v-if=sr.rulerefs class="rulerefs" >
<li v-for="rref in sr.rulerefs" :key=rref >
<span v-if=rref.caption class="caption" v-html=fixupHilites(rref.caption) />
<ruleid v-for="rid in rref.ruleids" :docId=sr.doc_id :ruleId=rid :key=rid />
</li>
</ul>
</div>
</div>`,
computed: {
makeSeeAlso() {
if ( this.sr.see_also )
return "See also: " + this.sr.see_also.join( ", " ) ;
return null ;
},
},
methods: {
fixupHilites( val ) {
return fixupSearchHilites( val ) ;
},
},
} ) ;
// --------------------------------------------------------------------
gMainApp.component( "ruleid", {
props: [ "docId", "ruleId" ],
data() { return {
target: null,
} ; },
template: `<span class="ruleid" v-bind:class="{unknown:!target}">[<a v-if=target @click=onClick>{{ruleId}}</a><span v-else>{{ruleId}}</span>]</span>`,
created() {
// figure out which rule is being referenced
let ruleId = this.ruleId ;
let pos = ruleId.indexOf( "-" ) ;
if ( pos >= 0 ) {
// NOTE: For ruleid's of the form "A12.3-.4", we want to target "A12.3".
ruleId = ruleId.substring( 0, pos ) ;
}
// check if the rule is one we know about
if ( gContentDocs[this.docId] && gContentDocs[this.docId].targets ) {
if ( gContentDocs[this.docId].targets[ ruleId ] )
this.target = ruleId ;
}
},
methods: {
onClick() {
// show the target
gEventBus.emit( "show-target", this.docId, this.target ) ;
},
},
} ) ;

@ -13,7 +13,7 @@ gMainApp.component( "tabbed-pages", {
<div class="tabbed-pages">
<slot />
<div class="tab-strip">
<div v-for="tab in tabs" :data-tabid=tab.tabId @click=onTabClicked class="tab" v-bind:class="{'active': tab.tabId == activeTabId}" >
<div v-for="tab in tabs" :data-tabid=tab.tabId @click=onTabClicked class="tab" v-bind:class="{'active': tab.tabId == activeTabId}" :key=tab.tabId >
{{tab.caption}}
</div>
</div>
@ -44,12 +44,12 @@ gMainApp.component( "tabbed-pages", {
methods: {
onTabClicked: function( evt ) {
onTabClicked( evt ) {
// activate the selected tab
this.activateTab( evt.target.dataset.tabid ) ;
},
activateTab: function( tabId ) {
activateTab( tabId ) {
// activate the specified tab
this.activeTabId = tabId ;
$( this.$el ).find( ".tabbed-page" ).each( function() {

@ -6,3 +6,5 @@
/* search results */
#search-results { flex-grow: -1 ; margin: 8px 0 2px 0 ; overflow-y: auto ; }
#search-results .no-results { font-style: italic ; color: #666 ; }
#search-results .error .pre { font-family: monospace ; margin: 0.25em 0 0 0.5em ; }

@ -1 +1,13 @@
#search-results .sr { margin: 0 10px 2px 0 ; border: 1px dotted #666 ; padding: 5px ; }
#search-results .sr { margin: 0 10px 2px 0 ; padding: 5px ; }
#search-results .sr .hilite { padding: 0 2px ; background: #ffa ; }
#search-results .index-sr .title { background: #e0e0e0 ; border-bottom: 1px solid #ccc ; padding: 2px 5px ; font-weight: bold ; }
#search-results .index-sr .subtitle { padding: 2px 5px ; font-weight: normal ; font-size: 80% ; font-style: italic ; }
#search-results .index-sr .body { padding: 2px 5px 0 5px ; font-size: 80% ; }
#search-results .index-sr .content { color: #444 ; }
#search-results .index-sr .see-also { color: #444 ; }
#search-results .index-sr ul.rulerefs { margin-left: 1.2em ; }
#search-results .index-sr ul.rulerefs .caption { padding-right: 0.5em ; }
#search-results .index-sr .ruleid { margin-right: 0.25em ; font-style: italic ; color: #444 ; }
#search-results .index-sr .ruleid.unknown { color: #888 ; }
#search-results .index-sr .ruleid a { cursor: pointer ; }

@ -1,3 +1,21 @@
// --------------------------------------------------------------------
const _HILITE_REGEXES = [
new RegExp("!@:","g"), new RegExp(":@!","g"),
] ;
export function fixupSearchHilites( val )
{
// NOTE: The search engine highlights search tems in the returned search content using special markers.
// We convert those markers to HTML span's here.
if ( val === null || val === undefined )
return val ;
return val.replace( _HILITE_REGEXES[0], "<span class='hilite'>" )
.replace( _HILITE_REGEXES[1], "</span>" ) ;
}
// --------------------------------------------------------------------
export function showInfoMsg( msg ) { _doShowNotificationMsg( "notice", msg ) ; }
export function showWarningMsg( msg ) { _doShowNotificationMsg( "warning", msg ) ; }
export function showErrorMsg( msg ) { _doShowNotificationMsg( "error", msg ) ; }

@ -43,6 +43,7 @@
<script>
gGetContentDocsUrl = "{{ url_for( 'get_content_docs') }}" ;
gSearchUrl = "{{ url_for( 'search' ) }}" ;
</script>
<script type="module" src="{{ url_for( 'static', filename='MainApp.js' ) }}"></script>

@ -11,7 +11,7 @@
{ "title": "Backblast",
"ruleids": [ "C13.8" ],
"rulerefs": [
{ "caption": "Huts", "ruleids": [ "G5.62" ] },
{ "caption": "HEAT", "ruleids": [ "C13.8" ] },
{ "caption": "RCL", "ruleids": [ "C12.3-.4" ] }
]
},
@ -31,11 +31,9 @@
"content": "Also known as \"running <em>really</em> fast.\"",
"rulerefs": [
{ "caption": "ENEMY Guard Automatic Action", "ruleids": [ "S6.303" ] },
{ "caption": "Manhandling", "ruleids": [ "C10.3" ] },
{ "caption": "NA for Pathfinders", "ruleids": [ "T1.2" ] },
{ "caption": "S? NA", "ruleids": [ "S3.321" ] },
{ "caption": "Water Shortage", "ruleids": [ "RCG21" ] },
{ "caption": "Wire NA", "ruleids": [ "B26.46" ] }
{ "ruleids": [ "C10.3" ] },
{ "caption": "NA in Advance Phase", "ruleids": [ "A4.7" ] },
{ "caption": "'S?' is \"&lt;NA&gt;\"" }
]
},
@ -54,6 +52,7 @@
},
{ "title": "Firepower",
"content": "The U.S. has lots of this.",
"ruleids": [ "A1.21" ],
"see_also": [ "FP" ]
},
@ -71,6 +70,12 @@
{ "title": "Identity, Vehicular",
"ruleids": [ "D1.4" ]
},
{ "title": "HTML ti<u>tl</u>e",
"subtitle": "HTML subti<u>tl</u>e",
"content": "HTML con<u>ten</u>t: 2&frac34; MP",
"see_also": [ "HTML see-<u>al</u>so" ]
}
]

@ -1,15 +1,15 @@
{
"A4.7": { "caption": "ADVANCE PHASE", "page_no": 1, "pos": [72,702] },
"C13.8": { "caption": "BACKBLAST", "page_no": 1, "pos": [72,404] },
"A3.8": { "caption": "CLOSE COMBAT PHASE (CCPh)", "page_no": 1, "pos": [72.97] },
"A4.7": { "caption": "ADVANCE PHASE", "page_no": 1, "pos": [72,718] },
"C13.8": { "caption": "BACKBLAST", "page_no": 1, "pos": [72,503] },
"A3.8": { "caption": "CLOSE COMBAT PHASE (CCPh)", "page_no": 1, "pos": [72,292] },
"A4.5": { "caption": "DOUBLE TIME", "page_no": 2, "pos": [72,702] },
"A19.1": { "caption": "EXPERIENCE LEVEL RATING (ELR)", "page_no": 2, "pos": [72.404] },
"A1.21": { "caption": "FIREPOWER (FP)", "page_no": 2, "pos": [72,97] },
"A4.5": { "caption": "DOUBLE TIME", "page_no": 2, "pos": [72,718] },
"A19.1": { "caption": "EXPERIENCE LEVEL RATING (ELR)", "page_no": 2, "pos": [72,503] },
"A1.21": { "caption": "FIREPOWER (FP)", "page_no": 2, "pos": [72,292] },
"A1.21": { "caption": "FIREPOWER (FP)", "page_no": 3, "pos": [72,702] },
"E11.21": { "caption": "GAPS", "page_no": 3, "pos":[72,404] },
"C8.3": { "caption": "HEAT (H)", "page_no": 3, "pos": [72,97] }
"E11.21": { "caption": "GAPS", "page_no": 3, "pos":[72,718] },
"C8.3": { "caption": "HEAT (H)", "page_no": 3, "pos": [72,503] },
"D1.4": { "caption": "IDENTITY & GROUND PRESSURE", "page_no": 3, "pos": [72,292] }
}

@ -0,0 +1,298 @@
""" Test search. """
import re
import logging
from selenium.webdriver.common.keys import Keys
from asl_rulebook2.utils import strip_html
from asl_rulebook2.webapp.search import load_search_config, _make_fts_query_string
from asl_rulebook2.webapp.tests.utils import init_webapp, select_tabbed_page, get_classes, \
wait_for, find_child, find_children
# ---------------------------------------------------------------------
def test_search( webapp, webdriver ):
"""Test search."""
# initialize
webapp.control_tests.set_data_dir( "simple" )
init_webapp( webapp, webdriver )
# test a search that finds nothing
results = _do_search( "oogah, boogah!" )
assert results is None
# test error handling
results = _do_search( "!:simulated-error:!" )
assert "Simulated error." in results
# do a search
results = _do_search( "enemy" )
assert results == [
{ "sr_type": "index",
"title": "CCPh", "subtitle": "Close Combat Phase",
"ruleids": [ "A3.8" ],
"rulerefs": [
{ "caption": "((ENEMY)) Attacks", "ruleids": [ "S11.5" ] },
{ "caption": "dropping SW before CC", "ruleids": [ "A4.43" ] },
]
},
{ "sr_type": "index",
"title": "Double Time",
"content": "Also known as \"running really fast.\"",
"see_also": [ "CX" ],
"ruleids": [ "A4.5-.51", "S6.222" ],
"rulerefs": [
{ "caption": "((ENEMY)) Guard Automatic Action", "ruleids": [ "S6.303" ] },
{ "ruleids": [ "C10.3" ] },
{ "caption": "NA in Advance Phase", "ruleids": [ "A4.7" ] },
{ "caption": "'S?' is \"<NA>\"" },
]
},
]
# do another search
results = _do_search( "gap" )
assert results == [
{ "sr_type": "index",
"title": "((Gaps)), Convoy",
"ruleids": [ "E11.21" ],
},
]
# ---------------------------------------------------------------------
def test_content_fixup( webapp, webdriver ):
"""Test fixing up of content returned by the search engine."""
# initialize
webapp.control_tests.set_data_dir( "simple" )
init_webapp( webapp, webdriver )
# search for a fraction
results = _do_search( "3/4" )
assert len(results) == 1
assert results[0]["content"] == "HTML content: 2((\u00be)) MP"
# search for something that ends with a hash
results = _do_search( "H#" )
assert len(results) == 1
assert results[0]["title"] == "((H#))"
# search for "U.S."
results = _do_search( "U.S." )
assert len(results) == 1
assert results[0]["content"] == "The ((U.S.)) has lots of this."
# ---------------------------------------------------------------------
def test_targets( webapp, webdriver ):
"""Test clicking on search results."""
# initialize
webapp.control_tests.set_data_dir( "simple" )
init_webapp( webapp, webdriver, no_content=1, add_empty_doc=1 )
def do_test( query_string, sel, expected ):
# select the dummy document
select_tabbed_page( "#content", "empty" )
# do the search
_do_search( query_string )
# click on a target
elem = find_child( "#search-results {}".format( sel ) )
elem.click()
def check_target():
# check the active tab
if find_child( "#content .tab-strip .tab.active" ).get_attribute( "data-tabid" ) != "simple":
return False
# check the current target
elem = find_child( "#content .tabbed-page[data-tabid='simple'] .content-doc" )
return elem.get_attribute( "data-target" ) == expected
wait_for( 2, check_target )
# do the tests
do_test( "CC", ".sr .ruleids .ruleid a", "A3.8" )
do_test( "time", ".sr .rulerefs .ruleid a", "A4.7" )
# ---------------------------------------------------------------------
def test_make_fts_query_string():
"""Test generating the FTS query string."""
# initialize
load_search_config( logging.getLogger("_unknown_") )
def check( query, expected ):
fts_query_string, _ = _make_fts_query_string(query)
assert fts_query_string == expected
# test some query strings
check( "", "" )
check( "hello", "hello" )
check( " hello, world! ", "hello AND world" )
check(
"foo 1+2 A-T K# bar",
'foo AND "1+2" AND "a-t" AND "k#" AND bar'
)
check(
"a'b a''b",
"\"a'b\" AND \"a''b\""
)
check(
'foo "set dc" bar',
'foo AND "set dc" AND bar'
)
# test some quoted phrases
check( '""', '' )
check( ' " " ', '' )
check(
'"hello world"',
'"hello world"'
)
check(
' foo "hello world" bar ',
'foo AND "hello world" AND bar'
)
check(
' foo " xyz " bar ',
'foo AND xyz AND bar'
)
check(
' foo " xyz 123 " bar ',
'foo AND "xyz 123" AND bar'
)
# test some incorrectly quoted phrases
check( '"', '' )
check( ' " " " ', '' )
check( ' a "b c d e', 'a AND "b c d e"' )
check( ' a b" c d e ', 'a AND b AND c AND d AND e' )
# test pass-through
check( "AND", "AND" )
check( " OR", "OR" )
check( "OR ", "OR" )
check( "foo OR bar", "foo OR bar" )
check( "(a OR b)", "(a OR b)" )
# test search replacements
check( "1/2 3/4 3/8 5/8", '"&frac12;" AND "&frac34;" AND "&frac38;" AND "&frac58;"' )
check( "(r)", '"&reg;"' )
# test search aliases
check( "entrenchment", "( ditch OR entrenchment OR foxhole OR trench )" )
check( "entrenchments", "( ditch OR entrenchments OR foxhole OR trench )" )
check( "foxhole", "foxhole" )
# test search synonyms
check( "armor", "( armor OR armour )" )
check( "american big armor", '( america OR american OR "u.s." ) AND big AND ( armor OR armour )' )
# ---------------------------------------------------------------------
def _do_search( query_string ):
"""Do a search."""
def get_seq_no():
return find_child( "#search-results" ).get_attribute( "data-seqno" )
# submit the search
select_tabbed_page( "#nav", "search" )
elem = find_child( "input#query-string" )
elem.clear()
elem.send_keys( query_string )
seq_no = get_seq_no()
elem.send_keys( Keys.RETURN )
# unload the results
wait_for( 2, lambda: get_seq_no() > seq_no )
elem = find_child( "#search-results .error" )
if elem:
return elem.text # nb: string = error message
elem = find_child( "#search-results .no-results" )
if elem:
assert elem.text == "Nothing was found."
return None # nb: None = no results
results = _unload_search_results()
assert isinstance( results, list ) # nb: list = search results
return results
def _unload_search_results():
"""Unload the search results."""
def unload_elem( result, key, elem ):
"""Unload a single element."""
if not elem:
return False
elem_text = get_elem_text( elem )
if not elem_text:
return False
result[key] = elem_text
return True
def get_elem_text( elem ):
"""Get the element's text content."""
val = elem.get_attribute( "innerHTML" )
# change how highlighted content is represented
matches = list( re.finditer( r'<span class="hilite">(.*?)</span>', val ) )
for mo in reversed(matches):
val = val[:mo.start()] + "((" + mo.group(1) + "))" + val[mo.end():]
# remove HTML tags
return strip_html( val.strip() )
def unload_ruleids( result, key, parent ):
"""Unload a list of ruleid's."""
if not parent:
return
ruleids = []
for elem in find_children( ".ruleid", parent ):
ruleid = get_elem_text( elem )
assert ruleid.startswith( "[" ) and ruleid.endswith( "]" )
ruleids.append( ruleid[1:-1] )
if ruleids:
result[key] = ruleids
def unload_rulerefs( result, key, parent ):
"""Unload a list of ruleref's."""
if not parent:
return
rulerefs = []
for elem in find_children( "li", parent ):
ruleref = {}
unload_elem( ruleref, "caption", find_child(".caption",elem) )
unload_ruleids( ruleref, "ruleids", elem )
rulerefs.append( ruleref )
if rulerefs:
result[key] = rulerefs
def unload_index_sr( sr ): #pylint: disable=possibly-unused-variable
"""Unload an "index" search result."""
result = {}
unload_elem( result, "title", find_child("span.title",sr) )
unload_elem( result, "subtitle", find_child(".subtitle",sr) )
unload_elem( result, "content", find_child(".content",sr) )
if unload_elem( result, "see_also", find_child(".see-also",sr) ):
assert result["see_also"].startswith( "See also:" )
result["see_also"] = [ s.strip() for s in result["see_also"][9:].split( "," ) ]
unload_ruleids( result, "ruleids", find_child(".ruleids",sr) )
unload_rulerefs( result, "rulerefs", find_child(".rulerefs",sr) )
return result
# unload the search results
results = []
for sr in find_children( "#search-results .sr"):
classes = get_classes( sr )
classes.remove( "sr" )
assert len(classes) == 1 and classes[0].endswith( "-sr" )
sr_type = classes[0][:-3]
func = locals()[ "unload_{}_sr".format( sr_type ) ]
sr = func( sr )
sr["sr_type"] = sr_type
results.append( sr )
return results

@ -17,6 +17,10 @@ def init_webapp( webapp, webdriver, **options ):
global _webapp, _webdriver
_webapp = webapp
_webdriver = webdriver
options = {
key.replace("_","-"): val
for key, val in options.items()
}
# load the webapp
if get_pytest_option("webdriver") == "chrome" and get_pytest_option("headless"):
@ -39,6 +43,18 @@ def _wait_for_webapp():
# ---------------------------------------------------------------------
def select_tabbed_page( parent_sel, tab_id ):
"""Select a tabbed page."""
tabbed_pages = find_child( ".tabbed-pages", find_child(parent_sel) )
btn = find_child( ".tab-strip .tab[data-tabid='{}']".format( tab_id ), tabbed_pages )
btn.click()
def find_tabbed_page():
elem = find_child( ".tabbed-page[data-tabid='{}']".format( tab_id ), tabbed_pages )
return elem and elem.is_displayed()
wait_for( 2, find_tabbed_page )
# ---------------------------------------------------------------------
def get_nav_panels():
"""Get the available nav panels."""
return _get_tab_ids( "#nav .tab-strip" )
@ -72,6 +88,11 @@ def find_children( sel, parent=None ):
except NoSuchElementException:
return None
def get_classes( elem ):
"""Get the element's classes."""
classes = elem.get_attribute( "class" )
return classes.split()
# ---------------------------------------------------------------------
def wait_for( timeout, func ):

@ -1,8 +1,24 @@
"""Helper functions."""
import os
import pathlib
import re
from asl_rulebook2.webapp import app, CONFIG_DIR
# ---------------------------------------------------------------------
def make_data_path( path ):
"""Generate a path relative to the data directory."""
dname = app.config.get( "DATA_DIR" )
if not dname:
return None
return os.path.join( dname, path )
def make_config_path( path ):
"""Generate a path in the config directory."""
return os.path.join( CONFIG_DIR, path )
# ---------------------------------------------------------------------
def change_extn( fname, extn ):

@ -0,0 +1,66 @@
#!/usr/bin/env python3
""" Add named destinations to a PDF file. """
import subprocess
import json
import time
import datetime
import click
from asl_rulebook2.utils import TempFile
# ---------------------------------------------------------------------
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--title", help="Document title." )
@click.option( "--targets","-t","targets_fname", required=True, type=click.Path(dir_okay=False),
help="Target definition file."
)
@click.option( "--yoffset", default=5, help="Offset to add to y co-ordinates." )
@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False),
help="Output PDF file."
)
@click.option( "--gs","gs_path", default="gs", help="Path to the Ghostscript executable." )
def main( pdf_file, title, targets_fname, yoffset, output_fname, gs_path ):
"""Add named destinations to a PDF file."""
# load the targets
with open( targets_fname, "r" ) as fp:
targets = json.load( fp )
with TempFile( mode="w" ) as temp_file:
# generate the pdfmarks
print( "Generating the pdfmarks..." )
if title:
print( "[ /Title ({})".format( title ), file=temp_file )
else:
print( "[", file=temp_file )
print( " /DOCINFO pdfmark", file=temp_file )
print( file=temp_file )
for ruleid, target in targets.items():
xpos, ypos = target["pos"]
print( "[ /Dest /{} /Page {} /View [/XYZ {} {}] /DEST pdfmark".format(
ruleid, target["page_no"], xpos, ypos+yoffset
), file=temp_file )
print( file=temp_file )
temp_file.close( delete=False )
# generate the pdfmark'ed document
print( "Generating the pdfmark'ed document..." )
print( "- {} => {}".format( pdf_file, output_fname ) )
args = [ gs_path, "-q", "-dBATCH", "-dNOPAUSE", "-sDEVICE=pdfwrite" ]
args.extend( [ "-o", output_fname ] )
args.extend( [ "-f", pdf_file ] )
args.append( temp_file.name )
start_time = time.time()
subprocess.run( args, check=True )
elapsed_time = time.time() - start_time
print( "- Elapsed time: {}".format( datetime.timedelta(seconds=int(elapsed_time)) ) )
# ---------------------------------------------------------------------
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -10,7 +10,7 @@ from asl_rulebook2.utils import parse_page_numbers
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False), help="Output PDF file" )
@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False), help="Output PDF file." )
@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." )
def main( pdf_file, output_fname, pages ):
"""Extract pages from a PDF."""

Loading…
Cancel
Save