diff --git a/asl_rulebook2/webapp/content.py b/asl_rulebook2/webapp/content.py index afcc008..4eba083 100644 --- a/asl_rulebook2/webapp/content.py +++ b/asl_rulebook2/webapp/content.py @@ -8,18 +8,28 @@ import glob from flask import jsonify, send_file, url_for, abort from asl_rulebook2.webapp import app -from asl_rulebook2.webapp.utils import change_extn, slugify +from asl_rulebook2.webapp.utils import slugify -content_docs = None +content_sets = None +content_doc_index = None # --------------------------------------------------------------------- -def load_content_docs( startup_msgs, logger ): - """Load the content documents from the data directory.""" +def load_content_sets( startup_msgs, logger ): + """Load the content from the data directory.""" + + # NOTE: A "content set" is an index file, together with one or more "content docs". + # A "content doc" is a PDF file, with an associated targets and/or footnote file. + # This architecture allows us to have: + # - a single index file that references content spread over multiple PDF's (e.g. the MMP eASLRB, + # together with additional modules in separate PDF's (e.g. RB or KGP), until such time these + # get included in the main eASLRB). + # - rules for completely separate modules (e.g. third-party modules) that are not included + # in the MMP eASLRB index, and have their own index. # initialize - global content_docs - content_docs = {} + global content_sets + content_sets = {} dname = app.config.get( "DATA_DIR" ) if not dname: return @@ -27,7 +37,15 @@ def load_content_docs( startup_msgs, logger ): startup_msgs.error( "Invalid data directory.", dname ) return - def load_file( fname, content_doc, key, on_error, binary=False ): + def load_content_doc( fname_stem, title ): + # load the content doc files + content_doc = { "title": title } + load_file( fname_stem+".targets", content_doc, "targets", startup_msgs.warning ) + load_file( fname_stem+".footnotes", content_doc, "footnotes", startup_msgs.warning ) + load_file( fname_stem+".pdf", content_doc, "content", startup_msgs.warning, binary=True ) + return content_doc + + def load_file( fname, save_loc, key, on_error, binary=False ): fname = os.path.join( dname, fname ) if not os.path.isfile( fname ): return False @@ -45,30 +63,68 @@ def load_content_docs( startup_msgs, logger ): on_error( "Couldn't load \"{}\".".format( os.path.basename(fname) ), str(ex) ) return False # save the file data - content_doc[ key ] = data + save_loc[ key ] = data return True - # load each content doc - logger.info( "Loading content docs: %s", dname ) + def find_assoc_cdocs( fname_stem ): + # find other content docs associated with the content set (names have the form "Foo (...)") + matches = set() + for fname in os.listdir( dname ): + if not fname.startswith( fname_stem ): + continue + fname = os.path.splitext( fname )[0] + fname = fname[len(fname_stem):].strip() + if fname.startswith( "(" ) and fname.endswith( ")" ): + matches.add( fname[1:-1] ) + return matches + + # load each content set + logger.info( "Loading content sets: %s", dname ) fspec = os.path.join( dname, "*.index" ) for fname in glob.glob( fspec ): - # load the main index file fname2 = os.path.basename( fname ) logger.info( "- %s", fname2 ) + # load the index file title = os.path.splitext( fname2 )[0] - content_doc = { - "_fname": fname, - "doc_id": slugify( title ), + cset_id = slugify( title ) + content_set = { + "cset_id": cset_id, "title": title, + "content_docs": {}, + "index_fname": fname, } - if not load_file( fname2, content_doc, "index", startup_msgs.error ): + if not load_file( fname2, content_set, "index", startup_msgs.error ): continue # nb: we can't do anything without an index file - # load any associated files - load_file( change_extn(fname2,".targets"), content_doc, "targets", startup_msgs.warning ) - load_file( change_extn(fname2,".footnotes"), content_doc, "footnotes", startup_msgs.warning ) - load_file( change_extn(fname2,".pdf"), content_doc, "content", startup_msgs.warning, binary=True ) - # save the new content doc - content_docs[ content_doc["doc_id"] ] = content_doc + # load the main content doc + fname_stem = os.path.splitext( fname2 )[0] + content_doc = load_content_doc( fname_stem, fname_stem ) + cdoc_id = cset_id # nb: because this the main content document + content_doc[ "cdoc_id" ] = cdoc_id + content_set[ "content_docs" ][ cdoc_id ] = content_doc + # load any associated content docs + for fname_stem2 in find_assoc_cdocs( fname_stem ): + # nb: we assume there's only one space between the two filename stems :-/ + content_doc = load_content_doc( + "{} ({})".format( fname_stem, fname_stem2 ), + fname_stem2 + ) + cdoc_id2 = "{}!{}".format( cdoc_id, slugify(fname_stem2) ) + content_doc[ "cdoc_id" ] = cdoc_id2 + content_set[ "content_docs" ][ cdoc_id2 ] = content_doc + # save the new content set + content_sets[ content_set["cset_id"] ] = content_set + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def _dump_content_sets(): + """Dump the available content sets.""" + for cset_id, cset in content_sets.items(): + print( "=== {} ({}) ===".format( cset["title"], cset_id ) ) + for cdoc_id, cdoc in cset["content_docs"].items(): + print( "Content doc: {} ({})".format( cdoc["title"], cdoc_id ) ) + for key in [ "targets", "footnotes", "content" ]: + if key in cdoc: + print( "- {}: {}".format( key, len(cdoc[key]) )) # --------------------------------------------------------------------- @@ -76,25 +132,29 @@ def load_content_docs( startup_msgs, logger ): def get_content_docs(): """Return the available content docs.""" resp = {} - for cdoc in content_docs.values(): - cdoc2 = { - "doc_id": cdoc["doc_id"], - "title": cdoc["title"], - } - if "content" in cdoc: - cdoc2["url"] = url_for( "get_content", doc_id=cdoc["doc_id"] ) - if "targets" in cdoc: - cdoc2["targets"] = cdoc["targets"] - resp[ cdoc["doc_id"] ] = cdoc2 + for cset in content_sets.values(): + for cdoc in cset["content_docs"].values(): + cdoc2 = { + "cdoc_id": cdoc["cdoc_id"], + "parent_cset_id": cset["cset_id"], + "title": cdoc["title"], + } + if "content" in cdoc: + cdoc2["url"] = url_for( "get_content", cdoc_id=cdoc["cdoc_id"] ) + if "targets" in cdoc: + cdoc2["targets"] = cdoc["targets"] + resp[ cdoc["cdoc_id"] ] = cdoc2 return jsonify( resp ) # --------------------------------------------------------------------- -@app.route( "/content/" ) -def get_content( doc_id ): +@app.route( "/content/" ) +def get_content( cdoc_id ): """Return the content for the specified document.""" - cdoc = content_docs.get( doc_id ) - if not cdoc or "content" not in cdoc: - abort( 404 ) - buf = io.BytesIO( cdoc["content"] ) - return send_file( buf, mimetype="application/pdf" ) + for cset in content_sets.values(): + for cdoc in cset["content_docs"].values(): + if cdoc["cdoc_id"] == cdoc_id and "content" in cdoc: + buf = io.BytesIO( cdoc["content"] ) + return send_file( buf, mimetype="application/pdf" ) + abort( 404 ) + return None # stupid pylint :-/ diff --git a/asl_rulebook2/webapp/search.py b/asl_rulebook2/webapp/search.py index 2a4566e..c2348b3 100644 --- a/asl_rulebook2/webapp/search.py +++ b/asl_rulebook2/webapp/search.py @@ -85,7 +85,7 @@ def _do_search( args ): def highlight( n ): # NOTE: highlight() is an FTS extension function, and takes column numbers :-/ return "highlight(searchable,{},'{}','{}')".format( n, _BEGIN_HIGHLIGHT, _END_HIGHLIGHT ) - sql = "SELECT rowid,doc_id,sr_type,rank,{},{},{},{} FROM searchable".format( + sql = "SELECT rowid,cset_id,sr_type,rank,{},{},{},{} FROM searchable".format( highlight(2), highlight(3), highlight(4), highlight(5) ) sql += " WHERE searchable MATCH ?" @@ -106,7 +106,7 @@ def _do_search( args ): continue index_entry = _fts_index_entries[ row[0] ] result = { - "doc_id": row[1], + "cset_id": row[1], "sr_type": row[2], "_key": "{}:{}:{}".format( row[1], row[2], row[0] ), "_score": - row[3], @@ -347,17 +347,17 @@ def init_search( startup_msgs, logger ): # the overall content, and also lets us do AND/OR queries across all searchable content. conn.execute( "CREATE VIRTUAL TABLE searchable USING fts5" - " ( doc_id, sr_type, title, subtitle, content, rulerefs, tokenize='porter unicode61' )" + " ( cset_id, sr_type, title, subtitle, content, rulerefs, tokenize='porter unicode61' )" ) # load the searchable content logger.info( "Loading the search index..." ) conn.execute( "DELETE FROM searchable" ) curs = conn.cursor() - for cdoc in webapp_content.content_docs.values(): - logger.info( "- Loading index file: %s", cdoc["_fname"] ) + for cset in webapp_content.content_sets.values(): + logger.info( "- Loading index file: %s", cset["index_fname"] ) nrows = 0 - for index_entry in cdoc["index"]: + for index_entry in cset["index"]: rulerefs = _RULEREF_SEPARATOR.join( r.get("caption","") for r in index_entry.get("rulerefs",[]) ) # NOTE: We should really strip content before adding it to the search index, otherwise any HTML tags # will need to be included in search terms. However, this means that the content returned by a query @@ -365,8 +365,8 @@ def init_search( startup_msgs, logger ): # but that means we would lose the highlighting of search terms that SQLite gives us. We opt to insert # the original content, since none of it should contain HTML, anyway. curs.execute( - "INSERT INTO searchable (doc_id,sr_type,title,subtitle,content,rulerefs) VALUES (?,?,?,?,?,?)", ( - cdoc["doc_id"], "index", + "INSERT INTO searchable (cset_id,sr_type,title,subtitle,content,rulerefs) VALUES (?,?,?,?,?,?)", ( + cset["cset_id"], "index", index_entry.get("title"), index_entry.get("subtitle"), index_entry.get("content"), rulerefs ) ) _fts_index_entries[ curs.lastrowid ] = index_entry diff --git a/asl_rulebook2/webapp/startup.py b/asl_rulebook2/webapp/startup.py index cf73ae6..4946271 100644 --- a/asl_rulebook2/webapp/startup.py +++ b/asl_rulebook2/webapp/startup.py @@ -6,7 +6,7 @@ from collections import defaultdict from flask import jsonify from asl_rulebook2.webapp import app -from asl_rulebook2.webapp.content import load_content_docs +from asl_rulebook2.webapp.content import load_content_sets from asl_rulebook2.webapp.search import init_search _logger = logging.getLogger( "startup" ) @@ -26,7 +26,7 @@ def init_webapp(): _startup_msgs = StartupMsgs() # initialize the webapp - load_content_docs( _startup_msgs, _logger ) + load_content_sets( _startup_msgs, _logger ) init_search( _startup_msgs, _logger ) # --------------------------------------------------------------------- diff --git a/asl_rulebook2/webapp/static/ContentPane.js b/asl_rulebook2/webapp/static/ContentPane.js index 53fa05e..8416578 100644 --- a/asl_rulebook2/webapp/static/ContentPane.js +++ b/asl_rulebook2/webapp/static/ContentPane.js @@ -8,14 +8,14 @@ gMainApp.component( "content-pane", { template: ` - - + + `, mounted() { - gEventBus.on( "show-target", (docId, target) => { //eslint-disable-line no-unused-vars - this.$refs.tabbedPages.activateTab( docId ) ; // nb: tabId == docId + gEventBus.on( "show-target", (cdocId, target) => { //eslint-disable-line no-unused-vars + this.$refs.tabbedPages.activateTab( cdocId ) ; // nb: tabId == cdocId } ) ; }, @@ -25,7 +25,7 @@ gMainApp.component( "content-pane", { gMainApp.component( "content-doc", { - props: [ "doc" ], + props: [ "cdoc" ], data() { return { target: null, noContent: gUrlParams.get( "no-content" ), @@ -33,14 +33,17 @@ gMainApp.component( "content-doc", { template: `
-
Content disabled.
target = {{target}}
-