You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
150 lines
6.2 KiB
150 lines
6.2 KiB
#!/usr/bin/env python3
|
|
""" Extract everything we need from the MMP eASLRB. """
|
|
|
|
import os
|
|
import json
|
|
import re
|
|
import importlib
|
|
|
|
import click
|
|
|
|
from asl_rulebook2.pdf import PdfDoc
|
|
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
|
|
from asl_rulebook2.extract.index import ExtractIndex
|
|
from asl_rulebook2.extract.content import ExtractContent
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
class ExtractAll( ExtractBase ):
|
|
"""Extract everything from the eASLRB."""
|
|
|
|
def __init__( self, args, log=None ):
|
|
super().__init__( None, None, log )
|
|
self._args = args
|
|
self.extract_index = None
|
|
self.extract_content = None
|
|
|
|
def extract_all( self, pdf ):
|
|
"""Extract everything from the eASLRB."""
|
|
|
|
# initialize
|
|
default_args = {}
|
|
for mod in ( "index", "content" ):
|
|
mod = importlib.import_module( "asl_rulebook2.extract." + mod )
|
|
default_args.update( getattr( mod, "_DEFAULT_ARGS" ) )
|
|
|
|
# extract the index
|
|
self.log_msg( "progress", "\nExtracting the index..." )
|
|
args = ExtractBase.parse_args( self._args, default_args )
|
|
self.extract_index = ExtractIndex( args, self._log )
|
|
self.extract_index.extract_index( pdf )
|
|
|
|
# extract the content
|
|
self.log_msg( "progress", "\nExtracting the content..." )
|
|
args = ExtractBase.parse_args( self._args, default_args )
|
|
self.extract_content = ExtractContent( args, self._log )
|
|
self.extract_content.extract_content( pdf )
|
|
|
|
# verify the index targets
|
|
self._check_targets()
|
|
|
|
def _check_targets( self ):
|
|
"""Cross-check ruleid's and ruleref's in the index against targets in the main content."""
|
|
|
|
# build an index of known targets
|
|
targets = {}
|
|
for ruleid, target in self.extract_content.targets.items():
|
|
assert ruleid not in targets
|
|
targets[ ruleid ] = target["caption"]
|
|
|
|
# load the list of known missing targets
|
|
known_strings, known_regexes = set(), set()
|
|
fname = os.path.join( os.path.dirname(__file__), "data/known-missing-ruleids.json" )
|
|
with open( fname, "r", encoding="utf-8" ) as fp:
|
|
data = json.load( fp )
|
|
for chapter in data["chapters"]:
|
|
known_regexes.add( re.compile( "^{}[0-9.]+[A-Ea-e]?$".format( chapter ) ) )
|
|
known_strings.update( data["strings"] )
|
|
known_regexes.update(
|
|
re.compile( regex ) for regex in data["regexes"]
|
|
)
|
|
|
|
def is_known_ruleid( ruleid ):
|
|
ruleid = re.sub( r"-[A-Z]?\.?\d+$", "", ruleid ) # e.g. "A1.23-.45" -> "A1.23"
|
|
if ruleid.endswith( " EX" ):
|
|
ruleid = ruleid[:-3]
|
|
if ruleid in targets:
|
|
return True
|
|
if ruleid in known_strings:
|
|
return True
|
|
if any( regex.search( ruleid ) for regex in known_regexes ):
|
|
return True
|
|
return False
|
|
|
|
# check each index entry
|
|
first = True
|
|
for index_entry in self.extract_index.index_entries:
|
|
|
|
errors = []
|
|
|
|
# check the index entry's ruleid's
|
|
for ruleid in index_entry.get( "ruleids", [] ):
|
|
if not is_known_ruleid( ruleid ):
|
|
errors.append( "Unknown ruleid: {}".format( ruleid ) )
|
|
|
|
# check the index entry's ruleref's
|
|
for ruleref in index_entry.get( "rulerefs", [] ):
|
|
if not ruleref["ruleids"]:
|
|
continue
|
|
# check each ruleref
|
|
if ", ".join( r for r in ruleref["ruleids"] ) in known_strings:
|
|
# NOTE: This is some free-form text that has been split up because it contains commas.
|
|
continue
|
|
for ruleid in ruleref["ruleids"]:
|
|
if not is_known_ruleid( ruleid ):
|
|
errors.append( "Unknown ruleref target: {} => [{}]".format( ruleref["caption"], ruleid ) )
|
|
|
|
# log any errors
|
|
if errors:
|
|
if first:
|
|
self.log_msg( "warning", "\n=== Unknown targets ===\n" )
|
|
first = False
|
|
errors = [ "- {}".format( e ) for e in errors ]
|
|
self.log_msg( "warning", "{}:\n{}",
|
|
index_entry["caption"], "\n".join(errors)
|
|
)
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
@click.command()
|
|
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
|
|
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
|
|
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
|
|
@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
|
|
help="Output format."
|
|
)
|
|
@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." )
|
|
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
|
|
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
|
|
def main( pdf_file, args, progress, output_fmt, save_index_fname, save_targets_fname, save_footnotes_fname ):
|
|
"""Extract everything we need from the MMP eASLRB."""
|
|
|
|
# extract everything
|
|
def log_msg( msg_type, msg ):
|
|
if msg_type == "progress" and not progress:
|
|
return
|
|
log_msg_stderr( msg_type, msg )
|
|
extract = ExtractAll( args, log_msg )
|
|
extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
|
|
with PdfDoc( pdf_file ) as pdf:
|
|
extract.extract_all( pdf )
|
|
|
|
# save the results
|
|
with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
|
|
open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
|
|
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
|
|
getattr( extract.extract_index, "save_as_"+output_fmt )( index_out )
|
|
getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, footnotes_out )
|
|
|
|
if __name__ == "__main__":
|
|
main() #pylint: disable=no-value-for-parameter
|
|
|