asl-rulebook2/asl_rulebook2/extract/all.py

#!/usr/bin/env python3
""" Extract everything we need from the MMP eASLRB. """

import os
import json
import re
import importlib

import click

from asl_rulebook2.pdf import PdfDoc
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
from asl_rulebook2.extract.index import ExtractIndex
from asl_rulebook2.extract.content import ExtractContent

# ---------------------------------------------------------------------

class ExtractAll( ExtractBase ):
    """Extract everything from the eASLRB."""

    def __init__( self, args, log=None ):
        super().__init__( None, None, log )
        self._args = args
        self.extract_index = None
        self.extract_content = None

    def extract_all( self, pdf ):
        """Extract everything from the eASLRB."""

        # initialize
        default_args = {}
        for mod in ( "index", "content" ):
            mod = importlib.import_module( "asl_rulebook2.extract." + mod )
            default_args.update( getattr( mod, "_DEFAULT_ARGS" ) )

        # extract the index
        self.log_msg( "progress",  "\nExtracting the index..." )
        args = ExtractBase.parse_args( self._args, default_args )
        self.extract_index = ExtractIndex( args, self._log )
        self.extract_index.extract_index( pdf )

        # extract the content
        self.log_msg( "progress",  "\nExtracting the content..." )
        args = ExtractBase.parse_args( self._args, default_args )
        self.extract_content = ExtractContent( args, self._log )
        self.extract_content.extract_content( pdf )

        # verify the index targets
        self._check_targets()

    def _check_targets( self ):
        """Cross-check ruleid's and ruleref's in the index against targets in the main content."""

        # build an index of known targets
        targets = {}
        for ruleid, target in self.extract_content.targets.items():
            assert ruleid not in targets
            targets[ ruleid ] = target["caption"]

        # load the list of known missing targets
        known_strings, known_regexes = set(), set()
        fname = os.path.join( os.path.dirname(__file__), "data/known-missing-ruleids.json" )
        with open( fname, "r", encoding="utf-8" ) as fp:
            data = json.load( fp )
            for chapter in data["chapters"]:
                known_regexes.add( re.compile( "^{}[0-9.]+[A-Ea-e]?$".format( chapter ) ) )
            known_strings.update( data["strings"] )
            known_regexes.update(
                re.compile( regex ) for regex in data["regexes"]
            )

        def is_known_ruleid( ruleid ):
            ruleid = re.sub( r"-[A-Z]?\.?\d+$", "", ruleid ) # e.g. "A1.23-.45" -> "A1.23"
            if ruleid.endswith( " EX" ):
                ruleid = ruleid[:-3]
            if ruleid in targets:
                return True
            if ruleid in known_strings:
                return True
            if any( regex.search( ruleid ) for regex in known_regexes ):
                return True
            return False

        # check each index entry
        first = True
        for index_entry in self.extract_index.index_entries:

            errors = []

            # check the index entry's ruleid's
            for ruleid in index_entry.get( "ruleids", [] ):
                if not is_known_ruleid( ruleid ):
                    errors.append( "Unknown ruleid: {}".format( ruleid ) )

            # check the index entry's ruleref's
            for ruleref in index_entry.get( "rulerefs", [] ):
                if not ruleref["ruleids"]:
                    continue
                # check each ruleref
                if ", ".join( r for r in ruleref["ruleids"] ) in known_strings:
                    # NOTE: This is some free-form text that has been split up because it contains commas.
                    continue
                for ruleid in ruleref["ruleids"]:
                    if not is_known_ruleid( ruleid ):
                        errors.append( "Unknown ruleref target: {} => [{}]".format( ruleref["caption"], ruleid ) )

            # log any errors
            if errors:
                if first:
                    self.log_msg( "warning", "\n=== Unknown targets ===\n" )
                    first = False
                errors = [ "- {}".format( e ) for e in errors ]
                self.log_msg( "warning", "{}:\n{}",
                    index_entry["caption"], "\n".join(errors)
                )

# ---------------------------------------------------------------------

@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
    help="Output format."
)
@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." )
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, output_fmt, save_index_fname, save_targets_fname, save_footnotes_fname ):
    """Extract everything we need from the MMP eASLRB."""

    # extract everything
    def log_msg( msg_type, msg ):
        if msg_type == "progress" and not progress:
            return
        log_msg_stderr( msg_type, msg )
    extract = ExtractAll( args, log_msg )
    extract.log_msg( "progress",  "Loading PDF: {}", pdf_file )
    with PdfDoc( pdf_file ) as pdf:
        extract.extract_all( pdf )

    # save the results
    with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
         open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
         open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
        getattr( extract.extract_index, "save_as_"+output_fmt )( index_out )
        getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, footnotes_out )

if __name__ == "__main__":
    main() #pylint: disable=no-value-for-parameter