asl-rulebook2/asl_rulebook2/extract/content.py

#!/usr/bin/env python3
""" Extract content from the MMP eASLRB. """

import os
import json
import re
import math
from collections import defaultdict

import click
from pdfminer.layout import LTChar

from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator
from asl_rulebook2.utils import parse_page_numbers, fixup_text, append_text, remove_trailing, jsonval

# NOTE: Characters are laid out individually on the page, and we generally want to process them top-to-bottom,
# left-to-right, but in some cases, alignment is messed up (e.g. the bounding boxes don't line up properly
# and e.g. the first part of a sentence is infintesimally lower down than the rest of the sentence, and so
# appears later in the sort order), and we get better results if we process characters in the order in which
# they appear in the PDF document.
_DISABLE_SORT_ITEMS = [
    "B40", # nb: to detect B31.1 NARROW STREET
    "A58","A59","A60", # Chapter A footnotes (nb: page A61 is a mess wrt element order :-/)
    "B45", "B46", # Chapter B footnotes
    "C25", "C26", # Chapter C footnotes
    "D27", # Chapter D footnotes
    "E28", "E29", "E30", # Chapter E footnotes
    "F20", "F21", # Chapter F footnotes
    "G48", "G49", "G50", # Chapter G footnotes
]

_DEFAULT_ARGS = {
    "chapter-a": "42-102", "chapter-b": "109-154", "chapter-c": "158-183", "chapter-d": "187-213",
    "chapter-e": "216-245", "chapter-f": "247-267", "chapter-g": "270-319",
    "chapter-j": "593",
    "chapter-w": "647-664",
    "content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport
    "disable-sort-items": ",".join( _DISABLE_SORT_ITEMS )
}

# ---------------------------------------------------------------------

class ExtractContent( ExtractBase ):
    """Extract content from the MMP eASLRB."""

    def __init__( self, args, log=None ):
        super().__init__( args, _DEFAULT_ARGS, log )
        self.targets = {}
        self._footnotes = {}
        self._curr_chapter = self._curr_footnote = self._curr_pageid = None
        self._prev_elem = self._top_left_elem = None
        # prepare to fixup problems in the content
        fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" )
        with open( fname2, "r", encoding="utf-8" ) as fp:
            self._target_fixups = json.load( fp )
        fname2 = os.path.join( os.path.dirname(__file__), "data/footnote-fixups.json" )
        with open( fname2, "r", encoding="utf-8" ) as fp:
            self._footnote_fixups = json.load( fp )

    def extract_content( self, pdf ):
        """Extract content from the MMP eASLRB."""

        # figure out which pages to process
        chapter_pages = {} # maps chapters to page numbers
        page_index = {} # maps page numbers to chapter
        for key, val in _DEFAULT_ARGS.items():
            if key.startswith( "chapter-" ):
                page_nos = parse_page_numbers( val )
                assert len(key) == 9
                chapter = key[8].upper()
                chapter_pages[ chapter ] = page_nos
                for page_no in page_nos:
                    page_index[ page_no ] = chapter
        disable_sort_items = set( self._args["disable-sort-items"].split( "," ) )

        # initialize
        self._curr_chapter = None
        curr_chapter_pageno = None
        self._curr_footnote = None

        # NOTE: The parsing code works in two modes.
        # - We start off extracting content, and detect the start of a new rule by bold text near the start of the line.
        # - When we see the footnotes header (e.g. "CHAPTER A FOOTNOTES"), we switch into footnotes mode, and detect
        #   the start of a footnote by a bold number near the start of the line.

        # process each page
        for page_no, _, lt_page in PageIterator( pdf ):

            # prepare to process the next page
            if page_no > max( page_index.keys() ):
                break
            if page_no not in page_index:
                self.log_msg( "progress", "- Skipping page {}.", page_no )
                continue
            if not self._curr_chapter or self._curr_chapter != page_index[page_no]:
                # we've found the start of a new chapter
                self._save_footnote() # nb: save the last footnote of the previous chapter
                self._curr_chapter = page_index[ page_no ]
                curr_chapter_pageno = 1
            else:
                curr_chapter_pageno += 1
            self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
                self._curr_chapter, curr_chapter_pageno
            )
            self.log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid )

            # process each element on the page
            curr_caption = None
            self._top_left_elem = self._prev_elem = None
            elem_filter = lambda e: isinstance( e, LTChar )
            sort_elems = self._curr_pageid not in disable_sort_items
            for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):

                # keep track of the top-left-most bold element
                if self._is_bold( elem ):
                    if self._top_left_elem is None \
                       or elem.x0 < self._top_left_elem.x0 and elem.y1 > self._top_left_elem.y1:
                        self._top_left_elem = elem

                # check if we should ignore this element
                if not self._in_viewport( elem, "content" ):
                    continue

                # check if we're currently extracting footnotes
                if self._curr_footnote is not None:
                    self._on_footnote_elem( elem, lt_page )
                    self._prev_elem = elem
                    continue

                # figure out what we've got
                is_bold = self._is_bold( elem )
                ch = curr_caption[0] if curr_caption else None #pylint: disable=unsubscriptable-object
                if is_bold and ch and ch.isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2:
                    # the previous bold character looks like a footnote superscript - ignore it
                    curr_caption = None
                if curr_caption and elem.get_text() == " ":
                    # FUDGE! Some captions are in a bold font, but the spaces are not :-/
                    is_bold = True
                if is_bold:
                    if curr_caption:
                        # NOTE: We stop collecting bold characters at the end of the line, even if they continue on
                        # to the next line. This is to handle the case of a major heading (e.g. "1. PERSONNEL COUNTERS")
                        # being followed by a lesser heading ("1.1"). However, we want to handle captions that span
                        # multiple lines, so we check the vertical distance between the lines to see if it looks like
                        # two separate headings, or a single caption that has spread over multiple lines.
                        if self._prev_elem.y0 - elem.y1 > 0.25*elem.height:
                            # we've found the start of a new rule - save the old one, start collecting the new caption
                            self._save_target( curr_caption, page_no, lt_page, elem )
                            curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ]
                        else:
                            # continue collecting the caption
                            if self._prev_elem.y0 - elem.y0 > 1:
                                # nb: we just started a new line
                                curr_caption[0] = append_text( #pylint: disable=unsupported-assignment-operation
                                    curr_caption[0], elem.get_text() #pylint: disable=unsubscriptable-object
                                )
                            else:
                                curr_caption[0] += elem.get_text() #pylint: disable=unsupported-assignment-operation
                    else:
                        # check if this is the first character of the line
                        if self._is_start_of_line( elem, lt_page ):
                            # yup - start collecting the caption
                            curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ]
                else:
                    # check if we're currently collecting a caption
                    if curr_caption:
                        # yup - we've just found the end of it, save it
                        self._save_target( curr_caption, page_no, lt_page, elem )
                        curr_caption = None

                # loop back to process the next element
                self._prev_elem = elem

        # add the last caption/footnote (if they haven't already been done)
        self._save_footnote()
        if curr_caption:
            self._save_target( curr_caption, page_no, None, None )

        # check for unused fixups
        if self._target_fixups:
            self.log_msg( "warning", "Unused fixups: {}", self._target_fixups )
        if self._footnote_fixups:
            self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )

    def _save_target( self, caption, page_no, lt_page, elem ):
        """Save a parsed target."""

        # initialize
        orig_caption = caption[0]
        caption_text = re.sub( r"\s+", " ", caption[0] ).strip()
        if len(caption_text) <= 1:
            # NOTE: We're finding text that is part of an image (e.g. the "E" for an Elite MMC),
            # perhaps because the pages were OCR'ed, so we ignore these.
            return

        # check if we've found the start of the chapter's footnotes
        if "FOOTNOTES" in caption_text :
            # yup - notify the main loop
            self._curr_footnote = []
            if elem:
                self._on_footnote_elem( elem, lt_page )
            return

        # check if the entry needs to be fixed up
        fixup = self._target_fixups.get( self._curr_pageid, {} ).get( caption_text )
        if fixup:
            # yup - make it so
            fixup[ "instances" ] = fixup.get("instances",1) - 1
            if fixup["instances"] <= 0:
                self._target_fixups[ self._curr_pageid ].pop( caption_text )
                if not self._target_fixups[ self._curr_pageid ]:
                    del self._target_fixups[ self._curr_pageid ]
            ruleid = fixup.get( "new_ruleid" )
            if not ruleid:
                return
            caption_text = fixup.get( "new_caption" )
        else:
            # nope - use what was parsed
            # FUDGE! There are a lot of layout problems with things like "12.CONCEALMENT" (i.e. missing space),
            # and it's tricky to detect these and not get tripped up by things like "12.C blah", so we handle it
            # as a separate case.
            mo = re.search( r"^(\d+\.\d*)([^ 0-9].+)", caption_text )
            if mo:
                ruleid, caption_text = mo.group(1), mo.group(2).strip()
            else:
                # check if the caption text starts with something that looks like a ruleid
                # NOTE: A leading "*" indicates an optional rule.
                mo = re.search( r"^\*?([A-Z]\.?)?[1-9][0-9.-]*[A-F]?", caption_text )
                if not mo:
                    return
                ruleid, caption_text = mo.group(), caption_text[mo.end():].strip()
                if ruleid.startswith( "*" ):
                    ruleid = ruleid[1:]
            ruleid = remove_trailing( ruleid, "." )
            caption_text = remove_trailing( caption_text, ":" )

        # save the new target
        if not ruleid.startswith( self._curr_chapter ):
            ruleid = self._curr_chapter + ruleid
        if ruleid in self.targets:
            self.log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").",
                ruleid, caption[0]
            )
            return
        if caption_text == "\u2014":
            caption_text = "-" # nb: for A7.306 :-/
        self.targets[ ruleid ] = {
            "caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1],
            "raw_caption": orig_caption
        }

    def _on_footnote_elem( self, elem, lt_page ):
        """Process an element while we're parsing footnotes."""
        # check if we've found the start of a new footnote
        if self._is_bold( elem ):
            if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ):
                # yup - save the current footnote, start collecting the new one
                self._save_footnote()
                self._curr_footnote = [ elem.get_text(), "" ]
            else:
                if self._curr_footnote[1]:
                    # FUDGE! Some footnote content has bold text hard-up at the left margin,
                    # so we collect that as normal content.
                    self._curr_footnote[1] += elem.get_text()
                else:
                    # we're still collecting the footnote's ID
                    # NOTE: Older chapters have only the footnote ID in bold text, while newer chapters have
                    # both the ID and caption in bold. We figure out what's going on later, in _save_footnote().
                    self._curr_footnote[0] += elem.get_text()
        else:
            # nope - we're still collecting the footnote's content
            if not self._prev_elem or elem.x0 < self._prev_elem.x0 or elem.y0 - self._prev_elem.y0 > lt_page.height/2:
                # nb: we just started a new line
                self._curr_footnote[1] = append_text( self._curr_footnote[1], elem.get_text() )
            else:
                self._curr_footnote[1] += elem.get_text()

    def _save_footnote( self ):
        """Save a parsed footnote."""

        if not self._curr_footnote:
            return

        # initialize
        if self._curr_chapter not in self._footnotes:
            # start saving footnotes for the chapter
            self._footnotes[ self._curr_chapter ] = []
        orig_content = self._curr_footnote[1]

        # separate the footnote ID, referenced rule, and content
        if self._curr_chapter in ( "F", "G", "W" ):
            # NOTE: Chapter F/G footnote captions are also bold.
            mo = re.search( r"^\d{1,2}\.", self._curr_footnote[0] )
            if mo:
                parts = mo.group(), self._curr_footnote[0][mo.end():]
                self._curr_footnote[0] = parts[0]
                self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip()
            else:
                self.log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] )
        footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." )
        content = self._curr_footnote[1].strip()
        mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content )
        if mo:
            ruleid, content = mo.group(), content[mo.end():]
            if not ruleid.startswith( self._curr_chapter ):
                ruleid = self._curr_chapter + ruleid
            ruleid = remove_trailing( ruleid, "." )
        else:
            ruleid = None
        if self._curr_chapter == "C":
            # FUDGE! The "29." for Chapter C's footnote #29 is misaligned, and is extracted as two separate
            # footnotes "2" and "9". There isn't really any way to fix this via the normal data-driven mechanism,
            # so we do it in the code here :-/
            footnote_ids = [ f["footnote_id"] for f in self._footnotes[self._curr_chapter] ]
            if footnote_id == "2" and "2" in footnote_ids:
                return
            if footnote_id == "9" and "9" in footnote_ids:
                footnote_id = "29"

        # clean up the content
        content = re.sub( r"\s+", " ", content ).strip()
        content = fixup_text( content )
        mo = re.search( r"^[A-Z ]+:\S", content )
        if mo:
            content = content[:mo.end()-1] + " " + content[mo.end()-1:]

        # check for any fixups
        captions = []
        fixups = self._footnote_fixups.get( self._curr_chapter, {} ).get( footnote_id )
        if fixups:
            if isinstance( fixups, list ):
                # NOTE: A simple search-and-replace is, by far, the most common fixup, so we provide
                # a simplified way of specifying these in the fixup file
                fixups = { "replace": [ ( sr[0], sr[1] ) for sr in fixups ] }
            errors = defaultdict( list )
            # do any search-replace's
            if "replace" in fixups:
                for sr in fixups["replace"]:
                    prev_content = content
                    content = content.replace( sr[0], sr[1] )
                    if content == prev_content:
                        self.log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}",
                            self._curr_chapter, footnote_id, sr[0]
                        )
                        errors["replace"].append( sr )
                del fixups["replace"]
            # replace the captions
            if "captions" in fixups:
                captions = fixups.pop( "captions" )
            # check that all fixups were successfully applied
            if fixups:
                errors.append( fixups )
            if errors:
                self._footnote_fixups[ self._curr_chapter ][ footnote_id ] = errors
            else:
                del self._footnote_fixups[ self._curr_chapter ][ footnote_id ]
                if not self._footnote_fixups[ self._curr_chapter ]:
                    del self._footnote_fixups[ self._curr_chapter ]
            content = content.strip()

        # extract the footnote's caption
        if not captions:
            pos = content.find( ":" )
            if pos >= 0:
                captions.append( ( ruleid, content[:pos] ) )
                content = content[pos+1:].strip()
            else:
                self.log_msg( "warning", "Can't extract footnote caption: {}:{} - {}",
                    self._curr_chapter, footnote_id, content
                )

        # check for the credits at the end of the Chapter F footnotes
        pos = content.find( "WEST OF ALAMEIN CREDITS" )
        if pos > 0:
            content = content[:pos]

        # save the footnote
        self._footnotes[ self._curr_chapter ].append( {
            "footnote_id": footnote_id,
            "captions": captions,
            "content": content,
            "raw_content": orig_content
        } )
        self._curr_footnote = None

    def _is_start_of_line( self, elem, lt_page ):
        """Check if the element is at the start of its line."""
        # NOTE: We can't just check the element's x co-ordinate, since there is sometimes a floating image
        # that pushes the text right (e.g. A.12).
        if self._prev_elem is None:
            return True
        if elem.y0 < self._prev_elem.y0:
            return True
        if self._prev_elem.x0 < lt_page.width/2 and elem.x0 > lt_page.width/2:
            return True # the element is at the top of the right column
        return False

    def save_as_raw( self, targets_out, footnotes_out ):
        """Save the raw results."""
        self._save_as_raw_or_text( targets_out, footnotes_out, True )

    def save_as_text( self, targets_out, footnotes_out ):
        """Save the results as plain-text."""
        self._save_as_raw_or_text( targets_out, footnotes_out, False )

    def _save_as_raw_or_text( self, targets_out, footnotes_out, raw ):
        """Save the results as raw or plain-text."""

        # save the targets
        curr_page_no = None
        for ruleid, target in self.targets.items():
            if target["page_no"] != curr_page_no:
                if curr_page_no:
                    print( file=targets_out )
                print( "=== p{} ===".format( target["page_no"] ), file=targets_out )
                curr_page_no = target["page_no"]
            xpos, ypos = self._get_target_pos( target )
            if raw:
                print( "[{},{}] = {}".format(
                    xpos, ypos, target["raw_caption"]
                ), file=targets_out )
            else:
                print( "{} => {} @ p{}:[{},{}]".format(
                    ruleid, target["caption"], target["page_no"], xpos, ypos
                ), file=targets_out )

        # save the footnotes
        def make_caption( caption ):
            buf = []
            if caption[1]:
                buf.append( caption[1] )
                if caption[0]:
                    buf.append( "[{}]".format( caption[0] ) )
            elif caption[0]:
                buf.append( caption[0] )
            return " ".join( buf )
        for chapter, footnotes in self._footnotes.items():
            if chapter != "A":
                print( file=footnotes_out )
            print( "=== CHAPTER {} FOOTNOTES {}".format( chapter, 80*"=" )[:80], file=footnotes_out )
            for footnote in footnotes:
                print( file=footnotes_out )
                print( "--- Footnote {} ---".format( footnote["footnote_id"] ), file=footnotes_out )
                if raw:
                    print( footnote["raw_content"], file=footnotes_out )
                else:
                    print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out )
                    print( footnote["content"], file=footnotes_out )

    def save_as_json( self, targets_out, footnotes_out ):
        """Save the results as JSON."""

        # save the targets
        targets, curr_chapter = [], None
        for ruleid, target in self.targets.items():
            xpos, ypos = self._get_target_pos( target )
            targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format(
                jsonval( ruleid ),
                jsonval(target["caption"]), target["page_no"], xpos, ypos
            ) )
            if ruleid[0] != curr_chapter:
                targets[-1] = "\n" + targets[-1]
                curr_chapter = ruleid[0]
        print( "{{\n{}\n\n}}".format(
            ",\n".join( targets )
        ), file=targets_out )

        # save the footnotes
        def make_caption( caption ):
            return "{{ \"caption\": {}, \"ruleid\": {} }}".format(
                jsonval(caption[1]), jsonval(caption[0])
            )
        chapters = []
        for chapter in self._footnotes:
            footnotes = []
            for footnote in self._footnotes[chapter]:
                footnotes.append( "{}: {{\n  \"captions\": {},\n  \"content\": {}\n}}".format(
                    jsonval( footnote["footnote_id"] ),
                    "[ {} ]".format( ", ".join( make_caption(c) for c in footnote["captions"] ) ),
                    jsonval( footnote["content"] )
                ) )
            chapters.append( "{}: {{\n\n{}\n\n}}".format(
                jsonval( chapter ),
                ",\n".join( footnotes )
            ) )
        print( "{{\n\n{}\n\n}}".format(
            ",\n\n".join( chapters )
        ), file=footnotes_out )

    @staticmethod
    def _get_target_pos( target ):
        """Return a target's X/Y position on the page."""
        xpos = math.floor( target["pos"][0] )
        ypos = math.ceil( target["pos"][1] )
        return xpos, ypos

# ---------------------------------------------------------------------

@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
    help="Output format."
)
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_footnotes_fname ):
    """Extract content from the MMP eASLRB."""

    # initialize
    args = ExtractBase.parse_args( args, _DEFAULT_ARGS )

    # extract the content
    def log_msg( msg_type, msg ):
        if msg_type == "progress" and not progress:
            return
        log_msg_stderr( msg_type, msg )
    extract = ExtractContent( args, log_msg )
    extract.log_msg( "progress",  "Loading PDF: {}", pdf_file )
    with PdfDoc( pdf_file ) as pdf:
        extract.extract_content( pdf )

    # save the results
    with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
         open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
        getattr( extract, "save_as_"+output_fmt, )( targets_out, footnotes_out )

if __name__ == "__main__":
    main() #pylint: disable=no-value-for-parameter