Extracted the index, rule targets and footnotes from the eASLRB.

master
Pacman Ghost 3 years ago
parent c2265404bc
commit e3ebbcd0f7
  1. 147
      asl_rulebook2/extract/all.py
  2. 59
      asl_rulebook2/extract/base.py
  3. 524
      asl_rulebook2/extract/content.py
  4. 209
      asl_rulebook2/extract/data/footnote-fixups.json
  5. 288
      asl_rulebook2/extract/data/index-fixups.json
  6. 40
      asl_rulebook2/extract/data/known-missing-ruleids.json
  7. 400
      asl_rulebook2/extract/data/target-fixups.json
  8. 383
      asl_rulebook2/extract/index.py
  9. 68
      asl_rulebook2/pdf.py
  10. 3
      asl_rulebook2/tests/__init__.py
  11. 116
      asl_rulebook2/tests/test_extract.py
  12. 99
      asl_rulebook2/utils.py
  13. 11
      bin/dump_pdf.py
  14. 6
      bin/extract_pages.py
  15. 33
      conftest.py
  16. 5
      setup.py

@ -0,0 +1,147 @@
#!/usr/bin/env python3
""" Extract everything we need from the MMP eASLRB. """
import sys
import os
import json
import re
import importlib
import click
from asl_rulebook2.pdf import PdfDoc
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
from asl_rulebook2.extract.index import ExtractIndex
from asl_rulebook2.extract.content import ExtractContent
# ---------------------------------------------------------------------
class ExtractAll( ExtractBase ):
"""Extract everything from the eASLRB."""
def __init__( self, args, log=None ):
super().__init__( None, None, log )
self._args = args
def extract_all( self, pdf ):
"""Extract everything from the eASLRB."""
# initialize
default_args = {}
for mod in ( "index", "content" ):
mod = importlib.import_module( "asl_rulebook2.extract." + mod )
default_args.update( getattr( mod, "_DEFAULT_ARGS" ) )
# extract the index
self._log_msg( "progress", "\nExtracting the index..." )
args = ExtractBase.parse_args( self._args, default_args )
self.extract_index = ExtractIndex( args, self._log )
self.extract_index.extract_index( pdf )
# extract the content
self._log_msg( "progress", "\nExtracting the content..." )
args = ExtractBase.parse_args( self._args, default_args )
self.extract_content = ExtractContent( args, self._log )
self.extract_content.extract_content( pdf )
# verify the index targets
self._check_targets()
def _check_targets( self ):
"""Cross-check ruleid's and ruleref's in the index against targets in the main content."""
# build an index of known targets
targets = {}
for ruleid, target in self.extract_content._targets.items():
assert ruleid not in targets
targets[ ruleid ] = target["caption"]
# load the list of known missing targets
known_strings, known_regexes = set(), set()
fname = os.path.join( os.path.dirname(__file__), "data/known-missing-ruleids.json" )
with open( fname, "r", encoding="utf-8" ) as fp:
data = json.load( fp )
for chapter in data["chapters"]:
known_regexes.add( re.compile( "^{}[0-9.]+[A-Ea-e]?$".format( chapter ) ) )
known_strings.update( data["strings"] )
known_regexes.update(
re.compile( regex ) for regex in data["regexes"]
)
def is_known_ruleid( ruleid ):
ruleid = re.sub( r"-[A-Z]?\.?\d+$", "", ruleid ) # e.g. "A1.23-.45" -> "A1.23"
if ruleid.endswith( " EX" ):
ruleid = ruleid[:-3]
if ruleid in targets:
return True
if ruleid in known_strings:
return True
if any( regex.search( ruleid ) for regex in known_regexes ):
return True
return False
# check each index entry
first = True
for index_entry in self.extract_index._index_entries:
errors = []
# check the index entry's ruleid's
for ruleid in index_entry.get( "ruleids", [] ):
if not is_known_ruleid( ruleid ):
errors.append( "Unknown ruleid: {}".format( ruleid ) )
# check the index entry's ruleref's
for ruleref in index_entry.get( "rulerefs", [] ):
if not ruleref["ruleids"]:
continue
# check each ruleref
if ", ".join( r for r in ruleref["ruleids"] ) in known_strings:
# NOTE: This is some free-form text that has been split up because it contains commas.
continue
for ruleid in ruleref["ruleids"]:
if not is_known_ruleid( ruleid ):
errors.append( "Unknown ruleref target: {} => [{}]".format( ruleref["caption"], ruleid ) )
# log any errors
if errors:
if first:
self._log_msg( "warning", "\n=== Unknown targets ===\n" )
first = False
errors = [ "- {}".format( e ) for e in errors ]
self._log_msg( "warning", "{}:\n{}",
index_entry["caption"], "\n".join(errors)
)
# ---------------------------------------------------------------------
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." )
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname, save_footnotes_fname ):
"""Extract everything we need from the MMP eASLRB."""
# extract everything
def log_msg( msg_type, msg ):
if msg_type == "progress" and not progress:
return
log_msg_stderr( msg_type, msg )
extract = ExtractAll( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf:
extract.extract_all( pdf )
# save the results
with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract.extract_index, "save_as_"+format )( index_out )
getattr( extract.extract_content, "save_as_"+format )( targets_out, footnotes_out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -0,0 +1,59 @@
""" Base class for the extraction tools. """
import sys
import click
# ---------------------------------------------------------------------
class ExtractBase:
def __init__( self, args, default_args, log ):
self._args = args
if default_args:
for key in default_args:
if key not in self._args:
self._args[ key ] = default_args[ key ]
self._log = log
@staticmethod
def parse_args( args, default_args ):
"""Helper method to parse command-line arguments."""
args2 = {}
for arg in args:
pos = arg.find( "=" )
if pos < 0:
raise RuntimeError( "Invalid configuration parameter: {}".format( arg ) )
key, val = arg[:pos], arg[pos+1:]
if key not in default_args:
raise RuntimeError( "Unknown configuration parameter: {}".format( key ) )
args2[ key ] = int(val) if val.isdigit() else val
return args2
def _in_viewport( self, elem, vp_type ):
"""Check if an element is in the viewport."""
if elem.x0 <= self._args[vp_type+"_vp_left"] or elem.x1 >= self._args[vp_type+"_vp_right"]:
return False
if elem.y0 <= self._args[vp_type+"_vp_bottom"] or elem.y1 >= self._args[vp_type+"_vp_top"]:
return False
return True
@staticmethod
def _is_bold( elem ):
"""Check if an element is using a bold font."""
return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) )
def _log_msg( self, msg_type, msg, *args, **kwargs ):
"""Log a message."""
if not self._log:
return
msg = msg.format( *args, **kwargs )
self._log( msg_type, msg )
# ---------------------------------------------------------------------
def log_msg_stderr( msg_type, msg ):
"""Log a message to stderr."""
if msg_type == "warning":
msg = click.style( "WARNING: {}".format( msg ), fg="yellow" )
click.echo( msg, file=sys.stderr )

@ -0,0 +1,524 @@
#!/usr/bin/env python3
""" Extract content from the MMP eASLRB. """
import os
import json
import re
import math
from collections import defaultdict
import click
from pdfminer.layout import LTChar
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator
from asl_rulebook2.utils import parse_page_numbers, fixup_text, append_text, remove_trailing, jsonval
# NOTE: Characters are laid out individually on the page, and we generally want to process them top-to-bottom,
# left-to-right, but in some cases, alignment is messed up (e.g. the bounding boxes don't line up properly
# and e.g. the first part of a sentence is infintesimally lower down than the rest of the sentence, and so
# appears later in the sort order), and we get better results if we process characters in the order in which
# they appear in the PDF document.
_DISABLE_SORT_ITEMS = [
"B40", # nb: to detect B31.1 NARROW STREET
"A58","A59","A60", # Chapter A footnotes (nb: page A61 is a mess wrt element order :-/)
"B45", "B46", # Chapter B footnotes
"C25", "C26", # Chapter C footnotes
"D27", # Chapter D footnotes
"E28", "E29", "E30", # Chapter E footnotes
"F20", "F21", # Chapter F footnotes
"G48", "G49", "G50", # Chapter G footnotes
]
_DEFAULT_ARGS = {
"chapter-a": "42-102", "chapter-b": "109-154", "chapter-c": "158-183", "chapter-d": "187-213",
"chapter-e": "216-245", "chapter-f": "247-267", "chapter-g": "270-319",
"chapter-j": "593",
"chapter-w": "647-664",
"content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport
"disable-sort-items": ",".join( _DISABLE_SORT_ITEMS )
}
# ---------------------------------------------------------------------
class ExtractContent( ExtractBase ):
"""Extract content from the MMP eASLRB."""
def __init__( self, args, log=None ):
super().__init__( args, _DEFAULT_ARGS, log )
self._targets = {}
self._footnotes = {}
# prepare to fixup problems in the content
fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp:
self._target_fixups = json.load( fp )
fname2 = os.path.join( os.path.dirname(__file__), "data/footnote-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp:
self._footnote_fixups = json.load( fp )
def extract_content( self, pdf ):
"""Extract content from the MMP eASLRB."""
# figure out which pages to process
chapter_pages = {} # maps chapters to page numbers
page_index = {} # maps page numbers to chapter
for key, val in _DEFAULT_ARGS.items():
if key.startswith( "chapter-" ):
page_nos = parse_page_numbers( val )
assert len(key) == 9
chapter = key[8].upper()
chapter_pages[ chapter ] = page_nos
for page_no in page_nos:
page_index[ page_no ] = chapter
disable_sort_items = set( self._args["disable-sort-items"].split( "," ) )
# initialize
self._curr_chapter = None
curr_chapter_pageno = None
self._curr_footnote = None
# NOTE: The parsing code works in two modes.
# - We start off extracting content, and detect the start of a new rule by bold text near the start of the line.
# - When we see the footnotes header (e.g. "CHAPTER A FOOTNOTES"), we switch into footnotes mode, and detect
# the start of a footnote by a bold number near the start of the line.
# process each page
for page_no, page, lt_page in PageIterator( pdf ):
# prepare to process the next page
if page_no > max( page_index.keys() ):
break
if page_no not in page_index:
self._log_msg( "progress", "- Skipping page {}.", page_no )
continue
if not self._curr_chapter or self._curr_chapter != page_index[page_no]:
# we've found the start of a new chapter
self._save_footnote() # nb: save the last footnote of the previous chapter
self._curr_chapter = page_index[ page_no ]
curr_chapter_pageno = 1
else:
curr_chapter_pageno += 1
self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
self._curr_chapter, curr_chapter_pageno
)
self._log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid )
# process each element on the page
curr_caption = None
self._top_left_elem = self._prev_elem = None
elem_filter = lambda e: isinstance( e, LTChar )
sort_elems = self._curr_pageid not in disable_sort_items
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
# keep track of the top-left-most bold element
if self._is_bold( elem ):
if self._top_left_elem is None \
or elem.x0 < self._top_left_elem.x0 and elem.y1 > self._top_left_elem.y1:
self._top_left_elem = elem
# check if we should ignore this element
if not self._in_viewport( elem, "content" ):
continue
# check if we're currently extracting footnotes
if self._curr_footnote is not None:
self._on_footnote_elem( elem, lt_page )
self._prev_elem = elem
continue
# figure out what we've got
is_bold = self._is_bold( elem )
if is_bold and curr_caption and curr_caption[0].isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2:
# the previous bold character looks like a footnote superscript - ignore it
curr_caption = None
if curr_caption and elem.get_text() == " ":
# FUDGE! Some captions are in a bold font, but the spaces are not :-/
is_bold = True
if is_bold:
if curr_caption:
# NOTE: We stop collecting bold characters at the end of the line, even if they continue on
# to the next line. This is to handle the case of a major heading (e.g. "1. PERSONNEL COUNTERS")
# being followed by a lesser heading ("1.1"). However, we want to handle captions that span
# multiple lines, so we check the vertical distance between the lines to see if it looks like
# two separate headings, or a single caption that has spread over multiple lines.
if self._prev_elem.y0 - elem.y1 > 0.25*elem.height:
# we've found the start of a new rule - save the old one, start collecting the new caption
self._save_target( curr_caption, page_no, lt_page, elem )
curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ]
else:
# continue collecting the caption
if self._prev_elem.y0 - elem.y0 > 1:
# nb: we just started a new line
curr_caption[0] = append_text( curr_caption[0], elem.get_text() )
else:
curr_caption[0] += elem.get_text()
else:
# check if this is the first character of the line
if self._is_start_of_line( elem, lt_page ):
# yup - start collecting the caption
curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ]
else:
# check if we're currently collecting a caption
if curr_caption:
# yup - we've just found the end of it, save it
self._save_target( curr_caption, page_no, lt_page, elem )
curr_caption = None
# loop back to process the next element
self._prev_elem = elem
# add the last caption/footnote (if they haven't already been done)
self._save_footnote()
if curr_caption:
self._save_target( curr_caption, page_no, None, None )
# check for unused fixups
if self._target_fixups:
self._log_msg( "warning", "Unused fixups: {}", self._target_fixups )
if self._footnote_fixups:
self._log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )
def _save_target( self, caption, page_no, lt_page, elem ):
"""Save a parsed target."""
# initialize
orig_caption = caption[0]
caption_text = re.sub( r"\s+", " ", caption[0] ).strip()
if len(caption_text) <= 1:
# NOTE: We're finding text that is part of an image (e.g. the "E" for an Elite MMC),
# perhaps because the pages were OCR'ed, so we ignore these.
return
# check if we've found the start of the chapter's footnotes
if "FOOTNOTES" in caption_text :
# yup - notify the main loop
self._curr_footnote = []
if elem:
self._on_footnote_elem( elem, lt_page )
return
# check if the entry needs to be fixed up
fixup = self._target_fixups.get( self._curr_pageid, {} ).get( caption_text )
if fixup:
# yup - make it so
fixup[ "instances" ] = fixup.get("instances",1) - 1
if fixup["instances"] <= 0:
self._target_fixups[ self._curr_pageid ].pop( caption_text )
if not self._target_fixups[ self._curr_pageid ]:
del self._target_fixups[ self._curr_pageid ]
ruleid = fixup.get( "new_ruleid" )
if not ruleid:
return
caption_text = fixup.get( "new_caption" )
else:
# nope - use what was parsed
# FUDGE! There are a lot of layout problems with things like "12.CONCEALMENT" (i.e. missing space),
# and it's tricky to detect these and not get tripped up by things like "12.C blah", so we handle it
# as a separate case.
mo = re.search( r"^(\d+\.\d*)([^ 0-9].+)", caption_text )
if mo:
ruleid, caption_text = mo.group(1), mo.group(2).strip()
else:
# check if the caption text starts with something that looks like a ruleid
# NOTE: A leading "*" indicates an optional rule.
mo = re.search( r"^\*?([A-Z]\.?)?[1-9][0-9.-]*[A-F]?", caption_text )
if not mo:
return
ruleid, caption_text = mo.group(), caption_text[mo.end():].strip()
if ruleid.startswith( "*" ):
ruleid = ruleid[1:]
ruleid = remove_trailing( ruleid, "." )
caption_text = remove_trailing( caption_text, ":" )
# save the new target
if not ruleid.startswith( self._curr_chapter ):
ruleid = self._curr_chapter + ruleid
if ruleid in self._targets:
self._log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").",
ruleid, caption[0]
)
return
if caption_text == "\u2014":
caption_text = "-" # nb: for A7.306 :-/
self._targets[ ruleid ] = {
"caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1],
"raw_caption": orig_caption
}
def _on_footnote_elem( self, elem, lt_page ):
"""Process an element while we're parsing footnotes."""
# check if we've found the start of a new footnote
if self._is_bold( elem ):
if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ):
# yup - save the current footnote, start collecting the new one
self._save_footnote()
self._curr_footnote = [ elem.get_text(), "" ]
else:
if self._curr_footnote[1]:
# FUDGE! Some footnote content has bold text hard-up at the left margin,
# so we collect that as normal content.
self._curr_footnote[1] += elem.get_text()
else:
# we're still collecting the footnote's ID
# NOTE: Older chapters have only the footnote ID in bold text, while newer chapters have
# both the ID and caption in bold. We figure out what's going on later, in _save_footnote().
self._curr_footnote[0] += elem.get_text()
else:
# nope - we're still collecting the footnote's content
if not self._prev_elem or elem.x0 < self._prev_elem.x0 or elem.y0 - self._prev_elem.y0 > lt_page.height/2:
# nb: we just started a new line
self._curr_footnote[1] = append_text( self._curr_footnote[1], elem.get_text() )
else:
self._curr_footnote[1] += elem.get_text()
def _save_footnote( self ):
"""Save a parsed footnote."""
if not self._curr_footnote:
return
# initialize
if self._curr_chapter not in self._footnotes:
# start saving footnotes for the chapter
self._footnotes[ self._curr_chapter ] = []
orig_content = self._curr_footnote[1]
# separate the footnote ID, referenced rule, and content
if self._curr_chapter in ( "F", "G", "W" ):
# NOTE: Chapter F/G footnote captions are also bold.
mo = re.search( r"^\d{1,2}\.", self._curr_footnote[0] )
if mo:
parts = mo.group(), self._curr_footnote[0][mo.end():]
self._curr_footnote[0] = parts[0]
self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip()
else:
self._log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] )
footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." )
content = self._curr_footnote[1].strip()
mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content )
if mo:
ruleid, content = mo.group(), content[mo.end():]
if not ruleid.startswith( self._curr_chapter ):
ruleid = self._curr_chapter + ruleid
ruleid = remove_trailing( ruleid, "." )
else:
ruleid = None
if self._curr_chapter == "C":
# FUDGE! The "29." for Chapter C's footnote #29 is misaligned, and is extracted as two separate
# footnotes "2" and "9". There isn't really any way to fix this via the normal data-driven mechanism,
# so we do it in the code here :-/
footnote_ids = [ f["footnote_id"] for f in self._footnotes[self._curr_chapter] ]
if footnote_id == "2" and "2" in footnote_ids:
return
if footnote_id == "9" and "9" in footnote_ids:
footnote_id = "29"
# clean up the content
content = re.sub( r"\s+", " ", content ).strip()
content = fixup_text( content )
mo = re.search( r"^[A-Z ]+:\S", content )
if mo:
content = content[:mo.end()-1] + " " + content[mo.end()-1:]
# check for any fixups
captions = []
fixups = self._footnote_fixups.get( self._curr_chapter, {} ).get( footnote_id )
if fixups:
if isinstance( fixups, list ):
# NOTE: A simple search-and-replace is, by far, the most common fixup, so we provide
# a simplified way of specifying these in the fixup file
fixups = { "replace": [ ( sr[0], sr[1] ) for sr in fixups ] }
errors = defaultdict( list )
# do any search-replace's
if "replace" in fixups:
for sr in fixups["replace"]:
prev_content = content
content = content.replace( sr[0], sr[1] )
if content == prev_content:
self._log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}",
self._curr_chapter, footnote_id, sr[0]
)
errors["replace"].append( sr )
del fixups["replace"]
# replace the captions
if "captions" in fixups:
captions = fixups.pop( "captions" )
# check that all fixups were successfully applied
if fixups:
errors.append( fixups )
if errors:
self._footnote_fixups[ self._curr_chapter ][ footnote_id ] = errors
else:
del self._footnote_fixups[ self._curr_chapter ][ footnote_id ]
if not self._footnote_fixups[ self._curr_chapter ]:
del self._footnote_fixups[ self._curr_chapter ]
content = content.strip()
# extract the footnote's caption
if not captions:
pos = content.find( ":" )
if pos >= 0:
captions.append( ( ruleid, content[:pos] ) )
content = content[pos+1:].strip()
else:
self._log_msg( "warning", "Can't extract footnote caption: {}:{} - {}",
self._curr_chapter, footnote_id, content
)
# check for the credits at the end of the Chapter F footnotes
pos = content.find( "WEST OF ALAMEIN CREDITS" )
if pos > 0:
content = content[:pos]
# save the footnote
self._footnotes[ self._curr_chapter ].append( {
"footnote_id": footnote_id,
"captions": captions,
"content": content,
"raw_content": orig_content
} )
self._curr_footnote = None
def _is_start_of_line( self, elem, lt_page ):
"""Check if the element is at the start of its line."""
# NOTE: We can't just check the element's x co-ordinate, since there is sometimes a floating image
# that pushes the text right (e.g. A.12).
if self._prev_elem is None:
return True
if elem.y0 < self._prev_elem.y0:
return True
if self._prev_elem.x0 < lt_page.width/2 and elem.x0 > lt_page.width/2:
return True # the element is at the top of the right column
return False
def save_as_raw( self, targets_out, footnotes_out ):
"""Save the raw results."""
self._save_as_raw_or_text( targets_out, footnotes_out, True )
def save_as_text( self, targets_out, footnotes_out ):
"""Save the results as plain-text."""
self._save_as_raw_or_text( targets_out, footnotes_out, False )
def _save_as_raw_or_text( self, targets_out, footnotes_out, raw ):
"""Save the results as raw or plain-text."""
# save the targets
curr_page_no = None
for ruleid, target in self._targets.items():
if target["page_no"] != curr_page_no:
if curr_page_no:
print( file=targets_out )
print( "=== p{} ===".format( target["page_no"] ), file=targets_out )
curr_page_no = target["page_no"]
xpos, ypos = self._get_target_pos( target )
if raw:
print( "[{},{}] = {}".format(
xpos, ypos, target["raw_caption"]
), file=targets_out )
else:
print( "{} => {} @ p{}:[{},{}]".format(
ruleid, target["caption"], target["page_no"], xpos, ypos
), file=targets_out )
# save the footnotes
def make_caption( caption ):
buf = []
if caption[1]:
buf.append( caption[1] )
if caption[0]:
buf.append( "[{}]".format( caption[0] ) )
elif caption[0]:
buf.append( caption[0] )
return " ".join( buf )
for chapter, footnotes in self._footnotes.items():
if chapter != "A":
print( file=footnotes_out )
print( "=== CHAPTER {} FOOTNOTES {}".format( chapter, 80*"=" )[:80], file=footnotes_out )
for footnote in footnotes:
print( file=footnotes_out )
print( "--- Footnote {} ---".format( footnote["footnote_id"] ), file=footnotes_out )
if raw:
print( footnote["raw_content"], file=footnotes_out )
else:
print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out )
print( footnote["content"], file=footnotes_out )
def save_as_json( self, targets_out, footnotes_out ):
"""Save the results as JSON."""
# save the targets
targets, curr_chapter = [], None
for ruleid, target in self._targets.items():
xpos, ypos = self._get_target_pos( target )
targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format(
jsonval( ruleid ),
jsonval(target["caption"]), target["page_no"], xpos, ypos
) )
if ruleid[0] != curr_chapter:
targets[-1] = "\n" + targets[-1]
curr_chapter = ruleid[0]
print( "{{\n{}\n\n}}".format(
",\n".join( targets )
), file=targets_out )
# save the footnotes
def make_caption( caption ):
return "{{ \"caption\": {}, \"ruleid\": {} }}".format(
jsonval(caption[1]), jsonval(caption[0])
)
chapters = []
for chapter in self._footnotes:
footnotes = []
for footnote in self._footnotes[chapter]:
footnotes.append( "{}: {{\n \"captions\": {},\n \"content\": {}\n}}".format(
jsonval( footnote["footnote_id"] ),
"[ {} ]".format( ", ".join( make_caption(c) for c in footnote["captions"] ) ),
jsonval( footnote["content"] )
) )
chapters.append( "{}: {{\n\n{}\n\n}}".format(
jsonval( chapter ),
",\n".join( footnotes )
) )
print( "{{\n\n{}\n\n}}".format(
",\n\n".join( chapters )
), file=footnotes_out )
@staticmethod
def _get_target_pos( target ):
"""Return a target's X/Y position on the page."""
xpos = math.floor( target["pos"][0] )
ypos = math.ceil( target["pos"][1] )
return xpos, ypos
# ---------------------------------------------------------------------
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_fname ):
"""Extract content from the MMP eASLRB."""
# initialize
args = ExtractBase.parse_args( args, _DEFAULT_ARGS )
# extract the content
def log_msg( msg_type, msg ):
if msg_type == "progress" and not progress:
return
log_msg_stderr( msg_type, msg )
extract = ExtractContent( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf:
extract.extract_content( pdf )
# save the results
with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract, "save_as_"+format )( targets_out, footnotes_out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -0,0 +1,209 @@
{
"A": {
"10A": [
[ "OneHalfFP", "One-Half FP" ],
[ "firstappearedintheASLAnnual'89.(In1998,bothwerereprintedin Classic ASL.)", "first appeared in the ASL Annual '89. (In 1998, both were reprinted in Classic ASL.)" ],
[ "One of the several criticisms", "<p> One of the several criticisms" ]
],
"12": [ [ "TEMto", "TEM to" ] ],
"14": [
[ "bipodmounted", "bipod-mounted" ],
[ "volume o f fire", "volume of fire" ]
],
"17": [ [ "adistinct", "a distinct" ] ],
"19" : [ [ "wellsited", "well-sited" ] ],
"32": [ [ "HWunits", "HW units" ] ],
"33": [ [ "multiLocation", "multi-Location" ] ],
"35": [ [ "The original printing", "<p> The original printing" ] ],
"37": [
[ "- Winter War (vs Soviet Union) 30 November 1939 - 13 March 1940- Continuation War (vs Soviet Union) 25 June 1941 - 4 September 1944- Lapland War (vs Germany) 15 September 1944 - 27 April 1945", " <ul> <li> <b>Winter War</b> (vs Soviet Union) 30 November 1939 - 13 March 1940 <li> <b>Continuation War</b> (vs Soviet Union) 25 June 1941 - 4 September 1944 <li> <b>Lapland War</b> (vs Germany) 15 Se ptember 1944 - 27 April 1945 </ul>" ]
],
"38": [
[ "Romania: Romania,", " <p> <b>Romania</b>: Romania," ],
[ "Hungary: A traditional", " <p> <b>Hungary</b>: A traditional" ],
[ "Slovakia: Urged on", "<p> <b>Slovakia</b>: Urged on" ],
[ "German-Croatian units in Russia:", " <p> <b>German-Croatian units in Russia</b>:" ],
[ "Italian-Croatian units in Russia:", " <p> <b>Italian-Croatian units in Russia</b>:" ],
[ "Croatian units in Yugoslavia:", " <p> <b>Croatian units in Yugoslavia</b>:" ],
[ "CroatianArmyunitswereengagedprimarilyinanti-partisanactivities,fightingmostly", "Croatian Army units were engaged primarily in anti-partisan activities, fighting mostly" ],
[ "Bulgaria: Bulgaria", "<p> <b>Bulgaria</b>: Bulgaria" ],
[ "WhiletheriflecompanydidnothaveaninherentHeavyWeapons(HW)platoon,it", "While the rifle company did not have an inherent Heavy Weapons (HW) platoon, it"]
],
"39": [ [ "generallyapply", "generally apply" ] ],
"41": [ [ "ViceAdmiral", "Vice-Admiral" ] ],
"43": [
[ "ALLIEDMINORS", "ALLIED MINORS" ],
[ "BARrather", "BAR rather" ]
]
},
"B": {
"3B": [ [ "LOWERLEVELLOCATIONS", "LOWER LEVEL LOCATIONS" ] ],
"6": [ [ "The Village Terrain rules", "<p> The Village Terrain rules" ] ],
"13": [ [ "U6U7 U8 U9W6W7W8W9V6 V7 V8Y6 Y7Y8Y9 X6X7X8", "" ] ]
},
"C": {
"1": [ [ "ac tually", "actually" ] ],
"8": [ [ "rep resents", "represents" ] ],
"13": [ [ "0o", "0&deg;" ] ],
"20": [ [ "predetermined", "pre-determined" ] ],
"21": [ [ "p roneto", "prone to" ] ],
"26": [
[ "Mortarsof76-107mm", "Mortars of 76-107mm" ],
[ "areexempted", "are exempted" ],
[ "frommanyof", "from many of" ],
[ "normalGun", "normal Gun" ],
[ "thantowed", "than towed" ]
],
"32": [ [ "PF counters were removed", "<p> PF counters were removed" ] ],
"38": [ [ "U. S. Army", "U.S. Army" ] ]
},
"D": {
"8": [ [ "massproduced", "mass-produced" ] ]
},
"E": {
"3": [
[ "flexibilityis", "flexibility is" ],
[ "for gottenby", "forgotten by" ]
],
"4": [ [ "Th e most", "The most" ] ],
"11": [
[ "Another problem", "<p> Another problem" ],
[ "A Fire cast", "<p> A Fire cast" ],
[ "Finally, to add", "<p> Finally, to add" ]
],
"14B": [ [ "infantrypulled", "infantry-pulled" ] ],
"15": [ [ "shallowdraught", "shallow draught" ] ],
"18": [
[ "reallife", "real-life" ],
[ "the g eneric", "the generic" ]
],
"20": {
"captions": [ [ "E7.51", "LIGHT AA" ], [ "E7.52", "HEAVY AA" ] ],
"replace": [
[ "& 7.52 AA FIRE:", "" ]
]
},
"24": {
"captions": [ [ "E9.2", "DRIFT" ], [ "E9.4", "LANDING" ] ],
"replace": [
[ "DRIFT & 9.4 LANDING:", "" ]
]
}
},
"F": {
"12": [ [ "non- entrenched", "non-entrenched" ] ],
"19": [
[ "Inthewinternight,thenear-freezingtemperaturecauseddewtoform.", "In the winter night, the near-freezing temperature caused dew to form. " ],
[ "Thenextmorningathickmistoftenformedasthesun evaporateditagain.", "The next morning a thick mist often formed as the sun evaporated it again. " ],
[ "Thiscouldhappeneveninthesummertimeundertheproperenvironmentalconditions,", "This could happen even in the summertime under the proper environmental conditions, " ],
[ "butsincethiswasamuchlessfrequentoccurrenceithasbeen ignored.", "but since this was a much less frequent occurrence it has been ignored." ]
],
"21": [
[ "Playerswillprobablyfinditmoreconvenienttoinstead", "Players will probably find it more convenient to instead" ],
[ "addathird,different-coloreddietothisTH/IFTDR,", "add a third, different-colored die to this TH/IFT DR, " ],
[ "usingittodeterminetheDust DRM.", "using it to determine the Dust DRM." ],
[ "Thefamiliarterm\"subsequentdr\"wasusedintherulebecauseitobviates theneed", "The familiar term \"subsequent dr\" was used in the rule because it obviates the need" ],
[ "a\"new\"concept", "a \"new\" concept" ],
[ "thatof rolling athird diesimultaneously", "that of rolling a third die simultaneously" ]
],
"22": [
[ "theDustcounter\"follows\"thevehicleasit movesfromhex to hex", "the Dust counter \"follows\" the vehicle as it moves from hex to hex" ],
[ "itexpends", "it expends " ],
[ "two MPeach timeitdoesso", " two MP each time it does so" ]
],
"23": [
[ "Anotherwind-relatedaspectoftheNorthAfricanenvironmentisthedesertsandstorm,", "Another wind-related aspect of the North African environment is the desert sandstorm, " ],
[ "orkhamsininArabic.", "or khamsin in Arabic. " ],
[ "ChapterFincludesnospecial rulesforitbecause,", "Chapter F includes no special rules for it because, " ],
[ "withvisibilitycutbythestormtoaslittleasthreeyards,", "with visibility cut by the storm to as little as three yards, " ],
[ "allactivitiesgenerallywerereducedtoseekingcoverfromthesandblastingwindandchoking dust.", "all activities generally were reduced to seeking cover from the sandblasting wind and choking dust. " ],
[ "However,thegamedoesnotignorethepossibilityofakhamsin'soccurrence.", "However, the game does not ignore the possibility of a khamsin's occurrence. " ],
[ "The propercombinationofWeather,EC,WindandGustsinaDYOscenariocancreateits effects,", "The proper combination of Weather, EC, Wind and Gusts in a DYO scenario can create its effects, " ],
[ "andtheprobabilityofitsoccurrenceisgreatestinascenariosetinspringor summer", "and the probability of its occurrence is greatest in a scenario set in spring or summer" ],
[ "thetimewhen khamsinsoccurred mostfrequently.", "the time when khamsins occurred most frequently." ]
],
"24": [
[ "Thisoverlay isused in aHOLLOW LEGIONS scenario.", "This overlay is used in a HOLLOW LEGIONS scenario." ]
],
"25": [
[ "ThefamousNorthAfricanescarpmentsaresimilarto cliffs,", "The famous North African escarpments are similar to cliffs, " ],
[ "butwithlesssteep(andveryeroded)slopes.", "but with less steep (and very eroded) slopes. " ],
[ "Somearesixhundredfeethigh", "Some are six hundred feet high" ],
[ "thoughgenerallytheirheightsrangefromonehundredtotwohundredfeet.", "though generally their heights range from one hundred to two hundred feet. " ],
[ "Theirsignificanceinthedesertwarlaymainlyinthattheywerecommandingheights,", "Their significance in the desert war lay mainly in that they were commanding heights, " ] ,
[ "defensivepositionsforinfantry,", "defensive positions for infantry, " ],
[ "andgreatlyrestrictedvehicularmovementacrossthem", "and greatly restricted vehicular movement across them" ],
[ "Hencetheywereoftenthesceneofheavyfighting,", "Hence they were often the scene of heavy fighting, " ],
[ "especiallywherecrossedbya road", "especially where crossed by a road" ]
]
},
"G": {
"4": [ [ "It's also interesting", "<p> It's also interesting" ] ],
"8": [ [ "miniDC,", "mini-DC," ] ],
"33": [ [ "closein", "close-in" ] ],
"45": [
[ "Guomindang(akaKuomintang", "Guomindang (aka Kuomintang" ],
[ "XForce", "X-Force" ],
[ "The two-tone color", "<p> The two-tone color" ]
],
"47" : [ [ "against-allodds", "against-all-odds" ] ],
"48": [ [ "trained-andequipped", "trained-and-equipped" ] ]
},
"W": {
"2": [
[ "Korean National Defense Constabulary:", "<ul> <li> <em>Korean National Defense Constabulary</em>: " ],
[ "ROK Army:", "<li> <em>ROK Army</em>: " ],
[ "Korean Marine Corps:", "<li> <em>Korean Marine Corps</em>: " ],
[ "United States - Army:", "<li> <em>United States</em> <ul> <li> Army:" ],
[ "- Army Airborne:", "<li> Army Airborne:" ],
[ "- Army Rangers:", "<li> Army Rangers:" ],
[ "- KATUSA:", "<li> KATUSA:" ],
[ "- Marine Corps:", "<li> Marine Corps:" ],
[ "British Commonwealth:", "</ul> <li> <em>British Commonwealth</em>: " ],
[ "- 41 Independent Commando, Royal Marines: 9/50-12/51", "<ul> <li> 41 Independent Commando, Royal Marines: 9/50-12/51 </ul>" ],
[ "Other United Nations Command:", "<li> <em>Other United Nations Command</em>: " ],
[ "10/50-7/53", "10/50-7/53 </ul>" ]
],
"3": [
[ "Korean People's Army:", "<ul> <li> <em>Korean People's Army</em>: " ],
[ "Communist Guerillas:", "<li> <em>Communist Guerillas</em>: " ],
[ "Chinese People's Volunteer Army:10/50-7/53", "<li> <em>Chinese People's Volunteer Army</em>: 10/50-7/53 </ul>" ]
],
"9": [ [ "T34/85", "T-34/85" ] ],
"16": [
[ "3 1/3 PP", "3&frac13; PP" ],
[ "24-8 HS", "2-4-8 HS" ]
],
"18": [ [ "The first unit", "<p> The first unit" ] ],
"29" : [ [ "RAT KILLERin which", "RAT KILLER in which" ] ],
"30": [
[ "G.M.D in", "G.M.D. in" ],
[ "as. sumed", "assumed" ]
],
"49": [ [ "SUP-PORT", "SUPPORT" ] ],
"50": [ [ "Speciallytrained", "Specially-trained" ] ]
}
}

@ -0,0 +1,288 @@
{
"AirSupport": {
"new_title": "Air Support",
"old_content": "E7,[BRT:TCG6][ChineseDYO:G18.83][ENEMY: S8.9][cannotbeusedvsanyLocationinFog:E3.313][inRB,German AirSupportisalwaysaStukaM42:SSRRB9][JapaneseDYO: G1.6621][inKGP,NAifMistDensity>Light,Night,orOvercast:SSR KGP3][Napalm:G17.4][NightNA:E7.2][OvercastNA:E3.55][during SeaborneAssault/Evacuation:G14.34][SeaborneAssaultDYO: G14.262][TarawaNavalGunfire: TCG3.3]",
"new_content": "E7, [BRT: TCG6] [Chinese DYO: G18.83] [ENEMY: S8.9] [cannot be used vs any Location in Fog: E3.313] [in RB, German Air Support is always a Stuka M42: SSR RB9] [Japanese DYO: G1.6621] [in KGP, NA if Mist Density > Light, Night, or Overcast: SSR KGP3] [Napalm: G17.4] [Night NA: E7.2] [Overcast NA: E3.55] [during Seaborne Assault/Evacuation: G14.34] [Seaborne Assault DYO: G14.262] [Tarawa Naval Gunfire: TCG3.3]"
},
"Ambush": {
"old_content": "A11.4[attacksfirstinCC:A11.32][ATTACKERadds+1 drm to Ambush dr in jungle, kunai, or bamboo Location: G.6] [Banks: G8.212][keeping \"?\" during CC: A12.14] [Dummies are eliminated BEFORE the Ambush dr, and do not qualify for the -2 drm: ASOP 8.11B] [Hand-to-Hand CC: J2.31] [Night: E1.77] [Panic Action: S6.213] [Panjis: G9.21] [in Rubble: SSR RB8] [Street Fighting: A11.8] [T-H Heroes are created after Ambush determination: G1.421]",
"new_content": "A11.4 [attacks first in CC: A11.32] [ATTACKER adds +1 drm to Ambush dr in jungle, kunai, or bamboo Location: G.6] [Banks: G8.212] [keeping \"?\" during CC: A12.14] [Dummies are eliminated BEFORE the Ambush dr, and do not qualify for the -2 drm: ASOP 8.11B] [Hand-to-Hand CC: J2.31] [Night: E1.77] [Panic Action: S6.213] [Panjis: G9.21] [in Rubble: SSR RB8] [Street Fighting: A11.8] [T-H Heroes are created after Ambush determination: G1.421]"
},
"American": {
"old_content": "A25.3[EarlyArmy:G17.2][OBAAccuracy:C1.3] [Paramarine:G17.111][Raider:G17.111][RifleCompany:S18.5][U.S. Marine Corps: G17.1]",
"new_content": "A25.3 [Early Army: G17.2] [OBA Accuracy: C1.3] [Paramarine: G17.111] [Raider: G17.111] [Rifle Company: S18.5] [U.S. Marine Corps: G17.1]"
},
"AmmoPP Reduction": {
"new_title": "Ammo PP Reduction"
},
"APCR": {
"new_title": "APCR/APDS",
"old_content": "(Armor Piercing Composite Rigid)/",
"new_content": "(Armor Piercing Composite Rigid/Discarding Sabot): C8.1-.2 [EXC German 28LL, 40LL: C4.3, C7.32] [vs Guns: C11.52] [HE Equivalency: C8.31] [TH# Modification: C4.3] [To Kill Table: C7.32] [Residual FP NA: C8.31]"
},
"APDS": {
"old_content": "(Armor Piercing Discarding Sabot): C8.1-.2 [EXC German 28LL, 40LL: C4.3, C7.32] [vs Guns: C11.52] [HE Equivalency: C8.31] [TH# Modification: C4.3] [To Kill Table: C7.32] [Residual FP NA: C8.31]",
"new_content": null
},
"Broken Units": {
"replace": [
[ "[Pin NA: A7.8(EXCInterdiction and Huts)]", "[Pin NA (EXC Interdiction and Huts): A7.8]" ]
]
},
"Cellars": {
"replace": [
[ "RBCellars", "RB Cellars" ]
]
},
"DC": {
"replace": [
[ "[Thrown from: (Halftrack: D6.63) (Sidecar: D15.6)]", "[Thrown from Halftrack: D6.63] [Thrown from Sidecar: D15.6]" ]
]
},
"Direct Fire": {
"old_content": "(Any fireattackrequiringaLOSfromthe firerwhichdoesnotuseIndirectFire):C.1,C9.1[InterveningUnits:A6.6][LC: G12.61-.62, G12.671]",
"new_content": "(Any fire attack requiring a LOS from the firer which does not use Indirect Fire): C.1, C9.1 [Intervening Units: A6.6] [LC: G12.61-.62, G12.671]"
},
"Dogfight": {
"old_content": "(AerialCombat):E7.22",
"new_content": "(Aerial Combat): E7.22"
},
"Elite": {
"replace": [
[ "[German (Africa, 1942-43: F.6) (prior to 1944: A25.1) (SS: A25.11)]", "[German (Africa, 1942-43): F.6] [German (prior to 1944): A25.1] [German (SS): A25.11]" ]
]
},
"End of Scenario": {
"replace": [
[ "[in ABtF: R9.4 CG4]", "[in ABtF: R9.4, CG4]" ],
[ "[in KGP: P8.4 CG23]", "[in KGP: P8.4, CG23]" ],
[ "[in PB: Q9.4 CG19 (-1 drm for any Night Scenario and +1 drm for Day II scenario: Turn Record Track)]", "[in PB (-1 drm for any Night Scenario and +1 drm for Day II scenario: Turn Record Track): Q9.4, CG19]" ],
[ "[in RB: O11.4 CG4]", "[in RB: O11.4, CG4]" ]
]
},
"EX": {
"old_content": "ExampleEXC: Exception",
"new_content": "Example",
"_comment_": "The code manually inserts an entry for EXC: Exception"
},
"Fortification": {
"replace": [
[ "[in BRT: SSR1 (BRT Sand: T3.2) (NA in Betio Piers: T9.2)]", "[in BRT: SSR1 (BRT Sand): T3.2] [in BRT: SSR1 (NA in Betio Piers): T9.2]" ]
]
},
"Hazardous Movement": {
"replace": [
[ "[Clearance of: (Debris: O1.5) (Fire: B24.72) (Jungle Path: G2.7) (Roadblock: B24.76) (Rubble: B24.71)]", "[Clearance of Debris: O1.5] [Clearance of Fire: B24.72] [Clearance of Jungle Path: G2.7] [Clearance of Roadblock: B24.76] [Clearance of Rubble: B24.71]" ]
]
},
"Hedges": {
"replace": [
[ "[TEM NA for Ground Support: E7.4; for PRC: B9.3]", "[TEM NA for Ground Support: E7.4] [TEM NA for PRC: B9.3]" ]
]
},
"Immobilization": {
"replace": [
[ "[LC: G12.602; LC Passengers NA: G12.13]", "[LC: G12.602] [LC Passengers NA: G12.13]" ],
[ "[TC: D5.5; TC in BRT: SSR12]", "[TC: D5.5] [TC in BRT: SSR12]" ]
]
},
"Jungle": {
"replace": [
[ "G.2G.6", "G.2-G.6" ]
]
},
"Kunai": {
"replace": [
[ "G.2G.6", "G.2-G.6" ]
]
},
"Leadership": {
"replace": [
[ "[Battle Hardening: A15.3, Finns: A25.71, Japanese: G1.41]", "[Battle Hardening: A15.3] [Battle Hardening (Finns): A25.71] [Battle Hardening (Japanese): G1.41]" ]
]
},
"MG": {
"replace": [
[ "[Vehicular: (see Vehicular MG: D3.5-.54)]", "[Vehicular MG: D3.5-.54]" ],
[ "[Aerial: E7.41, vs AFV: C7.22]", "[Aerial: E7.41] [Aerial (vs AFV): C7.22]" ]
]
},
"Minefield": {
"replace": [
[ "[fully-tracked A FV T B: B 8.61]", "[fully-tracked AFV TB: B8.61]" ]
]
},
"Morale": {
"replace": [
[ "[Gain:", "Gain:" ],
[ "FFE]", "FFE" ]
]
},
"Movement, Vehicle": {
"replace": [
[ "(see Amphibians: D16)", "(see Amphibians)" ]
]
},
"OBA": {
"replace": [
[ "USOrdnance", "US Ordnance" ]
]
},
"Optional Rules": {
"replace": [
[ "A12.16 (see footnote A18)", "A12.16, footnote A18" ]
]
},
"PAATC": {
"old_content": "(Pre-AFVAdvance/AttackTaskCheck;NAtoberserk/Fanatic/Japanese/SMC): A11.6, G1.62 [vs Armored Cupola: O.7] [DC Placement: A23.3] [ENEMY Advance into CC/Melee: S11.4] [1PAATC: Chinese, NonElite Italians, Inexperienced, Allied/Axis Minors] [OVR vs \"?\": A12.41] [CC Reaction Fire: D7.21]",
"new_content": "(Pre-AFV Advance/Attack Task Check; NA to berserk/Fanatic/Japanese/SMC): A11.6, G1.62 [vs Armored Cupola: O.7] [DC Placement: A23.3] [ENEMY Advance into CC/Melee: S11.4] [1PAATC: Chinese, Non-Elite Italians, Inexperienced, Allied/Axis Minors] [OVR vs \"?\": A12.41] [CC Reaction Fire: D7.21]"
},
"PBF": {
"replace": [
[ "A11.l", "A11.1" ]
]
},
"Pillbox": {
"replace": [
[ "[Control: B30.91; in BRT: TCG15]", "[Control: B30.91] [Control (in BRT): TCG15]" ]
]
},
"Pin": {
"replace": [
[ "D6.23.24", "D6.23-.24" ],
[ "[Fire Lanes: A9.22; Cancellation: A9.223]", "[Fire Lanes: A9.22] [Fire Lanes (Cancellation): A9.223]" ]
]
},
"PRC": {
"replace": [
[ "[disembarking in Panji: G9.423; embarking: G9.51]", "[disembarking in Panji: G9.423] [embarking in Panji: G9.51]" ]
]
},
"RMG": {
"replace": [
[ "D1.81 (hull) & D1.82 (turret)", "Hull: D1.81; Turret: D1.82" ]
]
},
"Radio": {
"replace": [
[ "[in KGP: P8.4 CG15]", "[in KGP: P8.4, CG15]" ],
[ "[in RB: O11.4 CG6]", "[in RB: O11.4, CG6]" ]
]
},
"Range": {
"replace": [
[ "see Firing Within Hex", "A7.21" ]
]
},
"Roadblock": {
"replace": [
[ "[TEM NA for Ground Support: E7.4; for PRC: B9.3]", "[TEM NA for Ground Support: E7.4] [TEM NA for PRC: B9.3]" ]
]
},
"Rubble": {
"replace": [
[ "; Stone Blaze:", "] [RePh, Stone Blaze:" ]
]
},
"Scrounging": {
"replace": [
[ "RBCG7", "RB CG7" ]
]
},
"Stacking Limits": {
"replace": [
[ "[Inspecting: see Right of Inspection: (Before Play: A2.9) (During Play: A12.16) (Pillboxes: B30.7)]", "[Inspecting: see Right of Inspection (Before Play): A2.9] [Inspecting: see Right of Inspection (During Play): A12.16] [Inspecting: see Right of Inspection (Pillboxes): B30.7]" ]
]
},
"Stall": {
"old_content": "(Rules are givenin a ChapterH Vehicle Note ifa nationality's AFV are subject to Stall; for example, German Multi-Applicable Vehicle Note H): [Platoon Movement: D14.22]",
"new_content": "(Rules are given in a Chapter H Vehicle Note if a nationality's AFV are subject to Stall; for example, German Multi-Applicable Vehicle Note H): [Platoon Movement: D14.22]"
},
"Target Size": {
"replace": [
[ "[Vehicular: D1.7, Concealment: D1.76]", "[Vehicular: D1.7] [Vehicular (Concealment): D1.76]" ]
]
},
"Uncon irmed Kill": {
"new_title": "Unconfirmed Kill"
},
"Unarmored Vehicles": {
"replace": [
[ "[AFV (vs A-P mines: B28.42) (vs A-T mines: B28.52)]", "[AFV (vs A-P mines): B28.42] [AFV (vs A-T mines): B28.52]" ]
]
},
"Unit": {
"replace": [
[ "[but not horses],", "(but not horses)," ]
]
},
"Voluntary Break": {
"replace": [
[ "[Japanese: G1.13, SMC NA: G1.4]", "[Japanese: G1.13] [Japanese (SMC NA): G1.4]" ]
]
},
"Walls": {
"replace": [
[ "[Bypass LOS across: (Infantry: A4.34) (Vehicle: D2.37)]", "[Bypass LOS across Infantry: A4.34] [Bypass LOS across Vehicle: D2.37]" ],
[ "; for PRC", "] [TEM NA for PRC" ]
]
},
"Winter Camouflage": {
"replace": [
[ "OBA Observer: C 1.6", "OBA Observer: C1.6" ]
]
},
"Wreck Blaze": {
"replace": [
[ "[Creation: (AFV C7.6) (Unarmored: A7.308)]", "[Creation (AFV): C7.6] [Creation (Unarmored): A7.308]" ]
]
}
}

@ -0,0 +1,40 @@
{
"chapters": [ "H", "O", "P", "Q", "R", "S", "T" ],
"strings": [
"see appropriate Vehicle Notes",
"Chapter H",
"derived by cross-indexing Target Type & Range on To Hit Table",
"Basic TH# plus any modifications for Gun and Ammo Types",
"Number beneath Gun Caliber & Length on applicable To Kill Table",
"Basic TK# plus applicable To Kill Modifications (Cases A-D)",
"The Modified TK# minus the AF of the Target Facing hit",
"FP-Range",
"Morale-Leadership",
"FP-Range-Morale",
"abbr. for Bow Flamethrower",
"HE NA", "AP NA",
"Chinese, Non-Elite Italians, Inexperienced, Allied/Axis Minors"
],
"regexes": [
"^ASOP .+$",
"^(RB )?OCG[0-9.]+$",
"^PCG[0-9.]+[a-e]?$",
"^(PB )?QCG[0-9.]+$",
"^RCG[0-9.]+$",
"^TCG[0-9.]+[a-e]?$",
"^SSR[0-9.]+$",
"^(RB CG )?SSR .+$",
"^(RB )?CG[0-9.]+$",
"^(SSR |SSRs )?(ABtF|KGP|PB|RB|BRT)[0-9.]+$",
"Chapter [A-Z] [Ii]ntroduction",
"Chapter [A-Z] [Dd]ivider",
"^footnote [A-Z]\\d+",
"^.+ [Oo]verlay$",
" Multi-Applicable Note ",
" (Vehicle|Ordnance) Note "
]
}

@ -0,0 +1,400 @@
{
"A1": {
"A.10LEADERSHIP DRM ():": {
"new_ruleid": "A.10",
"new_caption": "LEADERSHIP DRM (&#9651;)"
}
},
"A3": {
"23": { "new_ruleid": null }
},
"A5": {
"3.BASIC SEQUENCE OF PLAY": {
"new_ruleid": "A3",
"new_caption": "BASIC SEQUENCE OF PLAY"
},
"TURN RECORD CHART": { "new_ruleid": null }
},
"A21": {
"9.223CANCELLATION:": {
"new_ruleid": "A9.223",
"new_caption": "CANCELLATION"
}
},
"A28": {
"1 -": { "new_ruleid": null }
},
"A29": {
"1 -": { "new_ruleid": null }
},
"A30": {
"11.CLOSE COMBAT (CC)": {
"new_ruleid": "A11",
"new_caption": "CLOSE COMBAT (CC)"
}
},
"A31": {
"11.2WITHDRAWALFROMMELEE:": {
"new_ruleid": "A11.2",
"new_caption": "WITHDRAWAL FROM MELEE"
}
},
"A34": {
"12.CONCEALMENT": {
"new_ruleid": "A12",
"new_caption": "CONCEALMENT"
}
},
"A37": {
"5 12.2 CONCEALED / COUNTERS:": {
"new_ruleid": "A12.2",
"new_caption": "CONCEALED &frac58;\" COUNTERS"
}
},
"A38": {
"13.CAVALRY": {
"new_ruleid": "A13",
"new_caption": "CAVALRY"
}
},
"A39": {
"6MF 1MF2MF6FP": { "new_ruleid": null },
"4FP6FP4FP": { "new_ruleid": null }
},
"A43": {
"A18.2 LEADER CREATION TABLE*LEADER CREATION drm": { "new_ruleid": null }
},
"A46": {
"21.CAPTURED EQUIPMENT": {
"new_ruleid": "A21",
"new_caption": "CAPTURED EQUIPMENT"
}
},
"A50": {
"C24.5 STRENGTH:": {
"new_ruleid": "A24.5",
"new_caption": "STRENGTH"
}
},
"A51": {
"30 25.NATIONALITY DISTINCTIONS": {
"new_ruleid": "A25",
"new_caption": "NATIONALITY DISTINCTIONS"
}
},
"A54": {
"25.53 FREEFRENCH:": {
"new_ruleid": "A25.53",
"new_caption": "FREE FRENCH"
}
},
"A55": {
"26.VICTORYCONDITIONS": {
"new_ruleid": "A26",
"new_caption": "VICTORY CONDITIONS"
}
},
"B4": {
"6.BRIDGES": {
"new_ruleid": "B6",
"new_caption": "BRIDGES"
}
},
"B6": {
"8.45BROKEN & BERSERK:": {
"new_ruleid": "B8.45",
"new_caption": "BROKEN & BERSERK"
}
},
"B15": {
"11.CLIFFS": {
"new_ruleid": "B11",
"new_caption": "CLIFFS"
}
},
"B17": {
"13.8 PINEWOODS:": {
"new_ruleid": "B13.8",
"new_caption": "PINE WOODS"
},
"13.81 OBSTACLEHEIGHT:": {
"new_ruleid": "B13.81",
"new_caption": "OBSTACLE HEIGHT"
},
"13.82 MFCOST:": {
"new_ruleid": "B13.82",
"new_caption": "MF COST"
}
},
"B19": {
"17.CRAG": {
"new_ruleid": "17",
"new_caption": "CRAG"
}
},
"B22": {
"2 2": { "new_ruleid": null }
},
"B24": {
"23.BUILDINGS": {
"new_ruleid": "B23",
"new_caption": "BUILDINGS"
}
},
"B33": {
"0 25.64 WIND DIRECTION:": {
"new_ruleid": "B25.64",
"new_caption": "WIND DIRECTION"
}
},
"B35": {
"53": { "new_ruleid": null },
"1 2": { "new_ruleid": null }
},
"C7": {
"2.3 360 MOUNT:": {
"new_ruleid": "C2.3",
"new_caption": "360&deg; MOUNT:"
}
},
"C11": {
"5.31 CASE C; BOUNDING FIRST FIRER, RESTRICTED AIM: 1": {
"new_ruleid": "C5.31",
"new_caption": "CASE C<sup>1</sup>; BOUNDING FIRST FIRER, RESTRICTED AIM"
},
"5.32 CASE C; BOUNDING FIRST FIRER, LIMITED AIM:": {
"new_ruleid": "C5.32",
"new_caption": "CASE C<sup>2</sup>; BOUNDING FIRST FIRER, LIMITED AIM"
},
"5.34 CASE C; LATW:": {
"new_ruleid": "C5.34",
"new_caption": "CASE C<sup>3</sup>; LATW"
},
"5.35 CASE C; MOTION FIRER:": {
"new_ruleid": "C5.35",
"new_caption": "CASE C<sup>4</sup>; MOTION FIRER"
}
},
"C12": {
"6.11 CASE J; RESTRICTED AIM:": {
"new_ruleid": "C6.11",
"new_caption": "CASE J<sup>1</sup>; RESTRICTED AIM"
},
"6.12 CASE J; LIMITED AIM:": {
"new_ruleid": "C6.12",
"new_caption": "CASE J<sup>2</sup>; LIMITED AIM"
},
"6.13 CASE J; FFNAM:": {
"new_ruleid": "C6.13",
"new_caption": "CASE J<sup>3</sup>; FFNAM"
},
"6.14 CASE J; FFMO:": {
"new_ruleid": "C6.14",
"new_caption": "CASE J<sup>4</sup>; FFMO"
}
},
"C13": {
"21--": { "new_ruleid": null },
"12": { "new_ruleid": null }
},
"C15": {
"7.7 AFV DESTRU": { "new_ruleid": null },
"1KIA": { "new_ruleid": null }
},
"C16": {
"1819 8.11APCR(A)/APDS (D):": {
"new_ruleid": "C8.11",
"new_caption": "APCR (A)/APDS (D)"
}
},
"C20": {
"10.3 MANHANDLING DRM:": { "new_ruleid": null }
},
"C21": {
"1 GUN DESTRUCTION TABLE": { "new_ruleid": null }
},
"D4": {
"46": { "new_ruleid": null }
},
"D5": {
"13": { "new_ruleid": null }
},
"D6": {
"10 MP": { "new_ruleid": null },
"2 6": { "new_ruleid": null },
"1 21": { "new_ruleid": null },
"1 /": { "new_ruleid": null }
},
"D9": {
"56,": { "new_ruleid": null },
"1 2": { "new_ruleid": null }
},
"D10": {
"3.71 LOW AMMO B# (B # ):": {
"new_ruleid": "D3.71",
"new_caption": "LOW AMMO B#"
}
},
"D11": {
"3 1": { "new_ruleid": null },
"1 1 3": { "new_ruleid": null }
},
"D17": {
"1 9 12.5 2": { "new_ruleid": null }
},
"D22": {
"5 1 1(4)1 1 1(4)(1)(1) 1": { "new_ruleid": null },
"1 1": { "new_ruleid": null },
"1(4)": { "new_ruleid": null, "instances": 4 },
"1(4) 5": { "new_ruleid": null },
"1 1 1 (1) 1": { "new_ruleid": null },
"1 1(4)": { "new_ruleid": null }
},
"E5": {
"1.": { "new_ruleid": null }
},
"E6": {
"2.": { "new_ruleid": null },
"3.": { "new_ruleid": null }
},
"E7": {
"E3. DYO TEMPERATE WEATHER CHART": { "new_ruleid": null }
},
"E14": {
"18 7.AIR SUPPORT Fighter-Bomber/Stuka Counter example": {
"new_ruleid": "E7",
"new_caption": "AIR SUPPORT"
}
},
"E24": {
"1)": { "new_ruleid": null },
"2)": { "new_ruleid": null },
"3)": { "new_ruleid": null }
},
"E27": {
"1)": { "new_ruleid": null, "instances": 2 },
"2)": { "new_ruleid": null, "instances": 2 },
"3)": { "new_ruleid": null }
},
"F18": {
"D3": { "new_ruleid": null },
"W1": { "new_ruleid": null },
"H4": { "new_ruleid": null }
},
"G30": {
"1 ION TABLE": { "new_ruleid": null },
"2 :": { "new_ruleid": null },
"3 :": { "new_ruleid": null },
"4 :": { "new_ruleid": null },
"5 :": { "new_ruleid": null },
"6 :": { "new_ruleid": null },
"8 :": { "new_ruleid": null },
"9 :": { "new_ruleid": null },
"10 :": { "new_ruleid": null },
"10 Armored": { "new_ruleid": null },
"11 :": { "new_ruleid": null },
"11": { "new_ruleid": null, "instances": 4 }
},
"G34": {
"13.2BEACHELEVATION&SLOPE:": {
"new_ruleid": "G13.2",
"new_caption": "BEACH ELEVATION & SLOPE"
}
},
"G42": {
"1133": { "new_ruleid": null, "instances": 10 },
"11233": { "new_ruleid": null },
"10": { "new_ruleid": null },
"11": { "new_ruleid": null },
"12": { "new_ruleid": null }
},
"G45": {
"1 U.S.M.C. DEFENSEBN.SWALLOTMENTCHART": { "new_ruleid": null },
"1 U.S.M.C. PARA/RAIDERSQUADSWALLOTMENTCHART": { "new_ruleid": null },
"1 U.S.M.C. RIFLE/BARSQUADSWALLOTMENTCHART": { "new_ruleid": null },
"1 U.S.M.C. OBA AVAILABILITY CHART": { "new_ruleid": null },
"11/42-11/43YEAR8-10/4210/436/447-12/441945 DR: 2": { "new_ruleid": null },
"234356": { "new_ruleid": null },
"10": { "new_ruleid": null },
"11": { "new_ruleid": null },
"12": { "new_ruleid": null },
"55": { "new_ruleid": null }
},
"J1": {
"1.MINIATURES:": { "new_ruleid": null }
},
"W4": {
"!, 1.37 FORTIFICATIONS:": {
"new_ruleid": "W1.37",
"new_caption": "FORTIFICATIONS:"
}
},
"W5": {
"17": { "new_ruleid": null },
"18 3.2 REPUBLIC OF KOREA ARMY (ROKA):": {
"new_ruleid": "W3.2",
"new_caption": "REPUBLIC OF KOREA ARMY (ROKA)"
}
},
"W6": {
"27": { "new_ruleid": null }
}
}

@ -0,0 +1,383 @@
#!/usr/bin/env python3
""" Extract the index from the MMP eASLRB. """
import os
import json
import re
import click
from pdfminer.layout import LTChar
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator
from asl_rulebook2.utils import parse_page_numbers, fixup_text, extract_parens_content, jsonval
# ---------------------------------------------------------------------
_DEFAULT_ARGS = {
"pages": "10-41",
"index_vp_left": 0, "index_vp_right": 565, "index_vp_top": 715, "index_vp_bottom": 20, # viewport
"first_title": "a", "last_title": "X#", # first/last index entries
}
# ---------------------------------------------------------------------
class ExtractIndex( ExtractBase ):
"""Extract the index from the MMP eASLRB."""
def __init__( self, args, log=None ):
super().__init__( args, _DEFAULT_ARGS, log )
self._index_entries = None
# prepare to fixup problems in the index content
fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp:
self._fixups = json.load( fp )
def extract_index( self, pdf ):
"""Extract the index from the MMP eASLRB."""
# initialize
page_nos = parse_page_numbers( self._args["pages"] )
curr_title = curr_content = None
# process each page in the index
for page_no, page, lt_page in PageIterator( pdf ):
if page_no > max( page_nos ):
break
if page_no not in page_nos:
self._log_msg( "progress", "- Skipping page {}.", page_no )
continue
self._log_msg( "progress", "- Processing page {}...", page_no )
# process each element on the page
self._prev_y0 = 99999
elem_filter = lambda e: isinstance( e, LTChar )
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter ):
# check if we should ignore this element
if not self._in_viewport( elem, "index" ):
continue
if self._is_ignore( elem ):
continue
# NOTE: We identify the start of a new index entry by bold text at the start of a line.
# We then collect the remaining bold text as the index entry's title, until we see some
# non-bold text. This is collected as the index entry's content, until we see the start
# of the next index entry.
# figure out what we've got
if self._is_bold( elem ):
if curr_content is not None:
# we've found the start of a new index entry
if curr_title:
# save the index entry we've just finished collecting
self._save_index_entry( curr_title, curr_content )
if curr_title == self._args["last_title"]:
curr_title = curr_content = None
break # nb: that was the last one - we're all done
curr_title = curr_content = None
if curr_title is None:
# start collecting the title
curr_title = elem.get_text()
else:
# continue collecting the title
curr_title += elem.get_text()
else:
if curr_content is None:
# start collecting the content text
curr_content = elem.get_text()
else:
# continue collecting the content text
if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ):
# join up hyphenated words
curr_content = curr_content[:-1]
curr_content += elem.get_text()
# loop back to process the next element
self._prev_y0 = elem.y0
# add the last index entry (if it hasn't already been done)
if curr_title:
self._save_index_entry( curr_title, curr_content )
# check for unused fixups
if self._fixups:
self._log_msg( "warning", "Unused fixups: {}", self._fixups )
# process the content for each index entry
if not self._index_entries:
raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) )
self._process_content()
def _save_index_entry( self, title, content ):
"""Save a parsed index entry."""
# check if we've started parsing index entries
# NOTE: There is some bold text at the start of the index, which we parse as an index title,
# so we don't save anything until we've actually seen the first index entry.
if self._index_entries is None:
if title != self._args["first_title"]:
return
self._index_entries = []
# initialize
title, content = title.strip(), content.strip()
if content.startswith( ":" ):
content = content[1:].strip() # nb: this comes after the title, but we don't need it
# save the new index entry
if title == "bold":
# FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect
# as the start of a new entry. We fix that up here.
self._index_entries[-1]["content"] = "{} bold {}".format(
self._index_entries[-1]["content"], fixup_text(content)
)
elif title == "C" and self._index_entries[-1]["title"] == "FFE":
# FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate
# index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is
# also a real "FFE" entry, so we do it in the code here.
self._index_entries[-1].update( {
"title": "FFE:C", "content": fixup_text(content)
} )
else:
# save the new index entry
index_entry = self._make_index_entry( title, content )
if index_entry:
self._index_entries.append( index_entry )
# FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here.
if title == "EX":
self._index_entries.append( self._make_index_entry( "EXC", "Exception" ) )
def _make_index_entry( self, title, content ):
"""Create a new index entry."""
# initialize
orig_content = content
title = fixup_text( title )
if title.endswith( ":" ):
title = title[:-1]
# check for any fixups
fixup = self._fixups.pop( title, None )
if fixup:
# replace the title
title = fixup.get( "new_title", title )
# do any search-replace's
for sr in fixup.get( "replace", [] ):
new_content = content.replace( sr[0], sr[1] )
if new_content == content:
self._log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] )
else:
content = new_content
# replace the content
old_content = fixup.get( "old_content" )
if old_content:
if fixup_text( content ) != old_content:
self._log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title )
else:
new_content = fixup.get( "new_content" )
if not new_content:
return None
content = new_content
# FUDGE! There are two "Entry" index entries, but one of them should be "Entry (Offboard)" (the parsing code
# is actually correct, since the "(Offboard)" is not bold). We can't really fix this via the usual data-driven
# fixups, so we fix it in the code here.
if title == "Entry" and content.startswith( "(Offboard): " ):
title += " (Offboard)"
content = content[12:]
return {
"title": title,
"content": fixup_text( content ),
"raw_content": orig_content
}
def _process_content( self ):
"""Extract information out of the index entries into a structured form."""
for index_entry in self._index_entries:
# initialize
content = index_entry[ "content" ]
# extract any "see also"
mo = re.search( r"\(see (also )?(.+?)\):?", content )
if mo:
see_also = [ sa.strip() for sa in mo.group(2).split(",") ]
if "SW" in see_also or "Class" in see_also:
# FUDGE! See-also's are normally comma-separated, but we don't want to
# split things like "Recovery, SW" or "Class, Personnel Types".
see_also = [ mo.group(2) ]
index_entry[ "see_also" ] = see_also
content = content[:mo.start()] + content[mo.end():]
content = content.strip()
# extract any sub-title
if content.startswith( "(" ):
pos = content.find( ")" )
if pos < 0:
# FUDGE! Some index entries have the closing ) missing :-/
pos = content.find( ":" )
subtitle, content = content[1:pos], content[pos+1:]
else:
subtitle, content = extract_parens_content( content )
index_entry[ "subtitle" ] = subtitle
if content.startswith( ":" ):
content = content[1:]
content = content.strip()
# extract any ruleid's
ruleids = []
while True:
if content == "A./G.":
break # nb: special handling for "NCC" (National Capabilities Chart)
mo = re.search( r"^(SSR )?[A-Z]{1,3}[0-9.-]+[A-Fa-f]?", content )
if not mo:
break
ruleids.append( mo.group() )
content = content[mo.end():].strip()
if content.startswith( "," ):
content = content[1:].strip()
else:
break
if ruleids:
index_entry[ "ruleids" ] = ruleids
# extract any ruleref's
rulerefs = []
matches = list( re.finditer( r"\[(.+?)\]", content ) )
if matches:
for mo in reversed(matches):
val = mo.group(1)
# NOTE: We search for the ":" from the right, to avoid picking it up in the ruleref text.
pos = val.rfind( ":" )
if pos > 0:
vals = re.split( "[;,]", val[pos+1:] )
ruleids = [ v.strip() for v in vals ]
val = val[:pos]
else:
ruleids = None
rulerefs.append( { "caption": val, "ruleids": ruleids } )
content = content[:mo.start()] + content[mo.end():]
index_entry[ "rulerefs" ] = list( reversed( rulerefs ) )
# save the final content
content = re.sub( r"\s+", " ", content ).strip()
if content:
index_entry[ "content" ] = content
else:
del index_entry["content"]
def _is_ignore( self, elem ):
"""Check if we should ignore an element on the page."""
# check if we have a bold item as the first thing on a line
if self._is_bold( elem ) and elem.y0 - self._prev_y0 < -1:
# yup - check if it's near the start of the line
if self._is_near_start_of_line( elem ):
# yup - this is the title for an index entry
return False
# nope - this is a header that indicates a new section (the index is grouped by letter)
return True
return False
def _is_near_start_of_line( self, elem ):
"""Check if the element is near the start of its line."""
if self._args["index_vp_left"] <= elem.x0 <= self._args["index_vp_left"]+20:
# yup (left column)
return True
left = self._args["index_vp_left"] + (self._args["index_vp_right"]+1 - self._args["index_vp_left"]) / 2
if left <= elem.x0 <= left+20:
# yup (right column)
return True
return False
def save_as_raw( self, out ):
"""Save the raw results."""
for index_entry in self._index_entries:
print( "=== {} ===".format( index_entry["title"] ), file=out )
print( "{}".format( index_entry["raw_content"] ), file=out )
print( file=out )
def save_as_text( self, out ):
"""Save the results as plain-text."""
for index_entry in self._index_entries:
print( "=== {} ===".format( index_entry["title"] ), file=out )
if "subtitle" in index_entry:
print( index_entry["subtitle"], file=out )
if index_entry.get( "ruleids" ):
print( "RULEID'S: {}".format(
" ; ".join( index_entry["ruleids"] )
), file=out )
if index_entry.get( "see_also" ):
print( "SEE ALSO: {}".format(
" ; ".join( index_entry["see_also"] ),
), file=out )
if index_entry.get( "content" ):
print( "CONTENT:", index_entry["content"], file=out )
if index_entry.get( "rulerefs" ):
print( "RULEREF'S:", file=out )
for ruleref in index_entry["rulerefs"]:
if ruleref["ruleids"]:
ruleids = [ "[{}]".format(ri) for ri in ruleref["ruleids"] ]
print( "- {} {}".format( ruleref["caption"], " ".join(ruleids) ), file=out )
else:
print( "- {}".format( ruleref["caption"] ), file=out )
print( file=out )
def save_as_json( self, out ):
"""Save the results as JSON."""
entries = []
for index_entry in self._index_entries:
buf = []
buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) )
if "subtitle" in index_entry:
buf.append( " \"subtitle\": {}".format( jsonval(index_entry["subtitle"]) ) )
if index_entry.get( "ruleids" ):
buf.append( " \"ruleids\": {}".format( jsonval(index_entry["ruleids"]) ) )
if index_entry.get( "see_also" ):
buf.append( " \"see_also\": {}".format( jsonval(index_entry["see_also"]) ) )
if index_entry.get( "content" ):
buf.append( " \"content\": {}".format( jsonval(index_entry["content"]) ) )
if index_entry.get( "rulerefs" ):
buf2 = []
for ruleref in index_entry["rulerefs"]:
buf2.append( " {{ \"caption\": {}, \"ruleids\": {} }}".format(
jsonval( ruleref["caption"] ),
jsonval( ruleref["ruleids"] )
) )
buf.append( " \"rulerefs\": [\n{}\n ]".format( ",\n".join(buf2) ) )
entries.append( ",\n".join( buf ) + "\n}" )
print( "[\n\n{}\n\n]".format( ",\n\n".join(entries) ), file=out )
# ---------------------------------------------------------------------
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
@click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." )
def main( pdf_file, args, progress, format, output_fname ):
"""Extract the index from the MMP eASLRB."""
# initialize
args = ExtractBase.parse_args( args, _DEFAULT_ARGS )
# extract the index
def log_msg( msg_type, msg ):
if msg_type == "progress" and not progress:
return
log_msg_stderr( msg_type, msg )
extract = ExtractIndex( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf:
extract.extract_index( pdf )
# save the results
with open( output_fname, "w", encoding="utf-8" ) as out:
getattr( extract, "save_as_"+format )( out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -1,7 +1,5 @@
""" Parse and process a PDF. """
import collections
import click
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
@ -10,6 +8,8 @@ from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer
from pdfminer.pdfpage import PDFPage
from asl_rulebook2.utils import remove_quotes, roundf
# ---------------------------------------------------------------------
class PdfDoc:
@ -33,7 +33,7 @@ class PdfDoc:
if self._fp:
self._fp.close()
def dump_pdf( self, dump_toc=True, pages=None, elem_filter=None, out=None ):
def dump_pdf( self, dump_toc=True, page_nos=None, sort_elems=False, elem_filter=None, out=None ):
"""Dump the PDF document."""
# dump the TOC
@ -41,15 +41,14 @@ class PdfDoc:
self._dump_toc( out=out )
# dump each page
max_page_no = max( pages ) if pages else None
first_page = not dump_toc
for page_no, page in PageIterator( self ):
for page_no, page, lt_page in PageIterator( self ): #pylint: disable=unused-variable
# parse the next page
self.interp.process_page( page )
if pages and page_no not in pages:
continue
lt_page = self.device.get_result()
if page_nos:
if page_no > max( page_nos ):
break
if page_no not in page_nos:
continue
# dump the page details
if first_page:
@ -61,15 +60,9 @@ class PdfDoc:
click.echo( file=out )
# dump each element on the page
for depth, elem in PageElemIterator( lt_page ):
if elem_filter and not elem_filter( elem ):
continue
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
click.echo( "{}- {}".format( depth*" ", elem ), file=out )
# check if we're done
if max_page_no and page_no >= max_page_no:
break
def _dump_toc( self, out=None ):
"""Dump a PDF document's TOC."""
@ -84,9 +77,7 @@ class PdfDoc:
if depth > 1:
bullet = "*" if depth == 2 else "-"
click.echo( "{}{} ".format( (depth-2)*" ", bullet ), nl=False, file=out )
title = repr( title ).strip()
if title[0] in ('"',"'") and title[-1] == title[0]:
title = title[1:-1]
title = remove_quotes( repr( title ).strip() )
col = "cyan" if depth <= 2 else "green"
click.echo( "{} => {}".format(
click.style( title, fg=col ),
@ -101,41 +92,60 @@ class PageIterator:
def __init__( self, pdf ):
self.pdf = pdf
self._pages = PDFPage.create_pages( pdf.doc )
self._page_no = 0
self._curr_page_no = 0
def __iter__( self ):
return self
def __next__( self ):
"""Return the next page."""
page = next( self._pages )
self._page_no += 1
return self._page_no, page
while True:
self._curr_page_no += 1
page = next( self._pages )
self.pdf.interp.process_page( page )
lt_page = self.pdf.device.get_result()
return self._curr_page_no, page, lt_page
# ---------------------------------------------------------------------
class PageElemIterator:
"""Iterate over each element in a page."""
def __init__( self, lt_page ):
def __init__( self, lt_page, elem_filter=None, sort_elems=False ):
self.lt_page = lt_page
# collect all the elements (so that they can be sorted)
self._elems = collections.deque()
self._elems = []
self._curr_elem_no = -1
def walk( elem, depth ):
for child in elem:
self._elems.append( ( depth, child ) )
# NOTE: If elements are to be sorted, we ignore anything that is not laid out.
if not sort_elems or hasattr( child, "x0" ):
if not elem_filter or elem_filter( child ):
self._elems.append( ( depth, child ) )
if isinstance( child, LTContainer ):
walk( child, depth+1 )
walk( lt_page, 0 )
if sort_elems:
def sort_key( elem ):
col_no = 0 if elem[1].x0 < lt_page.width/2 else 1
# NOTE: Some elements that should be aligned are actually misaligned by a miniscule amount (e.g. 10^-5),
# so to stop this from resulting in the wrong sort order, we truncate the decimal places.
# NOTE: Characters are often rendered in different fonts, with bounding boxes that don't align neatly.
# I tried sorting by the centre of the bounding boxes, but superscripts causes problems :-/
ypos = - roundf( elem[1].y1, 1 )
xpos = roundf( elem[1].x0, 1 )
return col_no, ypos, xpos
self._elems.sort( key=sort_key )
def __iter__( self ):
return self
def __next__( self ):
"""Return the next element on the page."""
if not self._elems:
self._curr_elem_no += 1
if self._curr_elem_no >= len(self._elems):
raise StopIteration()
return self._elems.popleft()
return self._elems[ self._curr_elem_no ]
# ---------------------------------------------------------------------

@ -0,0 +1,3 @@
"""Module definitions."""
pytest_options = None #pylint: disable=invalid-name

@ -0,0 +1,116 @@
""" Test eASLRB extraction. """
import os
import io
import pytest
from asl_rulebook2.pdf import PdfDoc
from asl_rulebook2.extract.index import ExtractIndex
from asl_rulebook2.extract.content import ExtractContent
from asl_rulebook2.extract.all import ExtractAll
from asl_rulebook2.tests import pytest_options
# ---------------------------------------------------------------------
@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." )
@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." )
def test_extract_index():
"""Test extracting the index."""
def do_test( dname ):
# extract the index
fname = os.path.join( dname, "eASLRB.pdf" )
with PdfDoc( fname ) as pdf:
extract = ExtractIndex( args={}, log=_check_log_msg )
extract.extract_index( pdf )
buf = io.StringIO()
extract.save_as_text( buf )
buf = buf.getvalue()
# check the results
fname = os.path.join( dname, "index.txt" )
assert open( fname, "r", encoding="utf-8" ).read() == buf
# run the test
_for_each_version( do_test )
# ---------------------------------------------------------------------
@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." )
@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." )
def test_extract_content():
"""Test extracting content."""
def do_test( dname ):
# extract the content
fname = os.path.join( dname, "eASLRB.pdf" )
with PdfDoc( fname ) as pdf:
extract = ExtractContent( args={}, log=_check_log_msg )
extract.extract_content( pdf )
targets_buf, footnotes_buf = io.StringIO(), io.StringIO()
extract.save_as_text( targets_buf, footnotes_buf )
targets_buf = targets_buf.getvalue()
footnotes_buf = footnotes_buf.getvalue()
# check the results
fname2 = os.path.join( dname, "targets.txt" )
assert open( fname2, "r", encoding="utf-8" ).read() == targets_buf
fname2 = os.path.join( dname, "footnotes.txt" )
assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf
# run the test
_for_each_version( do_test )
# ---------------------------------------------------------------------
@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." )
@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." )
def test_extract_all():
"""Test extracting everything."""
def do_test( dname ):
# extract everything
fname = os.path.join( dname, "eASLRB.pdf" )
with PdfDoc( fname ) as pdf:
extract = ExtractAll( args={}, log=_check_log_msg )
extract.extract_all( pdf )
index_buf = io.StringIO()
extract.extract_index.save_as_json( index_buf )
index_buf = index_buf.getvalue()
targets_buf, footnotes_buf = io.StringIO(), io.StringIO()
extract.extract_content.save_as_json( targets_buf, footnotes_buf )
targets_buf = targets_buf.getvalue()
footnotes_buf = footnotes_buf.getvalue()
# check the results
fname2 = os.path.join( dname, "index.json" )
assert open( fname2, "r", encoding="utf-8" ).read() == index_buf
fname2 = os.path.join( dname, "targets.json" )
assert open( fname2, "r", encoding="utf-8" ).read() == targets_buf
fname2 = os.path.join( dname, "footnotes.json" )
assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf
# run the test
_for_each_version( do_test )
# ---------------------------------------------------------------------
def _for_each_version( func ):
"""Run tests for each version of the eASLRB."""
base_dir = pytest_options.easlrb_path
ncalls = 0
for name in os.listdir( base_dir ):
dname = os.path.join( base_dir, name )
if os.path.isdir( dname ):
func( dname )
ncalls += 1
assert ncalls > 0
def _check_log_msg( msg_type, msg ):
"""Check a log message."""
assert msg_type not in ( "warning", "error" ), \
"Unexpected {}: {}".format( msg_type, msg )

@ -1,6 +1,51 @@
""" Miscellaneous utilities. """
import pathlib
import re
import math
# ---------------------------------------------------------------------
def fixup_text( val ):
"""Fixup special characters in a string."""
# fixup smart quotes, dashes and other non-ASCII characters
def replace_chars( val, ch, targets ):
for target in targets:
val = val.replace( target, ch )
return val
val = replace_chars( val, '"', [ "\u00ab", "\u00bb", "\u201c", "\u201d", "\u201e", "\u201f", "\u02dd" ] )
val = replace_chars( val, "'", [ "\u2018", "\u2019", "\u201a", "\u201b", "\u2039", "\u203a" ] )
val = replace_chars( val, " - ", [ "\u2013", "\u2014" ] )
val = replace_chars( val, "-", [ "\u2022" ] ) # nb: bullet
val = replace_chars( val, "&le;", [ "\u2264" ] )
val = replace_chars( val, "&ge;", [ "\u2265" ] )
val = replace_chars( val, "&#9651;", [ "\u2206" ] ) # nb: "no leadership DRM" triangle
val = replace_chars( val, "&reg;", [ "\u00ae" ] ) # nb: circled R
val = replace_chars( val, "&deg;", [ "\u00b0" ] ) # nb: degree sign
val = replace_chars( val, "&auml;", [ "\u00e4" ] )
# replace fractions with their corresponding HTML entity
for frac in [ (1,2), (1,3), (2,3), (3,8), (5,8) ]:
val = re.sub(
r"\b{}/{}(?=(\"| MF| MP))".format( frac[0], frac[1] ),
"&frac{}{};".format( frac[0], frac[1] ),
val
)
return val
def extract_parens_content( val ):
"""Extract content in parenthesis (including nested parentheses)."""
assert val[0] == "("
nesting = 0
for pos, ch in enumerate(val):
if ch == "(":
nesting += 1
elif ch == ")":
nesting -= 1
if nesting <= 0:
return val[1:pos], val[pos+1:]
return val # nb: if we get here, we have unclosed parantheses :-/
# ---------------------------------------------------------------------
@ -11,10 +56,62 @@ def parse_page_numbers( val, offset=0 ):
"""
vals = set()
if val:
for v in val.split( "," ):
for v in str(val).split( "," ):
mo = re.search( r"^(\d+)-(\d+)$", v )
if mo:
vals.update( range( int(mo.group(1)), int(mo.group(2))+1 ) )
else:
vals.add( int(v) )
return [ v+offset for v in vals ]
# ---------------------------------------------------------------------
def jsonval( val ):
"""Return a value in a JSON-safe format."""
if val is None:
return "null"
if isinstance( val, int ):
return val
if isinstance( val, list ):
if not val:
return "[]"
vals = [ jsonval(v) for v in val ]
return "[ {} ]".format( ", ".join( vals ) )
if isinstance( val, str ):
val = "".join(
ch if 32 <= ord(ch) <= 127 else r"\u{:04x}".format(ord(ch))
for ch in val
)
return '"{}"'.format( val.replace('"',r'\"') )
assert False, "Unknown JSON data type: {}".format( type(val) )
return '"???"'
def change_extn( fname, extn ):
"""Change a filename's extension."""
return pathlib.Path( fname ).with_suffix( extn )
def append_text( buf, new ):
"""Append text to a buffer."""
if buf:
if buf[-1] == "-":
return buf[:-1] + new # nb: join hyphenated words
if buf[-1] != "/":
buf += " "
return buf + new
def remove_quotes( val ):
"""Remove enclosing quotes from a string."""
if val[0] in ('"',"'") and val[-1] == val[0]:
val = val[1:-1]
return val
def remove_trailing( val, ch ):
"""Remove a trailing character from a string."""
if val.endswith( ch ):
val = val[:-1]
return val
def roundf( val, ndigits ):
"""Round a floating-point value."""
pow10 = math.pow( 10, ndigits )
return int( pow10 * val + 0.5 ) / pow10

@ -11,16 +11,15 @@ from asl_rulebook2.utils import parse_page_numbers
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--toc","dump_toc", is_flag=True, default=False, help="Dump the TOC." )
@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." )
def main( pdf_file, dump_toc, pages ):
@click.option( "--pages","-p","page_nos", help="Page(s) to dump (e.g. 2,5,9-15)." )
@click.option( "--sort","-s","sort_elems", is_flag=True, default=False, help="Sort elements within each page." )
def main( pdf_file, dump_toc, page_nos, sort_elems ):
"""Dump a PDF file."""
# process the command-line arguments
pages = parse_page_numbers( pages )
# dump the PDF file
page_nos = parse_page_numbers( page_nos )
with PdfDoc( pdf_file ) as pdf:
pdf.dump_pdf( dump_toc=dump_toc, pages=pages )
pdf.dump_pdf( dump_toc=dump_toc, page_nos=page_nos, sort_elems=sort_elems )
# ---------------------------------------------------------------------

@ -18,8 +18,8 @@ def main( pdf_file, output_fname, pages ):
# NOTE: This extracts pages from the eASLRB, so we can work on specific parts of it without having to load
# the entire document each time. In particular, it maintains the internal PDF strucuture of each page.
# The files as small as you might expect (e.g. extracting a single page results in a file only about half
# the size), but processing them are significantly faster.
# The files are not as small as you might expect (e.g. extracting a single page results in a file only
# about half the size), but processing them is significantly faster.
# process the command-line arguments
pages = parse_page_numbers( pages, offset=-1 )
@ -34,7 +34,7 @@ def main( pdf_file, output_fname, pages ):
del outline.root[-1]
# extract the specified pages
print( "Extracting pages:", ", ".join( str(p) for p in sorted(pages) ) )
print( "Extracting pages:", ", ".join( str(1+p) for p in sorted(pages) ) )
for page_no in range( len(pdf.pages)-1, -1, -1 ):
if page_no not in pages:
del pdf.pages[ page_no ]

@ -0,0 +1,33 @@
""" pytest support functions. """
import pytest
_pytest_options = None
# ---------------------------------------------------------------------
def pytest_addoption( parser ):
"""Configure pytest options."""
# NOTE: This file needs to be in the project root for this to work :-/
# add test options
parser.addoption(
"--easlrb", action="store", dest="easlrb_path", default=None,
help="Directory containing the MMP eASLRB PDF and extracted data file(s)."
)
# add test options
parser.addoption(
"--short-tests", action="store_true", dest="short_tests", default=False,
help="Skip running the longer tests."
)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def pytest_configure( config ):
"""Called after command-line options have been parsed."""
global _pytest_options
_pytest_options = config.option
import asl_rulebook2.tests
asl_rulebook2.tests.pytest_options = _pytest_options

@ -41,6 +41,9 @@ setup(
( "asl-rulebook2", ["LICENSE.txt"] ),
],
entry_points = {
"console_scripts": "dump-pdf = bin.dump_pdf:main",
"console_scripts": [
"dump-pdf = bin.dump_pdf:main",
"extract-all = asl_rulebook2.extract.all:main"
],
}
)

Loading…
Cancel
Save