parent
c2265404bc
commit
e3ebbcd0f7
@ -0,0 +1,147 @@ |
||||
#!/usr/bin/env python3 |
||||
""" Extract everything we need from the MMP eASLRB. """ |
||||
|
||||
import sys |
||||
import os |
||||
import json |
||||
import re |
||||
import importlib |
||||
|
||||
import click |
||||
|
||||
from asl_rulebook2.pdf import PdfDoc |
||||
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr |
||||
from asl_rulebook2.extract.index import ExtractIndex |
||||
from asl_rulebook2.extract.content import ExtractContent |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class ExtractAll( ExtractBase ): |
||||
"""Extract everything from the eASLRB.""" |
||||
|
||||
def __init__( self, args, log=None ): |
||||
super().__init__( None, None, log ) |
||||
self._args = args |
||||
|
||||
def extract_all( self, pdf ): |
||||
"""Extract everything from the eASLRB.""" |
||||
|
||||
# initialize |
||||
default_args = {} |
||||
for mod in ( "index", "content" ): |
||||
mod = importlib.import_module( "asl_rulebook2.extract." + mod ) |
||||
default_args.update( getattr( mod, "_DEFAULT_ARGS" ) ) |
||||
|
||||
# extract the index |
||||
self._log_msg( "progress", "\nExtracting the index..." ) |
||||
args = ExtractBase.parse_args( self._args, default_args ) |
||||
self.extract_index = ExtractIndex( args, self._log ) |
||||
self.extract_index.extract_index( pdf ) |
||||
|
||||
# extract the content |
||||
self._log_msg( "progress", "\nExtracting the content..." ) |
||||
args = ExtractBase.parse_args( self._args, default_args ) |
||||
self.extract_content = ExtractContent( args, self._log ) |
||||
self.extract_content.extract_content( pdf ) |
||||
|
||||
# verify the index targets |
||||
self._check_targets() |
||||
|
||||
def _check_targets( self ): |
||||
"""Cross-check ruleid's and ruleref's in the index against targets in the main content.""" |
||||
|
||||
# build an index of known targets |
||||
targets = {} |
||||
for ruleid, target in self.extract_content._targets.items(): |
||||
assert ruleid not in targets |
||||
targets[ ruleid ] = target["caption"] |
||||
|
||||
# load the list of known missing targets |
||||
known_strings, known_regexes = set(), set() |
||||
fname = os.path.join( os.path.dirname(__file__), "data/known-missing-ruleids.json" ) |
||||
with open( fname, "r", encoding="utf-8" ) as fp: |
||||
data = json.load( fp ) |
||||
for chapter in data["chapters"]: |
||||
known_regexes.add( re.compile( "^{}[0-9.]+[A-Ea-e]?$".format( chapter ) ) ) |
||||
known_strings.update( data["strings"] ) |
||||
known_regexes.update( |
||||
re.compile( regex ) for regex in data["regexes"] |
||||
) |
||||
|
||||
def is_known_ruleid( ruleid ): |
||||
ruleid = re.sub( r"-[A-Z]?\.?\d+$", "", ruleid ) # e.g. "A1.23-.45" -> "A1.23" |
||||
if ruleid.endswith( " EX" ): |
||||
ruleid = ruleid[:-3] |
||||
if ruleid in targets: |
||||
return True |
||||
if ruleid in known_strings: |
||||
return True |
||||
if any( regex.search( ruleid ) for regex in known_regexes ): |
||||
return True |
||||
return False |
||||
|
||||
# check each index entry |
||||
first = True |
||||
for index_entry in self.extract_index._index_entries: |
||||
|
||||
errors = [] |
||||
|
||||
# check the index entry's ruleid's |
||||
for ruleid in index_entry.get( "ruleids", [] ): |
||||
if not is_known_ruleid( ruleid ): |
||||
errors.append( "Unknown ruleid: {}".format( ruleid ) ) |
||||
|
||||
# check the index entry's ruleref's |
||||
for ruleref in index_entry.get( "rulerefs", [] ): |
||||
if not ruleref["ruleids"]: |
||||
continue |
||||
# check each ruleref |
||||
if ", ".join( r for r in ruleref["ruleids"] ) in known_strings: |
||||
# NOTE: This is some free-form text that has been split up because it contains commas. |
||||
continue |
||||
for ruleid in ruleref["ruleids"]: |
||||
if not is_known_ruleid( ruleid ): |
||||
errors.append( "Unknown ruleref target: {} => [{}]".format( ruleref["caption"], ruleid ) ) |
||||
|
||||
# log any errors |
||||
if errors: |
||||
if first: |
||||
self._log_msg( "warning", "\n=== Unknown targets ===\n" ) |
||||
first = False |
||||
errors = [ "- {}".format( e ) for e in errors ] |
||||
self._log_msg( "warning", "{}:\n{}", |
||||
index_entry["caption"], "\n".join(errors) |
||||
) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@click.command() |
||||
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) |
||||
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) |
||||
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) |
||||
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) |
||||
@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." ) |
||||
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) |
||||
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) |
||||
def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname, save_footnotes_fname ): |
||||
"""Extract everything we need from the MMP eASLRB.""" |
||||
|
||||
# extract everything |
||||
def log_msg( msg_type, msg ): |
||||
if msg_type == "progress" and not progress: |
||||
return |
||||
log_msg_stderr( msg_type, msg ) |
||||
extract = ExtractAll( args, log_msg ) |
||||
extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) |
||||
with PdfDoc( pdf_file ) as pdf: |
||||
extract.extract_all( pdf ) |
||||
|
||||
# save the results |
||||
with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \ |
||||
open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ |
||||
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: |
||||
getattr( extract.extract_index, "save_as_"+format )( index_out ) |
||||
getattr( extract.extract_content, "save_as_"+format )( targets_out, footnotes_out ) |
||||
|
||||
if __name__ == "__main__": |
||||
main() #pylint: disable=no-value-for-parameter |
@ -0,0 +1,59 @@ |
||||
""" Base class for the extraction tools. """ |
||||
|
||||
import sys |
||||
|
||||
import click |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class ExtractBase: |
||||
|
||||
def __init__( self, args, default_args, log ): |
||||
self._args = args |
||||
if default_args: |
||||
for key in default_args: |
||||
if key not in self._args: |
||||
self._args[ key ] = default_args[ key ] |
||||
self._log = log |
||||
|
||||
@staticmethod |
||||
def parse_args( args, default_args ): |
||||
"""Helper method to parse command-line arguments.""" |
||||
args2 = {} |
||||
for arg in args: |
||||
pos = arg.find( "=" ) |
||||
if pos < 0: |
||||
raise RuntimeError( "Invalid configuration parameter: {}".format( arg ) ) |
||||
key, val = arg[:pos], arg[pos+1:] |
||||
if key not in default_args: |
||||
raise RuntimeError( "Unknown configuration parameter: {}".format( key ) ) |
||||
args2[ key ] = int(val) if val.isdigit() else val |
||||
return args2 |
||||
|
||||
def _in_viewport( self, elem, vp_type ): |
||||
"""Check if an element is in the viewport.""" |
||||
if elem.x0 <= self._args[vp_type+"_vp_left"] or elem.x1 >= self._args[vp_type+"_vp_right"]: |
||||
return False |
||||
if elem.y0 <= self._args[vp_type+"_vp_bottom"] or elem.y1 >= self._args[vp_type+"_vp_top"]: |
||||
return False |
||||
return True |
||||
|
||||
@staticmethod |
||||
def _is_bold( elem ): |
||||
"""Check if an element is using a bold font.""" |
||||
return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) ) |
||||
|
||||
def _log_msg( self, msg_type, msg, *args, **kwargs ): |
||||
"""Log a message.""" |
||||
if not self._log: |
||||
return |
||||
msg = msg.format( *args, **kwargs ) |
||||
self._log( msg_type, msg ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def log_msg_stderr( msg_type, msg ): |
||||
"""Log a message to stderr.""" |
||||
if msg_type == "warning": |
||||
msg = click.style( "WARNING: {}".format( msg ), fg="yellow" ) |
||||
click.echo( msg, file=sys.stderr ) |
@ -0,0 +1,524 @@ |
||||
#!/usr/bin/env python3 |
||||
""" Extract content from the MMP eASLRB. """ |
||||
|
||||
import os |
||||
import json |
||||
import re |
||||
import math |
||||
from collections import defaultdict |
||||
|
||||
import click |
||||
from pdfminer.layout import LTChar |
||||
|
||||
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr |
||||
from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator |
||||
from asl_rulebook2.utils import parse_page_numbers, fixup_text, append_text, remove_trailing, jsonval |
||||
|
||||
# NOTE: Characters are laid out individually on the page, and we generally want to process them top-to-bottom, |
||||
# left-to-right, but in some cases, alignment is messed up (e.g. the bounding boxes don't line up properly |
||||
# and e.g. the first part of a sentence is infintesimally lower down than the rest of the sentence, and so |
||||
# appears later in the sort order), and we get better results if we process characters in the order in which |
||||
# they appear in the PDF document. |
||||
_DISABLE_SORT_ITEMS = [ |
||||
"B40", # nb: to detect B31.1 NARROW STREET |
||||
"A58","A59","A60", # Chapter A footnotes (nb: page A61 is a mess wrt element order :-/) |
||||
"B45", "B46", # Chapter B footnotes |
||||
"C25", "C26", # Chapter C footnotes |
||||
"D27", # Chapter D footnotes |
||||
"E28", "E29", "E30", # Chapter E footnotes |
||||
"F20", "F21", # Chapter F footnotes |
||||
"G48", "G49", "G50", # Chapter G footnotes |
||||
] |
||||
|
||||
_DEFAULT_ARGS = { |
||||
"chapter-a": "42-102", "chapter-b": "109-154", "chapter-c": "158-183", "chapter-d": "187-213", |
||||
"chapter-e": "216-245", "chapter-f": "247-267", "chapter-g": "270-319", |
||||
"chapter-j": "593", |
||||
"chapter-w": "647-664", |
||||
"content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport |
||||
"disable-sort-items": ",".join( _DISABLE_SORT_ITEMS ) |
||||
} |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class ExtractContent( ExtractBase ): |
||||
"""Extract content from the MMP eASLRB.""" |
||||
|
||||
def __init__( self, args, log=None ): |
||||
super().__init__( args, _DEFAULT_ARGS, log ) |
||||
self._targets = {} |
||||
self._footnotes = {} |
||||
# prepare to fixup problems in the content |
||||
fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" ) |
||||
with open( fname2, "r", encoding="utf-8" ) as fp: |
||||
self._target_fixups = json.load( fp ) |
||||
fname2 = os.path.join( os.path.dirname(__file__), "data/footnote-fixups.json" ) |
||||
with open( fname2, "r", encoding="utf-8" ) as fp: |
||||
self._footnote_fixups = json.load( fp ) |
||||
|
||||
def extract_content( self, pdf ): |
||||
"""Extract content from the MMP eASLRB.""" |
||||
|
||||
# figure out which pages to process |
||||
chapter_pages = {} # maps chapters to page numbers |
||||
page_index = {} # maps page numbers to chapter |
||||
for key, val in _DEFAULT_ARGS.items(): |
||||
if key.startswith( "chapter-" ): |
||||
page_nos = parse_page_numbers( val ) |
||||
assert len(key) == 9 |
||||
chapter = key[8].upper() |
||||
chapter_pages[ chapter ] = page_nos |
||||
for page_no in page_nos: |
||||
page_index[ page_no ] = chapter |
||||
disable_sort_items = set( self._args["disable-sort-items"].split( "," ) ) |
||||
|
||||
# initialize |
||||
self._curr_chapter = None |
||||
curr_chapter_pageno = None |
||||
self._curr_footnote = None |
||||
|
||||
# NOTE: The parsing code works in two modes. |
||||
# - We start off extracting content, and detect the start of a new rule by bold text near the start of the line. |
||||
# - When we see the footnotes header (e.g. "CHAPTER A FOOTNOTES"), we switch into footnotes mode, and detect |
||||
# the start of a footnote by a bold number near the start of the line. |
||||
|
||||
# process each page |
||||
for page_no, page, lt_page in PageIterator( pdf ): |
||||
|
||||
# prepare to process the next page |
||||
if page_no > max( page_index.keys() ): |
||||
break |
||||
if page_no not in page_index: |
||||
self._log_msg( "progress", "- Skipping page {}.", page_no ) |
||||
continue |
||||
if not self._curr_chapter or self._curr_chapter != page_index[page_no]: |
||||
# we've found the start of a new chapter |
||||
self._save_footnote() # nb: save the last footnote of the previous chapter |
||||
self._curr_chapter = page_index[ page_no ] |
||||
curr_chapter_pageno = 1 |
||||
else: |
||||
curr_chapter_pageno += 1 |
||||
self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page# |
||||
self._curr_chapter, curr_chapter_pageno |
||||
) |
||||
self._log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid ) |
||||
|
||||
# process each element on the page |
||||
curr_caption = None |
||||
self._top_left_elem = self._prev_elem = None |
||||
elem_filter = lambda e: isinstance( e, LTChar ) |
||||
sort_elems = self._curr_pageid not in disable_sort_items |
||||
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): |
||||
|
||||
# keep track of the top-left-most bold element |
||||
if self._is_bold( elem ): |
||||
if self._top_left_elem is None \ |
||||
or elem.x0 < self._top_left_elem.x0 and elem.y1 > self._top_left_elem.y1: |
||||
self._top_left_elem = elem |
||||
|
||||
# check if we should ignore this element |
||||
if not self._in_viewport( elem, "content" ): |
||||
continue |
||||
|
||||
# check if we're currently extracting footnotes |
||||
if self._curr_footnote is not None: |
||||
self._on_footnote_elem( elem, lt_page ) |
||||
self._prev_elem = elem |
||||
continue |
||||
|
||||
# figure out what we've got |
||||
is_bold = self._is_bold( elem ) |
||||
if is_bold and curr_caption and curr_caption[0].isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2: |
||||
# the previous bold character looks like a footnote superscript - ignore it |
||||
curr_caption = None |
||||
if curr_caption and elem.get_text() == " ": |
||||
# FUDGE! Some captions are in a bold font, but the spaces are not :-/ |
||||
is_bold = True |
||||
if is_bold: |
||||
if curr_caption: |
||||
# NOTE: We stop collecting bold characters at the end of the line, even if they continue on |
||||
# to the next line. This is to handle the case of a major heading (e.g. "1. PERSONNEL COUNTERS") |
||||
# being followed by a lesser heading ("1.1"). However, we want to handle captions that span |
||||
# multiple lines, so we check the vertical distance between the lines to see if it looks like |
||||
# two separate headings, or a single caption that has spread over multiple lines. |
||||
if self._prev_elem.y0 - elem.y1 > 0.25*elem.height: |
||||
# we've found the start of a new rule - save the old one, start collecting the new caption |
||||
self._save_target( curr_caption, page_no, lt_page, elem ) |
||||
curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ] |
||||
else: |
||||
# continue collecting the caption |
||||
if self._prev_elem.y0 - elem.y0 > 1: |
||||
# nb: we just started a new line |
||||
curr_caption[0] = append_text( curr_caption[0], elem.get_text() ) |
||||
else: |
||||
curr_caption[0] += elem.get_text() |
||||
else: |
||||
# check if this is the first character of the line |
||||
if self._is_start_of_line( elem, lt_page ): |
||||
# yup - start collecting the caption |
||||
curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ] |
||||
else: |
||||
# check if we're currently collecting a caption |
||||
if curr_caption: |
||||
# yup - we've just found the end of it, save it |
||||
self._save_target( curr_caption, page_no, lt_page, elem ) |
||||
curr_caption = None |
||||
|
||||
# loop back to process the next element |
||||
self._prev_elem = elem |
||||
|
||||
# add the last caption/footnote (if they haven't already been done) |
||||
self._save_footnote() |
||||
if curr_caption: |
||||
self._save_target( curr_caption, page_no, None, None ) |
||||
|
||||
# check for unused fixups |
||||
if self._target_fixups: |
||||
self._log_msg( "warning", "Unused fixups: {}", self._target_fixups ) |
||||
if self._footnote_fixups: |
||||
self._log_msg( "warning", "Unused fixups: {}", self._footnote_fixups ) |
||||
|
||||
def _save_target( self, caption, page_no, lt_page, elem ): |
||||
"""Save a parsed target.""" |
||||
|
||||
# initialize |
||||
orig_caption = caption[0] |
||||
caption_text = re.sub( r"\s+", " ", caption[0] ).strip() |
||||
if len(caption_text) <= 1: |
||||
# NOTE: We're finding text that is part of an image (e.g. the "E" for an Elite MMC), |
||||
# perhaps because the pages were OCR'ed, so we ignore these. |
||||
return |
||||
|
||||
# check if we've found the start of the chapter's footnotes |
||||
if "FOOTNOTES" in caption_text : |
||||
# yup - notify the main loop |
||||
self._curr_footnote = [] |
||||
if elem: |
||||
self._on_footnote_elem( elem, lt_page ) |
||||
return |
||||
|
||||
# check if the entry needs to be fixed up |
||||
fixup = self._target_fixups.get( self._curr_pageid, {} ).get( caption_text ) |
||||
if fixup: |
||||
# yup - make it so |
||||
fixup[ "instances" ] = fixup.get("instances",1) - 1 |
||||
if fixup["instances"] <= 0: |
||||
self._target_fixups[ self._curr_pageid ].pop( caption_text ) |
||||
if not self._target_fixups[ self._curr_pageid ]: |
||||
del self._target_fixups[ self._curr_pageid ] |
||||
ruleid = fixup.get( "new_ruleid" ) |
||||
if not ruleid: |
||||
return |
||||
caption_text = fixup.get( "new_caption" ) |
||||
else: |
||||
# nope - use what was parsed |
||||
# FUDGE! There are a lot of layout problems with things like "12.CONCEALMENT" (i.e. missing space), |
||||
# and it's tricky to detect these and not get tripped up by things like "12.C blah", so we handle it |
||||
# as a separate case. |
||||
mo = re.search( r"^(\d+\.\d*)([^ 0-9].+)", caption_text ) |
||||
if mo: |
||||
ruleid, caption_text = mo.group(1), mo.group(2).strip() |
||||
else: |
||||
# check if the caption text starts with something that looks like a ruleid |
||||
# NOTE: A leading "*" indicates an optional rule. |
||||
mo = re.search( r"^\*?([A-Z]\.?)?[1-9][0-9.-]*[A-F]?", caption_text ) |
||||
if not mo: |
||||
return |
||||
ruleid, caption_text = mo.group(), caption_text[mo.end():].strip() |
||||
if ruleid.startswith( "*" ): |
||||
ruleid = ruleid[1:] |
||||
ruleid = remove_trailing( ruleid, "." ) |
||||
caption_text = remove_trailing( caption_text, ":" ) |
||||
|
||||
# save the new target |
||||
if not ruleid.startswith( self._curr_chapter ): |
||||
ruleid = self._curr_chapter + ruleid |
||||
if ruleid in self._targets: |
||||
self._log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").", |
||||
ruleid, caption[0] |
||||
) |
||||
return |
||||
if caption_text == "\u2014": |
||||
caption_text = "-" # nb: for A7.306 :-/ |
||||
self._targets[ ruleid ] = { |
||||
"caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1], |
||||
"raw_caption": orig_caption |
||||
} |
||||
|
||||
def _on_footnote_elem( self, elem, lt_page ): |
||||
"""Process an element while we're parsing footnotes.""" |
||||
# check if we've found the start of a new footnote |
||||
if self._is_bold( elem ): |
||||
if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ): |
||||
# yup - save the current footnote, start collecting the new one |
||||
self._save_footnote() |
||||
self._curr_footnote = [ elem.get_text(), "" ] |
||||
else: |
||||
if self._curr_footnote[1]: |
||||
# FUDGE! Some footnote content has bold text hard-up at the left margin, |
||||
# so we collect that as normal content. |
||||
self._curr_footnote[1] += elem.get_text() |
||||
else: |
||||
# we're still collecting the footnote's ID |
||||
# NOTE: Older chapters have only the footnote ID in bold text, while newer chapters have |
||||
# both the ID and caption in bold. We figure out what's going on later, in _save_footnote(). |
||||
self._curr_footnote[0] += elem.get_text() |
||||
else: |
||||
# nope - we're still collecting the footnote's content |
||||
if not self._prev_elem or elem.x0 < self._prev_elem.x0 or elem.y0 - self._prev_elem.y0 > lt_page.height/2: |
||||
# nb: we just started a new line |
||||
self._curr_footnote[1] = append_text( self._curr_footnote[1], elem.get_text() ) |
||||
else: |
||||
self._curr_footnote[1] += elem.get_text() |
||||
|
||||
def _save_footnote( self ): |
||||
"""Save a parsed footnote.""" |
||||
|
||||
if not self._curr_footnote: |
||||
return |
||||
|
||||
# initialize |
||||
if self._curr_chapter not in self._footnotes: |
||||
# start saving footnotes for the chapter |
||||
self._footnotes[ self._curr_chapter ] = [] |
||||
orig_content = self._curr_footnote[1] |
||||
|
||||
# separate the footnote ID, referenced rule, and content |
||||
if self._curr_chapter in ( "F", "G", "W" ): |
||||
# NOTE: Chapter F/G footnote captions are also bold. |
||||
mo = re.search( r"^\d{1,2}\.", self._curr_footnote[0] ) |
||||
if mo: |
||||
parts = mo.group(), self._curr_footnote[0][mo.end():] |
||||
self._curr_footnote[0] = parts[0] |
||||
self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip() |
||||
else: |
||||
self._log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] ) |
||||
footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." ) |
||||
content = self._curr_footnote[1].strip() |
||||
mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content ) |
||||
if mo: |
||||
ruleid, content = mo.group(), content[mo.end():] |
||||
if not ruleid.startswith( self._curr_chapter ): |
||||
ruleid = self._curr_chapter + ruleid |
||||
ruleid = remove_trailing( ruleid, "." ) |
||||
else: |
||||
ruleid = None |
||||
if self._curr_chapter == "C": |
||||
# FUDGE! The "29." for Chapter C's footnote #29 is misaligned, and is extracted as two separate |
||||
# footnotes "2" and "9". There isn't really any way to fix this via the normal data-driven mechanism, |
||||
# so we do it in the code here :-/ |
||||
footnote_ids = [ f["footnote_id"] for f in self._footnotes[self._curr_chapter] ] |
||||
if footnote_id == "2" and "2" in footnote_ids: |
||||
return |
||||
if footnote_id == "9" and "9" in footnote_ids: |
||||
footnote_id = "29" |
||||
|
||||
# clean up the content |
||||
content = re.sub( r"\s+", " ", content ).strip() |
||||
content = fixup_text( content ) |
||||
mo = re.search( r"^[A-Z ]+:\S", content ) |
||||
if mo: |
||||
content = content[:mo.end()-1] + " " + content[mo.end()-1:] |
||||
|
||||
# check for any fixups |
||||
captions = [] |
||||
fixups = self._footnote_fixups.get( self._curr_chapter, {} ).get( footnote_id ) |
||||
if fixups: |
||||
if isinstance( fixups, list ): |
||||
# NOTE: A simple search-and-replace is, by far, the most common fixup, so we provide |
||||
# a simplified way of specifying these in the fixup file |
||||
fixups = { "replace": [ ( sr[0], sr[1] ) for sr in fixups ] } |
||||
errors = defaultdict( list ) |
||||
# do any search-replace's |
||||
if "replace" in fixups: |
||||
for sr in fixups["replace"]: |
||||
prev_content = content |
||||
content = content.replace( sr[0], sr[1] ) |
||||
if content == prev_content: |
||||
self._log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}", |
||||
self._curr_chapter, footnote_id, sr[0] |
||||
) |
||||
errors["replace"].append( sr ) |
||||
del fixups["replace"] |
||||
# replace the captions |
||||
if "captions" in fixups: |
||||
captions = fixups.pop( "captions" ) |
||||
# check that all fixups were successfully applied |
||||
if fixups: |
||||
errors.append( fixups ) |
||||
if errors: |
||||
self._footnote_fixups[ self._curr_chapter ][ footnote_id ] = errors |
||||
else: |
||||
del self._footnote_fixups[ self._curr_chapter ][ footnote_id ] |
||||
if not self._footnote_fixups[ self._curr_chapter ]: |
||||
del self._footnote_fixups[ self._curr_chapter ] |
||||
content = content.strip() |
||||
|
||||
# extract the footnote's caption |
||||
if not captions: |
||||
pos = content.find( ":" ) |
||||
if pos >= 0: |
||||
captions.append( ( ruleid, content[:pos] ) ) |
||||
content = content[pos+1:].strip() |
||||
else: |
||||
self._log_msg( "warning", "Can't extract footnote caption: {}:{} - {}", |
||||
self._curr_chapter, footnote_id, content |
||||
) |
||||
|
||||
# check for the credits at the end of the Chapter F footnotes |
||||
pos = content.find( "WEST OF ALAMEIN CREDITS" ) |
||||
if pos > 0: |
||||
content = content[:pos] |
||||
|
||||
# save the footnote |
||||
self._footnotes[ self._curr_chapter ].append( { |
||||
"footnote_id": footnote_id, |
||||
"captions": captions, |
||||
"content": content, |
||||
"raw_content": orig_content |
||||
} ) |
||||
self._curr_footnote = None |
||||
|
||||
def _is_start_of_line( self, elem, lt_page ): |
||||
"""Check if the element is at the start of its line.""" |
||||
# NOTE: We can't just check the element's x co-ordinate, since there is sometimes a floating image |
||||
# that pushes the text right (e.g. A.12). |
||||
if self._prev_elem is None: |
||||
return True |
||||
if elem.y0 < self._prev_elem.y0: |
||||
return True |
||||
if self._prev_elem.x0 < lt_page.width/2 and elem.x0 > lt_page.width/2: |
||||
return True # the element is at the top of the right column |
||||
return False |
||||
|
||||
def save_as_raw( self, targets_out, footnotes_out ): |
||||
"""Save the raw results.""" |
||||
self._save_as_raw_or_text( targets_out, footnotes_out, True ) |
||||
|
||||
def save_as_text( self, targets_out, footnotes_out ): |
||||
"""Save the results as plain-text.""" |
||||
self._save_as_raw_or_text( targets_out, footnotes_out, False ) |
||||
|
||||
def _save_as_raw_or_text( self, targets_out, footnotes_out, raw ): |
||||
"""Save the results as raw or plain-text.""" |
||||
|
||||
# save the targets |
||||
curr_page_no = None |
||||
for ruleid, target in self._targets.items(): |
||||
if target["page_no"] != curr_page_no: |
||||
if curr_page_no: |
||||
print( file=targets_out ) |
||||
print( "=== p{} ===".format( target["page_no"] ), file=targets_out ) |
||||
curr_page_no = target["page_no"] |
||||
xpos, ypos = self._get_target_pos( target ) |
||||
if raw: |
||||
print( "[{},{}] = {}".format( |
||||
xpos, ypos, target["raw_caption"] |
||||
), file=targets_out ) |
||||
else: |
||||
print( "{} => {} @ p{}:[{},{}]".format( |
||||
ruleid, target["caption"], target["page_no"], xpos, ypos |
||||
), file=targets_out ) |
||||
|
||||
# save the footnotes |
||||
def make_caption( caption ): |
||||
buf = [] |
||||
if caption[1]: |
||||
buf.append( caption[1] ) |
||||
if caption[0]: |
||||
buf.append( "[{}]".format( caption[0] ) ) |
||||
elif caption[0]: |
||||
buf.append( caption[0] ) |
||||
return " ".join( buf ) |
||||
for chapter, footnotes in self._footnotes.items(): |
||||
if chapter != "A": |
||||
print( file=footnotes_out ) |
||||
print( "=== CHAPTER {} FOOTNOTES {}".format( chapter, 80*"=" )[:80], file=footnotes_out ) |
||||
for footnote in footnotes: |
||||
print( file=footnotes_out ) |
||||
print( "--- Footnote {} ---".format( footnote["footnote_id"] ), file=footnotes_out ) |
||||
if raw: |
||||
print( footnote["raw_content"], file=footnotes_out ) |
||||
else: |
||||
print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out ) |
||||
print( footnote["content"], file=footnotes_out ) |
||||
|
||||
def save_as_json( self, targets_out, footnotes_out ): |
||||
"""Save the results as JSON.""" |
||||
|
||||
# save the targets |
||||
targets, curr_chapter = [], None |
||||
for ruleid, target in self._targets.items(): |
||||
xpos, ypos = self._get_target_pos( target ) |
||||
targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format( |
||||
jsonval( ruleid ), |
||||
jsonval(target["caption"]), target["page_no"], xpos, ypos |
||||
) ) |
||||
if ruleid[0] != curr_chapter: |
||||
targets[-1] = "\n" + targets[-1] |
||||
curr_chapter = ruleid[0] |
||||
print( "{{\n{}\n\n}}".format( |
||||
",\n".join( targets ) |
||||
), file=targets_out ) |
||||
|
||||
# save the footnotes |
||||
def make_caption( caption ): |
||||
return "{{ \"caption\": {}, \"ruleid\": {} }}".format( |
||||
jsonval(caption[1]), jsonval(caption[0]) |
||||
) |
||||
chapters = [] |
||||
for chapter in self._footnotes: |
||||
footnotes = [] |
||||
for footnote in self._footnotes[chapter]: |
||||
footnotes.append( "{}: {{\n \"captions\": {},\n \"content\": {}\n}}".format( |
||||
jsonval( footnote["footnote_id"] ), |
||||
"[ {} ]".format( ", ".join( make_caption(c) for c in footnote["captions"] ) ), |
||||
jsonval( footnote["content"] ) |
||||
) ) |
||||
chapters.append( "{}: {{\n\n{}\n\n}}".format( |
||||
jsonval( chapter ), |
||||
",\n".join( footnotes ) |
||||
) ) |
||||
print( "{{\n\n{}\n\n}}".format( |
||||
",\n\n".join( chapters ) |
||||
), file=footnotes_out ) |
||||
|
||||
@staticmethod |
||||
def _get_target_pos( target ): |
||||
"""Return a target's X/Y position on the page.""" |
||||
xpos = math.floor( target["pos"][0] ) |
||||
ypos = math.ceil( target["pos"][1] ) |
||||
return xpos, ypos |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@click.command() |
||||
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) |
||||
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) |
||||
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) |
||||
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) |
||||
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) |
||||
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) |
||||
def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_fname ): |
||||
"""Extract content from the MMP eASLRB.""" |
||||
|
||||
# initialize |
||||
args = ExtractBase.parse_args( args, _DEFAULT_ARGS ) |
||||
|
||||
# extract the content |
||||
def log_msg( msg_type, msg ): |
||||
if msg_type == "progress" and not progress: |
||||
return |
||||
log_msg_stderr( msg_type, msg ) |
||||
extract = ExtractContent( args, log_msg ) |
||||
extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) |
||||
with PdfDoc( pdf_file ) as pdf: |
||||
extract.extract_content( pdf ) |
||||
|
||||
# save the results |
||||
with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ |
||||
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: |
||||
getattr( extract, "save_as_"+format )( targets_out, footnotes_out ) |
||||
|
||||
if __name__ == "__main__": |
||||
main() #pylint: disable=no-value-for-parameter |
@ -0,0 +1,209 @@ |
||||
{ |
||||
|
||||
"A": { |
||||
|
||||
"10A": [ |
||||
[ "OneHalfFP", "One-Half FP" ], |
||||
[ "firstappearedintheASLAnnual'89.(In1998,bothwerereprintedin Classic ASL.)", "first appeared in the ASL Annual '89. (In 1998, both were reprinted in Classic ASL.)" ], |
||||
[ "One of the several criticisms", "<p> One of the several criticisms" ] |
||||
], |
||||
"12": [ [ "TEMto", "TEM to" ] ], |
||||
"14": [ |
||||
[ "bipodmounted", "bipod-mounted" ], |
||||
[ "volume o f fire", "volume of fire" ] |
||||
], |
||||
"17": [ [ "adistinct", "a distinct" ] ], |
||||
"19" : [ [ "wellsited", "well-sited" ] ], |
||||
"32": [ [ "HWunits", "HW units" ] ], |
||||
"33": [ [ "multiLocation", "multi-Location" ] ], |
||||
"35": [ [ "The original printing", "<p> The original printing" ] ], |
||||
"37": [ |
||||
[ "- Winter War (vs Soviet Union) 30 November 1939 - 13 March 1940- Continuation War (vs Soviet Union) 25 June 1941 - 4 September 1944- Lapland War (vs Germany) 15 September 1944 - 27 April 1945", " <ul> <li> <b>Winter War</b> (vs Soviet Union) 30 November 1939 - 13 March 1940 <li> <b>Continuation War</b> (vs Soviet Union) 25 June 1941 - 4 September 1944 <li> <b>Lapland War</b> (vs Germany) 15 Se ptember 1944 - 27 April 1945 </ul>" ] |
||||
], |
||||
"38": [ |
||||
[ "Romania: Romania,", " <p> <b>Romania</b>: Romania," ], |
||||
[ "Hungary: A traditional", " <p> <b>Hungary</b>: A traditional" ], |
||||
[ "Slovakia: Urged on", "<p> <b>Slovakia</b>: Urged on" ], |
||||
[ "German-Croatian units in Russia:", " <p> <b>German-Croatian units in Russia</b>:" ], |
||||
[ "Italian-Croatian units in Russia:", " <p> <b>Italian-Croatian units in Russia</b>:" ], |
||||
[ "Croatian units in Yugoslavia:", " <p> <b>Croatian units in Yugoslavia</b>:" ], |
||||
[ "CroatianArmyunitswereengagedprimarilyinanti-partisanactivities,fightingmostly", "Croatian Army units were engaged primarily in anti-partisan activities, fighting mostly" ], |
||||
[ "Bulgaria: Bulgaria", "<p> <b>Bulgaria</b>: Bulgaria" ], |
||||
[ "WhiletheriflecompanydidnothaveaninherentHeavyWeapons(HW)platoon,it", "While the rifle company did not have an inherent Heavy Weapons (HW) platoon, it"] |
||||
], |
||||
"39": [ [ "generallyapply", "generally apply" ] ], |
||||
"41": [ [ "ViceAdmiral", "Vice-Admiral" ] ], |
||||
"43": [ |
||||
[ "ALLIEDMINORS", "ALLIED MINORS" ], |
||||
[ "BARrather", "BAR rather" ] |
||||
] |
||||
|
||||
}, |
||||
|
||||
"B": { |
||||
|
||||
"3B": [ [ "LOWERLEVELLOCATIONS", "LOWER LEVEL LOCATIONS" ] ], |
||||
"6": [ [ "The Village Terrain rules", "<p> The Village Terrain rules" ] ], |
||||
"13": [ [ "U6U7 U8 U9W6W7W8W9V6 V7 V8Y6 Y7Y8Y9 X6X7X8", "" ] ] |
||||
|
||||
}, |
||||
|
||||
"C": { |
||||
|
||||
"1": [ [ "ac tually", "actually" ] ], |
||||
"8": [ [ "rep resents", "represents" ] ], |
||||
"13": [ [ "0o", "0°" ] ], |
||||
"20": [ [ "predetermined", "pre-determined" ] ], |
||||
"21": [ [ "p roneto", "prone to" ] ], |
||||
"26": [ |
||||
[ "Mortarsof76-107mm", "Mortars of 76-107mm" ], |
||||
[ "areexempted", "are exempted" ], |
||||
[ "frommanyof", "from many of" ], |
||||
[ "normalGun", "normal Gun" ], |
||||
[ "thantowed", "than towed" ] |
||||
], |
||||
"32": [ [ "PF counters were removed", "<p> PF counters were removed" ] ], |
||||
"38": [ [ "U. S. Army", "U.S. Army" ] ] |
||||
|
||||
}, |
||||
|
||||
"D": { |
||||
|
||||
"8": [ [ "massproduced", "mass-produced" ] ] |
||||
|
||||
}, |
||||
|
||||
"E": { |
||||
|
||||
"3": [ |
||||
[ "flexibilityis", "flexibility is" ], |
||||
[ "for gottenby", "forgotten by" ] |
||||
], |
||||
"4": [ [ "Th e most", "The most" ] ], |
||||
"11": [ |
||||
[ "Another problem", "<p> Another problem" ], |
||||
[ "A Fire cast", "<p> A Fire cast" ], |
||||
[ "Finally, to add", "<p> Finally, to add" ] |
||||
], |
||||
"14B": [ [ "infantrypulled", "infantry-pulled" ] ], |
||||
"15": [ [ "shallowdraught", "shallow draught" ] ], |
||||
"18": [ |
||||
[ "reallife", "real-life" ], |
||||
[ "the g eneric", "the generic" ] |
||||
], |
||||
"20": { |
||||
"captions": [ [ "E7.51", "LIGHT AA" ], [ "E7.52", "HEAVY AA" ] ], |
||||
"replace": [ |
||||
[ "& 7.52 AA FIRE:", "" ] |
||||
] |
||||
}, |
||||
"24": { |
||||
"captions": [ [ "E9.2", "DRIFT" ], [ "E9.4", "LANDING" ] ], |
||||
"replace": [ |
||||
[ "DRIFT & 9.4 LANDING:", "" ] |
||||
] |
||||
} |
||||
}, |
||||
|
||||
"F": { |
||||
|
||||
"12": [ [ "non- entrenched", "non-entrenched" ] ], |
||||
"19": [ |
||||
[ "Inthewinternight,thenear-freezingtemperaturecauseddewtoform.", "In the winter night, the near-freezing temperature caused dew to form. " ], |
||||
[ "Thenextmorningathickmistoftenformedasthesun evaporateditagain.", "The next morning a thick mist often formed as the sun evaporated it again. " ], |
||||
[ "Thiscouldhappeneveninthesummertimeundertheproperenvironmentalconditions,", "This could happen even in the summertime under the proper environmental conditions, " ], |
||||
[ "butsincethiswasamuchlessfrequentoccurrenceithasbeen ignored.", "but since this was a much less frequent occurrence it has been ignored." ] |
||||
], |
||||
"21": [ |
||||
[ "Playerswillprobablyfinditmoreconvenienttoinstead", "Players will probably find it more convenient to instead" ], |
||||
[ "addathird,different-coloreddietothisTH/IFTDR,", "add a third, different-colored die to this TH/IFT DR, " ], |
||||
[ "usingittodeterminetheDust DRM.", "using it to determine the Dust DRM." ], |
||||
[ "Thefamiliarterm\"subsequentdr\"wasusedintherulebecauseitobviates theneed", "The familiar term \"subsequent dr\" was used in the rule because it obviates the need" ], |
||||
[ "a\"new\"concept", "a \"new\" concept" ], |
||||
[ "thatof rolling athird diesimultaneously", "that of rolling a third die simultaneously" ] |
||||
], |
||||
"22": [ |
||||
[ "theDustcounter\"follows\"thevehicleasit movesfromhex to hex", "the Dust counter \"follows\" the vehicle as it moves from hex to hex" ], |
||||
[ "itexpends", "it expends " ], |
||||
[ "two MPeach timeitdoesso", " two MP each time it does so" ] |
||||
], |
||||
"23": [ |
||||
[ "Anotherwind-relatedaspectoftheNorthAfricanenvironmentisthedesertsandstorm,", "Another wind-related aspect of the North African environment is the desert sandstorm, " ], |
||||
[ "orkhamsininArabic.", "or khamsin in Arabic. " ], |
||||
[ "ChapterFincludesnospecial rulesforitbecause,", "Chapter F includes no special rules for it because, " ], |
||||
[ "withvisibilitycutbythestormtoaslittleasthreeyards,", "with visibility cut by the storm to as little as three yards, " ], |
||||
[ "allactivitiesgenerallywerereducedtoseekingcoverfromthesandblastingwindandchoking dust.", "all activities generally were reduced to seeking cover from the sandblasting wind and choking dust. " ], |
||||
[ "However,thegamedoesnotignorethepossibilityofakhamsin'soccurrence.", "However, the game does not ignore the possibility of a khamsin's occurrence. " ], |
||||
[ "The propercombinationofWeather,EC,WindandGustsinaDYOscenariocancreateits effects,", "The proper combination of Weather, EC, Wind and Gusts in a DYO scenario can create its effects, " ], |
||||
[ "andtheprobabilityofitsoccurrenceisgreatestinascenariosetinspringor summer", "and the probability of its occurrence is greatest in a scenario set in spring or summer" ], |
||||
[ "thetimewhen khamsinsoccurred mostfrequently.", "the time when khamsins occurred most frequently." ] |
||||
], |
||||
"24": [ |
||||
[ "Thisoverlay isused in aHOLLOW LEGIONS scenario.", "This overlay is used in a HOLLOW LEGIONS scenario." ] |
||||
], |
||||
"25": [ |
||||
[ "ThefamousNorthAfricanescarpmentsaresimilarto cliffs,", "The famous North African escarpments are similar to cliffs, " ], |
||||
[ "butwithlesssteep(andveryeroded)slopes.", "but with less steep (and very eroded) slopes. " ], |
||||
[ "Somearesixhundredfeethigh", "Some are six hundred feet high" ], |
||||
[ "thoughgenerallytheirheightsrangefromonehundredtotwohundredfeet.", "though generally their heights range from one hundred to two hundred feet. " ], |
||||
[ "Theirsignificanceinthedesertwarlaymainlyinthattheywerecommandingheights,", "Their significance in the desert war lay mainly in that they were commanding heights, " ] , |
||||
[ "defensivepositionsforinfantry,", "defensive positions for infantry, " ], |
||||
[ "andgreatlyrestrictedvehicularmovementacrossthem", "and greatly restricted vehicular movement across them" ], |
||||
[ "Hencetheywereoftenthesceneofheavyfighting,", "Hence they were often the scene of heavy fighting, " ], |
||||
[ "especiallywherecrossedbya road", "especially where crossed by a road" ] |
||||
] |
||||
}, |
||||
|
||||
"G": { |
||||
|
||||
"4": [ [ "It's also interesting", "<p> It's also interesting" ] ], |
||||
"8": [ [ "miniDC,", "mini-DC," ] ], |
||||
"33": [ [ "closein", "close-in" ] ], |
||||
"45": [ |
||||
[ "Guomindang(akaKuomintang", "Guomindang (aka Kuomintang" ], |
||||
[ "XForce", "X-Force" ], |
||||
[ "The two-tone color", "<p> The two-tone color" ] |
||||
], |
||||
"47" : [ [ "against-allodds", "against-all-odds" ] ], |
||||
"48": [ [ "trained-andequipped", "trained-and-equipped" ] ] |
||||
|
||||
}, |
||||
|
||||
"W": { |
||||
|
||||
"2": [ |
||||
[ "Korean National Defense Constabulary:", "<ul> <li> <em>Korean National Defense Constabulary</em>: " ], |
||||
[ "ROK Army:", "<li> <em>ROK Army</em>: " ], |
||||
[ "Korean Marine Corps:", "<li> <em>Korean Marine Corps</em>: " ], |
||||
[ "United States - Army:", "<li> <em>United States</em> <ul> <li> Army:" ], |
||||
[ "- Army Airborne:", "<li> Army Airborne:" ], |
||||
[ "- Army Rangers:", "<li> Army Rangers:" ], |
||||
[ "- KATUSA:", "<li> KATUSA:" ], |
||||
[ "- Marine Corps:", "<li> Marine Corps:" ], |
||||
[ "British Commonwealth:", "</ul> <li> <em>British Commonwealth</em>: " ], |
||||
[ "- 41 Independent Commando, Royal Marines: 9/50-12/51", "<ul> <li> 41 Independent Commando, Royal Marines: 9/50-12/51 </ul>" ], |
||||
[ "Other United Nations Command:", "<li> <em>Other United Nations Command</em>: " ], |
||||
[ "10/50-7/53", "10/50-7/53 </ul>" ] |
||||
], |
||||
"3": [ |
||||
[ "Korean People's Army:", "<ul> <li> <em>Korean People's Army</em>: " ], |
||||
[ "Communist Guerillas:", "<li> <em>Communist Guerillas</em>: " ], |
||||
[ "Chinese People's Volunteer Army:10/50-7/53", "<li> <em>Chinese People's Volunteer Army</em>: 10/50-7/53 </ul>" ] |
||||
], |
||||
"9": [ [ "T34/85", "T-34/85" ] ], |
||||
"16": [ |
||||
[ "3 1/3 PP", "3⅓ PP" ], |
||||
[ "24-8 HS", "2-4-8 HS" ] |
||||
], |
||||
"18": [ [ "The first unit", "<p> The first unit" ] ], |
||||
"29" : [ [ "RAT KILLERin which", "RAT KILLER in which" ] ], |
||||
"30": [ |
||||
[ "G.M.D in", "G.M.D. in" ], |
||||
[ "as. sumed", "assumed" ] |
||||
], |
||||
"49": [ [ "SUP-PORT", "SUPPORT" ] ], |
||||
"50": [ [ "Speciallytrained", "Specially-trained" ] ] |
||||
} |
||||
|
||||
} |
||||
|
@ -0,0 +1,288 @@ |
||||
{ |
||||
|
||||
"AirSupport": { |
||||
"new_title": "Air Support", |
||||
"old_content": "E7,[BRT:TCG6][ChineseDYO:G18.83][ENEMY: S8.9][cannotbeusedvsanyLocationinFog:E3.313][inRB,German AirSupportisalwaysaStukaM42:SSRRB9][JapaneseDYO: G1.6621][inKGP,NAifMistDensity>Light,Night,orOvercast:SSR KGP3][Napalm:G17.4][NightNA:E7.2][OvercastNA:E3.55][during SeaborneAssault/Evacuation:G14.34][SeaborneAssaultDYO: G14.262][TarawaNavalGunfire: TCG3.3]", |
||||
"new_content": "E7, [BRT: TCG6] [Chinese DYO: G18.83] [ENEMY: S8.9] [cannot be used vs any Location in Fog: E3.313] [in RB, German Air Support is always a Stuka M42: SSR RB9] [Japanese DYO: G1.6621] [in KGP, NA if Mist Density > Light, Night, or Overcast: SSR KGP3] [Napalm: G17.4] [Night NA: E7.2] [Overcast NA: E3.55] [during Seaborne Assault/Evacuation: G14.34] [Seaborne Assault DYO: G14.262] [Tarawa Naval Gunfire: TCG3.3]" |
||||
}, |
||||
|
||||
"Ambush": { |
||||
"old_content": "A11.4[attacksfirstinCC:A11.32][ATTACKERadds+1 drm to Ambush dr in jungle, kunai, or bamboo Location: G.6] [Banks: G8.212][keeping \"?\" during CC: A12.14] [Dummies are eliminated BEFORE the Ambush dr, and do not qualify for the -2 drm: ASOP 8.11B] [Hand-to-Hand CC: J2.31] [Night: E1.77] [Panic Action: S6.213] [Panjis: G9.21] [in Rubble: SSR RB8] [Street Fighting: A11.8] [T-H Heroes are created after Ambush determination: G1.421]", |
||||
"new_content": "A11.4 [attacks first in CC: A11.32] [ATTACKER adds +1 drm to Ambush dr in jungle, kunai, or bamboo Location: G.6] [Banks: G8.212] [keeping \"?\" during CC: A12.14] [Dummies are eliminated BEFORE the Ambush dr, and do not qualify for the -2 drm: ASOP 8.11B] [Hand-to-Hand CC: J2.31] [Night: E1.77] [Panic Action: S6.213] [Panjis: G9.21] [in Rubble: SSR RB8] [Street Fighting: A11.8] [T-H Heroes are created after Ambush determination: G1.421]" |
||||
}, |
||||
|
||||
"American": { |
||||
"old_content": "A25.3[EarlyArmy:G17.2][OBAAccuracy:C1.3] [Paramarine:G17.111][Raider:G17.111][RifleCompany:S18.5][U.S. Marine Corps: G17.1]", |
||||
"new_content": "A25.3 [Early Army: G17.2] [OBA Accuracy: C1.3] [Paramarine: G17.111] [Raider: G17.111] [Rifle Company: S18.5] [U.S. Marine Corps: G17.1]" |
||||
}, |
||||
|
||||
"AmmoPP Reduction": { |
||||
"new_title": "Ammo PP Reduction" |
||||
}, |
||||
|
||||
"APCR": { |
||||
"new_title": "APCR/APDS", |
||||
"old_content": "(Armor Piercing Composite Rigid)/", |
||||
"new_content": "(Armor Piercing Composite Rigid/Discarding Sabot): C8.1-.2 [EXC German 28LL, 40LL: C4.3, C7.32] [vs Guns: C11.52] [HE Equivalency: C8.31] [TH# Modification: C4.3] [To Kill Table: C7.32] [Residual FP NA: C8.31]" |
||||
}, |
||||
"APDS": { |
||||
"old_content": "(Armor Piercing Discarding Sabot): C8.1-.2 [EXC German 28LL, 40LL: C4.3, C7.32] [vs Guns: C11.52] [HE Equivalency: C8.31] [TH# Modification: C4.3] [To Kill Table: C7.32] [Residual FP NA: C8.31]", |
||||
"new_content": null |
||||
}, |
||||
|
||||
"Broken Units": { |
||||
"replace": [ |
||||
[ "[Pin NA: A7.8(EXCInterdiction and Huts)]", "[Pin NA (EXC Interdiction and Huts): A7.8]" ] |
||||
] |
||||
}, |
||||
|
||||
"Cellars": { |
||||
"replace": [ |
||||
[ "RBCellars", "RB Cellars" ] |
||||
] |
||||
}, |
||||
|
||||
"DC": { |
||||
"replace": [ |
||||
[ "[Thrown from: (Halftrack: D6.63) (Sidecar: D15.6)]", "[Thrown from Halftrack: D6.63] [Thrown from Sidecar: D15.6]" ] |
||||
] |
||||
}, |
||||
|
||||
"Direct Fire": { |
||||
"old_content": "(Any fireattackrequiringaLOSfromthe firerwhichdoesnotuseIndirectFire):C.1,C9.1[InterveningUnits:A6.6][LC: G12.61-.62, G12.671]", |
||||
"new_content": "(Any fire attack requiring a LOS from the firer which does not use Indirect Fire): C.1, C9.1 [Intervening Units: A6.6] [LC: G12.61-.62, G12.671]" |
||||
}, |
||||
|
||||
"Dogfight": { |
||||
"old_content": "(AerialCombat):E7.22", |
||||
"new_content": "(Aerial Combat): E7.22" |
||||
}, |
||||
|
||||
"Elite": { |
||||
"replace": [ |
||||
[ "[German (Africa, 1942-43: F.6) (prior to 1944: A25.1) (SS: A25.11)]", "[German (Africa, 1942-43): F.6] [German (prior to 1944): A25.1] [German (SS): A25.11]" ] |
||||
] |
||||
}, |
||||
|
||||
"End of Scenario": { |
||||
"replace": [ |
||||
[ "[in ABtF: R9.4 CG4]", "[in ABtF: R9.4, CG4]" ], |
||||
[ "[in KGP: P8.4 CG23]", "[in KGP: P8.4, CG23]" ], |
||||
[ "[in PB: Q9.4 CG19 (-1 drm for any Night Scenario and +1 drm for Day II scenario: Turn Record Track)]", "[in PB (-1 drm for any Night Scenario and +1 drm for Day II scenario: Turn Record Track): Q9.4, CG19]" ], |
||||
[ "[in RB: O11.4 CG4]", "[in RB: O11.4, CG4]" ] |
||||
] |
||||
}, |
||||
|
||||
"EX": { |
||||
"old_content": "ExampleEXC: Exception", |
||||
"new_content": "Example", |
||||
"_comment_": "The code manually inserts an entry for EXC: Exception" |
||||
}, |
||||
|
||||
"Fortification": { |
||||
"replace": [ |
||||
[ "[in BRT: SSR1 (BRT Sand: T3.2) (NA in Betio Piers: T9.2)]", "[in BRT: SSR1 (BRT Sand): T3.2] [in BRT: SSR1 (NA in Betio Piers): T9.2]" ] |
||||
] |
||||
}, |
||||
|
||||
"Hazardous Movement": { |
||||
"replace": [ |
||||
[ "[Clearance of: (Debris: O1.5) (Fire: B24.72) (Jungle Path: G2.7) (Roadblock: B24.76) (Rubble: B24.71)]", "[Clearance of Debris: O1.5] [Clearance of Fire: B24.72] [Clearance of Jungle Path: G2.7] [Clearance of Roadblock: B24.76] [Clearance of Rubble: B24.71]" ] |
||||
] |
||||
}, |
||||
|
||||
"Hedges": { |
||||
"replace": [ |
||||
[ "[TEM NA for Ground Support: E7.4; for PRC: B9.3]", "[TEM NA for Ground Support: E7.4] [TEM NA for PRC: B9.3]" ] |
||||
] |
||||
}, |
||||
|
||||
"Immobilization": { |
||||
"replace": [ |
||||
[ "[LC: G12.602; LC Passengers NA: G12.13]", "[LC: G12.602] [LC Passengers NA: G12.13]" ], |
||||
[ "[TC: D5.5; TC in BRT: SSR12]", "[TC: D5.5] [TC in BRT: SSR12]" ] |
||||
] |
||||
}, |
||||
|
||||
"Jungle": { |
||||
"replace": [ |
||||
[ "G.2G.6", "G.2-G.6" ] |
||||
] |
||||
}, |
||||
|
||||
"Kunai": { |
||||
"replace": [ |
||||
[ "G.2G.6", "G.2-G.6" ] |
||||
] |
||||
}, |
||||
|
||||
"Leadership": { |
||||
"replace": [ |
||||
[ "[Battle Hardening: A15.3, Finns: A25.71, Japanese: G1.41]", "[Battle Hardening: A15.3] [Battle Hardening (Finns): A25.71] [Battle Hardening (Japanese): G1.41]" ] |
||||
] |
||||
}, |
||||
|
||||
"MG": { |
||||
"replace": [ |
||||
[ "[Vehicular: (see Vehicular MG: D3.5-.54)]", "[Vehicular MG: D3.5-.54]" ], |
||||
[ "[Aerial: E7.41, vs AFV: C7.22]", "[Aerial: E7.41] [Aerial (vs AFV): C7.22]" ] |
||||
] |
||||
}, |
||||
|
||||
"Minefield": { |
||||
"replace": [ |
||||
[ "[fully-tracked A FV T B: B 8.61]", "[fully-tracked AFV TB: B8.61]" ] |
||||
] |
||||
}, |
||||
|
||||
"Morale": { |
||||
"replace": [ |
||||
[ "[Gain:", "Gain:" ], |
||||
[ "FFE]", "FFE" ] |
||||
] |
||||
}, |
||||
|
||||
"Movement, Vehicle": { |
||||
"replace": [ |
||||
[ "(see Amphibians: D16)", "(see Amphibians)" ] |
||||
] |
||||
}, |
||||
|
||||
"OBA": { |
||||
"replace": [ |
||||
[ "USOrdnance", "US Ordnance" ] |
||||
] |
||||
}, |
||||
|
||||
"Optional Rules": { |
||||
"replace": [ |
||||
[ "A12.16 (see footnote A18)", "A12.16, footnote A18" ] |
||||
] |
||||
}, |
||||
|
||||
"PAATC": { |
||||
"old_content": "(Pre-AFVAdvance/AttackTaskCheck;NAtoberserk/Fanatic/Japanese/SMC): A11.6, G1.62 [vs Armored Cupola: O.7] [DC Placement: A23.3] [ENEMY Advance into CC/Melee: S11.4] [1PAATC: Chinese, NonElite Italians, Inexperienced, Allied/Axis Minors] [OVR vs \"?\": A12.41] [CC Reaction Fire: D7.21]", |
||||
"new_content": "(Pre-AFV Advance/Attack Task Check; NA to berserk/Fanatic/Japanese/SMC): A11.6, G1.62 [vs Armored Cupola: O.7] [DC Placement: A23.3] [ENEMY Advance into CC/Melee: S11.4] [1PAATC: Chinese, Non-Elite Italians, Inexperienced, Allied/Axis Minors] [OVR vs \"?\": A12.41] [CC Reaction Fire: D7.21]" |
||||
}, |
||||
|
||||
"PBF": { |
||||
"replace": [ |
||||
[ "A11.l", "A11.1" ] |
||||
] |
||||
}, |
||||
|
||||
"Pillbox": { |
||||
"replace": [ |
||||
[ "[Control: B30.91; in BRT: TCG15]", "[Control: B30.91] [Control (in BRT): TCG15]" ] |
||||
] |
||||
}, |
||||
|
||||
"Pin": { |
||||
"replace": [ |
||||
[ "D6.23.24", "D6.23-.24" ], |
||||
[ "[Fire Lanes: A9.22; Cancellation: A9.223]", "[Fire Lanes: A9.22] [Fire Lanes (Cancellation): A9.223]" ] |
||||
] |
||||
}, |
||||
|
||||
"PRC": { |
||||
"replace": [ |
||||
[ "[disembarking in Panji: G9.423; embarking: G9.51]", "[disembarking in Panji: G9.423] [embarking in Panji: G9.51]" ] |
||||
] |
||||
}, |
||||
|
||||
"RMG": { |
||||
"replace": [ |
||||
[ "D1.81 (hull) & D1.82 (turret)", "Hull: D1.81; Turret: D1.82" ] |
||||
] |
||||
}, |
||||
|
||||
"Radio": { |
||||
"replace": [ |
||||
[ "[in KGP: P8.4 CG15]", "[in KGP: P8.4, CG15]" ], |
||||
[ "[in RB: O11.4 CG6]", "[in RB: O11.4, CG6]" ] |
||||
] |
||||
}, |
||||
|
||||
"Range": { |
||||
"replace": [ |
||||
[ "see Firing Within Hex", "A7.21" ] |
||||
] |
||||
}, |
||||
|
||||
"Roadblock": { |
||||
"replace": [ |
||||
[ "[TEM NA for Ground Support: E7.4; for PRC: B9.3]", "[TEM NA for Ground Support: E7.4] [TEM NA for PRC: B9.3]" ] |
||||
] |
||||
}, |
||||
|
||||
"Rubble": { |
||||
"replace": [ |
||||
[ "; Stone Blaze:", "] [RePh, Stone Blaze:" ] |
||||
] |
||||
}, |
||||
|
||||
"Scrounging": { |
||||
"replace": [ |
||||
[ "RBCG7", "RB CG7" ] |
||||
] |
||||
}, |
||||
|
||||
"Stacking Limits": { |
||||
"replace": [ |
||||
[ "[Inspecting: see Right of Inspection: (Before Play: A2.9) (During Play: A12.16) (Pillboxes: B30.7)]", "[Inspecting: see Right of Inspection (Before Play): A2.9] [Inspecting: see Right of Inspection (During Play): A12.16] [Inspecting: see Right of Inspection (Pillboxes): B30.7]" ] |
||||
] |
||||
}, |
||||
|
||||
"Stall": { |
||||
"old_content": "(Rules are givenin a ChapterH Vehicle Note ifa nationality's AFV are subject to Stall; for example, German Multi-Applicable Vehicle Note H): [Platoon Movement: D14.22]", |
||||
"new_content": "(Rules are given in a Chapter H Vehicle Note if a nationality's AFV are subject to Stall; for example, German Multi-Applicable Vehicle Note H): [Platoon Movement: D14.22]" |
||||
}, |
||||
|
||||
"Target Size": { |
||||
"replace": [ |
||||
[ "[Vehicular: D1.7, Concealment: D1.76]", "[Vehicular: D1.7] [Vehicular (Concealment): D1.76]" ] |
||||
] |
||||
}, |
||||
|
||||
"Uncon irmed Kill": { |
||||
"new_title": "Unconfirmed Kill" |
||||
}, |
||||
|
||||
"Unarmored Vehicles": { |
||||
"replace": [ |
||||
[ "[AFV (vs A-P mines: B28.42) (vs A-T mines: B28.52)]", "[AFV (vs A-P mines): B28.42] [AFV (vs A-T mines): B28.52]" ] |
||||
] |
||||
}, |
||||
|
||||
"Unit": { |
||||
"replace": [ |
||||
[ "[but not horses],", "(but not horses)," ] |
||||
] |
||||
}, |
||||
|
||||
"Voluntary Break": { |
||||
"replace": [ |
||||
[ "[Japanese: G1.13, SMC NA: G1.4]", "[Japanese: G1.13] [Japanese (SMC NA): G1.4]" ] |
||||
] |
||||
}, |
||||
|
||||
"Walls": { |
||||
"replace": [ |
||||
[ "[Bypass LOS across: (Infantry: A4.34) (Vehicle: D2.37)]", "[Bypass LOS across Infantry: A4.34] [Bypass LOS across Vehicle: D2.37]" ], |
||||
[ "; for PRC", "] [TEM NA for PRC" ] |
||||
] |
||||
}, |
||||
|
||||
"Winter Camouflage": { |
||||
"replace": [ |
||||
[ "OBA Observer: C 1.6", "OBA Observer: C1.6" ] |
||||
] |
||||
}, |
||||
|
||||
"Wreck Blaze": { |
||||
"replace": [ |
||||
[ "[Creation: (AFV C7.6) (Unarmored: A7.308)]", "[Creation (AFV): C7.6] [Creation (Unarmored): A7.308]" ] |
||||
] |
||||
} |
||||
|
||||
} |
@ -0,0 +1,40 @@ |
||||
{ |
||||
|
||||
"chapters": [ "H", "O", "P", "Q", "R", "S", "T" ], |
||||
|
||||
"strings": [ |
||||
"see appropriate Vehicle Notes", |
||||
"Chapter H", |
||||
"derived by cross-indexing Target Type & Range on To Hit Table", |
||||
"Basic TH# plus any modifications for Gun and Ammo Types", |
||||
"Number beneath Gun Caliber & Length on applicable To Kill Table", |
||||
"Basic TK# plus applicable To Kill Modifications (Cases A-D)", |
||||
"The Modified TK# minus the AF of the Target Facing hit", |
||||
"FP-Range", |
||||
"Morale-Leadership", |
||||
"FP-Range-Morale", |
||||
"abbr. for Bow Flamethrower", |
||||
"HE NA", "AP NA", |
||||
"Chinese, Non-Elite Italians, Inexperienced, Allied/Axis Minors" |
||||
], |
||||
|
||||
"regexes": [ |
||||
"^ASOP .+$", |
||||
"^(RB )?OCG[0-9.]+$", |
||||
"^PCG[0-9.]+[a-e]?$", |
||||
"^(PB )?QCG[0-9.]+$", |
||||
"^RCG[0-9.]+$", |
||||
"^TCG[0-9.]+[a-e]?$", |
||||
"^SSR[0-9.]+$", |
||||
"^(RB CG )?SSR .+$", |
||||
"^(RB )?CG[0-9.]+$", |
||||
"^(SSR |SSRs )?(ABtF|KGP|PB|RB|BRT)[0-9.]+$", |
||||
"Chapter [A-Z] [Ii]ntroduction", |
||||
"Chapter [A-Z] [Dd]ivider", |
||||
"^footnote [A-Z]\\d+", |
||||
"^.+ [Oo]verlay$", |
||||
" Multi-Applicable Note ", |
||||
" (Vehicle|Ordnance) Note " |
||||
] |
||||
|
||||
} |
@ -0,0 +1,400 @@ |
||||
{ |
||||
|
||||
"A1": { |
||||
"A.10LEADERSHIP DRM ():": { |
||||
"new_ruleid": "A.10", |
||||
"new_caption": "LEADERSHIP DRM (△)" |
||||
} |
||||
}, |
||||
|
||||
"A3": { |
||||
"23": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"A5": { |
||||
"3.BASIC SEQUENCE OF PLAY": { |
||||
"new_ruleid": "A3", |
||||
"new_caption": "BASIC SEQUENCE OF PLAY" |
||||
}, |
||||
"TURN RECORD CHART": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"A21": { |
||||
"9.223CANCELLATION:": { |
||||
"new_ruleid": "A9.223", |
||||
"new_caption": "CANCELLATION" |
||||
} |
||||
}, |
||||
|
||||
"A28": { |
||||
"1 -": { "new_ruleid": null } |
||||
}, |
||||
"A29": { |
||||
"1 -": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"A30": { |
||||
"11.CLOSE COMBAT (CC)": { |
||||
"new_ruleid": "A11", |
||||
"new_caption": "CLOSE COMBAT (CC)" |
||||
} |
||||
}, |
||||
|
||||
"A31": { |
||||
"11.2WITHDRAWALFROMMELEE:": { |
||||
"new_ruleid": "A11.2", |
||||
"new_caption": "WITHDRAWAL FROM MELEE" |
||||
} |
||||
}, |
||||
|
||||
"A34": { |
||||
"12.CONCEALMENT": { |
||||
"new_ruleid": "A12", |
||||
"new_caption": "CONCEALMENT" |
||||
} |
||||
}, |
||||
|
||||
"A37": { |
||||
"5 12.2 CONCEALED / COUNTERS:": { |
||||
"new_ruleid": "A12.2", |
||||
"new_caption": "CONCEALED ⅝\" COUNTERS" |
||||
} |
||||
}, |
||||
|
||||
"A38": { |
||||
"13.CAVALRY": { |
||||
"new_ruleid": "A13", |
||||
"new_caption": "CAVALRY" |
||||
} |
||||
}, |
||||
|
||||
"A39": { |
||||
"6MF 1MF2MF6FP": { "new_ruleid": null }, |
||||
"4FP6FP4FP": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"A43": { |
||||
"A18.2 LEADER CREATION TABLE*LEADER CREATION drm": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"A46": { |
||||
"21.CAPTURED EQUIPMENT": { |
||||
"new_ruleid": "A21", |
||||
"new_caption": "CAPTURED EQUIPMENT" |
||||
} |
||||
}, |
||||
|
||||
"A50": { |
||||
"C24.5 STRENGTH:": { |
||||
"new_ruleid": "A24.5", |
||||
"new_caption": "STRENGTH" |
||||
} |
||||
}, |
||||
|
||||
"A51": { |
||||
"30 25.NATIONALITY DISTINCTIONS": { |
||||
"new_ruleid": "A25", |
||||
"new_caption": "NATIONALITY DISTINCTIONS" |
||||
} |
||||
}, |
||||
|
||||
"A54": { |
||||
"25.53 FREEFRENCH:": { |
||||
"new_ruleid": "A25.53", |
||||
"new_caption": "FREE FRENCH" |
||||
} |
||||
}, |
||||
|
||||
"A55": { |
||||
"26.VICTORYCONDITIONS": { |
||||
"new_ruleid": "A26", |
||||
"new_caption": "VICTORY CONDITIONS" |
||||
} |
||||
}, |
||||
|
||||
"B4": { |
||||
"6.BRIDGES": { |
||||
"new_ruleid": "B6", |
||||
"new_caption": "BRIDGES" |
||||
} |
||||
}, |
||||
|
||||
"B6": { |
||||
"8.45BROKEN & BERSERK:": { |
||||
"new_ruleid": "B8.45", |
||||
"new_caption": "BROKEN & BERSERK" |
||||
} |
||||
}, |
||||
|
||||
"B15": { |
||||
"11.CLIFFS": { |
||||
"new_ruleid": "B11", |
||||
"new_caption": "CLIFFS" |
||||
} |
||||
}, |
||||
|
||||
"B17": { |
||||
"13.8 PINEWOODS:": { |
||||
"new_ruleid": "B13.8", |
||||
"new_caption": "PINE WOODS" |
||||
}, |
||||
"13.81 OBSTACLEHEIGHT:": { |
||||
"new_ruleid": "B13.81", |
||||
"new_caption": "OBSTACLE HEIGHT" |
||||
}, |
||||
"13.82 MFCOST:": { |
||||
"new_ruleid": "B13.82", |
||||
"new_caption": "MF COST" |
||||
} |
||||
}, |
||||
|
||||
"B19": { |
||||
"17.CRAG": { |
||||
"new_ruleid": "17", |
||||
"new_caption": "CRAG" |
||||
} |
||||
}, |
||||
|
||||
"B22": { |
||||
"2 2": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"B24": { |
||||
"23.BUILDINGS": { |
||||
"new_ruleid": "B23", |
||||
"new_caption": "BUILDINGS" |
||||
} |
||||
}, |
||||
|
||||
"B33": { |
||||
"0 25.64 WIND DIRECTION:": { |
||||
"new_ruleid": "B25.64", |
||||
"new_caption": "WIND DIRECTION" |
||||
} |
||||
}, |
||||
|
||||
"B35": { |
||||
"53": { "new_ruleid": null }, |
||||
"1 2": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"C7": { |
||||
"2.3 360 MOUNT:": { |
||||
"new_ruleid": "C2.3", |
||||
"new_caption": "360° MOUNT:" |
||||
} |
||||
}, |
||||
|
||||
"C11": { |
||||
"5.31 CASE C; BOUNDING FIRST FIRER, RESTRICTED AIM: 1": { |
||||
"new_ruleid": "C5.31", |
||||
"new_caption": "CASE C<sup>1</sup>; BOUNDING FIRST FIRER, RESTRICTED AIM" |
||||
}, |
||||
"5.32 CASE C; BOUNDING FIRST FIRER, LIMITED AIM:": { |
||||
"new_ruleid": "C5.32", |
||||
"new_caption": "CASE C<sup>2</sup>; BOUNDING FIRST FIRER, LIMITED AIM" |
||||
}, |
||||
"5.34 CASE C; LATW:": { |
||||
"new_ruleid": "C5.34", |
||||
"new_caption": "CASE C<sup>3</sup>; LATW" |
||||
}, |
||||
"5.35 CASE C; MOTION FIRER:": { |
||||
"new_ruleid": "C5.35", |
||||
"new_caption": "CASE C<sup>4</sup>; MOTION FIRER" |
||||
} |
||||
}, |
||||
|
||||
"C12": { |
||||
"6.11 CASE J; RESTRICTED AIM:": { |
||||
"new_ruleid": "C6.11", |
||||
"new_caption": "CASE J<sup>1</sup>; RESTRICTED AIM" |
||||
}, |
||||
"6.12 CASE J; LIMITED AIM:": { |
||||
"new_ruleid": "C6.12", |
||||
"new_caption": "CASE J<sup>2</sup>; LIMITED AIM" |
||||
}, |
||||
"6.13 CASE J; FFNAM:": { |
||||
"new_ruleid": "C6.13", |
||||
"new_caption": "CASE J<sup>3</sup>; FFNAM" |
||||
}, |
||||
"6.14 CASE J; FFMO:": { |
||||
"new_ruleid": "C6.14", |
||||
"new_caption": "CASE J<sup>4</sup>; FFMO" |
||||
} |
||||
}, |
||||
|
||||
"C13": { |
||||
"21--": { "new_ruleid": null }, |
||||
"12": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"C15": { |
||||
"7.7 AFV DESTRU": { "new_ruleid": null }, |
||||
"1KIA": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"C16": { |
||||
"1819 8.11APCR(A)/APDS (D):": { |
||||
"new_ruleid": "C8.11", |
||||
"new_caption": "APCR (A)/APDS (D)" |
||||
} |
||||
}, |
||||
|
||||
"C20": { |
||||
"10.3 MANHANDLING DRM:": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"C21": { |
||||
"1 GUN DESTRUCTION TABLE": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D4": { |
||||
"46": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D5": { |
||||
"13": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D6": { |
||||
"10 MP": { "new_ruleid": null }, |
||||
"2 6": { "new_ruleid": null }, |
||||
"1 21": { "new_ruleid": null }, |
||||
"1 /": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D9": { |
||||
"56,": { "new_ruleid": null }, |
||||
"1 2": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D10": { |
||||
"3.71 LOW AMMO B# (B # ):": { |
||||
"new_ruleid": "D3.71", |
||||
"new_caption": "LOW AMMO B#" |
||||
} |
||||
}, |
||||
|
||||
"D11": { |
||||
"3 1": { "new_ruleid": null }, |
||||
"1 1 3": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D17": { |
||||
"1 9 12.5 2": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"D22": { |
||||
"5 1 1(4)1 1 1(4)(1)(1) 1": { "new_ruleid": null }, |
||||
"1 1": { "new_ruleid": null }, |
||||
"1(4)": { "new_ruleid": null, "instances": 4 }, |
||||
"1(4) 5": { "new_ruleid": null }, |
||||
"1 1 1 (1) 1": { "new_ruleid": null }, |
||||
"1 1(4)": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"E5": { |
||||
"1.": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"E6": { |
||||
"2.": { "new_ruleid": null }, |
||||
"3.": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"E7": { |
||||
"E3. DYO TEMPERATE WEATHER CHART": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"E14": { |
||||
"18 7.AIR SUPPORT Fighter-Bomber/Stuka Counter example": { |
||||
"new_ruleid": "E7", |
||||
"new_caption": "AIR SUPPORT" |
||||
} |
||||
}, |
||||
|
||||
"E24": { |
||||
"1)": { "new_ruleid": null }, |
||||
"2)": { "new_ruleid": null }, |
||||
"3)": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"E27": { |
||||
"1)": { "new_ruleid": null, "instances": 2 }, |
||||
"2)": { "new_ruleid": null, "instances": 2 }, |
||||
"3)": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"F18": { |
||||
"D3": { "new_ruleid": null }, |
||||
"W1": { "new_ruleid": null }, |
||||
"H4": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"G30": { |
||||
"1 ION TABLE": { "new_ruleid": null }, |
||||
"2 :": { "new_ruleid": null }, |
||||
"3 :": { "new_ruleid": null }, |
||||
"4 :": { "new_ruleid": null }, |
||||
"5 :": { "new_ruleid": null }, |
||||
"6 :": { "new_ruleid": null }, |
||||
"8 :": { "new_ruleid": null }, |
||||
"9 :": { "new_ruleid": null }, |
||||
"10 :": { "new_ruleid": null }, |
||||
"10 Armored": { "new_ruleid": null }, |
||||
"11 :": { "new_ruleid": null }, |
||||
"11": { "new_ruleid": null, "instances": 4 } |
||||
}, |
||||
|
||||
"G34": { |
||||
"13.2BEACHELEVATION&SLOPE:": { |
||||
"new_ruleid": "G13.2", |
||||
"new_caption": "BEACH ELEVATION & SLOPE" |
||||
} |
||||
}, |
||||
|
||||
"G42": { |
||||
"1133": { "new_ruleid": null, "instances": 10 }, |
||||
"11233": { "new_ruleid": null }, |
||||
"10": { "new_ruleid": null }, |
||||
"11": { "new_ruleid": null }, |
||||
"12": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"G45": { |
||||
"1 U.S.M.C. DEFENSEBN.SWALLOTMENTCHART": { "new_ruleid": null }, |
||||
"1 U.S.M.C. PARA/RAIDERSQUADSWALLOTMENTCHART": { "new_ruleid": null }, |
||||
"1 U.S.M.C. RIFLE/BARSQUADSWALLOTMENTCHART": { "new_ruleid": null }, |
||||
"1 U.S.M.C. OBA AVAILABILITY CHART": { "new_ruleid": null }, |
||||
"11/42-11/43YEAR8-10/4210/436/447-12/441945 DR: 2": { "new_ruleid": null }, |
||||
"234356": { "new_ruleid": null }, |
||||
"10": { "new_ruleid": null }, |
||||
"11": { "new_ruleid": null }, |
||||
"12": { "new_ruleid": null }, |
||||
"55": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"J1": { |
||||
"1.MINIATURES:": { "new_ruleid": null } |
||||
}, |
||||
|
||||
"W4": { |
||||
"!, 1.37 FORTIFICATIONS:": { |
||||
"new_ruleid": "W1.37", |
||||
"new_caption": "FORTIFICATIONS:" |
||||
} |
||||
}, |
||||
|
||||
"W5": { |
||||
"17": { "new_ruleid": null }, |
||||
"18 3.2 REPUBLIC OF KOREA ARMY (ROKA):": { |
||||
"new_ruleid": "W3.2", |
||||
"new_caption": "REPUBLIC OF KOREA ARMY (ROKA)" |
||||
} |
||||
}, |
||||
|
||||
"W6": { |
||||
"27": { "new_ruleid": null } |
||||
} |
||||
|
||||
} |
@ -0,0 +1,383 @@ |
||||
#!/usr/bin/env python3 |
||||
""" Extract the index from the MMP eASLRB. """ |
||||
|
||||
import os |
||||
import json |
||||
import re |
||||
|
||||
import click |
||||
from pdfminer.layout import LTChar |
||||
|
||||
from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr |
||||
from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator |
||||
from asl_rulebook2.utils import parse_page_numbers, fixup_text, extract_parens_content, jsonval |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
_DEFAULT_ARGS = { |
||||
"pages": "10-41", |
||||
"index_vp_left": 0, "index_vp_right": 565, "index_vp_top": 715, "index_vp_bottom": 20, # viewport |
||||
"first_title": "a", "last_title": "X#", # first/last index entries |
||||
} |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class ExtractIndex( ExtractBase ): |
||||
"""Extract the index from the MMP eASLRB.""" |
||||
|
||||
def __init__( self, args, log=None ): |
||||
super().__init__( args, _DEFAULT_ARGS, log ) |
||||
self._index_entries = None |
||||
# prepare to fixup problems in the index content |
||||
fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" ) |
||||
with open( fname2, "r", encoding="utf-8" ) as fp: |
||||
self._fixups = json.load( fp ) |
||||
|
||||
def extract_index( self, pdf ): |
||||
"""Extract the index from the MMP eASLRB.""" |
||||
|
||||
# initialize |
||||
page_nos = parse_page_numbers( self._args["pages"] ) |
||||
curr_title = curr_content = None |
||||
|
||||
# process each page in the index |
||||
for page_no, page, lt_page in PageIterator( pdf ): |
||||
|
||||
if page_no > max( page_nos ): |
||||
break |
||||
if page_no not in page_nos: |
||||
self._log_msg( "progress", "- Skipping page {}.", page_no ) |
||||
continue |
||||
self._log_msg( "progress", "- Processing page {}...", page_no ) |
||||
|
||||
# process each element on the page |
||||
self._prev_y0 = 99999 |
||||
elem_filter = lambda e: isinstance( e, LTChar ) |
||||
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter ): |
||||
|
||||
# check if we should ignore this element |
||||
if not self._in_viewport( elem, "index" ): |
||||
continue |
||||
if self._is_ignore( elem ): |
||||
continue |
||||
|
||||
# NOTE: We identify the start of a new index entry by bold text at the start of a line. |
||||
# We then collect the remaining bold text as the index entry's title, until we see some |
||||
# non-bold text. This is collected as the index entry's content, until we see the start |
||||
# of the next index entry. |
||||
|
||||
# figure out what we've got |
||||
if self._is_bold( elem ): |
||||
if curr_content is not None: |
||||
# we've found the start of a new index entry |
||||
if curr_title: |
||||
# save the index entry we've just finished collecting |
||||
self._save_index_entry( curr_title, curr_content ) |
||||
if curr_title == self._args["last_title"]: |
||||
curr_title = curr_content = None |
||||
break # nb: that was the last one - we're all done |
||||
curr_title = curr_content = None |
||||
if curr_title is None: |
||||
# start collecting the title |
||||
curr_title = elem.get_text() |
||||
else: |
||||
# continue collecting the title |
||||
curr_title += elem.get_text() |
||||
else: |
||||
if curr_content is None: |
||||
# start collecting the content text |
||||
curr_content = elem.get_text() |
||||
else: |
||||
# continue collecting the content text |
||||
if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ): |
||||
# join up hyphenated words |
||||
curr_content = curr_content[:-1] |
||||
curr_content += elem.get_text() |
||||
|
||||
# loop back to process the next element |
||||
self._prev_y0 = elem.y0 |
||||
|
||||
# add the last index entry (if it hasn't already been done) |
||||
if curr_title: |
||||
self._save_index_entry( curr_title, curr_content ) |
||||
|
||||
# check for unused fixups |
||||
if self._fixups: |
||||
self._log_msg( "warning", "Unused fixups: {}", self._fixups ) |
||||
|
||||
# process the content for each index entry |
||||
if not self._index_entries: |
||||
raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) ) |
||||
self._process_content() |
||||
|
||||
def _save_index_entry( self, title, content ): |
||||
"""Save a parsed index entry.""" |
||||
|
||||
# check if we've started parsing index entries |
||||
# NOTE: There is some bold text at the start of the index, which we parse as an index title, |
||||
# so we don't save anything until we've actually seen the first index entry. |
||||
if self._index_entries is None: |
||||
if title != self._args["first_title"]: |
||||
return |
||||
self._index_entries = [] |
||||
|
||||
# initialize |
||||
title, content = title.strip(), content.strip() |
||||
if content.startswith( ":" ): |
||||
content = content[1:].strip() # nb: this comes after the title, but we don't need it |
||||
|
||||
# save the new index entry |
||||
if title == "bold": |
||||
# FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect |
||||
# as the start of a new entry. We fix that up here. |
||||
self._index_entries[-1]["content"] = "{} bold {}".format( |
||||
self._index_entries[-1]["content"], fixup_text(content) |
||||
) |
||||
elif title == "C" and self._index_entries[-1]["title"] == "FFE": |
||||
# FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate |
||||
# index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is |
||||
# also a real "FFE" entry, so we do it in the code here. |
||||
self._index_entries[-1].update( { |
||||
"title": "FFE:C", "content": fixup_text(content) |
||||
} ) |
||||
else: |
||||
# save the new index entry |
||||
index_entry = self._make_index_entry( title, content ) |
||||
if index_entry: |
||||
self._index_entries.append( index_entry ) |
||||
# FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here. |
||||
if title == "EX": |
||||
self._index_entries.append( self._make_index_entry( "EXC", "Exception" ) ) |
||||
|
||||
def _make_index_entry( self, title, content ): |
||||
"""Create a new index entry.""" |
||||
|
||||
# initialize |
||||
orig_content = content |
||||
title = fixup_text( title ) |
||||
if title.endswith( ":" ): |
||||
title = title[:-1] |
||||
|
||||
# check for any fixups |
||||
fixup = self._fixups.pop( title, None ) |
||||
if fixup: |
||||
# replace the title |
||||
title = fixup.get( "new_title", title ) |
||||
# do any search-replace's |
||||
for sr in fixup.get( "replace", [] ): |
||||
new_content = content.replace( sr[0], sr[1] ) |
||||
if new_content == content: |
||||
self._log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] ) |
||||
else: |
||||
content = new_content |
||||
# replace the content |
||||
old_content = fixup.get( "old_content" ) |
||||
if old_content: |
||||
if fixup_text( content ) != old_content: |
||||
self._log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title ) |
||||
else: |
||||
new_content = fixup.get( "new_content" ) |
||||
if not new_content: |
||||
return None |
||||
content = new_content |
||||
|
||||
# FUDGE! There are two "Entry" index entries, but one of them should be "Entry (Offboard)" (the parsing code |
||||
# is actually correct, since the "(Offboard)" is not bold). We can't really fix this via the usual data-driven |
||||
# fixups, so we fix it in the code here. |
||||
if title == "Entry" and content.startswith( "(Offboard): " ): |
||||
title += " (Offboard)" |
||||
content = content[12:] |
||||
|
||||
return { |
||||
"title": title, |
||||
"content": fixup_text( content ), |
||||
"raw_content": orig_content |
||||
} |
||||
|
||||
def _process_content( self ): |
||||
"""Extract information out of the index entries into a structured form.""" |
||||
|
||||
for index_entry in self._index_entries: |
||||
|
||||
# initialize |
||||
content = index_entry[ "content" ] |
||||
|
||||
# extract any "see also" |
||||
mo = re.search( r"\(see (also )?(.+?)\):?", content ) |
||||
if mo: |
||||
see_also = [ sa.strip() for sa in mo.group(2).split(",") ] |
||||
if "SW" in see_also or "Class" in see_also: |
||||
# FUDGE! See-also's are normally comma-separated, but we don't want to |
||||
# split things like "Recovery, SW" or "Class, Personnel Types". |
||||
see_also = [ mo.group(2) ] |
||||
index_entry[ "see_also" ] = see_also |
||||
content = content[:mo.start()] + content[mo.end():] |
||||
content = content.strip() |
||||
|
||||
# extract any sub-title |
||||
if content.startswith( "(" ): |
||||
pos = content.find( ")" ) |
||||
if pos < 0: |
||||
# FUDGE! Some index entries have the closing ) missing :-/ |
||||
pos = content.find( ":" ) |
||||
subtitle, content = content[1:pos], content[pos+1:] |
||||
else: |
||||
subtitle, content = extract_parens_content( content ) |
||||
index_entry[ "subtitle" ] = subtitle |
||||
if content.startswith( ":" ): |
||||
content = content[1:] |
||||
content = content.strip() |
||||
|
||||
# extract any ruleid's |
||||
ruleids = [] |
||||
while True: |
||||
if content == "A./G.": |
||||
break # nb: special handling for "NCC" (National Capabilities Chart) |
||||
mo = re.search( r"^(SSR )?[A-Z]{1,3}[0-9.-]+[A-Fa-f]?", content ) |
||||
if not mo: |
||||
break |
||||
ruleids.append( mo.group() ) |
||||
content = content[mo.end():].strip() |
||||
if content.startswith( "," ): |
||||
content = content[1:].strip() |
||||
else: |
||||
break |
||||
if ruleids: |
||||
index_entry[ "ruleids" ] = ruleids |
||||
|
||||
# extract any ruleref's |
||||
rulerefs = [] |
||||
matches = list( re.finditer( r"\[(.+?)\]", content ) ) |
||||
if matches: |
||||
for mo in reversed(matches): |
||||
val = mo.group(1) |
||||
# NOTE: We search for the ":" from the right, to avoid picking it up in the ruleref text. |
||||
pos = val.rfind( ":" ) |
||||
if pos > 0: |
||||
vals = re.split( "[;,]", val[pos+1:] ) |
||||
ruleids = [ v.strip() for v in vals ] |
||||
val = val[:pos] |
||||
else: |
||||
ruleids = None |
||||
rulerefs.append( { "caption": val, "ruleids": ruleids } ) |
||||
content = content[:mo.start()] + content[mo.end():] |
||||
index_entry[ "rulerefs" ] = list( reversed( rulerefs ) ) |
||||
|
||||
# save the final content |
||||
content = re.sub( r"\s+", " ", content ).strip() |
||||
if content: |
||||
index_entry[ "content" ] = content |
||||
else: |
||||
del index_entry["content"] |
||||
|
||||
def _is_ignore( self, elem ): |
||||
"""Check if we should ignore an element on the page.""" |
||||
# check if we have a bold item as the first thing on a line |
||||
if self._is_bold( elem ) and elem.y0 - self._prev_y0 < -1: |
||||
# yup - check if it's near the start of the line |
||||
if self._is_near_start_of_line( elem ): |
||||
# yup - this is the title for an index entry |
||||
return False |
||||
# nope - this is a header that indicates a new section (the index is grouped by letter) |
||||
return True |
||||
return False |
||||
|
||||
def _is_near_start_of_line( self, elem ): |
||||
"""Check if the element is near the start of its line.""" |
||||
if self._args["index_vp_left"] <= elem.x0 <= self._args["index_vp_left"]+20: |
||||
# yup (left column) |
||||
return True |
||||
left = self._args["index_vp_left"] + (self._args["index_vp_right"]+1 - self._args["index_vp_left"]) / 2 |
||||
if left <= elem.x0 <= left+20: |
||||
# yup (right column) |
||||
return True |
||||
return False |
||||
|
||||
def save_as_raw( self, out ): |
||||
"""Save the raw results.""" |
||||
for index_entry in self._index_entries: |
||||
print( "=== {} ===".format( index_entry["title"] ), file=out ) |
||||
print( "{}".format( index_entry["raw_content"] ), file=out ) |
||||
print( file=out ) |
||||
|
||||
def save_as_text( self, out ): |
||||
"""Save the results as plain-text.""" |
||||
for index_entry in self._index_entries: |
||||
print( "=== {} ===".format( index_entry["title"] ), file=out ) |
||||
if "subtitle" in index_entry: |
||||
print( index_entry["subtitle"], file=out ) |
||||
if index_entry.get( "ruleids" ): |
||||
print( "RULEID'S: {}".format( |
||||
" ; ".join( index_entry["ruleids"] ) |
||||
), file=out ) |
||||
if index_entry.get( "see_also" ): |
||||
print( "SEE ALSO: {}".format( |
||||
" ; ".join( index_entry["see_also"] ), |
||||
), file=out ) |
||||
if index_entry.get( "content" ): |
||||
print( "CONTENT:", index_entry["content"], file=out ) |
||||
if index_entry.get( "rulerefs" ): |
||||
print( "RULEREF'S:", file=out ) |
||||
for ruleref in index_entry["rulerefs"]: |
||||
if ruleref["ruleids"]: |
||||
ruleids = [ "[{}]".format(ri) for ri in ruleref["ruleids"] ] |
||||
print( "- {} {}".format( ruleref["caption"], " ".join(ruleids) ), file=out ) |
||||
else: |
||||
print( "- {}".format( ruleref["caption"] ), file=out ) |
||||
print( file=out ) |
||||
|
||||
def save_as_json( self, out ): |
||||
"""Save the results as JSON.""" |
||||
entries = [] |
||||
for index_entry in self._index_entries: |
||||
buf = [] |
||||
buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) ) |
||||
if "subtitle" in index_entry: |
||||
buf.append( " \"subtitle\": {}".format( jsonval(index_entry["subtitle"]) ) ) |
||||
if index_entry.get( "ruleids" ): |
||||
buf.append( " \"ruleids\": {}".format( jsonval(index_entry["ruleids"]) ) ) |
||||
if index_entry.get( "see_also" ): |
||||
buf.append( " \"see_also\": {}".format( jsonval(index_entry["see_also"]) ) ) |
||||
if index_entry.get( "content" ): |
||||
buf.append( " \"content\": {}".format( jsonval(index_entry["content"]) ) ) |
||||
if index_entry.get( "rulerefs" ): |
||||
buf2 = [] |
||||
for ruleref in index_entry["rulerefs"]: |
||||
buf2.append( " {{ \"caption\": {}, \"ruleids\": {} }}".format( |
||||
jsonval( ruleref["caption"] ), |
||||
jsonval( ruleref["ruleids"] ) |
||||
) ) |
||||
buf.append( " \"rulerefs\": [\n{}\n ]".format( ",\n".join(buf2) ) ) |
||||
entries.append( ",\n".join( buf ) + "\n}" ) |
||||
print( "[\n\n{}\n\n]".format( ",\n\n".join(entries) ), file=out ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@click.command() |
||||
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) |
||||
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) |
||||
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) |
||||
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) |
||||
@click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." ) |
||||
def main( pdf_file, args, progress, format, output_fname ): |
||||
"""Extract the index from the MMP eASLRB.""" |
||||
|
||||
# initialize |
||||
args = ExtractBase.parse_args( args, _DEFAULT_ARGS ) |
||||
|
||||
# extract the index |
||||
def log_msg( msg_type, msg ): |
||||
if msg_type == "progress" and not progress: |
||||
return |
||||
log_msg_stderr( msg_type, msg ) |
||||
extract = ExtractIndex( args, log_msg ) |
||||
extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) |
||||
with PdfDoc( pdf_file ) as pdf: |
||||
extract.extract_index( pdf ) |
||||
|
||||
# save the results |
||||
with open( output_fname, "w", encoding="utf-8" ) as out: |
||||
getattr( extract, "save_as_"+format )( out ) |
||||
|
||||
if __name__ == "__main__": |
||||
main() #pylint: disable=no-value-for-parameter |
@ -0,0 +1,3 @@ |
||||
"""Module definitions.""" |
||||
|
||||
pytest_options = None #pylint: disable=invalid-name |
@ -0,0 +1,116 @@ |
||||
""" Test eASLRB extraction. """ |
||||
|
||||
import os |
||||
import io |
||||
|
||||
import pytest |
||||
|
||||
from asl_rulebook2.pdf import PdfDoc |
||||
from asl_rulebook2.extract.index import ExtractIndex |
||||
from asl_rulebook2.extract.content import ExtractContent |
||||
from asl_rulebook2.extract.all import ExtractAll |
||||
from asl_rulebook2.tests import pytest_options |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." ) |
||||
@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." ) |
||||
def test_extract_index(): |
||||
"""Test extracting the index.""" |
||||
|
||||
def do_test( dname ): |
||||
|
||||
# extract the index |
||||
fname = os.path.join( dname, "eASLRB.pdf" ) |
||||
with PdfDoc( fname ) as pdf: |
||||
extract = ExtractIndex( args={}, log=_check_log_msg ) |
||||
extract.extract_index( pdf ) |
||||
buf = io.StringIO() |
||||
extract.save_as_text( buf ) |
||||
buf = buf.getvalue() |
||||
|
||||
# check the results |
||||
fname = os.path.join( dname, "index.txt" ) |
||||
assert open( fname, "r", encoding="utf-8" ).read() == buf |
||||
|
||||
# run the test |
||||
_for_each_version( do_test ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." ) |
||||
@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." ) |
||||
def test_extract_content(): |
||||
"""Test extracting content.""" |
||||
|
||||
def do_test( dname ): |
||||
|
||||
# extract the content |
||||
fname = os.path.join( dname, "eASLRB.pdf" ) |
||||
with PdfDoc( fname ) as pdf: |
||||
extract = ExtractContent( args={}, log=_check_log_msg ) |
||||
extract.extract_content( pdf ) |
||||
targets_buf, footnotes_buf = io.StringIO(), io.StringIO() |
||||
extract.save_as_text( targets_buf, footnotes_buf ) |
||||
targets_buf = targets_buf.getvalue() |
||||
footnotes_buf = footnotes_buf.getvalue() |
||||
|
||||
# check the results |
||||
fname2 = os.path.join( dname, "targets.txt" ) |
||||
assert open( fname2, "r", encoding="utf-8" ).read() == targets_buf |
||||
fname2 = os.path.join( dname, "footnotes.txt" ) |
||||
assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf |
||||
|
||||
# run the test |
||||
_for_each_version( do_test ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." ) |
||||
@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." ) |
||||
def test_extract_all(): |
||||
"""Test extracting everything.""" |
||||
|
||||
def do_test( dname ): |
||||
|
||||
# extract everything |
||||
fname = os.path.join( dname, "eASLRB.pdf" ) |
||||
with PdfDoc( fname ) as pdf: |
||||
extract = ExtractAll( args={}, log=_check_log_msg ) |
||||
extract.extract_all( pdf ) |
||||
index_buf = io.StringIO() |
||||
extract.extract_index.save_as_json( index_buf ) |
||||
index_buf = index_buf.getvalue() |
||||
targets_buf, footnotes_buf = io.StringIO(), io.StringIO() |
||||
extract.extract_content.save_as_json( targets_buf, footnotes_buf ) |
||||
targets_buf = targets_buf.getvalue() |
||||
footnotes_buf = footnotes_buf.getvalue() |
||||
|
||||
# check the results |
||||
fname2 = os.path.join( dname, "index.json" ) |
||||
assert open( fname2, "r", encoding="utf-8" ).read() == index_buf |
||||
fname2 = os.path.join( dname, "targets.json" ) |
||||
assert open( fname2, "r", encoding="utf-8" ).read() == targets_buf |
||||
fname2 = os.path.join( dname, "footnotes.json" ) |
||||
assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf |
||||
|
||||
# run the test |
||||
_for_each_version( do_test ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def _for_each_version( func ): |
||||
"""Run tests for each version of the eASLRB.""" |
||||
base_dir = pytest_options.easlrb_path |
||||
ncalls = 0 |
||||
for name in os.listdir( base_dir ): |
||||
dname = os.path.join( base_dir, name ) |
||||
if os.path.isdir( dname ): |
||||
func( dname ) |
||||
ncalls += 1 |
||||
assert ncalls > 0 |
||||
|
||||
def _check_log_msg( msg_type, msg ): |
||||
"""Check a log message.""" |
||||
assert msg_type not in ( "warning", "error" ), \ |
||||
"Unexpected {}: {}".format( msg_type, msg ) |
@ -0,0 +1,33 @@ |
||||
""" pytest support functions. """ |
||||
|
||||
import pytest |
||||
|
||||
_pytest_options = None |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def pytest_addoption( parser ): |
||||
"""Configure pytest options.""" |
||||
|
||||
# NOTE: This file needs to be in the project root for this to work :-/ |
||||
|
||||
# add test options |
||||
parser.addoption( |
||||
"--easlrb", action="store", dest="easlrb_path", default=None, |
||||
help="Directory containing the MMP eASLRB PDF and extracted data file(s)." |
||||
) |
||||
|
||||
# add test options |
||||
parser.addoption( |
||||
"--short-tests", action="store_true", dest="short_tests", default=False, |
||||
help="Skip running the longer tests." |
||||
) |
||||
|
||||
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
||||
|
||||
def pytest_configure( config ): |
||||
"""Called after command-line options have been parsed.""" |
||||
global _pytest_options |
||||
_pytest_options = config.option |
||||
import asl_rulebook2.tests |
||||
asl_rulebook2.tests.pytest_options = _pytest_options |
Loading…
Reference in new issue