diff --git a/asl_rulebook2/bin/prepare_pdf.py b/asl_rulebook2/bin/prepare_pdf.py index 071767d..965f9ee 100755 --- a/asl_rulebook2/bin/prepare_pdf.py +++ b/asl_rulebook2/bin/prepare_pdf.py @@ -23,12 +23,17 @@ _COMPRESSION_CHOICES = [ # --------------------------------------------------------------------- -def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compression, gs_path, log_msg ): +def prepare_pdf( pdf_file, title, targets_fname, vo_notes_fname, yoffset, output_fname, compression, gs_path, log_msg ): """Prepare the MMP eASLRB PDF.""" # load the targets with open( targets_fname, "r" ) as fp: targets = json.load( fp ) + if vo_notes_fname: + with open( vo_notes_fname, "r" ) as fp: + vo_notes_targets = json.load( fp ) + else: + vo_notes_targets = None with TempFile(mode="w") as compressed_file, TempFile(mode="w") as pdfmarks_file: @@ -49,6 +54,16 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress ) pdf_file = compressed_file.name + def add_vo_notes_dests( key, vo_entries, yoffset, out ): + for vo_note_id, vo_entry in vo_entries.items(): + dest = "{}:{}".format( key, vo_note_id ) + xpos, ypos = vo_entry.get( "pos", ["null","null"] ) + if isinstance( ypos, int ): + ypos += yoffset + print( "[ /Dest /{} /Page {} /View [/XYZ {} {}] /DEST pdfmark".format( + dest, vo_entry["page_no"], xpos, ypos + ), file=out ) + # generate the pdfmarks log_msg( "progress", "Generating the pdfmarks..." ) if title: @@ -68,7 +83,15 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress print( "[ /Dest /{} /Page {} /View [/XYZ {} {}] /DEST pdfmark".format( ruleid, target["page_no"], xpos, ypos ), file=pdfmarks_file ) - print( file=pdfmarks_file ) + if vo_notes_targets: + print( file=pdfmarks_file ) + for nat in vo_notes_targets: + if nat == "landing-craft": + add_vo_notes_dests( nat, vo_notes_targets[nat], yoffset, pdfmarks_file ) + continue + for vo_type, vo_entries in vo_notes_targets[nat].items(): + key = "{}_{}".format( nat, vo_type ) + add_vo_notes_dests( key, vo_entries, yoffset, pdfmarks_file ) pdfmarks_file.close( delete=False ) # generate the pdfmark'ed document @@ -92,6 +115,9 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress @click.option( "--targets","-t","targets_fname", required=True, type=click.Path(dir_okay=False), help="Target definition file." ) +@click.option( "--vo-notes","vo_notes_fname", required=False, type=click.Path(dir_okay=False), + help="Vehicle/ordnance notes definition file." +) @click.option( "--yoffset", default=5, help="Offset to add to y co-ordinates." ) @click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False), help="Output PDF file." @@ -101,7 +127,7 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress ) @click.option( "--gs","gs_path", default="gs", help="Path to the Ghostscript executable." ) @click.option( "--progress","-p", is_flag=True, default=False, help="Log progress." ) -def main( pdf_file, title, targets_fname, yoffset, output_fname, compression, gs_path, progress ): +def main( pdf_file, title, targets_fname, vo_notes_fname, yoffset, output_fname, compression, gs_path, progress ): """Prepare the MMP eASLRB PDF.""" # initialize @@ -113,7 +139,7 @@ def main( pdf_file, title, targets_fname, yoffset, output_fname, compression, gs # prepare the PDF prepare_pdf( pdf_file, title, - targets_fname, yoffset, + targets_fname, vo_notes_fname, yoffset, output_fname, compression, gs_path, log_msg diff --git a/asl_rulebook2/extract/all.py b/asl_rulebook2/extract/all.py index 51018d3..fc8365f 100755 --- a/asl_rulebook2/extract/all.py +++ b/asl_rulebook2/extract/all.py @@ -128,8 +128,11 @@ class ExtractAll( ExtractBase ): @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-chapters","save_chapters_fname", required=True, help="Where to save the extracted chaopters." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) +@click.option( "--save-vo-notes","save_vo_notes_fname", required=True, + help="Where to save the extracted vehicle/ordnance notes targets." +) def main( pdf_file, args, progress, output_fmt, - save_index_fname, save_targets_fname, save_chapters_fname, save_footnotes_fname + save_index_fname, save_targets_fname, save_chapters_fname, save_footnotes_fname, save_vo_notes_fname ): """Extract everything we need from the MMP eASLRB.""" @@ -147,9 +150,12 @@ def main( pdf_file, args, progress, output_fmt, with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \ open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ open( save_chapters_fname, "w", encoding="utf-8" ) as chapters_out, \ - open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: + open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out, \ + open( save_vo_notes_fname, "w", encoding="utf-8" ) as vo_notes_out: getattr( extract.extract_index, "save_as_"+output_fmt )( index_out ) - getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, chapters_out, footnotes_out ) + getattr( extract.extract_content, "save_as_"+output_fmt )( + targets_out, chapters_out, footnotes_out, vo_notes_out + ) if __name__ == "__main__": main() #pylint: disable=no-value-for-parameter diff --git a/asl_rulebook2/extract/base.py b/asl_rulebook2/extract/base.py index be5a19b..a4b594b 100644 --- a/asl_rulebook2/extract/base.py +++ b/asl_rulebook2/extract/base.py @@ -38,7 +38,7 @@ class ExtractBase: @staticmethod def _is_bold( elem ): """Check if an element is using a bold font.""" - return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) ) + return elem.fontname.endswith( ( ",Bold", "-BoldMT" ) ) or "-Bold" in elem.fontname def log_msg( self, msg_type, msg, *args, **kwargs ): """Log a message.""" diff --git a/asl_rulebook2/extract/content.py b/asl_rulebook2/extract/content.py index be2ca8d..2769bb2 100755 --- a/asl_rulebook2/extract/content.py +++ b/asl_rulebook2/extract/content.py @@ -5,6 +5,7 @@ import os import json import re import math +from collections import defaultdict import click from pdfminer.layout import LTChar @@ -30,6 +31,8 @@ _DISABLE_SORT_ITEMS = [ "F20", "F21", # Chapter F footnotes "G48", "G49", "G50", # Chapter G footnotes "H9", # Chapter H footnotes + 429,431,432,433,434,435, # Italian vehicle notes + 436,437,438,439, # Italian ordnance notes ] _DEFAULT_ARGS = { @@ -38,9 +41,44 @@ _DEFAULT_ARGS = { "chapter-j": "593", "chapter-w": "647-664", "content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport - "disable-sort-items": ",".join( _DISABLE_SORT_ITEMS ) + "disable-sort-items": ",".join( str(si) for si in _DISABLE_SORT_ITEMS ) } +# NOTE: The exact mappings here are actually not that important. What's important is: +# - the order of the nationality + V/O types +# - the page numbers themselves (so that they get parsed) +_VO_NOTE_SECTIONS = [ + [ "german", "vehicles", "330,332,334-343", True ], + [ "german", "ordnance", "344-348", True ], + [ "russian", "vehicles", "348,350-355", True ], + [ "russian", "ordnance", "356-358", True ], + [ "russian", "vehicles", "362,364-368", False ], + [ "russian", "ordnance", "369", False ], + [ "american", "vehicles", "371,373-383", True ], + [ "american", "ordnance", "385-389", True ], + [ "british", "vehicles", "395,398-417", True ], + [ "british", "ordnance", "419-423", True ], + [ "italian", "vehicles", "429,431-435", True ], + [ "italian", "ordnance", "436-439", True ], + [ "japanese", "vehicles", "443-448", True ], + [ "japanese", "ordnance", "448-452", True ], + [ "chinese", "vehicles", "456-459", True ], + [ "chinese", "ordnance", "459-463", True ], + [ "landing-craft", "vehicles", "467-468", True ], + [ "french", "vehicles", "470,472-480", True ], + [ "french", "ordnance", "482-487", True ], + [ "allied-minor", "vehicles", "492-493,495-500", True ], + [ "allied-minor", "ordnance", "501-504", True ], + [ "axis-minor", "vehicles", "506,508-515", True ], + [ "axis-minor", "ordnance", "516,518-527", True ], + [ "finnish", "vehicles", "536,538-541", True ], + [ "finnish", "ordnance", "543,545-549", True ], + [ "un-forces", "vehicles", "554,556-565", True ], + [ "un-forces", "ordnance", "567-570", True ], + [ "communist-forces", "vehicles", "580", True ], + [ "communist-forces", "ordnance", "581-585", True ], +] + # --------------------------------------------------------------------- class ExtractContent( ExtractBase ): @@ -51,6 +89,8 @@ class ExtractContent( ExtractBase ): self.targets = {} self._chapters = [] self._footnotes = {} + self._vo_notes = self._prev_vo_note_id = None + self._curr_vo_note_section = 0 self._curr_chapter = self._curr_footnote = self._curr_pageid = None self._prev_elem = self._top_left_elem = None # prepare to fixup problems in the content @@ -61,6 +101,7 @@ class ExtractContent( ExtractBase ): self._target_fixups = load_fixup( "target-fixups.json" ) self._chapter_fixups = load_fixup( "chapter-fixups.json" ) self._footnote_fixups = load_fixup( "footnote-fixups.json" ) + self._vo_note_fixups = load_fixup( "vo-note-fixups.json" ) def extract_content( self, pdf ): """Extract content from the MMP eASLRB.""" @@ -78,6 +119,12 @@ class ExtractContent( ExtractBase ): page_index[ page_no ] = chapter disable_sort_items = set( self._args["disable-sort-items"].split( "," ) ) + # include the pages for the Chapter H vehicle/ordnance notes + for _, _, page_nos, _ in _VO_NOTE_SECTIONS: + page_nos = parse_page_numbers( page_nos ) + for page_no in page_nos: + page_index[ page_no ] = "H" + # initialize self._curr_chapter = None curr_chapter_pageno = None @@ -109,13 +156,18 @@ class ExtractContent( ExtractBase ): self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page# self._curr_chapter, curr_chapter_pageno ) - self.log_msg( "progress", "- Analyzing page {} ({}).", page_no, self._curr_pageid ) + # NOTE: There have been so many extra pages added to Chapter H, there's no easy way to calculate + # the page ID. We could set up a table mapping physical page numbers to page ID's, but that's + # far more trouble than it's worth. + self.log_msg( "progress", "- Analyzing page {}{}.", + page_no, " ({})".format( self._curr_pageid ) if not self._curr_pageid.startswith("H") else "" + ) # process each element on the page curr_caption = None self._top_left_elem = self._prev_elem = None elem_filter = lambda e: isinstance( e, LTChar ) - sort_elems = self._curr_pageid not in disable_sort_items + sort_elems = self._curr_pageid not in disable_sort_items and str(page_no) not in disable_sort_items for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): # skip problematic elements @@ -137,7 +189,7 @@ class ExtractContent( ExtractBase ): # check if we're currently extracting footnotes if self._curr_footnote is not None: - self._on_footnote_elem( elem, lt_page ) + self._on_footnote_elem( elem, lt_page, page_no ) self._prev_elem = elem continue @@ -195,6 +247,8 @@ class ExtractContent( ExtractBase ): self.log_msg( "warning", "Unused fixups: {}", self._target_fixups ) if self._footnote_fixups: self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups ) + if self._vo_note_fixups: + self.log_msg( "warning", "Unused fixups: {}", self._vo_note_fixups ) # extract the chapters self._extract_chapters() @@ -215,7 +269,7 @@ class ExtractContent( ExtractBase ): # yup - notify the main loop self._curr_footnote = [] if elem: - self._on_footnote_elem( elem, lt_page ) + self._on_footnote_elem( elem, lt_page, page_no ) return # check if the entry needs to be fixed up @@ -266,14 +320,15 @@ class ExtractContent( ExtractBase ): "raw_caption": orig_caption } - def _on_footnote_elem( self, elem, lt_page ): + def _on_footnote_elem( self, elem, lt_page, page_no ): """Process an element while we're parsing footnotes.""" # check if we've found the start of a new footnote if self._is_bold( elem ): if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ): # yup - save the current footnote, start collecting the new one self._save_footnote() - self._curr_footnote = [ elem.get_text(), "" ] + elem_pos = ( elem.x0, elem.y1 ) + self._curr_footnote = [ elem.get_text(), "", page_no, elem_pos ] else: if self._curr_footnote[1]: # FUDGE! Some footnote content has bold text hard-up at the left margin, @@ -334,10 +389,17 @@ class ExtractContent( ExtractBase ): if footnote_id == "9" and "9" in footnote_ids: footnote_id = "29" - # check if we've gone past the end of the Chapter H footnotes - if self._curr_chapter == "H" and len(footnote_id) > 1: - self._curr_footnote = None - return + if self._curr_chapter == "H": + # check if we've gone past the end of the Chapter H footnotes + if self._vo_notes is None and len(footnote_id) > 1: + # yup - start collecting vehicle/ordnance notes + self._vo_notes = defaultdict( lambda: defaultdict( list ) ) + # check if we're collecting Chapter H vehicle/ordnance notes + if self._vo_notes is not None: + # yup - save the next entry (the "footnote" is actually a vehicle/ordnance note) + self._save_vo_note( footnote_id, self._curr_footnote[2], self._curr_footnote[3] ) + self._curr_footnote = None + return # clean up the content content = re.sub( r"\s+", " ", content ).strip() @@ -415,6 +477,108 @@ class ExtractContent( ExtractBase ): } ) self._curr_footnote = None + def _save_vo_note( self, caption, page_no, page_pos ): + """Save an extracted vehicle/ordnance note.""" + + # NOTE: Some pieces of text cause the parsing code to go wonky (typically because it's seen + # a "1" and so thinks it's found the start of a new section), so we manually skip over these. + skips = self._vo_note_fixups.get( "skips", {} ).get( str(page_no) ) + if skips: + for i, target in enumerate(skips): + if self._check_string( caption, target ): + # we've got a caption that should be skipped - remove it from the list, and return + del skips[i] + if not skips: + del self._vo_note_fixups["skips"][ str(page_no) ] + if not self._vo_note_fixups["skips"]: + del self._vo_note_fixups["skips"] + return + if caption.isdigit() and page_no not in (354, 417): + return + + def apply_fixups( vo_note_id, caption ): + nat, vo_type, _, _ = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ] + fixup = self._vo_note_fixups.get( nat, {} ).get( vo_type, {} ).get( vo_note_id ) + if fixup: + if self._check_string( caption, fixup["old_caption"] ): + # remove the fixup + del self._vo_note_fixups[ nat ][ vo_type ][ vo_note_id ] + cleanup_fixups( nat, vo_type ) + # apply the fixup + if "new_vo_note_id" in fixup: + vo_note_id = fixup["new_vo_note_id"] + if "new_caption" in fixup: + caption = fixup["new_caption"] + return vo_note_id, caption + + def cleanup_fixups( nat, vo_type ): + if nat not in self._vo_note_fixups: + return + if vo_type in self._vo_note_fixups[nat] and not self._vo_note_fixups[ nat ][ vo_type ]: + del self._vo_note_fixups[ nat ][ vo_type ] + if nat in self._vo_note_fixups and not self._vo_note_fixups[ nat ]: + del self._vo_note_fixups[ nat ] + + # extract the note number and caption + mo = re.search( r"^[1-9][0-9.]*", caption ) + if not mo: + return + vo_note_id = mo.group() + caption = caption[ mo.end() : ].strip() + if vo_note_id.endswith( "." ): + vo_note_id = vo_note_id[:-1] + if caption.endswith( ":" ): + caption = caption[:-1].strip() + if caption.startswith( ( "cm ", "mm ", "pdr", "-cwt" ) ): + # FUDGE! Things like "5.1 2.2cm Big Gun" are getting parsed as "5.12.2: + "cm Big Gun" :-/ + pos = vo_note_id.find( "." ) + if pos >= 0: + caption = vo_note_id[pos+1:] + caption + vo_note_id = vo_note_id[:pos] + + # check for any fixups + vo_note_id, caption = apply_fixups( vo_note_id, caption ) + + # compare the note ID with the previous one + nat, vo_type, _, check_seq = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ] + def get_base_note_id( val ): + pos = val.find( "." ) + return int( val[:pos] if pos >= 0 else val ) + base_note_id = get_base_note_id( vo_note_id ) + if self._prev_vo_note_id: + # check if we've found the start of the next section + if base_note_id == 1: + # yup - add any extra entries to the current section + add_vo_entries = self._vo_note_fixups.get( nat, {} ).get( vo_type, {} ).pop( "add", [] ) + for vo_entry in add_vo_entries: + self._vo_notes[ nat ][ vo_type ].append( vo_entry ) + cleanup_fixups( nat, vo_type ) + # get the next nationality + V/O type + self._curr_vo_note_section += 1 + nat, vo_type, _, _ = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ] + # check for any fixups + vo_note_id, caption = apply_fixups( vo_note_id, caption ) + elif check_seq: + # compare the note ID with the previous one + prev_base_note_id = get_base_note_id( self._prev_vo_note_id ) + if base_note_id == prev_base_note_id + 1: + pass # nb: this is the normal case, we've found the next V/O note + elif base_note_id == prev_base_note_id and "." in vo_note_id: + pass # nb: this is to allow things like "9.1" following "9" + else: + return # nb: we got some junk that can be ignored + + # save the V/O note + self._vo_notes[ nat ][ vo_type ].append( { + "vo_note_id": vo_note_id, "caption": caption, + "page_no": page_no, "page_pos": page_pos + } ) + if nat == "allied-minor" and vo_type == "ordnance" and vo_note_id == "19": + # FUDGE! Because we're not seing Allied Minor Ordnance Note 20 :-/ + self._prev_vo_note_id = "20" + else: + self._prev_vo_note_id = vo_note_id + def _extract_chapters( self ): """Extract the chapters and their sections.""" @@ -492,15 +656,15 @@ class ExtractContent( ExtractBase ): return True # the element is at the top of the right column return False - def save_as_raw( self, targets_out, chapters_out, footnotes_out ): + def save_as_raw( self, targets_out, chapters_out, footnotes_out, vo_notes_out ): """Save the raw results.""" - self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, True ) + self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, vo_notes_out, True ) - def save_as_text( self, targets_out, chapters_out, footnotes_out ): + def save_as_text( self, targets_out, chapters_out, footnotes_out, vo_notes_out ): """Save the results as plain-text.""" - self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, False ) + self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, vo_notes_out, False ) - def _save_as_raw_or_text( self, targets_out, chapters_out, footnotes_out, raw ): + def _save_as_raw_or_text( self, targets_out, chapters_out, footnotes_out, vo_notes_out, raw ): """Save the results as raw or plain-text.""" # save the targets @@ -511,7 +675,7 @@ class ExtractContent( ExtractBase ): print( file=targets_out ) print( "=== p{} ===".format( target["page_no"] ), file=targets_out ) curr_page_no = target["page_no"] - xpos, ypos = self._get_target_pos( target ) + xpos, ypos = self._get_page_pos( target["pos"] ) if raw: print( "[{},{}] = {}".format( xpos, ypos, target["raw_caption"] @@ -556,13 +720,35 @@ class ExtractContent( ExtractBase ): print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out ) print( footnote["content"], file=footnotes_out ) - def save_as_json( self, targets_out, chapters_out, footnotes_out ): + # save the vehicle/ordnance notes + first = True + for nat, vo_types in self._vo_notes.items(): + for vo_type, vo_entries in vo_types.items(): + if first: + first = False + else: + print( file=vo_notes_out ) + print( "=== {} ===".format( + nat if nat == "landing-craft" else "{} {}".format( nat, vo_type ) + ), file=vo_notes_out ) + for vo_entry in vo_entries: + if "page_pos" in vo_entry: + xpos, ypos = ExtractContent._get_page_pos( vo_entry["page_pos"] ) + page_pos = "[{},{}]".format( xpos, ypos ) + else: + page_pos = None + print( "{:<5} {} @p{}{}".format( + vo_entry["vo_note_id"]+":", vo_entry["caption"], vo_entry["page_no"], + ":"+page_pos if page_pos else "" + ), file=vo_notes_out ) + + def save_as_json( self, targets_out, chapters_out, footnotes_out, vo_notes_out ): """Save the results as JSON.""" # save the targets targets, curr_chapter = [], None for ruleid, target in self.targets.items(): - xpos, ypos = self._get_target_pos( target ) + xpos, ypos = self._get_page_pos( target["pos"] ) targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format( jsonval( ruleid ), jsonval(target["caption"]), target["page_no"], xpos, ypos @@ -613,12 +799,46 @@ class ExtractContent( ExtractBase ): ",\n\n".join( chapters ) ), file=footnotes_out ) + # save the vehicle/ordnance notes + vo_notes = [] + for nat in self._vo_notes: + vo_types = [] + for vo_type, vo_entries in self._vo_notes[nat].items(): + entries = [] + for vo_entry in vo_entries: + val = "{}: {{ \"caption\": {}, \"page_no\": {}".format( + jsonval(vo_entry["vo_note_id"]), jsonval(vo_entry["caption"]), jsonval(vo_entry["page_no"]) + ) + if "page_pos" in vo_entry: + xpos, ypos = self._get_page_pos( vo_entry["page_pos"] ) + val += ", \"pos\": [{},{}]".format( xpos, ypos ) + val += " }" + entries.append( " {}".format( val ) ) + if nat == "landing-craft": + vo_types.append( ",\n".join( entries ) ) + else: + vo_types.append( "{}: {{\n{}\n}}".format( + jsonval(vo_type), ",\n".join( entries ) + ) ) + vo_notes.append( "{}: {{\n{}\n}}".format( + jsonval(nat), ",\n".join( vo_types ) + ) ) + print( "{{\n\n{}\n\n}}".format( + ",\n\n".join( vo_notes ) + ), file=vo_notes_out ) + @staticmethod - def _get_target_pos( target ): - """Return a target's X/Y position on the page.""" - xpos = math.floor( target["pos"][0] ) - ypos = math.ceil( target["pos"][1] ) - return xpos, ypos + def _check_string( val, target ): + """Check if a string matches a target.""" + if target.startswith( "^" ): + return val.startswith( target[1:] ) + else: + return val == target + + @staticmethod + def _get_page_pos( pos ): + """Return a X/Y position on the page.""" + return math.floor( pos[0] ), math.ceil( pos[1] ) # --------------------------------------------------------------------- @@ -632,7 +852,12 @@ class ExtractContent( ExtractBase ): @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-chapters","save_chapters_fname", required=True, help="Where to save the extracted chaopters." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) -def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_chapters_fname, save_footnotes_fname ): +@click.option( "--save-vo-notes","save_vo_notes_fname", required=True, + help="Where to save the extracted vehicle/ordnance notes." +) +def main( pdf_file, args, progress, output_fmt, + save_targets_fname, save_chapters_fname, save_footnotes_fname, save_vo_notes_fname +): """Extract content from the MMP eASLRB.""" # initialize @@ -651,8 +876,9 @@ def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_chapter # save the results with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ open( save_chapters_fname, "w", encoding="utf-8" ) as chapters_out, \ - open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: - getattr( extract, "save_as_"+output_fmt, )( targets_out, chapters_out, footnotes_out ) + open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out, \ + open( save_vo_notes_fname, "w", encoding="utf-8" ) as vo_notes_out: + getattr( extract, "save_as_"+output_fmt, )( targets_out, chapters_out, footnotes_out, vo_notes_out ) if __name__ == "__main__": main() #pylint: disable=no-value-for-parameter diff --git a/asl_rulebook2/extract/data/index-fixups.json b/asl_rulebook2/extract/data/index-fixups.json index 4ff4728..73017d7 100644 --- a/asl_rulebook2/extract/data/index-fixups.json +++ b/asl_rulebook2/extract/data/index-fixups.json @@ -83,12 +83,6 @@ ] }, -"EX": { - "old_content": "ExampleEXC: Exception", - "new_content": "Example", - "_comment_": "The code manually inserts an entry for EXC: Exception" -}, - "Fortification": { "replace": [ [ "[in BRT: SSR1 (BRT Sand: T3.2) (NA in Betio Piers: T9.2)]", "[in BRT: SSR1 (BRT Sand): T3.2] [in BRT: SSR1 (NA in Betio Piers): T9.2]" ] diff --git a/asl_rulebook2/extract/data/target-fixups.json b/asl_rulebook2/extract/data/target-fixups.json index 7515bcc..70d2bce 100644 --- a/asl_rulebook2/extract/data/target-fixups.json +++ b/asl_rulebook2/extract/data/target-fixups.json @@ -394,7 +394,8 @@ "10": { "new_ruleid": null }, "11": { "new_ruleid": null }, "12": { "new_ruleid": null }, - "55": { "new_ruleid": null } + "55": { "new_ruleid": null }, + "678876987": { "new_ruleid": null } }, "H2": { diff --git a/asl_rulebook2/extract/data/vo-note-fixups.json b/asl_rulebook2/extract/data/vo-note-fixups.json new file mode 100644 index 0000000..b8c1d01 --- /dev/null +++ b/asl_rulebook2/extract/data/vo-note-fixups.json @@ -0,0 +1,734 @@ +{ + +"skips": { + "382": [ "1, 3" ], + "429": [ "^1,660,", "^1and Fiat 3000", "^9/43 armistice", "^4/41 (.9)" ], + "431": [ "^1, for East Africa", "^9/42 (1.4)," ], + "432": [ "^1 (l.2),", "^1. Sources vary" ], + "434": [ "1-", "^1.5 for 11/41-6/42," ], + "438": [ "^1/41-5/43" ], + "439": [ "1 (1", "^1/43 ( 1.2),", "^1/43 (1.3),", "^1/42-5/43." ], + "492": [ "1B11CE/FPNA", "1B11CE/FPNA" ], + "493": [ "1T", "1B" ], + "496": [ "1B" ], + "501": [ "1h-d" ], + "502": [ "1s5", "1s5" ], + "503": [ "1AP5", "1s6" ], + "504": [ "^1.3)" ], + "514": [ "1.4 for 45" ], + "556": [ "1#" ], + "560": [ "1, 3" ] +}, + +"german": { +"vehicles": { + "9.1": { + "old_caption": "FT-17 730(f) &", + "new_caption": "FT-17 730(f) & FT-17 730m(f)" + }, + "22": { + "old_caption": "PzKpfw IVF: 1", + "new_caption": "PzKpfw IVF(1)" + }, + "23": { + "old_caption": "PzKpfw IVF:2", + "new_caption": "PzKpfw IVF(2)" + }, + "35": { + "old_caption": "StuG IIIG (L) &", + "new_caption": "StuG IIIG (L) & StuH 42 (L)" + }, + "52": { + "old_caption": "JgdPz IV & JgdPz", + "new_caption": "JgdPz IV & JgdPz IV(L)" + }, + "58": { + "old_caption": "SPW 250/sMG", + "new_caption": "SPW 250/sMG & 251/sMG" + }, + "59": { + "old_caption": "SPW 250/7 &", + "new_caption": "SPW 250/7 & 251/2" + }, + "72": { + "old_caption": "PSW 231(8 rad)", + "new_caption": "PSW 231(8 rad) & 232" + }, + "85.37": { + "old_caption": "FlaK/Pz IV", + "new_vo_note_id": "85", "new_caption": "37 FlaK/Pz IV" + }, + "92": { + "old_caption": "2cm & 3.7cm", + "new_caption": "2cm & 3.7cm FlaK LKW" + }, + "96": { + "old_caption": "Opel 6700 &Buessing-NAG", + "new_caption": "Opel 6700 & Buessing-NAG 4500" + }, + "add": [ + { "_comment_": "This gets parsed as '4' and '5.1 GSW 39H(f) PaK' :-/", + "vo_note_id": "45.1", "caption": "GSW 39H(f) PaK", "page_no": 337, "page_pos": [380,561] + }, + { "vo_note_id": "37.1", "caption": "Sturmtiger", "page_no": 532, "page_pos": [118,640] }, + { "vo_note_id": "88.1", "caption": "SdKfz 10/5", "page_no": 532, "page_pos": [399,713] } + ] +} +}, + +"russian": { +"vehicles": { + "4": { + "old_caption": "T-60 M40 &", + "new_caption": "T-60 M40 & M42" + }, + "6": { + "old_caption": "T-26 M33 & T-", + "new_caption": "T-26 M33 & T-26S M37/39" + }, + "1": { + "old_caption": "T-28 M34", + "new_vo_note_id": "11" + }, + "11.1": { + "old_caption": "T-28 M34(L) & 12.1. T-28E M40(L)", + "new_caption": "T-28 M34(L)" + }, + "23": { + "old_caption": "KV-lE, KV-1 M41,", + "new_caption": "KV-lE, KV-1 M41, & KV-1 M-42" + }, + "34": { + "old_caption": "ISU-122 & ISU-", + "new_caption": "ISU-122 & ISU-152" + }, + "add": [ + { "vo_note_id": "12.1", "caption": "T-28E M40(L)", "page_no": 364, "page_pos": [394,289] } + ] +} +}, + +"american": { +"vehicles": { + "14": { + "old_caption": "M4A3E2 & M4A3E2 (L) MediumTanks", + "new_caption": "M4A3E2 & M4A3E2 (L) Medium Tanks" + }, + "17": { + "old_caption": "M4(105) & M4A3(105) MediumTanks", + "new_caption": "M4(105) & M4A3(105) Medium Tanks" + } +} +}, + +"british": { +"vehicles": { + "2": { + "old_caption": "(A17) Tetrarch & Tetrarch CS[Light Tanks Mk VII & Mk VII CS]", + "new_caption": "(A17) Tetrarch & Tetrarch CS [Light Tanks Mk VII & Mk VII CS]" + }, + "6": { + "old_caption": "A9 & A9 CS [Cruiser Tanks Mk I& Mk I CS]", + "new_caption": "A9 & A9 CS [Cruiser Tanks Mk I & Mk I CS]" + }, + "26": { + "old_caption": "(A12) Matilda II & II CS [In-fantry Tank Mk II]", + "new_caption": "(A12) Matilda II & II CS [Infantry Tank Mk II]" + }, + "36": { + "old_caption": "Valentine & Churchill Bridgelay-ers", + "new_caption": "Valentine & Churchill Bridgelayers" + }, + "45": { + "old_caption": "Humber III & Otter Light Re-connaissance Cars", + "new_caption": "Humber III & Otter Light Reconnaissance Cars" + }, + "82": { + "old_caption": "", + "new_caption": "30-cwt Lorry" + }, + "83": { + "old_caption": "", + "new_caption": "3-Ton Lorry" + } +} +}, + +"italian": { +"vehicles": { + "1": { + "old_caption": "LS/21 & LS/3", + "new_caption": "L5/21 & L5/30" + }, + "2": { + "old_caption": "^L3/35: Derived from", + "new_caption": "L3/35" + }, + "3": { + "old_caption": "^L3 aa: Some L3", + "new_caption": "L3 aa" + }, + "4": { + "old_caption": "^L3 cc: During the early months", + "new_caption": "L3 cc" + }, + "5": { + "old_caption": "^L3 Lf: Development of", + "new_caption": "L3 Lf" + }, + "6": { + "old_caption": "^L6/40: Designed to replace", + "new_caption": "L6/40" + }, + "7": { + "old_caption": "^Mll/39: This tank carried", + "new_caption": "M11/39" + }, + "8": { + "old_caption": "^Ml3/40: Replacing the", + "new_caption": "M13/40" + }, + "9": { + "old_caption": "^M14/41: This tank,", + "new_caption": "M14/41" + }, + "10": { + "old_caption": "^M15/42: This, the last version", + "new_caption": "M15/42" + }, + "11": { + "old_caption": "^MR/35(f): The Germans provided", + "new_caption": "MR/35(f)" + }, + "12": { + "old_caption": "Semovente M40 & M41 da", + "new_caption": "Semovente M40 & M41 da 75/18" + }, + "13": { + "old_caption": "^Semovente M42 da 75/1&75/32: The last model", + "new_caption": "Semovente M42 da 75/18 & 75/32" + }, + "14": { + "old_caption": "^Semovente M43 da 105/25: Nicknathe", + "new_caption": "Semovente M43 da 105/25" + }, + "15": { + "old_caption": "Semovente L40 da 47/32: The SMV", + "new_caption": "Semovente L40 da 47/32" + }, + "16": { + "old_caption": "^Semovente M41M da 90/53: This AFV", + "new_caption": "Semovente M41M da 90/53" + }, + "18": { + "old_caption": "^Lince: The Lince (Lynx)", + "new_caption": "Lince" + }, + "19": { + "old_caption": "^Lancia lZM: In late 1912", + "new_caption": "Lancia 1ZM" + }, + "20": { + "old_caption": "^Fiat 611A & 611BThese armoredcars", + "new_caption": "Fiat 611A & 611B" + }, + "21": { + "old_caption": "^AB 40 & AB41These two auto", + "new_caption": "AB 40 & AB 41" + }, + "22": { + "old_caption": "^Autoprotetto S37: This APC", + "new_caption": "Autoprotetto S37" + }, + "23": { + "old_caption": "Autocannoni da", + "new_caption": "Autocannoni da 20/65(b) & 65/17(b)" + }, + "24": { + "old_caption": "Autocannoni da", + "new_caption": "Autocannoni da 75/27 CK & 90/53" + }, + "25": { + "old_caption": "^TL 37, TM 40 &TP 32", + "new_caption": "TL 37, TM 40 & TP 32" + }, + "26": { + "old_caption": "^Autocarretta: As the portee", + "new_caption": "Autocarretta" + }, + "27": { + "old_caption": "^Fiat 508 MC: Derived from", + "new_caption": "Fiat 508 MC" + }, + "28": { + "old_caption": "^Autocarri L, M & P: The ItalianArmy", + "new_caption": "Autocarri L, M & P" + } +}, +"ordnance": { + "1": { + "old_caption": "^Mortaio da 45 \"Brixia\": This weapon,", + "new_caption": "Mortaio da 45 \"Brixia\"" + }, + "2": { + "old_caption": "^Mortaio da 81/14: First usedi", + "new_caption": "Mortaio da 81/14" + }, + "3": { + "old_caption": "^Fucile-cc S: Like several other", + "new_caption": "Fucile-cc S" + }, + "4": { + "old_caption": "^Cannone-cc da 37/45: This was", + "new_caption": "Cannone-cc da 37/45" + }, + "5": { + "old_caption": "^Cannone da 47/32: This was", + "new_caption": "Cannone da 47/32" + }, + "6": { + "old_caption": "^Cannone da 65/17: This was", + "new_caption": "Cannone da 65/17" + }, + "7": { + "old_caption": "^Cannone da 70/15: This", + "new_caption": "Cannone da 70/15" + }, + "8": { + "old_caption": "^Obice da 75/13: The Skoda", + "new_caption": "Obice da 75/13" + }, + "9": { + "old_caption": "^Cannone da 75/27: This was", + "new_caption": "Cannone da 75/27" + }, + "10": { + "old_caption": "^Obice da 75/18: This game piece", + "new_caption": "Obice da 75/18" + }, + "11": { + "old_caption": "^Cannone da 75/32: The 75/32", + "new_caption": "Cannone da 75/32" + }, + "12": { + "old_caption": "^Obice da 100/17: Another old", + "new_caption": "Obice da 100/17" + }, + "13": { + "old_caption": "^Cannone da 105/28: This was", + "new_caption": "Cannone da 105/28" + }, + "14": { + "old_caption": "^Obice da 149/13: This piece", + "new_caption": "Obice da 149/13" + }, + "15": { + "old_caption": "^Cannone da 149/35: Another", + "new_caption": "Cannone da 149/35" + }, + "16": { + "old_caption": "^Cannone da 149/40: To replace", + "new_caption": "Cannone da 149/40" + }, + "17": { + "old_caption": "^Cannone-mitragliera da 20/65: Thiswas", + "new_caption": "Cannone-mitragliera da 20/65" + }, + "18": { + "old_caption": "^Cannone-aa da 75/39: This was", + "new_caption": "Cannone-aa da 75/39" + }, + "add": [ + { "vo_note_id": "19", "caption": "Cannone-aa da 75/46", "page_no": 439, "page_pos": [283,42] }, + { "vo_note_id": "20", "caption": "Cannone-aa da 90/53", "page_no": 439, "page_pos": [384,541] } + ] +} +}, + +"japanese": { +"vehicles": { + "3": { + "old_caption": "Type 95 SO-KI Armored Railway Ve-hicle", + "new_caption": "Type 95 SO-KI Armored Railway Vehicle" + }, + "4": { + "old_caption": "Types 97A & 97B TE-KE Light Ar-mored Vehicles", + "new_caption": "Types 97A & 97B TE-KE Light Armored Vehicles" + }, + "7": { + "old_caption": "Types 89A & 89B CHI-RO MediumTanks", + "new_caption": "Types 89A & 89B CHI-RO Medium Tanks" + }, + "8": { + "old_caption": "Types 97A & 97B CHI-HA MediumTanks", + "new_caption": "Types 97A & 97B CHI-HA Medium Tanks" + }, + "13": { + "old_caption": "Type 4 HO-RO Self-Propelled How-itzer", + "new_caption": "Type 4 HO-RO Self-Propelled Howitzer" + }, + "14": { + "old_caption": "Type 1 HO-KI Armored Troop-Vehi-cle", + "new_caption": "Type 1 HO-KI Armored Troop-Vehicle" + } +}, +"ordnance": { + "2": { + "old_caption": "Year-11 Type Curved-Fire InfantryGun", + "new_caption": "Year-11 Type Curved-Fire Infantry Gun" + }, + "1": { + "old_caption": "Type 98 High-Angle Machine Can-non", + "new_vo_note_id": "21", + "new_caption": "Type 98 High-Angle Machine Cannon" + }, + "17": { + "old_caption": "Year-3 Type 14cm Naval SeacoastGun", + "new_caption": "Year-3 Type 14cm Naval Seacoast Gun" + }, + "20": { + "old_caption": "Type 93 Twin-Mount High-Angle Ma-chine Gun", + "new_caption": "Type 93 Twin-Mount High-Angle Machine Gun" + }, + "22": { + "old_caption": "Type 96 Single-, Twin-, & Triple-Mount Naval High-Angle Machine Can-", + "new_caption": "Type 96 Single-, Twin-, & Triple-Mount Naval High-Angle Machine Cannons" + }, + "24": { + "old_caption": "Year-10 Type 12cm Naval High-AngleGun", + "new_caption": "Year-10 Type 12cm Naval High-Angle Gun" + } +} +}, + +"chinese": { +"ordnance": { + "2": { + "old_caption": "Mortaio da 45 “Brixia”, 5cm leGrW 36,", + "new_caption": "Mortaio da 45 “Brixia”, 5cm leGrW 36, 50mm RM obr.38, & Type 89 Heavy Grenade Launcher" + }, + "4": { + "old_caption": "Stokes 3-in., 8cm GrW 34, & 82mmBM obr. 37", + "new_caption": "Stokes 3-in., 8cm GrW 34, & 82mm BM obr. 37" + }, + "7": { + "old_caption": "37mm PP obr. 15R & Cannone da", + "new_caption": "37mm PP obr. 15R & Cannone da 70/15" + }, + "11": { + "old_caption": "7.7cm FK 16, 76.2mm P obr. 02/30, &OQF 18-Pounder", + "new_caption": "7.7cm FK 16, 76.2mm P obr. 02/30, & OQF 18-Pounder" + }, + "12": { + "old_caption": "10.5cm leFH 16, Cannone da 105/28,& M2A1 105mm Howitzer", + "new_captipn": "10.5cm leFH 16, Cannone da 105/28, & M2A1 105mm Howitzer" + }, + "15": { + "old_caption": "Oerlikon FF, Cannone-mitragliera da", + "new_caption": "Oerlikon FF, Cannone-mitragliera da 20/65, & 2cm FlaK 30" + }, + "16": { + "old_caption": "3.7cm FlaK 36 o. 37 & Bofors 40mmL/60", + "new_caption": "3.7cm FlaK 36 o. 37 & Bofors 40mm L/60" + } +} +}, + +"french": { +"vehicles": { + "20": { + "old_caption": "Autocanon de 75 mle 97 & Autocanonde 75 Conus(b)", + "new_caption": "Autocanon de 75 mle 97 & Autocanon de 75 Conus(b)" + }, + "21": { + "old_caption": "Camion de Mitrailleuse Contre-Avions, Camion de 13.2 CAJ, Camion de", + "new_caption": "Camion de Mitrailleuse Contre-Avions, Camion de 13.2 CAJ, Camion de 20 CA, & Autocanon de 25 CA" + }, + "36": { + "old_caption": "Peugeot 202, Citroën 23, & RenaultAGR2", + "new_caption": "Peugeot 202, Citroën 23, & Renault AGR2" + }, + "40": { + "old_caption": "M4A3(75)W, M4A3(76)W, & M4A3(105) Medium Tanks, & M4Tankdozer", + "new_caption": "M4A3(75)W, M4A3(76)W, & M4A3(105) Medium Tanks, & M4 Tankdozer" + } +}, +"ordnance": { + "6": { + "old_caption": "Canon Antichar de 47SA mle 37 APX", + "new_caption": "Canon Antichar de 47 SA mle 37 APX" + }, + "18": { + "old_caption": "Mitrailleuse de 13.2 CAJmle 30", + "new_caption": "Mitrailleuse de 13.2 CAJ mle 30" + } +} +}, + +"allied-minor": { +"vehicles": { + "1": { + "old_caption": "TKS&TKS(L)", + "new_caption": "TKS & TKS(L)" + }, + "2": { + "old_caption": "VickersEdw(b)&Ejw(b)", + "new_caption": "Vickers Edw(b) & Ejw(b)" + }, + "3.7": { + "old_caption": "TPdw & 7TPjw", + "new_vo_note_id": "3", + "new_caption": "7TPdw & 7TPjw" + }, + "6": { + "old_caption": "wz.29", + "new_caption": "wz.29 \"Ursus\"" + }, + "10.302": { + "old_caption": "T", + "new_vo_note_id": "10", + "new_caption": "302T" + }, + "13": { + "old_caption": "Horse-Drawn", + "new_caption": "Horse-Drawn \"Taczanka\"" + }, + "22": { + "old_caption": "M3A3(a) FlaK38", + "new_caption": "M3A3(a) FlaK 38" + }, + "29": { + "old_caption": "Marmon-Herrington III(b) Armored", + "new_caption": "Marmon-Herrington III(b) Armored Cars" + }, + "31": { + "old_caption": "L5/30(i) & L3/35(i) &", + "new_caption": "L5/30(i) & L3/35(i) & L6/40(i) & M13/40(i)" + }, + "37": { + "old_caption": "Light Truck & Medium Truck &", + "new_caption": "Light Truck & Medium Truck & Heavy Truck" + } +}, +"ordnance": { + "30.75": { + "old_caption": "M 19S", + "new_vo_note_id": "30", + "new_caption": "75M 19S" + }, + "add": [ + { "vo_note_id": "20", "caption": "3.7cm Infantry Gun", "page_no": 502, "page_pos": [393,616] } + ] +} +}, + +"axis-minor": { +"vehicles": { + "7.38": { + "old_caption": "M Toldi I", + "new_vo_note_id": "7", + "new_caption": "38M Toldi I" + }, + "8.38": { + "old_caption": "M Toldi IIA", + "new_vo_note_id": "8", + "new_caption": "38M Toldi IIA" + }, + "9.40": { + "old_caption": "M Turan I(r)", + "new_vo_note_id": "9", + "new_caption": "40M Turan I(r)" + }, + "10.41": { + "old_caption": "M Turan II(r)", + "new_vo_note_id": "10", + "new_caption": "41M Turan II(r)" + }, + "13.43": { + "old_caption": "M Zrinyi II", + "new_vo_note_id": "13", + "new_caption": "43M Zrinyi II" + }, + "14.39": { + "old_caption": "M Csaba & 40M Csaba", + "new_vo_note_id": "14", + "new_caption": "39M Csaba & 40M Csaba" + }, + "16.40": { + "old_caption": "M Nimrod", + "new_vo_note_id": "16", + "new_caption": "40M Nimrod" + }, + "50": { + "old_caption": "Light Truck, Medium Truck, &Heavy Truck", + "new_caption": "Light Truck, Medium Truck, & Heavy Truck" + } +}, +"ordnance": { + "20": { + "old_caption": "Canon Automatique de 25 CAmle 38", + "new_caption": "Canon Automatique de 25 CA mle 38" + }, + "29": { + "old_caption": "47mm KanonPUV vz. 36(t)", + "new_caption": "47mm Kanon PUV vz. 36(t)" + } +} +}, + +"finnish": { +"vehicles": { + "27": { + "old_caption": "Light Truck, Medium Truck, &Heavy Truck", + "new_caption": "Light Truck, Medium Truck, & Heavy Truck" + } +}, +"ordnance": { + "1.47": { + "old_caption": "Krh/41", + "new_vo_note_id": "1", + "new_caption": "47 Krh/41" + }, + "2.50": { + "old_caption": "Krh/39(r)", + "new_vo_note_id": "2", + "new_caption": "50 Krh/39(r)" + }, + "3.81": { + "old_caption": "Krh/32", + "new_vo_note_id": "3", + "new_caption": "81 Krh/32" + }, + "4.81": { + "old_caption": "Savunheitin M/42", + "new_vo_note_id": "4", + "new_caption": "81 Savunheitin M/42" + }, + "5.120": { + "old_caption": "Krh/40", + "new_vo_note_id": "5", + "new_caption": "120 Krh/40" + }, + "8.20": { + "old_caption": "PstK/40", + "new_vo_note_id": "8", + "new_caption": "20 PstK/40" + }, + "11.37": { + "old_caption": "PstK/37(g)", + "new_vo_note_id": "11", + "new_caption": "37 PstK/37(g)" + }, + "12.45": { + "old_caption": "PstK/32(r)", + "new_vo_note_id": "12", + "new_caption": "45 PstK/32(r)" + }, + "13.50": { + "old_caption": "PstK/38(g)", + "new_vo_note_id": "13", + "new_caption": "50 PstK/38(g)" + }, + "14.75": { + "old_caption": "PstK/97-38(g)", + "new_vo_note_id": "14", + "new_caption": "75 PstK/97-38(g)" + }, + "15.75": { + "old_caption": "PstK/40(g)", + "new_vo_note_id": "15", + "new_caption": "75 PstK/40(g)" + }, + "16.76": { + "old_caption": "RK/27(r)", + "new_vo_note_id": "16", + "new_caption": "76 RK/27(r)" + }, + "17.75": { + "old_caption": "K/02", + "new_vo_note_id": "17", + "new_caption": "75 K/02" + }, + "18.76": { + "old_caption": "LK/13", + "new_vo_note_id": "18", + "new_caption": "76 LK/13" + }, + "30.150": { + "old_caption": "H/40(g)", + "new_vo_note_id": "30", + "new_caption": "150 H/40(g)" + }, + "31.155": { + "old_caption": "H/17(f)", + "new_vo_note_id": "31", + "new_caption": "155 H/17(f)" + }, + "33.20": { + "old_caption": "ItK/30 BSW(g)", + "new_vo_note_id": "33", + "new_caption": "20 ItK/30 BSW(g)" + }, + "34.20": { + "old_caption": "ItK/38 BSW(g)", + "new_vo_note_id": "34", + "new_caption": "20 ItK/38 BSW(g)" + }, + "35.20": { + "old_caption": "ItK/35 Br", + "new_vo_note_id": "35", + "new_caption": "20 ItK/35 Br" + }, + "36.20": { + "old_caption": "ItK/40 VKT", + "new_vo_note_id": "36", + "new_caption": "20 ItK/40 VKT" + }, + "38.76": { + "old_caption": "ItK/28 B(s)", + "new_vo_note_id": "38", + "new_caption": "76 ItK/28 B(s)" + }, + "39.76": { + "old_caption": "ItK/31(r)", + "new_vo_note_id": "39", + "new_caption": "76 ItK/31(r)" + } +} +}, + +"un-forces": { +"vehicles": { + "19": { + "old_caption": "M39 Armored Utility Vehicle & M39Mortar Carrier", + "new_caption": "M39 Armored Utility Vehicle & M39 Mortar Carrier" + }, + "7": { + "old_caption": "Churchill VII", + "new_vo_note_id": "37" + }, + "34": { + "old_caption": "M4A3E8(a) Medium Tank & M4A3E8Dozer(a)", + "new_caption": "M4A3E8(a) Medium Tank & M4A3E8 Dozer(a)" + }, + "47": { + "old_caption": "Oxford Carrier, MMG & Oxford Car-rier, HMG", + "new_caption": "Oxford Carrier, MMG & Oxford Carrier, HMG" + }, + "57": { + "old_caption": "Jeep, ⁄-Ton Truck, & 2½-Ton Truck:4", + "new_caption": "Jeep, ¾-Ton Truck, & 2½-Ton Truck" + } +} +}, + +"communist-forces": { +"ordnance": { + "32": { + "old_caption": "Type 93 Twin-Mount High-Angle Ma-chine Gun", + "new_caption": "Type 93 Twin-Mount High-Angle Machine Gun" + } +} +} + +} diff --git a/asl_rulebook2/extract/index.py b/asl_rulebook2/extract/index.py index e605fda..81dab12 100755 --- a/asl_rulebook2/extract/index.py +++ b/asl_rulebook2/extract/index.py @@ -146,9 +146,6 @@ class ExtractIndex( ExtractBase ): index_entry = self._make_index_entry( title, content ) if index_entry: self.index_entries.append( index_entry ) - # FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here. - if title == "EX": - self.index_entries.append( self._make_index_entry( "EXC", "Exception" ) ) def _make_index_entry( self, title, content ): """Create a new index entry.""" diff --git a/asl_rulebook2/tests/test_extract.py b/asl_rulebook2/tests/test_extract.py index 808a1c0..e517152 100644 --- a/asl_rulebook2/tests/test_extract.py +++ b/asl_rulebook2/tests/test_extract.py @@ -52,10 +52,12 @@ def test_extract_content(): extract = ExtractContent( args={}, log=_check_log_msg ) extract.extract_content( pdf ) targets_buf, chapters_buf, footnotes_buf = io.StringIO(), io.StringIO(), io.StringIO() - extract.save_as_text( targets_buf, chapters_buf, footnotes_buf ) + vo_notes_buf = io.StringIO() + extract.save_as_text( targets_buf, chapters_buf, footnotes_buf, vo_notes_buf ) targets_buf = targets_buf.getvalue() chapters_buf = chapters_buf.getvalue() footnotes_buf = footnotes_buf.getvalue() + vo_notes_buf = vo_notes_buf.getvalue() # check the results fname2 = os.path.join( dname, "targets.txt" ) @@ -64,6 +66,8 @@ def test_extract_content(): assert open( fname2, "r", encoding="utf-8" ).read() == chapters_buf fname2 = os.path.join( dname, "footnotes.txt" ) assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf + fname2 = os.path.join( dname, "vo-notes.txt" ) + assert open( fname2, "r", encoding="utf-8" ).read() == vo_notes_buf # run the test for_each_easlrb_version( do_test ) @@ -86,10 +90,12 @@ def test_extract_all(): extract.extract_index.save_as_json( index_buf ) index_buf = index_buf.getvalue() targets_buf, chapters_buf, footnotes_buf = io.StringIO(), io.StringIO(), io.StringIO() - extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf ) + vo_notes_buf = io.StringIO() + extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf, vo_notes_buf ) targets_buf = targets_buf.getvalue() chapters_buf = chapters_buf.getvalue() footnotes_buf = footnotes_buf.getvalue() + vo_notes_buf = vo_notes_buf.getvalue() # check the results fname2 = os.path.join( dname, "index.json" ) @@ -100,6 +106,8 @@ def test_extract_all(): assert open( fname2, "r", encoding="utf-8" ).read() == chapters_buf fname2 = os.path.join( dname, "footnotes.json" ) assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf + fname2 = os.path.join( dname, "vo-notes.json" ) + assert open( fname2, "r", encoding="utf-8" ).read() == vo_notes_buf # run the test for_each_easlrb_version( do_test ) diff --git a/asl_rulebook2/webapp/prepare.py b/asl_rulebook2/webapp/prepare.py index 79bd8d4..b8a8d2c 100644 --- a/asl_rulebook2/webapp/prepare.py +++ b/asl_rulebook2/webapp/prepare.py @@ -114,28 +114,33 @@ def _do_prepare_data_files( args, download_url ): index_buf = io.StringIO() extract.extract_index.save_as_json( index_buf ) targets_buf, chapters_buf, footnotes_buf = io.StringIO(), io.StringIO(), io.StringIO() - extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf ) + vo_notes_buf = io.StringIO() + extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf, vo_notes_buf ) file_data = { "index": index_buf.getvalue(), "targets": targets_buf.getvalue(), "chapters": chapters_buf.getvalue(), "footnotes": footnotes_buf.getvalue(), + "vo-notes": vo_notes_buf.getvalue(), } # prepare the PDF gs_path = get_gs_path() if not gs_path: raise RuntimeError( "Ghostscript is not available." ) - with TempFile( mode="w", encoding="utf-8" ) as targets_file: + with TempFile( mode="w", encoding="utf-8" ) as targets_file, \ + TempFile( mode="w", encoding="utf-8" ) as vo_notes_file: log_msg( "status", "Preparing the final PDF..." ) # save the extracted targets targets_file.temp_file.write( file_data["targets"] ) targets_file.close( delete=False ) + vo_notes_file.temp_file.write( file_data["vo-notes"] ) + vo_notes_file.close( delete=False ) # prepare the PDF prepared_file.close( delete=False ) prepare_pdf( input_file.name, "ASL Rulebook", - targets_file.name, 5, + targets_file.name, vo_notes_file.name, 5, prepared_file.name, "ebook", gs_path, log_msg diff --git a/asl_rulebook2/webapp/tests/test_prepare.py b/asl_rulebook2/webapp/tests/test_prepare.py index 92f9807..7d685f0 100644 --- a/asl_rulebook2/webapp/tests/test_prepare.py +++ b/asl_rulebook2/webapp/tests/test_prepare.py @@ -83,7 +83,7 @@ def test_full_prepare( webapp, webdriver ): with zipfile.ZipFile( io.BytesIO( zip_data ) ) as zip_file: assert set( zip_file.namelist() ) == set( [ "ASL Rulebook.pdf", "ASL Rulebook.index", - "ASL Rulebook.targets", "ASL Rulebook.chapters", "ASL Rulebook.footnotes" + "ASL Rulebook.targets", "ASL Rulebook.chapters", "ASL Rulebook.footnotes", "ASL Rulebook.vo-notes" ] ) assert zip_file.getinfo( "ASL Rulebook.pdf" ).file_size > 40*1000 for ftype in [ "index", "targets", "chapters", "footnotes" ]: diff --git a/doc/prepare.md b/doc/prepare.md index b79e600..029d087 100644 --- a/doc/prepare.md +++ b/doc/prepare.md @@ -18,6 +18,7 @@ The first step is to extract the information we need from the eASLRB PDF. --save-targets /tmp/prepared/ASL\ Rulebook.targets \ --save-chapters /tmp/prepared/ASL\ Rulebook.chapters \ --save-footnotes /tmp/prepared/ASL\ Rulebook.footnotes \ + --save-vo-notes /tmp/prepared/ASL\ Rulebook.vo-notes \ --progress ``` This extracts the information we need, and saves it in the 4 data files. @@ -29,6 +30,7 @@ Next, we need to prepare the eASLRB PDF, namely create bookmarks for each rule, asl_rulebook2/bin/prepare_pdf.py \ $EASLRB \ --targets /tmp/prepared/ASL\ Rulebook.targets \ + --vo-notes /tmp/prepared/ASL\ Rulebook.vo-notes \ --yoffset 5 \ --output /tmp/prepared.pdf \ --compression ebook \