Located each Chapter H vehicle/ordnance note.

master
Pacman Ghost 3 years ago
parent 463df8bb6c
commit cabab224e7
  1. 34
      asl_rulebook2/bin/prepare_pdf.py
  2. 12
      asl_rulebook2/extract/all.py
  3. 2
      asl_rulebook2/extract/base.py
  4. 280
      asl_rulebook2/extract/content.py
  5. 6
      asl_rulebook2/extract/data/index-fixups.json
  6. 3
      asl_rulebook2/extract/data/target-fixups.json
  7. 734
      asl_rulebook2/extract/data/vo-note-fixups.json
  8. 3
      asl_rulebook2/extract/index.py
  9. 12
      asl_rulebook2/tests/test_extract.py
  10. 11
      asl_rulebook2/webapp/prepare.py
  11. 2
      asl_rulebook2/webapp/tests/test_prepare.py
  12. 2
      doc/prepare.md

@ -23,12 +23,17 @@ _COMPRESSION_CHOICES = [
# ---------------------------------------------------------------------
def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compression, gs_path, log_msg ):
def prepare_pdf( pdf_file, title, targets_fname, vo_notes_fname, yoffset, output_fname, compression, gs_path, log_msg ):
"""Prepare the MMP eASLRB PDF."""
# load the targets
with open( targets_fname, "r" ) as fp:
targets = json.load( fp )
if vo_notes_fname:
with open( vo_notes_fname, "r" ) as fp:
vo_notes_targets = json.load( fp )
else:
vo_notes_targets = None
with TempFile(mode="w") as compressed_file, TempFile(mode="w") as pdfmarks_file:
@ -49,6 +54,16 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress
)
pdf_file = compressed_file.name
def add_vo_notes_dests( key, vo_entries, yoffset, out ):
for vo_note_id, vo_entry in vo_entries.items():
dest = "{}:{}".format( key, vo_note_id )
xpos, ypos = vo_entry.get( "pos", ["null","null"] )
if isinstance( ypos, int ):
ypos += yoffset
print( "[ /Dest /{} /Page {} /View [/XYZ {} {}] /DEST pdfmark".format(
dest, vo_entry["page_no"], xpos, ypos
), file=out )
# generate the pdfmarks
log_msg( "progress", "Generating the pdfmarks..." )
if title:
@ -68,7 +83,15 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress
print( "[ /Dest /{} /Page {} /View [/XYZ {} {}] /DEST pdfmark".format(
ruleid, target["page_no"], xpos, ypos
), file=pdfmarks_file )
print( file=pdfmarks_file )
if vo_notes_targets:
print( file=pdfmarks_file )
for nat in vo_notes_targets:
if nat == "landing-craft":
add_vo_notes_dests( nat, vo_notes_targets[nat], yoffset, pdfmarks_file )
continue
for vo_type, vo_entries in vo_notes_targets[nat].items():
key = "{}_{}".format( nat, vo_type )
add_vo_notes_dests( key, vo_entries, yoffset, pdfmarks_file )
pdfmarks_file.close( delete=False )
# generate the pdfmark'ed document
@ -92,6 +115,9 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress
@click.option( "--targets","-t","targets_fname", required=True, type=click.Path(dir_okay=False),
help="Target definition file."
)
@click.option( "--vo-notes","vo_notes_fname", required=False, type=click.Path(dir_okay=False),
help="Vehicle/ordnance notes definition file."
)
@click.option( "--yoffset", default=5, help="Offset to add to y co-ordinates." )
@click.option( "--output","-o","output_fname", required=True, type=click.Path(dir_okay=False),
help="Output PDF file."
@ -101,7 +127,7 @@ def prepare_pdf( pdf_file, title, targets_fname, yoffset, output_fname, compress
)
@click.option( "--gs","gs_path", default="gs", help="Path to the Ghostscript executable." )
@click.option( "--progress","-p", is_flag=True, default=False, help="Log progress." )
def main( pdf_file, title, targets_fname, yoffset, output_fname, compression, gs_path, progress ):
def main( pdf_file, title, targets_fname, vo_notes_fname, yoffset, output_fname, compression, gs_path, progress ):
"""Prepare the MMP eASLRB PDF."""
# initialize
@ -113,7 +139,7 @@ def main( pdf_file, title, targets_fname, yoffset, output_fname, compression, gs
# prepare the PDF
prepare_pdf(
pdf_file, title,
targets_fname, yoffset,
targets_fname, vo_notes_fname, yoffset,
output_fname, compression,
gs_path,
log_msg

@ -128,8 +128,11 @@ class ExtractAll( ExtractBase ):
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-chapters","save_chapters_fname", required=True, help="Where to save the extracted chaopters." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
@click.option( "--save-vo-notes","save_vo_notes_fname", required=True,
help="Where to save the extracted vehicle/ordnance notes targets."
)
def main( pdf_file, args, progress, output_fmt,
save_index_fname, save_targets_fname, save_chapters_fname, save_footnotes_fname
save_index_fname, save_targets_fname, save_chapters_fname, save_footnotes_fname, save_vo_notes_fname
):
"""Extract everything we need from the MMP eASLRB."""
@ -147,9 +150,12 @@ def main( pdf_file, args, progress, output_fmt,
with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_chapters_fname, "w", encoding="utf-8" ) as chapters_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out, \
open( save_vo_notes_fname, "w", encoding="utf-8" ) as vo_notes_out:
getattr( extract.extract_index, "save_as_"+output_fmt )( index_out )
getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, chapters_out, footnotes_out )
getattr( extract.extract_content, "save_as_"+output_fmt )(
targets_out, chapters_out, footnotes_out, vo_notes_out
)
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -38,7 +38,7 @@ class ExtractBase:
@staticmethod
def _is_bold( elem ):
"""Check if an element is using a bold font."""
return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) )
return elem.fontname.endswith( ( ",Bold", "-BoldMT" ) ) or "-Bold" in elem.fontname
def log_msg( self, msg_type, msg, *args, **kwargs ):
"""Log a message."""

@ -5,6 +5,7 @@ import os
import json
import re
import math
from collections import defaultdict
import click
from pdfminer.layout import LTChar
@ -30,6 +31,8 @@ _DISABLE_SORT_ITEMS = [
"F20", "F21", # Chapter F footnotes
"G48", "G49", "G50", # Chapter G footnotes
"H9", # Chapter H footnotes
429,431,432,433,434,435, # Italian vehicle notes
436,437,438,439, # Italian ordnance notes
]
_DEFAULT_ARGS = {
@ -38,9 +41,44 @@ _DEFAULT_ARGS = {
"chapter-j": "593",
"chapter-w": "647-664",
"content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport
"disable-sort-items": ",".join( _DISABLE_SORT_ITEMS )
"disable-sort-items": ",".join( str(si) for si in _DISABLE_SORT_ITEMS )
}
# NOTE: The exact mappings here are actually not that important. What's important is:
# - the order of the nationality + V/O types
# - the page numbers themselves (so that they get parsed)
_VO_NOTE_SECTIONS = [
[ "german", "vehicles", "330,332,334-343", True ],
[ "german", "ordnance", "344-348", True ],
[ "russian", "vehicles", "348,350-355", True ],
[ "russian", "ordnance", "356-358", True ],
[ "russian", "vehicles", "362,364-368", False ],
[ "russian", "ordnance", "369", False ],
[ "american", "vehicles", "371,373-383", True ],
[ "american", "ordnance", "385-389", True ],
[ "british", "vehicles", "395,398-417", True ],
[ "british", "ordnance", "419-423", True ],
[ "italian", "vehicles", "429,431-435", True ],
[ "italian", "ordnance", "436-439", True ],
[ "japanese", "vehicles", "443-448", True ],
[ "japanese", "ordnance", "448-452", True ],
[ "chinese", "vehicles", "456-459", True ],
[ "chinese", "ordnance", "459-463", True ],
[ "landing-craft", "vehicles", "467-468", True ],
[ "french", "vehicles", "470,472-480", True ],
[ "french", "ordnance", "482-487", True ],
[ "allied-minor", "vehicles", "492-493,495-500", True ],
[ "allied-minor", "ordnance", "501-504", True ],
[ "axis-minor", "vehicles", "506,508-515", True ],
[ "axis-minor", "ordnance", "516,518-527", True ],
[ "finnish", "vehicles", "536,538-541", True ],
[ "finnish", "ordnance", "543,545-549", True ],
[ "un-forces", "vehicles", "554,556-565", True ],
[ "un-forces", "ordnance", "567-570", True ],
[ "communist-forces", "vehicles", "580", True ],
[ "communist-forces", "ordnance", "581-585", True ],
]
# ---------------------------------------------------------------------
class ExtractContent( ExtractBase ):
@ -51,6 +89,8 @@ class ExtractContent( ExtractBase ):
self.targets = {}
self._chapters = []
self._footnotes = {}
self._vo_notes = self._prev_vo_note_id = None
self._curr_vo_note_section = 0
self._curr_chapter = self._curr_footnote = self._curr_pageid = None
self._prev_elem = self._top_left_elem = None
# prepare to fixup problems in the content
@ -61,6 +101,7 @@ class ExtractContent( ExtractBase ):
self._target_fixups = load_fixup( "target-fixups.json" )
self._chapter_fixups = load_fixup( "chapter-fixups.json" )
self._footnote_fixups = load_fixup( "footnote-fixups.json" )
self._vo_note_fixups = load_fixup( "vo-note-fixups.json" )
def extract_content( self, pdf ):
"""Extract content from the MMP eASLRB."""
@ -78,6 +119,12 @@ class ExtractContent( ExtractBase ):
page_index[ page_no ] = chapter
disable_sort_items = set( self._args["disable-sort-items"].split( "," ) )
# include the pages for the Chapter H vehicle/ordnance notes
for _, _, page_nos, _ in _VO_NOTE_SECTIONS:
page_nos = parse_page_numbers( page_nos )
for page_no in page_nos:
page_index[ page_no ] = "H"
# initialize
self._curr_chapter = None
curr_chapter_pageno = None
@ -109,13 +156,18 @@ class ExtractContent( ExtractBase ):
self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
self._curr_chapter, curr_chapter_pageno
)
self.log_msg( "progress", "- Analyzing page {} ({}).", page_no, self._curr_pageid )
# NOTE: There have been so many extra pages added to Chapter H, there's no easy way to calculate
# the page ID. We could set up a table mapping physical page numbers to page ID's, but that's
# far more trouble than it's worth.
self.log_msg( "progress", "- Analyzing page {}{}.",
page_no, " ({})".format( self._curr_pageid ) if not self._curr_pageid.startswith("H") else ""
)
# process each element on the page
curr_caption = None
self._top_left_elem = self._prev_elem = None
elem_filter = lambda e: isinstance( e, LTChar )
sort_elems = self._curr_pageid not in disable_sort_items
sort_elems = self._curr_pageid not in disable_sort_items and str(page_no) not in disable_sort_items
for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
# skip problematic elements
@ -137,7 +189,7 @@ class ExtractContent( ExtractBase ):
# check if we're currently extracting footnotes
if self._curr_footnote is not None:
self._on_footnote_elem( elem, lt_page )
self._on_footnote_elem( elem, lt_page, page_no )
self._prev_elem = elem
continue
@ -195,6 +247,8 @@ class ExtractContent( ExtractBase ):
self.log_msg( "warning", "Unused fixups: {}", self._target_fixups )
if self._footnote_fixups:
self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )
if self._vo_note_fixups:
self.log_msg( "warning", "Unused fixups: {}", self._vo_note_fixups )
# extract the chapters
self._extract_chapters()
@ -215,7 +269,7 @@ class ExtractContent( ExtractBase ):
# yup - notify the main loop
self._curr_footnote = []
if elem:
self._on_footnote_elem( elem, lt_page )
self._on_footnote_elem( elem, lt_page, page_no )
return
# check if the entry needs to be fixed up
@ -266,14 +320,15 @@ class ExtractContent( ExtractBase ):
"raw_caption": orig_caption
}
def _on_footnote_elem( self, elem, lt_page ):
def _on_footnote_elem( self, elem, lt_page, page_no ):
"""Process an element while we're parsing footnotes."""
# check if we've found the start of a new footnote
if self._is_bold( elem ):
if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ):
# yup - save the current footnote, start collecting the new one
self._save_footnote()
self._curr_footnote = [ elem.get_text(), "" ]
elem_pos = ( elem.x0, elem.y1 )
self._curr_footnote = [ elem.get_text(), "", page_no, elem_pos ]
else:
if self._curr_footnote[1]:
# FUDGE! Some footnote content has bold text hard-up at the left margin,
@ -334,10 +389,17 @@ class ExtractContent( ExtractBase ):
if footnote_id == "9" and "9" in footnote_ids:
footnote_id = "29"
# check if we've gone past the end of the Chapter H footnotes
if self._curr_chapter == "H" and len(footnote_id) > 1:
self._curr_footnote = None
return
if self._curr_chapter == "H":
# check if we've gone past the end of the Chapter H footnotes
if self._vo_notes is None and len(footnote_id) > 1:
# yup - start collecting vehicle/ordnance notes
self._vo_notes = defaultdict( lambda: defaultdict( list ) )
# check if we're collecting Chapter H vehicle/ordnance notes
if self._vo_notes is not None:
# yup - save the next entry (the "footnote" is actually a vehicle/ordnance note)
self._save_vo_note( footnote_id, self._curr_footnote[2], self._curr_footnote[3] )
self._curr_footnote = None
return
# clean up the content
content = re.sub( r"\s+", " ", content ).strip()
@ -415,6 +477,108 @@ class ExtractContent( ExtractBase ):
} )
self._curr_footnote = None
def _save_vo_note( self, caption, page_no, page_pos ):
"""Save an extracted vehicle/ordnance note."""
# NOTE: Some pieces of text cause the parsing code to go wonky (typically because it's seen
# a "1" and so thinks it's found the start of a new section), so we manually skip over these.
skips = self._vo_note_fixups.get( "skips", {} ).get( str(page_no) )
if skips:
for i, target in enumerate(skips):
if self._check_string( caption, target ):
# we've got a caption that should be skipped - remove it from the list, and return
del skips[i]
if not skips:
del self._vo_note_fixups["skips"][ str(page_no) ]
if not self._vo_note_fixups["skips"]:
del self._vo_note_fixups["skips"]
return
if caption.isdigit() and page_no not in (354, 417):
return
def apply_fixups( vo_note_id, caption ):
nat, vo_type, _, _ = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ]
fixup = self._vo_note_fixups.get( nat, {} ).get( vo_type, {} ).get( vo_note_id )
if fixup:
if self._check_string( caption, fixup["old_caption"] ):
# remove the fixup
del self._vo_note_fixups[ nat ][ vo_type ][ vo_note_id ]
cleanup_fixups( nat, vo_type )
# apply the fixup
if "new_vo_note_id" in fixup:
vo_note_id = fixup["new_vo_note_id"]
if "new_caption" in fixup:
caption = fixup["new_caption"]
return vo_note_id, caption
def cleanup_fixups( nat, vo_type ):
if nat not in self._vo_note_fixups:
return
if vo_type in self._vo_note_fixups[nat] and not self._vo_note_fixups[ nat ][ vo_type ]:
del self._vo_note_fixups[ nat ][ vo_type ]
if nat in self._vo_note_fixups and not self._vo_note_fixups[ nat ]:
del self._vo_note_fixups[ nat ]
# extract the note number and caption
mo = re.search( r"^[1-9][0-9.]*", caption )
if not mo:
return
vo_note_id = mo.group()
caption = caption[ mo.end() : ].strip()
if vo_note_id.endswith( "." ):
vo_note_id = vo_note_id[:-1]
if caption.endswith( ":" ):
caption = caption[:-1].strip()
if caption.startswith( ( "cm ", "mm ", "pdr", "-cwt" ) ):
# FUDGE! Things like "5.1 2.2cm Big Gun" are getting parsed as "5.12.2: + "cm Big Gun" :-/
pos = vo_note_id.find( "." )
if pos >= 0:
caption = vo_note_id[pos+1:] + caption
vo_note_id = vo_note_id[:pos]
# check for any fixups
vo_note_id, caption = apply_fixups( vo_note_id, caption )
# compare the note ID with the previous one
nat, vo_type, _, check_seq = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ]
def get_base_note_id( val ):
pos = val.find( "." )
return int( val[:pos] if pos >= 0 else val )
base_note_id = get_base_note_id( vo_note_id )
if self._prev_vo_note_id:
# check if we've found the start of the next section
if base_note_id == 1:
# yup - add any extra entries to the current section
add_vo_entries = self._vo_note_fixups.get( nat, {} ).get( vo_type, {} ).pop( "add", [] )
for vo_entry in add_vo_entries:
self._vo_notes[ nat ][ vo_type ].append( vo_entry )
cleanup_fixups( nat, vo_type )
# get the next nationality + V/O type
self._curr_vo_note_section += 1
nat, vo_type, _, _ = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ]
# check for any fixups
vo_note_id, caption = apply_fixups( vo_note_id, caption )
elif check_seq:
# compare the note ID with the previous one
prev_base_note_id = get_base_note_id( self._prev_vo_note_id )
if base_note_id == prev_base_note_id + 1:
pass # nb: this is the normal case, we've found the next V/O note
elif base_note_id == prev_base_note_id and "." in vo_note_id:
pass # nb: this is to allow things like "9.1" following "9"
else:
return # nb: we got some junk that can be ignored
# save the V/O note
self._vo_notes[ nat ][ vo_type ].append( {
"vo_note_id": vo_note_id, "caption": caption,
"page_no": page_no, "page_pos": page_pos
} )
if nat == "allied-minor" and vo_type == "ordnance" and vo_note_id == "19":
# FUDGE! Because we're not seing Allied Minor Ordnance Note 20 :-/
self._prev_vo_note_id = "20"
else:
self._prev_vo_note_id = vo_note_id
def _extract_chapters( self ):
"""Extract the chapters and their sections."""
@ -492,15 +656,15 @@ class ExtractContent( ExtractBase ):
return True # the element is at the top of the right column
return False
def save_as_raw( self, targets_out, chapters_out, footnotes_out ):
def save_as_raw( self, targets_out, chapters_out, footnotes_out, vo_notes_out ):
"""Save the raw results."""
self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, True )
self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, vo_notes_out, True )
def save_as_text( self, targets_out, chapters_out, footnotes_out ):
def save_as_text( self, targets_out, chapters_out, footnotes_out, vo_notes_out ):
"""Save the results as plain-text."""
self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, False )
self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, vo_notes_out, False )
def _save_as_raw_or_text( self, targets_out, chapters_out, footnotes_out, raw ):
def _save_as_raw_or_text( self, targets_out, chapters_out, footnotes_out, vo_notes_out, raw ):
"""Save the results as raw or plain-text."""
# save the targets
@ -511,7 +675,7 @@ class ExtractContent( ExtractBase ):
print( file=targets_out )
print( "=== p{} ===".format( target["page_no"] ), file=targets_out )
curr_page_no = target["page_no"]
xpos, ypos = self._get_target_pos( target )
xpos, ypos = self._get_page_pos( target["pos"] )
if raw:
print( "[{},{}] = {}".format(
xpos, ypos, target["raw_caption"]
@ -556,13 +720,35 @@ class ExtractContent( ExtractBase ):
print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out )
print( footnote["content"], file=footnotes_out )
def save_as_json( self, targets_out, chapters_out, footnotes_out ):
# save the vehicle/ordnance notes
first = True
for nat, vo_types in self._vo_notes.items():
for vo_type, vo_entries in vo_types.items():
if first:
first = False
else:
print( file=vo_notes_out )
print( "=== {} ===".format(
nat if nat == "landing-craft" else "{} {}".format( nat, vo_type )
), file=vo_notes_out )
for vo_entry in vo_entries:
if "page_pos" in vo_entry:
xpos, ypos = ExtractContent._get_page_pos( vo_entry["page_pos"] )
page_pos = "[{},{}]".format( xpos, ypos )
else:
page_pos = None
print( "{:<5} {} @p{}{}".format(
vo_entry["vo_note_id"]+":", vo_entry["caption"], vo_entry["page_no"],
":"+page_pos if page_pos else ""
), file=vo_notes_out )
def save_as_json( self, targets_out, chapters_out, footnotes_out, vo_notes_out ):
"""Save the results as JSON."""
# save the targets
targets, curr_chapter = [], None
for ruleid, target in self.targets.items():
xpos, ypos = self._get_target_pos( target )
xpos, ypos = self._get_page_pos( target["pos"] )
targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format(
jsonval( ruleid ),
jsonval(target["caption"]), target["page_no"], xpos, ypos
@ -613,12 +799,46 @@ class ExtractContent( ExtractBase ):
",\n\n".join( chapters )
), file=footnotes_out )
# save the vehicle/ordnance notes
vo_notes = []
for nat in self._vo_notes:
vo_types = []
for vo_type, vo_entries in self._vo_notes[nat].items():
entries = []
for vo_entry in vo_entries:
val = "{}: {{ \"caption\": {}, \"page_no\": {}".format(
jsonval(vo_entry["vo_note_id"]), jsonval(vo_entry["caption"]), jsonval(vo_entry["page_no"])
)
if "page_pos" in vo_entry:
xpos, ypos = self._get_page_pos( vo_entry["page_pos"] )
val += ", \"pos\": [{},{}]".format( xpos, ypos )
val += " }"
entries.append( " {}".format( val ) )
if nat == "landing-craft":
vo_types.append( ",\n".join( entries ) )
else:
vo_types.append( "{}: {{\n{}\n}}".format(
jsonval(vo_type), ",\n".join( entries )
) )
vo_notes.append( "{}: {{\n{}\n}}".format(
jsonval(nat), ",\n".join( vo_types )
) )
print( "{{\n\n{}\n\n}}".format(
",\n\n".join( vo_notes )
), file=vo_notes_out )
@staticmethod
def _get_target_pos( target ):
"""Return a target's X/Y position on the page."""
xpos = math.floor( target["pos"][0] )
ypos = math.ceil( target["pos"][1] )
return xpos, ypos
def _check_string( val, target ):
"""Check if a string matches a target."""
if target.startswith( "^" ):
return val.startswith( target[1:] )
else:
return val == target
@staticmethod
def _get_page_pos( pos ):
"""Return a X/Y position on the page."""
return math.floor( pos[0] ), math.ceil( pos[1] )
# ---------------------------------------------------------------------
@ -632,7 +852,12 @@ class ExtractContent( ExtractBase ):
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-chapters","save_chapters_fname", required=True, help="Where to save the extracted chaopters." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_chapters_fname, save_footnotes_fname ):
@click.option( "--save-vo-notes","save_vo_notes_fname", required=True,
help="Where to save the extracted vehicle/ordnance notes."
)
def main( pdf_file, args, progress, output_fmt,
save_targets_fname, save_chapters_fname, save_footnotes_fname, save_vo_notes_fname
):
"""Extract content from the MMP eASLRB."""
# initialize
@ -651,8 +876,9 @@ def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_chapter
# save the results
with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_chapters_fname, "w", encoding="utf-8" ) as chapters_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract, "save_as_"+output_fmt, )( targets_out, chapters_out, footnotes_out )
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out, \
open( save_vo_notes_fname, "w", encoding="utf-8" ) as vo_notes_out:
getattr( extract, "save_as_"+output_fmt, )( targets_out, chapters_out, footnotes_out, vo_notes_out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -83,12 +83,6 @@
]
},
"EX": {
"old_content": "ExampleEXC: Exception",
"new_content": "Example",
"_comment_": "The code manually inserts an entry for EXC: Exception"
},
"Fortification": {
"replace": [
[ "[in BRT: SSR1 (BRT Sand: T3.2) (NA in Betio Piers: T9.2)]", "[in BRT: SSR1 (BRT Sand): T3.2] [in BRT: SSR1 (NA in Betio Piers): T9.2]" ]

@ -394,7 +394,8 @@
"10": { "new_ruleid": null },
"11": { "new_ruleid": null },
"12": { "new_ruleid": null },
"55": { "new_ruleid": null }
"55": { "new_ruleid": null },
"678876987": { "new_ruleid": null }
},
"H2": {

@ -0,0 +1,734 @@
{
"skips": {
"382": [ "1, 3" ],
"429": [ "^1,660,", "^1and Fiat 3000", "^9/43 armistice", "^4/41 (.9)" ],
"431": [ "^1, for East Africa", "^9/42 (1.4)," ],
"432": [ "^1 (l.2),", "^1. Sources vary" ],
"434": [ "1-", "^1.5 for 11/41-6/42," ],
"438": [ "^1/41-5/43" ],
"439": [ "1 (1", "^1/43 ( 1.2),", "^1/43 (1.3),", "^1/42-5/43." ],
"492": [ "1B11CE/FPNA", "1B11CE/FPNA" ],
"493": [ "1T", "1B" ],
"496": [ "1B" ],
"501": [ "1h-d" ],
"502": [ "1s5", "1s5" ],
"503": [ "1AP5", "1s6" ],
"504": [ "^1.3)" ],
"514": [ "1.4 for 45" ],
"556": [ "1#" ],
"560": [ "1, 3" ]
},
"german": {
"vehicles": {
"9.1": {
"old_caption": "FT-17 730(f) &",
"new_caption": "FT-17 730(f) & FT-17 730m(f)"
},
"22": {
"old_caption": "PzKpfw IVF: 1",
"new_caption": "PzKpfw IVF(1)"
},
"23": {
"old_caption": "PzKpfw IVF:2",
"new_caption": "PzKpfw IVF(2)"
},
"35": {
"old_caption": "StuG IIIG (L) &",
"new_caption": "StuG IIIG (L) & StuH 42 (L)"
},
"52": {
"old_caption": "JgdPz IV & JgdPz",
"new_caption": "JgdPz IV & JgdPz IV(L)"
},
"58": {
"old_caption": "SPW 250/sMG",
"new_caption": "SPW 250/sMG & 251/sMG"
},
"59": {
"old_caption": "SPW 250/7 &",
"new_caption": "SPW 250/7 & 251/2"
},
"72": {
"old_caption": "PSW 231(8 rad)",
"new_caption": "PSW 231(8 rad) & 232"
},
"85.37": {
"old_caption": "FlaK/Pz IV",
"new_vo_note_id": "85", "new_caption": "37 FlaK/Pz IV"
},
"92": {
"old_caption": "2cm & 3.7cm",
"new_caption": "2cm & 3.7cm FlaK LKW"
},
"96": {
"old_caption": "Opel 6700 &Buessing-NAG",
"new_caption": "Opel 6700 & Buessing-NAG 4500"
},
"add": [
{ "_comment_": "This gets parsed as '4' and '5.1 GSW 39H(f) PaK' :-/",
"vo_note_id": "45.1", "caption": "GSW 39H(f) PaK", "page_no": 337, "page_pos": [380,561]
},
{ "vo_note_id": "37.1", "caption": "Sturmtiger", "page_no": 532, "page_pos": [118,640] },
{ "vo_note_id": "88.1", "caption": "SdKfz 10/5", "page_no": 532, "page_pos": [399,713] }
]
}
},
"russian": {
"vehicles": {
"4": {
"old_caption": "T-60 M40 &",
"new_caption": "T-60 M40 & M42"
},
"6": {
"old_caption": "T-26 M33 & T-",
"new_caption": "T-26 M33 & T-26S M37/39"
},
"1": {
"old_caption": "T-28 M34",
"new_vo_note_id": "11"
},
"11.1": {
"old_caption": "T-28 M34(L) & 12.1. T-28E M40(L)",
"new_caption": "T-28 M34(L)"
},
"23": {
"old_caption": "KV-lE, KV-1 M41,",
"new_caption": "KV-lE, KV-1 M41, & KV-1 M-42"
},
"34": {
"old_caption": "ISU-122 & ISU-",
"new_caption": "ISU-122 & ISU-152"
},
"add": [
{ "vo_note_id": "12.1", "caption": "T-28E M40(L)", "page_no": 364, "page_pos": [394,289] }
]
}
},
"american": {
"vehicles": {
"14": {
"old_caption": "M4A3E2 & M4A3E2 (L) MediumTanks",
"new_caption": "M4A3E2 & M4A3E2 (L) Medium Tanks"
},
"17": {
"old_caption": "M4(105) & M4A3(105) MediumTanks",
"new_caption": "M4(105) & M4A3(105) Medium Tanks"
}
}
},
"british": {
"vehicles": {
"2": {
"old_caption": "(A17) Tetrarch & Tetrarch CS[Light Tanks Mk VII & Mk VII CS]",
"new_caption": "(A17) Tetrarch & Tetrarch CS [Light Tanks Mk VII & Mk VII CS]"
},
"6": {
"old_caption": "A9 & A9 CS [Cruiser Tanks Mk I& Mk I CS]",
"new_caption": "A9 & A9 CS [Cruiser Tanks Mk I & Mk I CS]"
},
"26": {
"old_caption": "(A12) Matilda II & II CS [In-fantry Tank Mk II]",
"new_caption": "(A12) Matilda II & II CS [Infantry Tank Mk II]"
},
"36": {
"old_caption": "Valentine & Churchill Bridgelay-ers",
"new_caption": "Valentine & Churchill Bridgelayers"
},
"45": {
"old_caption": "Humber III & Otter Light Re-connaissance Cars",
"new_caption": "Humber III & Otter Light Reconnaissance Cars"
},
"82": {
"old_caption": "",
"new_caption": "30-cwt Lorry"
},
"83": {
"old_caption": "",
"new_caption": "3-Ton Lorry"
}
}
},
"italian": {
"vehicles": {
"1": {
"old_caption": "LS/21 & LS/3",
"new_caption": "L5/21 & L5/30"
},
"2": {
"old_caption": "^L3/35: Derived from",
"new_caption": "L3/35"
},
"3": {
"old_caption": "^L3 aa: Some L3",
"new_caption": "L3 aa"
},
"4": {
"old_caption": "^L3 cc: During the early months",
"new_caption": "L3 cc"
},
"5": {
"old_caption": "^L3 Lf: Development of",
"new_caption": "L3 Lf"
},
"6": {
"old_caption": "^L6/40: Designed to replace",
"new_caption": "L6/40"
},
"7": {
"old_caption": "^Mll/39: This tank carried",
"new_caption": "M11/39"
},
"8": {
"old_caption": "^Ml3/40: Replacing the",
"new_caption": "M13/40"
},
"9": {
"old_caption": "^M14/41: This tank,",
"new_caption": "M14/41"
},
"10": {
"old_caption": "^M15/42: This, the last version",
"new_caption": "M15/42"
},
"11": {
"old_caption": "^MR/35(f): The Germans provided",
"new_caption": "MR/35(f)"
},
"12": {
"old_caption": "Semovente M40 & M41 da",
"new_caption": "Semovente M40 & M41 da 75/18"
},
"13": {
"old_caption": "^Semovente M42 da 75/1&75/32: The last model",
"new_caption": "Semovente M42 da 75/18 & 75/32"
},
"14": {
"old_caption": "^Semovente M43 da 105/25: Nicknathe",
"new_caption": "Semovente M43 da 105/25"
},
"15": {
"old_caption": "Semovente L40 da 47/32: The SMV",
"new_caption": "Semovente L40 da 47/32"
},
"16": {
"old_caption": "^Semovente M41M da 90/53: This AFV",
"new_caption": "Semovente M41M da 90/53"
},
"18": {
"old_caption": "^Lince: The Lince (Lynx)",
"new_caption": "Lince"
},
"19": {
"old_caption": "^Lancia lZM: In late 1912",
"new_caption": "Lancia 1ZM"
},
"20": {
"old_caption": "^Fiat 611A & 611BThese armoredcars",
"new_caption": "Fiat 611A & 611B"
},
"21": {
"old_caption": "^AB 40 & AB41These two auto",
"new_caption": "AB 40 & AB 41"
},
"22": {
"old_caption": "^Autoprotetto S37: This APC",
"new_caption": "Autoprotetto S37"
},
"23": {
"old_caption": "Autocannoni da",
"new_caption": "Autocannoni da 20/65(b) & 65/17(b)"
},
"24": {
"old_caption": "Autocannoni da",
"new_caption": "Autocannoni da 75/27 CK & 90/53"
},
"25": {
"old_caption": "^TL 37, TM 40 &TP 32",
"new_caption": "TL 37, TM 40 & TP 32"
},
"26": {
"old_caption": "^Autocarretta: As the portee",
"new_caption": "Autocarretta"
},
"27": {
"old_caption": "^Fiat 508 MC: Derived from",
"new_caption": "Fiat 508 MC"
},
"28": {
"old_caption": "^Autocarri L, M & P: The ItalianArmy",
"new_caption": "Autocarri L, M & P"
}
},
"ordnance": {
"1": {
"old_caption": "^Mortaio da 45 \"Brixia\": This weapon,",
"new_caption": "Mortaio da 45 \"Brixia\""
},
"2": {
"old_caption": "^Mortaio da 81/14: First usedi",
"new_caption": "Mortaio da 81/14"
},
"3": {
"old_caption": "^Fucile-cc S: Like several other",
"new_caption": "Fucile-cc S"
},
"4": {
"old_caption": "^Cannone-cc da 37/45: This was",
"new_caption": "Cannone-cc da 37/45"
},
"5": {
"old_caption": "^Cannone da 47/32: This was",
"new_caption": "Cannone da 47/32"
},
"6": {
"old_caption": "^Cannone da 65/17: This was",
"new_caption": "Cannone da 65/17"
},
"7": {
"old_caption": "^Cannone da 70/15: This",
"new_caption": "Cannone da 70/15"
},
"8": {
"old_caption": "^Obice da 75/13: The Skoda",
"new_caption": "Obice da 75/13"
},
"9": {
"old_caption": "^Cannone da 75/27: This was",
"new_caption": "Cannone da 75/27"
},
"10": {
"old_caption": "^Obice da 75/18: This game piece",
"new_caption": "Obice da 75/18"
},
"11": {
"old_caption": "^Cannone da 75/32: The 75/32",
"new_caption": "Cannone da 75/32"
},
"12": {
"old_caption": "^Obice da 100/17: Another old",
"new_caption": "Obice da 100/17"
},
"13": {
"old_caption": "^Cannone da 105/28: This was",
"new_caption": "Cannone da 105/28"
},
"14": {
"old_caption": "^Obice da 149/13: This piece",
"new_caption": "Obice da 149/13"
},
"15": {
"old_caption": "^Cannone da 149/35: Another",
"new_caption": "Cannone da 149/35"
},
"16": {
"old_caption": "^Cannone da 149/40: To replace",
"new_caption": "Cannone da 149/40"
},
"17": {
"old_caption": "^Cannone-mitragliera da 20/65: Thiswas",
"new_caption": "Cannone-mitragliera da 20/65"
},
"18": {
"old_caption": "^Cannone-aa da 75/39: This was",
"new_caption": "Cannone-aa da 75/39"
},
"add": [
{ "vo_note_id": "19", "caption": "Cannone-aa da 75/46", "page_no": 439, "page_pos": [283,42] },
{ "vo_note_id": "20", "caption": "Cannone-aa da 90/53", "page_no": 439, "page_pos": [384,541] }
]
}
},
"japanese": {
"vehicles": {
"3": {
"old_caption": "Type 95 SO-KI Armored Railway Ve-hicle",
"new_caption": "Type 95 SO-KI Armored Railway Vehicle"
},
"4": {
"old_caption": "Types 97A & 97B TE-KE Light Ar-mored Vehicles",
"new_caption": "Types 97A & 97B TE-KE Light Armored Vehicles"
},
"7": {
"old_caption": "Types 89A & 89B CHI-RO MediumTanks",
"new_caption": "Types 89A & 89B CHI-RO Medium Tanks"
},
"8": {
"old_caption": "Types 97A & 97B CHI-HA MediumTanks",
"new_caption": "Types 97A & 97B CHI-HA Medium Tanks"
},
"13": {
"old_caption": "Type 4 HO-RO Self-Propelled How-itzer",
"new_caption": "Type 4 HO-RO Self-Propelled Howitzer"
},
"14": {
"old_caption": "Type 1 HO-KI Armored Troop-Vehi-cle",
"new_caption": "Type 1 HO-KI Armored Troop-Vehicle"
}
},
"ordnance": {
"2": {
"old_caption": "Year-11 Type Curved-Fire InfantryGun",
"new_caption": "Year-11 Type Curved-Fire Infantry Gun"
},
"1": {
"old_caption": "Type 98 High-Angle Machine Can-non",
"new_vo_note_id": "21",
"new_caption": "Type 98 High-Angle Machine Cannon"
},
"17": {
"old_caption": "Year-3 Type 14cm Naval SeacoastGun",
"new_caption": "Year-3 Type 14cm Naval Seacoast Gun"
},
"20": {
"old_caption": "Type 93 Twin-Mount High-Angle Ma-chine Gun",
"new_caption": "Type 93 Twin-Mount High-Angle Machine Gun"
},
"22": {
"old_caption": "Type 96 Single-, Twin-, & Triple-Mount Naval High-Angle Machine Can-",
"new_caption": "Type 96 Single-, Twin-, & Triple-Mount Naval High-Angle Machine Cannons"
},
"24": {
"old_caption": "Year-10 Type 12cm Naval High-AngleGun",
"new_caption": "Year-10 Type 12cm Naval High-Angle Gun"
}
}
},
"chinese": {
"ordnance": {
"2": {
"old_caption": "Mortaio da 45 “Brixia”, 5cm leGrW 36,",
"new_caption": "Mortaio da 45 “Brixia”, 5cm leGrW 36, 50mm RM obr.38, & Type 89 Heavy Grenade Launcher"
},
"4": {
"old_caption": "Stokes 3-in., 8cm GrW 34, & 82mmBM obr. 37",
"new_caption": "Stokes 3-in., 8cm GrW 34, & 82mm BM obr. 37"
},
"7": {
"old_caption": "37mm PP obr. 15R & Cannone da",
"new_caption": "37mm PP obr. 15R & Cannone da 70/15"
},
"11": {
"old_caption": "7.7cm FK 16, 76.2mm P obr. 02/30, &OQF 18-Pounder",
"new_caption": "7.7cm FK 16, 76.2mm P obr. 02/30, & OQF 18-Pounder"
},
"12": {
"old_caption": "10.5cm leFH 16, Cannone da 105/28,& M2A1 105mm Howitzer",
"new_captipn": "10.5cm leFH 16, Cannone da 105/28, & M2A1 105mm Howitzer"
},
"15": {
"old_caption": "Oerlikon FF, Cannone-mitragliera da",
"new_caption": "Oerlikon FF, Cannone-mitragliera da 20/65, & 2cm FlaK 30"
},
"16": {
"old_caption": "3.7cm FlaK 36 o. 37 & Bofors 40mmL/60",
"new_caption": "3.7cm FlaK 36 o. 37 & Bofors 40mm L/60"
}
}
},
"french": {
"vehicles": {
"20": {
"old_caption": "Autocanon de 75 mle 97 & Autocanonde 75 Conus(b)",
"new_caption": "Autocanon de 75 mle 97 & Autocanon de 75 Conus(b)"
},
"21": {
"old_caption": "Camion de Mitrailleuse Contre-Avions, Camion de 13.2 CAJ, Camion de",
"new_caption": "Camion de Mitrailleuse Contre-Avions, Camion de 13.2 CAJ, Camion de 20 CA, & Autocanon de 25 CA"
},
"36": {
"old_caption": "Peugeot 202, Citroën 23, & RenaultAGR2",
"new_caption": "Peugeot 202, Citroën 23, & Renault AGR2"
},
"40": {
"old_caption": "M4A3(75)W, M4A3(76)W, & M4A3(105) Medium Tanks, & M4Tankdozer",
"new_caption": "M4A3(75)W, M4A3(76)W, & M4A3(105) Medium Tanks, & M4 Tankdozer"
}
},
"ordnance": {
"6": {
"old_caption": "Canon Antichar de 47SA mle 37 APX",
"new_caption": "Canon Antichar de 47 SA mle 37 APX"
},
"18": {
"old_caption": "Mitrailleuse de 13.2 CAJmle 30",
"new_caption": "Mitrailleuse de 13.2 CAJ mle 30"
}
}
},
"allied-minor": {
"vehicles": {
"1": {
"old_caption": "TKS&TKS(L)",
"new_caption": "TKS & TKS(L)"
},
"2": {
"old_caption": "VickersEdw(b)&Ejw(b)",
"new_caption": "Vickers Edw(b) & Ejw(b)"
},
"3.7": {
"old_caption": "TPdw & 7TPjw",
"new_vo_note_id": "3",
"new_caption": "7TPdw & 7TPjw"
},
"6": {
"old_caption": "wz.29",
"new_caption": "wz.29 \"Ursus\""
},
"10.302": {
"old_caption": "T",
"new_vo_note_id": "10",
"new_caption": "302T"
},
"13": {
"old_caption": "Horse-Drawn",
"new_caption": "Horse-Drawn \"Taczanka\""
},
"22": {
"old_caption": "M3A3(a) FlaK38",
"new_caption": "M3A3(a) FlaK 38"
},
"29": {
"old_caption": "Marmon-Herrington III(b) Armored",
"new_caption": "Marmon-Herrington III(b) Armored Cars"
},
"31": {
"old_caption": "L5/30(i) & L3/35(i) &",
"new_caption": "L5/30(i) & L3/35(i) & L6/40(i) & M13/40(i)"
},
"37": {
"old_caption": "Light Truck & Medium Truck &",
"new_caption": "Light Truck & Medium Truck & Heavy Truck"
}
},
"ordnance": {
"30.75": {
"old_caption": "M 19S",
"new_vo_note_id": "30",
"new_caption": "75M 19S"
},
"add": [
{ "vo_note_id": "20", "caption": "3.7cm Infantry Gun", "page_no": 502, "page_pos": [393,616] }
]
}
},
"axis-minor": {
"vehicles": {
"7.38": {
"old_caption": "M Toldi I",
"new_vo_note_id": "7",
"new_caption": "38M Toldi I"
},
"8.38": {
"old_caption": "M Toldi IIA",
"new_vo_note_id": "8",
"new_caption": "38M Toldi IIA"
},
"9.40": {
"old_caption": "M Turan I(r)",
"new_vo_note_id": "9",
"new_caption": "40M Turan I(r)"
},
"10.41": {
"old_caption": "M Turan II(r)",
"new_vo_note_id": "10",
"new_caption": "41M Turan II(r)"
},
"13.43": {
"old_caption": "M Zrinyi II",
"new_vo_note_id": "13",
"new_caption": "43M Zrinyi II"
},
"14.39": {
"old_caption": "M Csaba & 40M Csaba",
"new_vo_note_id": "14",
"new_caption": "39M Csaba & 40M Csaba"
},
"16.40": {
"old_caption": "M Nimrod",
"new_vo_note_id": "16",
"new_caption": "40M Nimrod"
},
"50": {
"old_caption": "Light Truck, Medium Truck, &Heavy Truck",
"new_caption": "Light Truck, Medium Truck, & Heavy Truck"
}
},
"ordnance": {
"20": {
"old_caption": "Canon Automatique de 25 CAmle 38",
"new_caption": "Canon Automatique de 25 CA mle 38"
},
"29": {
"old_caption": "47mm KanonPUV vz. 36(t)",
"new_caption": "47mm Kanon PUV vz. 36(t)"
}
}
},
"finnish": {
"vehicles": {
"27": {
"old_caption": "Light Truck, Medium Truck, &Heavy Truck",
"new_caption": "Light Truck, Medium Truck, & Heavy Truck"
}
},
"ordnance": {
"1.47": {
"old_caption": "Krh/41",
"new_vo_note_id": "1",
"new_caption": "47 Krh/41"
},
"2.50": {
"old_caption": "Krh/39(r)",
"new_vo_note_id": "2",
"new_caption": "50 Krh/39(r)"
},
"3.81": {
"old_caption": "Krh/32",
"new_vo_note_id": "3",
"new_caption": "81 Krh/32"
},
"4.81": {
"old_caption": "Savunheitin M/42",
"new_vo_note_id": "4",
"new_caption": "81 Savunheitin M/42"
},
"5.120": {
"old_caption": "Krh/40",
"new_vo_note_id": "5",
"new_caption": "120 Krh/40"
},
"8.20": {
"old_caption": "PstK/40",
"new_vo_note_id": "8",
"new_caption": "20 PstK/40"
},
"11.37": {
"old_caption": "PstK/37(g)",
"new_vo_note_id": "11",
"new_caption": "37 PstK/37(g)"
},
"12.45": {
"old_caption": "PstK/32(r)",
"new_vo_note_id": "12",
"new_caption": "45 PstK/32(r)"
},
"13.50": {
"old_caption": "PstK/38(g)",
"new_vo_note_id": "13",
"new_caption": "50 PstK/38(g)"
},
"14.75": {
"old_caption": "PstK/97-38(g)",
"new_vo_note_id": "14",
"new_caption": "75 PstK/97-38(g)"
},
"15.75": {
"old_caption": "PstK/40(g)",
"new_vo_note_id": "15",
"new_caption": "75 PstK/40(g)"
},
"16.76": {
"old_caption": "RK/27(r)",
"new_vo_note_id": "16",
"new_caption": "76 RK/27(r)"
},
"17.75": {
"old_caption": "K/02",
"new_vo_note_id": "17",
"new_caption": "75 K/02"
},
"18.76": {
"old_caption": "LK/13",
"new_vo_note_id": "18",
"new_caption": "76 LK/13"
},
"30.150": {
"old_caption": "H/40(g)",
"new_vo_note_id": "30",
"new_caption": "150 H/40(g)"
},
"31.155": {
"old_caption": "H/17(f)",
"new_vo_note_id": "31",
"new_caption": "155 H/17(f)"
},
"33.20": {
"old_caption": "ItK/30 BSW(g)",
"new_vo_note_id": "33",
"new_caption": "20 ItK/30 BSW(g)"
},
"34.20": {
"old_caption": "ItK/38 BSW(g)",
"new_vo_note_id": "34",
"new_caption": "20 ItK/38 BSW(g)"
},
"35.20": {
"old_caption": "ItK/35 Br",
"new_vo_note_id": "35",
"new_caption": "20 ItK/35 Br"
},
"36.20": {
"old_caption": "ItK/40 VKT",
"new_vo_note_id": "36",
"new_caption": "20 ItK/40 VKT"
},
"38.76": {
"old_caption": "ItK/28 B(s)",
"new_vo_note_id": "38",
"new_caption": "76 ItK/28 B(s)"
},
"39.76": {
"old_caption": "ItK/31(r)",
"new_vo_note_id": "39",
"new_caption": "76 ItK/31(r)"
}
}
},
"un-forces": {
"vehicles": {
"19": {
"old_caption": "M39 Armored Utility Vehicle & M39Mortar Carrier",
"new_caption": "M39 Armored Utility Vehicle & M39 Mortar Carrier"
},
"7": {
"old_caption": "Churchill VII",
"new_vo_note_id": "37"
},
"34": {
"old_caption": "M4A3E8(a) Medium Tank & M4A3E8Dozer(a)",
"new_caption": "M4A3E8(a) Medium Tank & M4A3E8 Dozer(a)"
},
"47": {
"old_caption": "Oxford Carrier, MMG & Oxford Car-rier, HMG",
"new_caption": "Oxford Carrier, MMG & Oxford Carrier, HMG"
},
"57": {
"old_caption": "Jeep, ⁄-Ton Truck, & 2½-Ton Truck:4",
"new_caption": "Jeep, ¾-Ton Truck, & 2½-Ton Truck"
}
}
},
"communist-forces": {
"ordnance": {
"32": {
"old_caption": "Type 93 Twin-Mount High-Angle Ma-chine Gun",
"new_caption": "Type 93 Twin-Mount High-Angle Machine Gun"
}
}
}
}

@ -146,9 +146,6 @@ class ExtractIndex( ExtractBase ):
index_entry = self._make_index_entry( title, content )
if index_entry:
self.index_entries.append( index_entry )
# FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here.
if title == "EX":
self.index_entries.append( self._make_index_entry( "EXC", "Exception" ) )
def _make_index_entry( self, title, content ):
"""Create a new index entry."""

@ -52,10 +52,12 @@ def test_extract_content():
extract = ExtractContent( args={}, log=_check_log_msg )
extract.extract_content( pdf )
targets_buf, chapters_buf, footnotes_buf = io.StringIO(), io.StringIO(), io.StringIO()
extract.save_as_text( targets_buf, chapters_buf, footnotes_buf )
vo_notes_buf = io.StringIO()
extract.save_as_text( targets_buf, chapters_buf, footnotes_buf, vo_notes_buf )
targets_buf = targets_buf.getvalue()
chapters_buf = chapters_buf.getvalue()
footnotes_buf = footnotes_buf.getvalue()
vo_notes_buf = vo_notes_buf.getvalue()
# check the results
fname2 = os.path.join( dname, "targets.txt" )
@ -64,6 +66,8 @@ def test_extract_content():
assert open( fname2, "r", encoding="utf-8" ).read() == chapters_buf
fname2 = os.path.join( dname, "footnotes.txt" )
assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf
fname2 = os.path.join( dname, "vo-notes.txt" )
assert open( fname2, "r", encoding="utf-8" ).read() == vo_notes_buf
# run the test
for_each_easlrb_version( do_test )
@ -86,10 +90,12 @@ def test_extract_all():
extract.extract_index.save_as_json( index_buf )
index_buf = index_buf.getvalue()
targets_buf, chapters_buf, footnotes_buf = io.StringIO(), io.StringIO(), io.StringIO()
extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf )
vo_notes_buf = io.StringIO()
extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf, vo_notes_buf )
targets_buf = targets_buf.getvalue()
chapters_buf = chapters_buf.getvalue()
footnotes_buf = footnotes_buf.getvalue()
vo_notes_buf = vo_notes_buf.getvalue()
# check the results
fname2 = os.path.join( dname, "index.json" )
@ -100,6 +106,8 @@ def test_extract_all():
assert open( fname2, "r", encoding="utf-8" ).read() == chapters_buf
fname2 = os.path.join( dname, "footnotes.json" )
assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf
fname2 = os.path.join( dname, "vo-notes.json" )
assert open( fname2, "r", encoding="utf-8" ).read() == vo_notes_buf
# run the test
for_each_easlrb_version( do_test )

@ -114,28 +114,33 @@ def _do_prepare_data_files( args, download_url ):
index_buf = io.StringIO()
extract.extract_index.save_as_json( index_buf )
targets_buf, chapters_buf, footnotes_buf = io.StringIO(), io.StringIO(), io.StringIO()
extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf )
vo_notes_buf = io.StringIO()
extract.extract_content.save_as_json( targets_buf, chapters_buf, footnotes_buf, vo_notes_buf )
file_data = {
"index": index_buf.getvalue(),
"targets": targets_buf.getvalue(),
"chapters": chapters_buf.getvalue(),
"footnotes": footnotes_buf.getvalue(),
"vo-notes": vo_notes_buf.getvalue(),
}
# prepare the PDF
gs_path = get_gs_path()
if not gs_path:
raise RuntimeError( "Ghostscript is not available." )
with TempFile( mode="w", encoding="utf-8" ) as targets_file:
with TempFile( mode="w", encoding="utf-8" ) as targets_file, \
TempFile( mode="w", encoding="utf-8" ) as vo_notes_file:
log_msg( "status", "Preparing the final PDF..." )
# save the extracted targets
targets_file.temp_file.write( file_data["targets"] )
targets_file.close( delete=False )
vo_notes_file.temp_file.write( file_data["vo-notes"] )
vo_notes_file.close( delete=False )
# prepare the PDF
prepared_file.close( delete=False )
prepare_pdf( input_file.name,
"ASL Rulebook",
targets_file.name, 5,
targets_file.name, vo_notes_file.name, 5,
prepared_file.name, "ebook",
gs_path,
log_msg

@ -83,7 +83,7 @@ def test_full_prepare( webapp, webdriver ):
with zipfile.ZipFile( io.BytesIO( zip_data ) ) as zip_file:
assert set( zip_file.namelist() ) == set( [
"ASL Rulebook.pdf", "ASL Rulebook.index",
"ASL Rulebook.targets", "ASL Rulebook.chapters", "ASL Rulebook.footnotes"
"ASL Rulebook.targets", "ASL Rulebook.chapters", "ASL Rulebook.footnotes", "ASL Rulebook.vo-notes"
] )
assert zip_file.getinfo( "ASL Rulebook.pdf" ).file_size > 40*1000
for ftype in [ "index", "targets", "chapters", "footnotes" ]:

@ -18,6 +18,7 @@ The first step is to extract the information we need from the eASLRB PDF.
--save-targets /tmp/prepared/ASL\ Rulebook.targets \
--save-chapters /tmp/prepared/ASL\ Rulebook.chapters \
--save-footnotes /tmp/prepared/ASL\ Rulebook.footnotes \
--save-vo-notes /tmp/prepared/ASL\ Rulebook.vo-notes \
--progress
```
This extracts the information we need, and saves it in the 4 data files.
@ -29,6 +30,7 @@ Next, we need to prepare the eASLRB PDF, namely create bookmarks for each rule,
asl_rulebook2/bin/prepare_pdf.py \
$EASLRB \
--targets /tmp/prepared/ASL\ Rulebook.targets \
--vo-notes /tmp/prepared/ASL\ Rulebook.vo-notes \
--yoffset 5 \
--output /tmp/prepared.pdf \
--compression ebook \

Loading…
Cancel
Save