#!/usr/bin/env python3 """ Extract content from the MMP eASLRB. """ import os import json import re import math from collections import defaultdict import click from pdfminer.layout import LTChar from asl_rulebook2.extract.base import ExtractBase from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator from asl_rulebook2.utils import parse_page_numbers, fixup_text, append_text, remove_trailing, jsonval, log_msg_stderr # NOTE: Characters are laid out individually on the page, and we generally want to process them top-to-bottom, # left-to-right, but in some cases, alignment is messed up (e.g. the bounding boxes don't line up properly # and e.g. the first part of a sentence is infintesimally lower down than the rest of the sentence, and so # appears later in the sort order), and we get better results if we process characters in the order in which # they appear in the PDF document. _DISABLE_SORT_ITEMS = [ "B40", # nb: to detect B31.1 NARROW STREET "A16", "A58","A59","A60", # Chapter A footnotes (nb: page A61 is a mess wrt element order :-/) "B1", "B45", "B46", # Chapter B footnotes "C25", "C26", # Chapter C footnotes "D27", # Chapter D footnotes "E28", "E29", "E30", # Chapter E footnotes "F20", "F21", # Chapter F footnotes "G48", "G49", "G50", # Chapter G footnotes "H9", # Chapter H footnotes 429,431,432,433,434,435, # Italian vehicle notes 436,437,438,439, # Italian ordnance notes ] _DEFAULT_ARGS = { "chapter-a": "42-102", "chapter-b": "109-154", "chapter-c": "158-183", "chapter-d": "187-213", "chapter-e": "216-245", "chapter-f": "247-267", "chapter-g": "270-319", "chapter-h": "322-324,326-330", "chapter-j": "593", "chapter-w": "647-664", "content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport "disable-sort-items": ",".join( str(si) for si in _DISABLE_SORT_ITEMS ) } # NOTE: The exact mappings here are actually not that important. What's important is: # - the order of the nationality + V/O types # - the page numbers themselves (so that they get parsed) _VO_NOTE_SECTIONS = [ [ "german", "vehicles", "330,332,334-343", True ], [ "german", "ordnance", "344-348", True ], [ "russian", "vehicles", "348,350-355", True ], [ "russian", "ordnance", "356-358", True ], [ "russian", "vehicles", "362,364-368", False ], [ "russian", "ordnance", "369", False ], [ "american", "vehicles", "371,373-383", True ], [ "american", "ordnance", "385-389", True ], [ "british", "vehicles", "395,398-417", True ], [ "british", "ordnance", "419-423", True ], [ "italian", "vehicles", "429,431-435", True ], [ "italian", "ordnance", "436-439", True ], [ "japanese", "vehicles", "443-448", True ], [ "japanese", "ordnance", "448-452", True ], [ "chinese", "vehicles", "456-459", True ], [ "chinese", "ordnance", "459-463", True ], [ "landing-craft", "vehicles", "467-468", True ], [ "french", "vehicles", "470,472-480", True ], [ "french", "ordnance", "482-487", True ], [ "allied-minor", "vehicles", "492-493,495-500", True ], [ "allied-minor", "ordnance", "501-504", True ], [ "axis-minor", "vehicles", "506,508-515", True ], [ "axis-minor", "ordnance", "516,518-527", True ], [ "finnish", "vehicles", "536,538-541", True ], [ "finnish", "ordnance", "543,545-549", True ], [ "un-forces", "vehicles", "554,556-565", True ], [ "un-forces", "ordnance", "567-570", True ], [ "communist-forces", "vehicles", "580", True ], [ "communist-forces", "ordnance", "581-585", True ], ] # --------------------------------------------------------------------- class ExtractContent( ExtractBase ): """Extract content from the MMP eASLRB.""" def __init__( self, args, log=None ): super().__init__( args, _DEFAULT_ARGS, log ) self.targets = {} self._chapters = [] self._footnotes = {} self._vo_notes = self._prev_vo_note_id = None self._curr_vo_note_section = 0 self._curr_chapter = self._curr_footnote = self._curr_pageid = None self._prev_elem = self._top_left_elem = None # prepare to fixup problems in the content def load_fixup( fname ): fname = os.path.join( os.path.dirname(__file__), "data/", fname ) with open( fname, "r", encoding="utf-8" ) as fp: return json.load( fp ) self._target_fixups = load_fixup( "target-fixups.json" ) self._chapter_fixups = load_fixup( "chapter-fixups.json" ) self._footnote_fixups = load_fixup( "footnote-fixups.json" ) self._vo_note_fixups = load_fixup( "vo-note-fixups.json" ) def extract_content( self, pdf ): """Extract content from the MMP eASLRB.""" # figure out which pages to process chapter_pages = {} # maps chapters to page numbers page_index = {} # maps page numbers to chapter for key, val in self._args.items(): if key.startswith( "chapter-" ): page_nos = parse_page_numbers( val ) assert len(key) == 9 chapter = key[8].upper() chapter_pages[ chapter ] = page_nos for page_no in page_nos: page_index[ page_no ] = chapter disable_sort_items = set( self._args["disable-sort-items"].split( "," ) ) # include the pages for the Chapter H vehicle/ordnance notes for _, _, page_nos, _ in _VO_NOTE_SECTIONS: page_nos = parse_page_numbers( page_nos ) for page_no in page_nos: page_index[ page_no ] = "H" # initialize self._curr_chapter = None curr_chapter_pageno = None self._curr_footnote = None # NOTE: The parsing code works in two modes. # - We start off extracting content, and detect the start of a new rule by bold text near the start of the line. # - When we see the footnotes header (e.g. "CHAPTER A FOOTNOTES"), we switch into footnotes mode, and detect # the start of a footnote by a bold number near the start of the line. # process each page for page_no, _, lt_page in PageIterator( pdf ): # prepare to process the next page if page_no > max( page_index.keys() ): break if page_no not in page_index: self.log_msg( "progress", "- Skipping page {}.", page_no ) if curr_chapter_pageno is not None: curr_chapter_pageno += 1 continue if not self._curr_chapter or self._curr_chapter != page_index[page_no]: # we've found the start of a new chapter self._save_footnote() # nb: save the last footnote of the previous chapter self._curr_chapter = page_index[ page_no ] curr_chapter_pageno = 1 else: curr_chapter_pageno += 1 self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page# self._curr_chapter, curr_chapter_pageno ) # NOTE: There have been so many extra pages added to Chapter H, there's no easy way to calculate # the page ID. We could set up a table mapping physical page numbers to page ID's, but that's # far more trouble than it's worth. self.log_msg( "progress", "- Analyzing page {}{}.", page_no, " ({})".format( self._curr_pageid ) if not self._curr_pageid.startswith("H") else "" ) # process each element on the page curr_caption = None self._top_left_elem = self._prev_elem = None def elem_filter( elem ): return isinstance( elem, LTChar ) sort_elems = self._curr_pageid not in disable_sort_items and str(page_no) not in disable_sort_items for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): # skip problematic elements if elem.fontname == "OYULKV+MyriadPro-Regular": # FUDGE! Some symbols are represented as characters, which can sometimes cause problems # (e.g. in v1.05, the diamond for A7.8 PIN broke caption parsing), and the easiest way # to work-around this is to just ignore those characters. continue # keep track of the top-left-most bold element if self._is_bold( elem ): if self._top_left_elem is None \ or elem.x0 < self._top_left_elem.x0 and elem.y1 > self._top_left_elem.y1: self._top_left_elem = elem # check if we should ignore this element if not self._in_viewport( elem, "content" ): continue # check if we're currently extracting footnotes if self._curr_footnote is not None: self._on_footnote_elem( elem, lt_page, page_no ) self._prev_elem = elem continue # figure out what we've got is_bold = self._is_bold( elem ) ch = curr_caption[0] if curr_caption else None #pylint: disable=unsubscriptable-object if is_bold and ch and ch.isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2: # the previous bold character looks like a footnote superscript - ignore it curr_caption = None if curr_caption and elem.get_text() == " ": # FUDGE! Some captions are in a bold font, but the spaces are not :-/ is_bold = True if is_bold: if curr_caption: # NOTE: We stop collecting bold characters at the end of the line, even if they continue on # to the next line. This is to handle the case of a major heading (e.g. "1. PERSONNEL COUNTERS") # being followed by a lesser heading ("1.1"). However, we want to handle captions that span # multiple lines, so we check the vertical distance between the lines to see if it looks like # two separate headings, or a single caption that has spread over multiple lines. if self._prev_elem.y0 - elem.y1 > 0.25*elem.height: # we've found the start of a new rule - save the old one, start collecting the new caption self._save_target( curr_caption, page_no, lt_page, elem ) curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ] else: # continue collecting the caption if self._prev_elem.y0 - elem.y0 > 1: # nb: we just started a new line curr_caption[0] = append_text( #pylint: disable=unsupported-assignment-operation curr_caption[0], elem.get_text() #pylint: disable=unsubscriptable-object ) else: curr_caption[0] += elem.get_text() #pylint: disable=unsupported-assignment-operation else: # check if this is the first character of the line if self._is_start_of_line( elem, lt_page ): # yup - start collecting the caption curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ] else: # check if we're currently collecting a caption if curr_caption: # yup - we've just found the end of it, save it self._save_target( curr_caption, page_no, lt_page, elem ) curr_caption = None # loop back to process the next element self._prev_elem = elem # add the last caption/footnote (if they haven't already been done) self._save_footnote() if curr_caption: self._save_target( curr_caption, page_no, None, None ) # check for unused fixups if self._target_fixups: self.log_msg( "warning", "Unused fixups: {}", self._target_fixups ) if self._footnote_fixups: self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups ) if self._vo_note_fixups: self.log_msg( "warning", "Unused fixups: {}", self._vo_note_fixups ) # extract the chapters self._extract_chapters() def _save_target( self, caption, page_no, lt_page, elem ): """Save a parsed target.""" # initialize orig_caption = caption[0] caption_text = re.sub( r"\s+", " ", caption[0] ).strip() if len(caption_text) <= 1: # NOTE: We're finding text that is part of an image (e.g. the "E" for an Elite MMC), # perhaps because the pages were OCR'ed, so we ignore these. return # check if we've found the start of the chapter's footnotes if "FOOTNOTES" in caption_text : # yup - notify the main loop self._curr_footnote = [] if elem: self._on_footnote_elem( elem, lt_page, page_no ) return # check if the entry needs to be fixed up fixup = self._target_fixups.get( self._curr_pageid, {} ).get( caption_text ) if fixup: # yup - make it so fixup[ "instances" ] = fixup.get("instances",1) - 1 if fixup["instances"] <= 0: self._target_fixups[ self._curr_pageid ].pop( caption_text ) if not self._target_fixups[ self._curr_pageid ]: del self._target_fixups[ self._curr_pageid ] ruleid = fixup.get( "new_ruleid" ) if not ruleid: return caption_text = fixup.get( "new_caption" ) else: # nope - use what was parsed # FUDGE! There are a lot of layout problems with things like "12.CONCEALMENT" (i.e. missing space), # and it's tricky to detect these and not get tripped up by things like "12.C blah", so we handle it # as a separate case. mo = re.search( r"^(\d+\.\d*)([^ 0-9].+)", caption_text ) if mo: ruleid, caption_text = mo.group(1), mo.group(2).strip() else: # check if the caption text starts with something that looks like a ruleid # NOTE: A leading "*" indicates an optional rule. mo = re.search( r"^\*?([A-Z]\.?)?[1-9][0-9.-]*[A-F]?", caption_text ) if not mo: return ruleid, caption_text = mo.group(), caption_text[mo.end():].strip() if ruleid.startswith( "*" ): ruleid = ruleid[1:] ruleid = remove_trailing( ruleid, "." ) caption_text = remove_trailing( caption_text, ":" ) # save the new target if not ruleid.startswith( self._curr_chapter ): ruleid = self._curr_chapter + ruleid if ruleid in self.targets: self.log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").", ruleid, caption[0] ) return if caption_text == "\u2014": caption_text = "-" # nb: for A7.306 :-/ self.targets[ ruleid ] = { "caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1], "raw_caption": orig_caption } def _on_footnote_elem( self, elem, lt_page, page_no ): """Process an element while we're parsing footnotes.""" # check if we've found the start of a new footnote if self._is_bold( elem ): if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ): # yup - save the current footnote, start collecting the new one self._save_footnote() elem_pos = ( elem.x0, elem.y1 ) self._curr_footnote = [ elem.get_text(), "", page_no, elem_pos ] else: if self._curr_footnote[1]: # FUDGE! Some footnote content has bold text hard-up at the left margin, # so we collect that as normal content. self._curr_footnote[1] += elem.get_text() else: # we're still collecting the footnote's ID # NOTE: Older chapters have only the footnote ID in bold text, while newer chapters have # both the ID and caption in bold. We figure out what's going on later, in _save_footnote(). self._curr_footnote[0] += elem.get_text() else: # nope - we're still collecting the footnote's content if not self._prev_elem or elem.x0 < self._prev_elem.x0 or elem.y0 - self._prev_elem.y0 > lt_page.height/2: # nb: we just started a new line self._curr_footnote[1] = append_text( self._curr_footnote[1], elem.get_text() ) else: self._curr_footnote[1] += elem.get_text() def _save_footnote( self ): #pylint: disable=too-many-branches """Save a parsed footnote.""" if not self._curr_footnote: return # initialize if self._curr_chapter not in self._footnotes: # start saving footnotes for the chapter self._footnotes[ self._curr_chapter ] = [] orig_content = self._curr_footnote[1] # separate the footnote ID, referenced rule, and content if self._curr_chapter in ( "F", "G", "W" ): # NOTE: Chapter F/G footnote captions are also bold. mo = re.search( r"^\d{1,2}\.", self._curr_footnote[0] ) if mo: parts = mo.group(), self._curr_footnote[0][mo.end():] self._curr_footnote[0] = parts[0] self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip() else: self.log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] ) footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." ) content = self._curr_footnote[1].strip() mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content ) if mo: ruleid, content = mo.group(), content[mo.end():] if not ruleid.startswith( self._curr_chapter ): ruleid = self._curr_chapter + ruleid ruleid = remove_trailing( ruleid, "." ) else: ruleid = None if self._curr_chapter == "C": # FUDGE! The "29." for Chapter C's footnote #29 is misaligned, and is extracted as two separate # footnotes "2" and "9". There isn't really any way to fix this via the normal data-driven mechanism, # so we do it in the code here :-/ footnote_ids = [ f["footnote_id"] for f in self._footnotes[self._curr_chapter] ] if footnote_id == "2" and "2" in footnote_ids: return if footnote_id == "9" and "9" in footnote_ids: footnote_id = "29" if self._curr_chapter == "H": # check if we've gone past the end of the Chapter H footnotes if self._vo_notes is None and len(footnote_id) > 1: # yup - start collecting vehicle/ordnance notes self._vo_notes = defaultdict( lambda: defaultdict( list ) ) # check if we're collecting Chapter H vehicle/ordnance notes if self._vo_notes is not None: # yup - save the next entry (the "footnote" is actually a vehicle/ordnance note) self._save_vo_note( footnote_id, self._curr_footnote[2], self._curr_footnote[3] ) self._curr_footnote = None return # clean up the content content = re.sub( r"\s+", " ", content ).strip() content = fixup_text( content ) mo = re.search( r"^[A-Z ]+:\S", content ) if mo: content = content[:mo.end()-1] + " " + content[mo.end()-1:] # check for any fixups captions = [] fixups = self._footnote_fixups.get( self._curr_chapter, {} ).get( footnote_id ) if fixups: if isinstance( fixups, list ): # NOTE: A simple search-and-replace is, by far, the most common fixup, so we provide # a simplified way of specifying these in the fixup file fixups = { "replace": [ ( sr[0], sr[1] ) for sr in fixups ] } errors = [] # do any search-replace's if "replace" in fixups: failed_sr = [] for sr in fixups["replace"]: prev_content = content content = content.replace( sr[0], sr[1] ) if content == prev_content: self.log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}", self._curr_chapter, footnote_id, sr[0] ) failed_sr.append( sr ) if failed_sr: fixups["replace"] = failed_sr else: del fixups["replace"] # replace the captions if "captions" in fixups: captions = fixups.pop( "captions" ) # check that all fixups were successfully applied if fixups: errors.append( fixups ) if errors: self._footnote_fixups[ self._curr_chapter ][ footnote_id ] = errors else: del self._footnote_fixups[ self._curr_chapter ][ footnote_id ] if not self._footnote_fixups[ self._curr_chapter ]: del self._footnote_fixups[ self._curr_chapter ] content = content.strip() # extract the footnote's caption if not captions: pos = content.find( ":" ) if pos >= 0: captions.append( ( ruleid, content[:pos] ) ) content = content[pos+1:].strip() else: self.log_msg( "warning", "Can't extract footnote caption: {}:{} - {}", self._curr_chapter, footnote_id, content ) # check for the credits at the end of the Chapter F footnotes if self._curr_chapter == "F": pos = content.find( "WEST OF ALAMEIN CREDITS" ) if pos > 0: content = content[:pos] # check for the start of the vehicle notes at the end of the Chapter H footnotes if self._curr_chapter == "H": pos = content.find( "GERMAN VEHICLE NOTES" ) if pos > 0: content = content[:pos] # save the footnote self._footnotes[ self._curr_chapter ].append( { "footnote_id": footnote_id, "captions": captions, "content": content, "raw_content": orig_content } ) self._curr_footnote = None def _save_vo_note( self, caption, page_no, page_pos ): """Save an extracted vehicle/ordnance note.""" # NOTE: Some pieces of text cause the parsing code to go wonky (typically because it's seen # a "1" and so thinks it's found the start of a new section), so we manually skip over these. skips = self._vo_note_fixups.get( "skips", {} ).get( str(page_no) ) if skips: for i, target in enumerate(skips): if self._check_string( caption, target ): # we've got a caption that should be skipped - remove it from the list, and return del skips[i] if not skips: del self._vo_note_fixups["skips"][ str(page_no) ] if not self._vo_note_fixups["skips"]: del self._vo_note_fixups["skips"] return if caption.isdigit() and page_no not in (354, 417): return def apply_fixups( vo_note_id, caption ): nat, vo_type, _, _ = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ] fixup = self._vo_note_fixups.get( nat, {} ).get( vo_type, {} ).get( vo_note_id ) if fixup: if self._check_string( caption, fixup["old_caption"] ): # remove the fixup del self._vo_note_fixups[ nat ][ vo_type ][ vo_note_id ] cleanup_fixups( nat, vo_type ) # apply the fixup if "new_vo_note_id" in fixup: vo_note_id = fixup["new_vo_note_id"] if "new_caption" in fixup: caption = fixup["new_caption"] return vo_note_id, caption def cleanup_fixups( nat, vo_type ): if nat not in self._vo_note_fixups: return if vo_type in self._vo_note_fixups[nat] and not self._vo_note_fixups[ nat ][ vo_type ]: del self._vo_note_fixups[ nat ][ vo_type ] if nat in self._vo_note_fixups and not self._vo_note_fixups[ nat ]: del self._vo_note_fixups[ nat ] # extract the note number and caption mo = re.search( r"^[1-9][0-9.]*", caption ) if not mo: return vo_note_id = mo.group() caption = caption[ mo.end() : ].strip() if vo_note_id.endswith( "." ): vo_note_id = vo_note_id[:-1] if caption.endswith( ":" ): caption = caption[:-1].strip() if caption.startswith( ( "cm ", "mm ", "pdr", "-cwt" ) ): # FUDGE! Things like "5.1 2.2cm Big Gun" are getting parsed as "5.12.2: + "cm Big Gun" :-/ pos = vo_note_id.find( "." ) if pos >= 0: caption = vo_note_id[pos+1:] + caption vo_note_id = vo_note_id[:pos] # check for any fixups vo_note_id, caption = apply_fixups( vo_note_id, caption ) # compare the note ID with the previous one nat, vo_type, _, check_seq = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ] def get_base_note_id( val ): pos = val.find( "." ) return int( val[:pos] if pos >= 0 else val ) base_note_id = get_base_note_id( vo_note_id ) if self._prev_vo_note_id: # check if we've found the start of the next section if base_note_id == 1: # yup - add any extra entries to the current section add_vo_entries = self._vo_note_fixups.get( nat, {} ).get( vo_type, {} ).pop( "add", [] ) for vo_entry in add_vo_entries: self._vo_notes[ nat ][ vo_type ].append( vo_entry ) cleanup_fixups( nat, vo_type ) # get the next nationality + V/O type self._curr_vo_note_section += 1 nat, vo_type, _, _ = _VO_NOTE_SECTIONS[ self._curr_vo_note_section ] # check for any fixups vo_note_id, caption = apply_fixups( vo_note_id, caption ) elif check_seq: # compare the note ID with the previous one prev_base_note_id = get_base_note_id( self._prev_vo_note_id ) if base_note_id == prev_base_note_id + 1: pass # nb: this is the normal case, we've found the next V/O note elif base_note_id == prev_base_note_id and "." in vo_note_id: pass # nb: this is to allow things like "9.1" following "9" else: return # nb: we got some junk that can be ignored # save the V/O note self._vo_notes[ nat ][ vo_type ].append( { "vo_note_id": vo_note_id, "caption": caption, "page_no": page_no, "page_pos": page_pos } ) if nat == "allied-minor" and vo_type == "ordnance" and vo_note_id == "19": # FUDGE! Because we're not seing Allied Minor Ordnance Note 20 :-/ self._prev_vo_note_id = "20" else: self._prev_vo_note_id = vo_note_id def _extract_chapters( self ): """Extract the chapters and their sections.""" # FUDGE! Extracting the index at the start of each chapter from the PDF would be horribly complicated, # since they are laid out as a 2-column table, but the PDF elements run left-to-right across the entire table, # and things get very messy when the section title spans multiple lines :-/ # We fudge around this by doing things as a post-processing step, looking for targets that have a ruleid # that match a particular form. # initialize self._chapters = [] fixup_regexes = [ ( re.compile( r"\b{}\b".format( word.title() ) ), word ) for word in self._chapter_fixups["capitalize_words"] ] # process each chapter for arg, val in self._args.items(): mo = re.search( r"^chapter-([a-z])$", arg ) if not mo: continue chapter_id = mo.group(1).upper() ruleid_regex = re.compile( r"^{}(\d+)$".format( chapter_id ) ) page_nos = parse_page_numbers( val ) # look for ruleid's that mark the start of a section sections = [] for ruleid, target in self.targets.items(): mo = ruleid_regex.search( ruleid ) if not mo: continue # found one - add it to the list fixup = self._chapter_fixups.get( "replace", {} ).pop( ruleid, None ) if fixup: caption = target["caption"] if caption != fixup[0]: self.log_msg( "warning", "Unexpected chapter fixup caption ({}): {}", ruleid, caption ) caption = fixup[1] else: caption = target["caption"].title() for fixup in fixup_regexes: caption = fixup[0].sub( fixup[1], caption ) sections.append( { "caption": "{}. {}".format( mo.group(1), caption ), "ruleid": ruleid, } ) # FUDGE! Chapter titles often span across the width of the page i.e. across both columns # of the 2-column layout, which makes them messy to parse. There are not too many of them, # so it's easier just to specify them manually :-/ title = self._chapter_fixups.get( "titles", {} ).get( chapter_id, "Chapter "+chapter_id ) self._chapters.append( { "chapter_title": title, "chapter_id": chapter_id, "page_no": min( page_nos ), "sections": sections } ) # check for unused search-and-replace fixups fixups = self._chapter_fixups.get( "replace" ) if fixups: self.log_msg( "warning", "Unused chapter fixup: {}", fixups ) def _is_start_of_line( self, elem, lt_page ): """Check if the element is at the start of its line.""" # NOTE: We can't just check the element's x co-ordinate, since there is sometimes a floating image # that pushes the text right (e.g. A.12). if self._prev_elem is None: return True if elem.y0 < self._prev_elem.y0: return True if self._prev_elem.x0 < lt_page.width/2 and elem.x0 > lt_page.width/2: return True # the element is at the top of the right column return False def save_as_raw( self, targets_out, chapters_out, footnotes_out, vo_notes_out ): """Save the raw results.""" self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, vo_notes_out, True ) def save_as_text( self, targets_out, chapters_out, footnotes_out, vo_notes_out ): """Save the results as plain-text.""" self._save_as_raw_or_text( targets_out, chapters_out, footnotes_out, vo_notes_out, False ) def _save_as_raw_or_text( self, targets_out, chapters_out, footnotes_out, vo_notes_out, raw ): """Save the results as raw or plain-text.""" # save the targets curr_page_no = None for ruleid, target in self.targets.items(): if target["page_no"] != curr_page_no: if curr_page_no: print( file=targets_out ) print( "=== p{} ===".format( target["page_no"] ), file=targets_out ) curr_page_no = target["page_no"] xpos, ypos = self._get_page_pos( target["pos"] ) if raw: print( "[{},{}] = {}".format( xpos, ypos, target["raw_caption"] ), file=targets_out ) else: print( "{} => {} @ p{}:[{},{}]".format( ruleid, target["caption"], target["page_no"], xpos, ypos ), file=targets_out ) # save the chapters for chapter_no, chapter in enumerate(self._chapters): if chapter_no > 0: print( file=chapters_out ) print( "=== {}: {} (p{}) ===\n".format( chapter["chapter_id"], chapter["chapter_title"], chapter["page_no"] ), file=chapters_out ) for section in chapter["sections"]: print( "{} => {}".format( section["caption"], section["ruleid"], ), file=chapters_out ) # save the footnotes def make_caption( caption ): buf = [] if caption[1]: buf.append( caption[1] ) if caption[0]: buf.append( "[{}]".format( caption[0] ) ) elif caption[0]: buf.append( caption[0] ) return " ".join( buf ) for chapter, footnotes in self._footnotes.items(): if chapter != "A": print( file=footnotes_out ) print( "=== CHAPTER {} FOOTNOTES {}".format( chapter, 80*"=" )[:80], file=footnotes_out ) for footnote in footnotes: print( file=footnotes_out ) print( "--- Footnote {} ---".format( footnote["footnote_id"] ), file=footnotes_out ) if raw: print( footnote["raw_content"], file=footnotes_out ) else: print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out ) print( footnote["content"], file=footnotes_out ) # save the vehicle/ordnance notes first = True for nat, vo_types in self._vo_notes.items(): for vo_type, vo_entries in vo_types.items(): if first: first = False else: print( file=vo_notes_out ) print( "=== {} ===".format( nat if nat == "landing-craft" else "{} {}".format( nat, vo_type ) ), file=vo_notes_out ) for vo_entry in vo_entries: if "page_pos" in vo_entry: xpos, ypos = ExtractContent._get_page_pos( vo_entry["page_pos"] ) page_pos = "[{},{}]".format( xpos, ypos ) else: page_pos = None print( "{:<5} {} @p{}{}".format( vo_entry["vo_note_id"]+":", vo_entry["caption"], vo_entry["page_no"], ":"+page_pos if page_pos else "" ), file=vo_notes_out ) def save_as_json( self, targets_out, chapters_out, footnotes_out, vo_notes_out ): """Save the results as JSON.""" # save the targets targets, curr_chapter = [], None for ruleid, target in self.targets.items(): xpos, ypos = self._get_page_pos( target["pos"] ) targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format( jsonval( ruleid ), jsonval(target["caption"]), target["page_no"], xpos, ypos ) ) if ruleid[0] != curr_chapter: targets[-1] = "\n" + targets[-1] curr_chapter = ruleid[0] print( "{{\n{}\n\n}}".format( ",\n".join( targets ) ), file=targets_out ) # save the chapters chapters = [] for chapter in self._chapters: sections = [] for section in chapter["sections"]: sections.append( " {{ \"caption\": {}, \"ruleid\": {} }}".format( jsonval(section["caption"]), jsonval(section["ruleid"]) ) ) chapters.append( "{{ \"title\": {},\n \"chapter_id\": {},\n \"page_no\": {},\n \"sections\": [\n{}\n] }}".format( jsonval(chapter["chapter_title"]), jsonval(chapter["chapter_id"]), jsonval(chapter["page_no"]), ",\n".join( sections ) ) ) print( "[\n\n{}\n\n]".format( ",\n\n".join( chapters ) ), file=chapters_out ) # save the footnotes def make_caption( caption ): return "{{ \"caption\": {}, \"ruleid\": {} }}".format( jsonval(caption[1]), jsonval(caption[0]) ) chapters = [] for chapter in self._footnotes: #pylint: disable=consider-using-dict-items footnotes = [] for footnote in self._footnotes[chapter]: footnotes.append( "{}: {{\n \"captions\": {},\n \"content\": {}\n}}".format( jsonval( footnote["footnote_id"] ), "[ {} ]".format( ", ".join( make_caption(c) for c in footnote["captions"] ) ), jsonval( footnote["content"] ) ) ) chapters.append( "{}: {{\n\n{}\n\n}}".format( jsonval( chapter ), ",\n".join( footnotes ) ) ) print( "{{\n\n{}\n\n}}".format( ",\n\n".join( chapters ) ), file=footnotes_out ) # save the vehicle/ordnance notes vo_notes = [] for nat in self._vo_notes: vo_types = [] for vo_type, vo_entries in self._vo_notes[nat].items(): entries = [] for vo_entry in vo_entries: val = "{}: {{ \"caption\": {}, \"page_no\": {}".format( jsonval(vo_entry["vo_note_id"]), jsonval(vo_entry["caption"]), jsonval(vo_entry["page_no"]) ) if "page_pos" in vo_entry: xpos, ypos = self._get_page_pos( vo_entry["page_pos"] ) val += ", \"pos\": [{},{}]".format( xpos, ypos ) val += " }" entries.append( " {}".format( val ) ) if nat == "landing-craft": vo_types.append( ",\n".join( entries ) ) else: vo_types.append( "{}: {{\n{}\n}}".format( jsonval(vo_type), ",\n".join( entries ) ) ) vo_notes.append( "{}: {{\n{}\n}}".format( jsonval(nat), ",\n".join( vo_types ) ) ) print( "{{\n\n{}\n\n}}".format( ",\n\n".join( vo_notes ) ), file=vo_notes_out ) @staticmethod def _check_string( val, target ): """Check if a string matches a target.""" if target.startswith( "^" ): return val.startswith( target[1:] ) else: return val == target @staticmethod def _get_page_pos( pos ): """Return a X/Y position on the page.""" return math.floor( pos[0] ), math.ceil( pos[1] ) # --------------------------------------------------------------------- @click.command() @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) @click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-chapters","save_chapters_fname", required=True, help="Where to save the extracted chaopters." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) @click.option( "--save-vo-notes","save_vo_notes_fname", required=True, help="Where to save the extracted vehicle/ordnance notes." ) def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_chapters_fname, save_footnotes_fname, save_vo_notes_fname ): """Extract content from the MMP eASLRB.""" # initialize args = ExtractBase.parse_args( args, _DEFAULT_ARGS ) # extract the content def log_msg( msg_type, msg ): if msg_type == "progress" and not progress: return log_msg_stderr( msg_type, msg ) extract = ExtractContent( args, log_msg ) extract.log_msg( "progress", "Loading PDF: {}", pdf_file ) with PdfDoc( pdf_file ) as pdf: extract.extract_content( pdf ) # save the results with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ open( save_chapters_fname, "w", encoding="utf-8" ) as chapters_out, \ open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out, \ open( save_vo_notes_fname, "w", encoding="utf-8" ) as vo_notes_out: getattr( extract, "save_as_"+output_fmt, )( targets_out, chapters_out, footnotes_out, vo_notes_out ) if __name__ == "__main__": main() #pylint: disable=no-value-for-parameter