Extracted the index, rule targets and footnotes from the eASLRB.

3 years ago · e3ebbcd0f7
parent c2265404bc
commit e3ebbcd0f7
16 changed files with 2351 additions and 40 deletions
--- a/asl_rulebook2/extract/all.py
+++ b/asl_rulebook2/extract/all.py
@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+""" Extract everything we need from the MMP eASLRB. """
+
+import sys
+import os
+import json
+import re
+import importlib
+
+import click
+
+from asl_rulebook2.pdf import PdfDoc
+from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
+from asl_rulebook2.extract.index import ExtractIndex
+from asl_rulebook2.extract.content import ExtractContent
+
+# ---------------------------------------------------------------------
+
+class ExtractAll( ExtractBase ):
+    """Extract everything from the eASLRB."""
+
+    def __init__( self, args, log=None ):
+        super().__init__( None, None, log )
+        self._args = args
+
+    def extract_all( self, pdf ):
+        """Extract everything from the eASLRB."""
+
+        # initialize
+        default_args = {}
+        for mod in ( "index", "content" ):
+            mod = importlib.import_module( "asl_rulebook2.extract." + mod )
+            default_args.update( getattr( mod, "_DEFAULT_ARGS" ) )
+
+        # extract the index
+        self._log_msg( "progress",  "\nExtracting the index..." )
+        args = ExtractBase.parse_args( self._args, default_args )
+        self.extract_index = ExtractIndex( args, self._log )
+        self.extract_index.extract_index( pdf )
+
+        # extract the content
+        self._log_msg( "progress",  "\nExtracting the content..." )
+        args = ExtractBase.parse_args( self._args, default_args )
+        self.extract_content = ExtractContent( args, self._log )
+        self.extract_content.extract_content( pdf )
+
+        # verify the index targets
+        self._check_targets()
+
+    def _check_targets( self ):
+        """Cross-check ruleid's and ruleref's in the index against targets in the main content."""
+
+        # build an index of known targets
+        targets = {}
+        for ruleid, target in self.extract_content._targets.items():
+            assert ruleid not in targets
+            targets[ ruleid ] = target["caption"]
+
+        # load the list of known missing targets
+        known_strings, known_regexes = set(), set()
+        fname = os.path.join( os.path.dirname(__file__), "data/known-missing-ruleids.json" )
+        with open( fname, "r", encoding="utf-8" ) as fp:
+            data = json.load( fp )
+            for chapter in data["chapters"]:
+                known_regexes.add( re.compile( "^{}[0-9.]+[A-Ea-e]?$".format( chapter ) ) )
+            known_strings.update( data["strings"] )
+            known_regexes.update(
+                re.compile( regex ) for regex in data["regexes"]
+            )
+
+        def is_known_ruleid( ruleid ):
+            ruleid = re.sub( r"-[A-Z]?\.?\d+$", "", ruleid ) # e.g. "A1.23-.45" -> "A1.23"
+            if ruleid.endswith( " EX" ):
+                ruleid = ruleid[:-3]
+            if ruleid in targets:
+                return True
+            if ruleid in known_strings:
+                return True
+            if any( regex.search( ruleid ) for regex in known_regexes ):
+                return True
+            return False
+
+        # check each index entry
+        first = True
+        for index_entry in self.extract_index._index_entries:
+
+            errors = []
+
+            # check the index entry's ruleid's
+            for ruleid in index_entry.get( "ruleids", [] ):
+                if not is_known_ruleid( ruleid ):
+                    errors.append( "Unknown ruleid: {}".format( ruleid ) )
+
+            # check the index entry's ruleref's
+            for ruleref in index_entry.get( "rulerefs", [] ):
+                if not ruleref["ruleids"]:
+                    continue
+                # check each ruleref
+                if ", ".join( r for r in ruleref["ruleids"] ) in known_strings:
+                    # NOTE: This is some free-form text that has been split up because it contains commas.
+                    continue
+                for ruleid in ruleref["ruleids"]:
+                    if not is_known_ruleid( ruleid ):
+                        errors.append( "Unknown ruleref target: {} => [{}]".format( ruleref["caption"], ruleid ) )
+
+            # log any errors
+            if errors:
+                if first:
+                    self._log_msg( "warning", "\n=== Unknown targets ===\n" )
+                    first = False
+                errors = [ "- {}".format( e ) for e in errors ]
+                self._log_msg(  "warning", "{}:\n{}",
+                    index_entry["caption"], "\n".join(errors)
+                )
+
+# ---------------------------------------------------------------------
+
+@click.command()
+@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
+@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
+@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
+@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
+@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." )
+@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
+@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
+def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname, save_footnotes_fname ):
+    """Extract everything we need from the MMP eASLRB."""
+
+    # extract everything
+    def log_msg( msg_type, msg ):
+        if msg_type == "progress" and not progress:
+            return
+        log_msg_stderr( msg_type, msg )
+    extract = ExtractAll( args, log_msg )
+    extract._log_msg( "progress",  "Loading PDF: {}", pdf_file )
+    with PdfDoc( pdf_file ) as pdf:
+        extract.extract_all( pdf )
+
+    # save the results
+    with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
+         open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
+         open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
+        getattr( extract.extract_index, "save_as_"+format )( index_out )
+        getattr( extract.extract_content, "save_as_"+format )( targets_out, footnotes_out )
+
+if __name__ == "__main__":
+    main() #pylint: disable=no-value-for-parameter
--- a/asl_rulebook2/extract/base.py
+++ b/asl_rulebook2/extract/base.py
@ -0,0 +1,59 @@
+""" Base class for the extraction tools. """
+
+import sys
+
+import click
+
+# ---------------------------------------------------------------------
+
+class ExtractBase:
+
+    def __init__( self, args, default_args, log ):
+        self._args = args
+        if default_args:
+            for key in default_args:
+                if key not in self._args:
+                    self._args[ key ] = default_args[ key ]
+        self._log = log
+
+    @staticmethod
+    def parse_args( args, default_args ):
+        """Helper method to parse command-line arguments."""
+        args2 = {}
+        for arg in args:
+            pos = arg.find( "=" )
+            if pos < 0:
+                raise RuntimeError( "Invalid configuration parameter: {}".format( arg ) )
+            key, val = arg[:pos], arg[pos+1:]
+            if key not in default_args:
+                raise RuntimeError( "Unknown configuration parameter: {}".format( key ) )
+            args2[ key ] = int(val) if val.isdigit() else val
+        return args2
+
+    def _in_viewport( self, elem, vp_type ):
+        """Check if an element is in the viewport."""
+        if elem.x0 <= self._args[vp_type+"_vp_left"] or elem.x1 >= self._args[vp_type+"_vp_right"]:
+            return False
+        if elem.y0 <= self._args[vp_type+"_vp_bottom"] or elem.y1 >= self._args[vp_type+"_vp_top"]:
+            return False
+        return True
+
+    @staticmethod
+    def _is_bold( elem ):
+        """Check if an element is using a bold font."""
+        return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) )
+
+    def _log_msg( self, msg_type, msg, *args, **kwargs ):
+        """Log a message."""
+        if not self._log:
+            return
+        msg = msg.format( *args, **kwargs )
+        self._log( msg_type, msg )
+
+# ---------------------------------------------------------------------
+
+def log_msg_stderr( msg_type, msg ):
+    """Log a message to stderr."""
+    if msg_type == "warning":
+        msg = click.style( "WARNING: {}".format( msg ), fg="yellow" )
+    click.echo( msg, file=sys.stderr )
--- a/asl_rulebook2/extract/content.py
+++ b/asl_rulebook2/extract/content.py
@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+""" Extract content from the MMP eASLRB. """
+
+import os
+import json
+import re
+import math
+from collections import defaultdict
+
+import click
+from pdfminer.layout import LTChar
+
+from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
+from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator
+from asl_rulebook2.utils import parse_page_numbers, fixup_text, append_text, remove_trailing, jsonval
+
+# NOTE: Characters are laid out individually on the page, and we generally want to process them top-to-bottom,
+# left-to-right, but in some cases, alignment is messed up (e.g. the bounding boxes don't line up properly
+# and e.g. the first part of a sentence is infintesimally lower down than the rest of the sentence, and so
+# appears later in the sort order), and we get better results if we process characters in the order in which
+# they appear in the PDF document.
+_DISABLE_SORT_ITEMS = [
+    "B40", # nb: to detect B31.1 NARROW STREET
+    "A58","A59","A60", # Chapter A footnotes (nb: page A61 is a mess wrt element order :-/)
+    "B45", "B46", # Chapter B footnotes
+    "C25", "C26", # Chapter C footnotes
+    "D27", # Chapter D footnotes
+    "E28", "E29", "E30", # Chapter E footnotes
+    "F20", "F21", # Chapter F footnotes
+    "G48", "G49", "G50", # Chapter G footnotes
+]
+
+_DEFAULT_ARGS = {
+    "chapter-a": "42-102", "chapter-b": "109-154", "chapter-c": "158-183", "chapter-d": "187-213",
+    "chapter-e": "216-245", "chapter-f": "247-267", "chapter-g": "270-319",
+    "chapter-j": "593",
+    "chapter-w": "647-664",
+    "content_vp_left": 0, "content_vp_right": 565, "content_vp_top": 715, "content_vp_bottom": 28, # viewport
+    "disable-sort-items": ",".join( _DISABLE_SORT_ITEMS )
+}
+
+# ---------------------------------------------------------------------
+
+class ExtractContent( ExtractBase ):
+    """Extract content from the MMP eASLRB."""
+
+    def __init__( self, args, log=None ):
+        super().__init__( args, _DEFAULT_ARGS, log )
+        self._targets = {}
+        self._footnotes = {}
+        # prepare to fixup problems in the content
+        fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" )
+        with open( fname2, "r", encoding="utf-8" ) as fp:
+            self._target_fixups = json.load( fp )
+        fname2 = os.path.join( os.path.dirname(__file__), "data/footnote-fixups.json" )
+        with open( fname2, "r", encoding="utf-8" ) as fp:
+            self._footnote_fixups = json.load( fp )
+
+    def extract_content( self, pdf ):
+        """Extract content from the MMP eASLRB."""
+
+        # figure out which pages to process
+        chapter_pages = {} # maps chapters to page numbers
+        page_index = {} # maps page numbers to chapter
+        for key, val in _DEFAULT_ARGS.items():
+            if key.startswith( "chapter-" ):
+                page_nos = parse_page_numbers( val )
+                assert len(key) == 9
+                chapter = key[8].upper()
+                chapter_pages[ chapter ] = page_nos
+                for page_no in page_nos:
+                    page_index[ page_no ] = chapter
+        disable_sort_items = set( self._args["disable-sort-items"].split( "," ) )
+
+        # initialize
+        self._curr_chapter = None
+        curr_chapter_pageno = None
+        self._curr_footnote = None
+
+        # NOTE: The parsing code works in two modes.
+        # - We start off extracting content, and detect the start of a new rule by bold text near the start of the line.
+        # - When we see the footnotes header (e.g. "CHAPTER A FOOTNOTES"), we switch into footnotes mode, and detect
+        #   the start of a footnote by a bold number near the start of the line.
+
+        # process each page
+        for page_no, page, lt_page in PageIterator( pdf ):
+
+            # prepare to process the next page
+            if page_no > max( page_index.keys() ):
+                break
+            if page_no not in page_index:
+                self._log_msg( "progress", "- Skipping page {}.", page_no )
+                continue
+            if not self._curr_chapter or self._curr_chapter != page_index[page_no]:
+                # we've found the start of a new chapter
+                self._save_footnote() # nb: save the last footnote of the previous chapter
+                self._curr_chapter = page_index[ page_no ]
+                curr_chapter_pageno = 1
+            else:
+                curr_chapter_pageno += 1
+            self._curr_pageid = "{}{}".format(  # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
+                self._curr_chapter, curr_chapter_pageno
+            )
+            self._log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid )
+
+            # process each element on the page
+            curr_caption = None
+            self._top_left_elem = self._prev_elem = None
+            elem_filter = lambda e: isinstance( e, LTChar )
+            sort_elems = self._curr_pageid not in disable_sort_items
+            for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
+
+                # keep track of the top-left-most bold element
+                if self._is_bold( elem ):
+                    if self._top_left_elem is None \
+                       or elem.x0 < self._top_left_elem.x0 and elem.y1 > self._top_left_elem.y1:
+                        self._top_left_elem = elem
+
+                # check if we should ignore this element
+                if not self._in_viewport( elem, "content" ):
+                    continue
+
+                # check if we're currently extracting footnotes
+                if self._curr_footnote is not None:
+                    self._on_footnote_elem( elem, lt_page )
+                    self._prev_elem = elem
+                    continue
+
+                # figure out what we've got
+                is_bold = self._is_bold( elem )
+                if is_bold and curr_caption and curr_caption[0].isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2:
+                    # the previous bold character looks like a footnote superscript - ignore it
+                    curr_caption = None
+                if curr_caption and elem.get_text() == " ":
+                    # FUDGE! Some captions are in a bold font, but the spaces are not :-/
+                    is_bold = True
+                if is_bold:
+                    if curr_caption:
+                        # NOTE: We stop collecting bold characters at the end of the line, even if they continue on
+                        # to the next line. This is to handle the case of a major heading (e.g. "1. PERSONNEL COUNTERS")
+                        # being followed by a lesser heading ("1.1"). However, we want to handle captions that span
+                        # multiple lines, so we check the vertical distance between the lines to see if it looks like
+                        # two separate headings, or a single caption that has spread over multiple lines.
+                        if self._prev_elem.y0 - elem.y1 > 0.25*elem.height:
+                            # we've found the start of a new rule - save the old one, start collecting the new caption
+                            self._save_target( curr_caption, page_no, lt_page, elem )
+                            curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ]
+                        else:
+                            # continue collecting the caption
+                            if self._prev_elem.y0 - elem.y0 > 1:
+                                # nb: we just started a new line
+                                curr_caption[0] = append_text( curr_caption[0], elem.get_text() )
+                            else:
+                                curr_caption[0] += elem.get_text()
+                    else:
+                        # check if this is the first character of the line
+                        if self._is_start_of_line( elem, lt_page ):
+                            # yup - start collecting the caption
+                            curr_caption = [ elem.get_text(), ( elem.x0, elem.y1 ) ]
+                else:
+                    # check if we're currently collecting a caption
+                    if curr_caption:
+                        # yup - we've just found the end of it, save it
+                        self._save_target( curr_caption, page_no, lt_page, elem )
+                        curr_caption = None
+
+                # loop back to process the next element
+                self._prev_elem = elem
+
+        # add the last caption/footnote (if they haven't already been done)
+        self._save_footnote()
+        if curr_caption:
+            self._save_target( curr_caption, page_no, None, None )
+
+        # check for unused fixups
+        if self._target_fixups:
+            self._log_msg( "warning", "Unused fixups: {}", self._target_fixups )
+        if self._footnote_fixups:
+            self._log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )
+
+    def _save_target( self, caption, page_no, lt_page, elem ):
+        """Save a parsed target."""
+
+        # initialize
+        orig_caption = caption[0]
+        caption_text = re.sub( r"\s+", " ", caption[0] ).strip()
+        if len(caption_text) <= 1:
+            # NOTE: We're finding text that is part of an image (e.g. the "E" for an Elite MMC),
+            # perhaps because the pages were OCR'ed, so we ignore these.
+            return
+
+        # check if we've found the start of the chapter's footnotes
+        if "FOOTNOTES" in caption_text :
+            # yup - notify the main loop
+            self._curr_footnote = []
+            if elem:
+                self._on_footnote_elem( elem, lt_page )
+            return
+
+        # check if the entry needs to be fixed up
+        fixup = self._target_fixups.get( self._curr_pageid, {} ).get( caption_text )
+        if fixup:
+            # yup - make it so
+            fixup[ "instances" ] = fixup.get("instances",1) - 1
+            if fixup["instances"] <= 0:
+                self._target_fixups[ self._curr_pageid ].pop( caption_text )
+                if not self._target_fixups[ self._curr_pageid ]:
+                    del self._target_fixups[ self._curr_pageid ]
+            ruleid = fixup.get( "new_ruleid" )
+            if not ruleid:
+                return
+            caption_text = fixup.get( "new_caption" )
+        else:
+            # nope - use what was parsed
+            # FUDGE! There are a lot of layout problems with things like "12.CONCEALMENT" (i.e. missing space),
+            # and it's tricky to detect these and not get tripped up by things like "12.C blah", so we handle it
+            # as a separate case.
+            mo = re.search( r"^(\d+\.\d*)([^ 0-9].+)", caption_text )
+            if mo:
+                ruleid, caption_text = mo.group(1), mo.group(2).strip()
+            else:
+                # check if the caption text starts with something that looks like a ruleid
+                # NOTE: A leading "*" indicates an optional rule.
+                mo = re.search( r"^\*?([A-Z]\.?)?[1-9][0-9.-]*[A-F]?", caption_text )
+                if not mo:
+                    return
+                ruleid, caption_text = mo.group(), caption_text[mo.end():].strip()
+                if ruleid.startswith( "*" ):
+                    ruleid = ruleid[1:]
+            ruleid = remove_trailing( ruleid, "." )
+            caption_text = remove_trailing( caption_text, ":" )
+
+        # save the new target
+        if not ruleid.startswith( self._curr_chapter ):
+            ruleid = self._curr_chapter + ruleid
+        if ruleid in self._targets:
+            self._log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").",
+                ruleid, caption[0]
+            )
+            return
+        if caption_text == "\u2014":
+            caption_text = "-" # nb: for A7.306 :-/
+        self._targets[ ruleid ] = {
+            "caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1],
+            "raw_caption": orig_caption
+        }
+
+    def _on_footnote_elem( self, elem, lt_page ):
+        """Process an element while we're parsing footnotes."""
+        # check if we've found the start of a new footnote
+        if self._is_bold( elem ):
+            if elem.get_text().isdigit() and self._is_start_of_line( elem, lt_page ):
+                # yup - save the current footnote, start collecting the new one
+                self._save_footnote()
+                self._curr_footnote = [ elem.get_text(), "" ]
+            else:
+                if self._curr_footnote[1]:
+                    # FUDGE! Some footnote content has bold text hard-up at the left margin,
+                    # so we collect that as normal content.
+                    self._curr_footnote[1] += elem.get_text()
+                else:
+                    # we're still collecting the footnote's ID
+                    # NOTE: Older chapters have only the footnote ID in bold text, while newer chapters have
+                    # both the ID and caption in bold. We figure out what's going on later, in _save_footnote().
+                    self._curr_footnote[0] += elem.get_text()
+        else:
+            # nope - we're still collecting the footnote's content
+            if not self._prev_elem or elem.x0 < self._prev_elem.x0 or elem.y0 - self._prev_elem.y0 > lt_page.height/2:
+                # nb: we just started a new line
+                self._curr_footnote[1] = append_text( self._curr_footnote[1], elem.get_text() )
+            else:
+                self._curr_footnote[1] += elem.get_text()
+
+    def _save_footnote( self ):
+        """Save a parsed footnote."""
+
+        if not self._curr_footnote:
+            return
+
+        # initialize
+        if self._curr_chapter not in self._footnotes:
+            # start saving footnotes for the chapter
+            self._footnotes[ self._curr_chapter ] = []
+        orig_content = self._curr_footnote[1]
+
+        # separate the footnote ID, referenced rule, and content
+        if self._curr_chapter in ( "F", "G", "W" ):
+            # NOTE: Chapter F/G footnote captions are also bold.
+            mo = re.search( r"^\d{1,2}\.", self._curr_footnote[0] )
+            if mo:
+                parts = mo.group(), self._curr_footnote[0][mo.end():]
+                self._curr_footnote[0] = parts[0]
+                self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip()
+            else:
+                self._log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] )
+        footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." )
+        content = self._curr_footnote[1].strip()
+        mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content )
+        if mo:
+            ruleid, content = mo.group(), content[mo.end():]
+            if not ruleid.startswith( self._curr_chapter ):
+                ruleid = self._curr_chapter + ruleid
+            ruleid = remove_trailing( ruleid, "." )
+        else:
+            ruleid = None
+        if self._curr_chapter == "C":
+            # FUDGE! The "29." for Chapter C's footnote #29 is misaligned, and is extracted as two separate
+            # footnotes "2" and "9". There isn't really any way to fix this via the normal data-driven mechanism,
+            # so we do it in the code here :-/
+            footnote_ids = [ f["footnote_id"] for f in self._footnotes[self._curr_chapter] ]
+            if footnote_id == "2" and "2" in footnote_ids:
+                return
+            if footnote_id == "9" and "9" in footnote_ids:
+                footnote_id = "29"
+
+        # clean up the content
+        content = re.sub( r"\s+", " ", content ).strip()
+        content = fixup_text( content )
+        mo = re.search( r"^[A-Z ]+:\S", content )
+        if mo:
+            content = content[:mo.end()-1] + " " + content[mo.end()-1:]
+
+        # check for any fixups
+        captions = []
+        fixups = self._footnote_fixups.get( self._curr_chapter, {} ).get( footnote_id )
+        if fixups:
+            if isinstance( fixups, list ):
+                # NOTE: A simple search-and-replace is, by far, the most common fixup, so we provide
+                # a simplified way of specifying these in the fixup file
+                fixups = { "replace": [ ( sr[0], sr[1] ) for sr in fixups ] }
+            errors = defaultdict( list )
+            # do any search-replace's
+            if "replace" in fixups:
+                for sr in fixups["replace"]:
+                    prev_content = content
+                    content = content.replace( sr[0], sr[1] )
+                    if content == prev_content:
+                        self._log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}",
+                            self._curr_chapter, footnote_id, sr[0]
+                        )
+                        errors["replace"].append( sr )
+                del fixups["replace"]
+            # replace the captions
+            if "captions" in fixups:
+                captions = fixups.pop( "captions" )
+            # check that all fixups were successfully applied
+            if fixups:
+                errors.append( fixups )
+            if errors:
+                self._footnote_fixups[ self._curr_chapter ][ footnote_id ] = errors
+            else:
+                del self._footnote_fixups[ self._curr_chapter ][ footnote_id ]
+                if not self._footnote_fixups[ self._curr_chapter ]:
+                    del self._footnote_fixups[ self._curr_chapter ]
+            content = content.strip()
+
+        # extract the footnote's caption
+        if not captions:
+            pos = content.find( ":" )
+            if pos >= 0:
+                captions.append( ( ruleid, content[:pos] ) )
+                content = content[pos+1:].strip()
+            else:
+                self._log_msg( "warning", "Can't extract footnote caption: {}:{} - {}",
+                    self._curr_chapter, footnote_id, content
+                )
+
+        # check for the credits at the end of the Chapter F footnotes
+        pos = content.find( "WEST OF ALAMEIN CREDITS" )
+        if pos > 0:
+            content = content[:pos]
+
+        # save the footnote
+        self._footnotes[ self._curr_chapter ].append( {
+            "footnote_id": footnote_id,
+            "captions": captions,
+            "content": content,
+            "raw_content": orig_content
+        } )
+        self._curr_footnote = None
+
+    def _is_start_of_line( self, elem, lt_page ):
+        """Check if the element is at the start of its line."""
+        # NOTE: We can't just check the element's x co-ordinate, since there is sometimes a floating image
+        # that pushes the text right (e.g. A.12).
+        if self._prev_elem is None:
+            return True
+        if elem.y0 < self._prev_elem.y0:
+            return True
+        if self._prev_elem.x0 < lt_page.width/2 and elem.x0 > lt_page.width/2:
+            return True # the element is at the top of the right column
+        return False
+
+    def save_as_raw( self, targets_out, footnotes_out ):
+        """Save the raw results."""
+        self._save_as_raw_or_text( targets_out, footnotes_out, True )
+
+    def save_as_text( self, targets_out, footnotes_out ):
+        """Save the results as plain-text."""
+        self._save_as_raw_or_text( targets_out, footnotes_out, False )
+
+    def _save_as_raw_or_text( self, targets_out, footnotes_out, raw ):
+        """Save the results as raw or plain-text."""
+
+        # save the targets
+        curr_page_no = None
+        for ruleid, target in self._targets.items():
+            if target["page_no"] != curr_page_no:
+                if curr_page_no:
+                    print( file=targets_out )
+                print( "=== p{} ===".format( target["page_no"] ), file=targets_out )
+                curr_page_no = target["page_no"]
+            xpos, ypos = self._get_target_pos( target )
+            if raw:
+                print( "[{},{}] = {}".format(
+                    xpos, ypos, target["raw_caption"]
+                ), file=targets_out )
+            else:
+                print( "{} => {} @ p{}:[{},{}]".format(
+                    ruleid, target["caption"], target["page_no"], xpos, ypos
+                ), file=targets_out )
+
+        # save the footnotes
+        def make_caption( caption ):
+            buf = []
+            if caption[1]:
+                buf.append( caption[1] )
+                if caption[0]:
+                    buf.append( "[{}]".format( caption[0] ) )
+            elif caption[0]:
+                buf.append( caption[0] )
+            return " ".join( buf )
+        for chapter, footnotes in self._footnotes.items():
+            if chapter != "A":
+                print( file=footnotes_out )
+            print( "=== CHAPTER {} FOOTNOTES {}".format( chapter, 80*"=" )[:80], file=footnotes_out )
+            for footnote in footnotes:
+                print( file=footnotes_out )
+                print( "--- Footnote {} ---".format( footnote["footnote_id"] ), file=footnotes_out )
+                if raw:
+                    print( footnote["raw_content"], file=footnotes_out )
+                else:
+                    print( " ; ".join( make_caption(c) for c in footnote["captions"] ), file=footnotes_out )
+                    print( footnote["content"], file=footnotes_out )
+
+    def save_as_json( self, targets_out, footnotes_out ):
+        """Save the results as JSON."""
+
+        # save the targets
+        targets, curr_chapter = [], None
+        for ruleid, target in self._targets.items():
+            xpos, ypos = self._get_target_pos( target )
+            targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format(
+                jsonval( ruleid ),
+                jsonval(target["caption"]), target["page_no"], xpos, ypos
+            ) )
+            if ruleid[0] != curr_chapter:
+                targets[-1] = "\n" + targets[-1]
+                curr_chapter = ruleid[0]
+        print( "{{\n{}\n\n}}".format(
+            ",\n".join( targets )
+        ), file=targets_out )
+
+        # save the footnotes
+        def make_caption( caption ):
+            return "{{ \"caption\": {}, \"ruleid\": {} }}".format(
+                jsonval(caption[1]), jsonval(caption[0])
+            )
+        chapters = []
+        for chapter in self._footnotes:
+            footnotes = []
+            for footnote in self._footnotes[chapter]:
+                footnotes.append( "{}: {{\n  \"captions\": {},\n  \"content\": {}\n}}".format(
+                    jsonval( footnote["footnote_id"] ),
+                    "[ {} ]".format( ", ".join( make_caption(c) for c in footnote["captions"] ) ),
+                    jsonval( footnote["content"] )
+                ) )
+            chapters.append( "{}: {{\n\n{}\n\n}}".format(
+                jsonval( chapter ),
+                ",\n".join( footnotes )
+            ) )
+        print( "{{\n\n{}\n\n}}".format(
+            ",\n\n".join( chapters )
+        ), file=footnotes_out )
+
+    @staticmethod
+    def _get_target_pos( target ):
+        """Return a target's X/Y position on the page."""
+        xpos = math.floor( target["pos"][0] )
+        ypos = math.ceil( target["pos"][1] )
+        return xpos, ypos
+
+# ---------------------------------------------------------------------
+
+@click.command()
+@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
+@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
+@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
+@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
+@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
+@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
+def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_fname ):
+    """Extract content from the MMP eASLRB."""
+
+    # initialize
+    args = ExtractBase.parse_args( args, _DEFAULT_ARGS )
+
+    # extract the content
+    def log_msg( msg_type, msg ):
+        if msg_type == "progress" and not progress:
+            return
+        log_msg_stderr( msg_type, msg )
+    extract = ExtractContent( args, log_msg )
+    extract._log_msg( "progress",  "Loading PDF: {}", pdf_file )
+    with PdfDoc( pdf_file ) as pdf:
+        extract.extract_content( pdf )
+
+    # save the results
+    with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
+         open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
+        getattr( extract, "save_as_"+format )( targets_out, footnotes_out )
+
+if __name__ == "__main__":
+    main() #pylint: disable=no-value-for-parameter
--- a/asl_rulebook2/extract/data/footnote-fixups.json
+++ b/asl_rulebook2/extract/data/footnote-fixups.json
@ -0,0 +1,209 @@
+{
+
+"A": {
+
+    "10A": [
+        [ "OneHalfFP", "One-Half FP" ],
+        [ "firstappearedintheASLAnnual'89.(In1998,bothwerereprintedin Classic ASL.)", "first appeared in the ASL Annual '89. (In 1998, both were reprinted in Classic ASL.)" ],
+        [ "One of the several criticisms", "<p> One of the several criticisms" ]
+    ],
+    "12": [ [ "TEMto", "TEM to" ] ],
+    "14": [
+        [ "bipodmounted", "bipod-mounted" ],
+        [ "volume o f fire", "volume of fire" ]
+    ],
+    "17": [ [ "adistinct", "a distinct" ] ],
+    "19" : [ [ "wellsited", "well-sited" ] ],
+    "32": [ [ "HWunits", "HW units" ] ],
+    "33": [ [ "multiLocation", "multi-Location" ] ],
+    "35": [ [ "The original printing", "<p> The original printing" ] ],
+    "37": [
+        [ "- Winter War (vs Soviet Union) 30 November 1939 - 13 March 1940- Continuation War (vs Soviet Union) 25 June 1941 - 4 September 1944- Lapland War (vs Germany) 15 September 1944 - 27 April 1945", " <ul> <li> <b>Winter War</b> (vs Soviet Union) 30 November 1939 - 13 March 1940 <li> <b>Continuation War</b> (vs Soviet Union) 25 June 1941 - 4 September 1944 <li> <b>Lapland War</b> (vs Germany) 15 Se  ptember 1944 - 27 April 1945 </ul>" ]
+    ],
+    "38": [
+        [ "Romania: Romania,", " <p> <b>Romania</b>: Romania," ],
+        [ "Hungary: A traditional", " <p> <b>Hungary</b>: A traditional" ],
+        [ "Slovakia: Urged on", "<p> <b>Slovakia</b>: Urged on" ],
+        [ "German-Croatian units in Russia:", " <p> <b>German-Croatian units in Russia</b>:" ],
+        [ "Italian-Croatian units in Russia:", " <p> <b>Italian-Croatian units in Russia</b>:" ],
+        [ "Croatian units in Yugoslavia:", " <p> <b>Croatian units in Yugoslavia</b>:" ],
+        [ "CroatianArmyunitswereengagedprimarilyinanti-partisanactivities,fightingmostly", "Croatian Army units were engaged primarily in anti-partisan activities, fighting mostly" ],
+        [ "Bulgaria: Bulgaria", "<p> <b>Bulgaria</b>: Bulgaria" ],
+        [ "WhiletheriflecompanydidnothaveaninherentHeavyWeapons(HW)platoon,it", "While the rifle company did not have an inherent Heavy Weapons (HW) platoon, it"]
+    ],
+    "39": [ [ "generallyapply", "generally apply" ] ],
+    "41": [ [ "ViceAdmiral", "Vice-Admiral" ] ],
+    "43": [
+        [ "ALLIEDMINORS", "ALLIED MINORS" ],
+        [ "BARrather", "BAR rather" ]
+    ]
+
+},
+
+"B": {
+
+    "3B": [ [ "LOWERLEVELLOCATIONS", "LOWER LEVEL LOCATIONS" ] ],
+    "6": [ [ "The Village Terrain rules", "<p> The Village Terrain rules" ] ],
+    "13": [ [ "U6U7 U8 U9W6W7W8W9V6 V7 V8Y6 Y7Y8Y9 X6X7X8", "" ] ]
+
+},
+
+"C": {
+
+    "1": [ [ "ac tually", "actually" ] ],
+    "8": [ [ "rep resents", "represents" ] ],
+    "13": [ [ "0o", "0&deg;" ] ],
+    "20": [ [ "predetermined", "pre-determined" ] ],
+    "21": [ [ "p roneto", "prone to" ] ],
+    "26": [
+        [ "Mortarsof76-107mm", "Mortars of 76-107mm" ],
+        [ "areexempted", "are exempted" ],
+        [ "frommanyof", "from many of" ],
+        [ "normalGun", "normal Gun" ],
+        [ "thantowed", "than towed" ]
+    ],
+    "32": [ [ "PF counters were removed", "<p> PF counters were removed" ] ],
+    "38": [ [ "U. S. Army", "U.S. Army" ] ]
+
+},
+
+"D": {
+
+    "8": [ [ "massproduced", "mass-produced" ] ]
+
+},
+
+"E": {
+
+    "3": [
+        [ "flexibilityis", "flexibility is" ],
+        [ "for gottenby", "forgotten by" ]
+    ],
+    "4": [ [ "Th e most", "The most" ] ],
+    "11": [
+        [ "Another problem",  "<p> Another problem" ],
+        [ "A Fire cast",  "<p> A Fire cast" ],
+        [ "Finally, to add",  "<p> Finally, to add" ]
+    ],
+    "14B": [ [ "infantrypulled", "infantry-pulled" ] ],
+    "15": [ [ "shallowdraught", "shallow draught" ] ],
+    "18": [
+        [ "reallife", "real-life" ],
+        [ "the g eneric", "the generic" ]
+    ],
+    "20": {
+        "captions": [ [ "E7.51", "LIGHT AA" ], [ "E7.52", "HEAVY AA" ] ],
+        "replace": [
+            [ "& 7.52 AA FIRE:", "" ]
+        ]
+    },
+    "24": {
+        "captions": [ [ "E9.2", "DRIFT" ], [ "E9.4", "LANDING" ] ],
+        "replace": [
+            [ "DRIFT & 9.4 LANDING:", "" ]
+        ]
+    }
+},
+
+"F": {
+
+    "12": [ [ "non- entrenched", "non-entrenched" ] ],
+    "19": [
+        [ "Inthewinternight,thenear-freezingtemperaturecauseddewtoform.", "In the winter night, the near-freezing temperature caused dew to form. " ],
+        [ "Thenextmorningathickmistoftenformedasthesun evaporateditagain.", "The next morning a thick mist often formed as the sun evaporated it again. " ],
+        [ "Thiscouldhappeneveninthesummertimeundertheproperenvironmentalconditions,", "This could happen even in the summertime under the proper environmental conditions, " ],
+        [ "butsincethiswasamuchlessfrequentoccurrenceithasbeen ignored.", "but since this was a much less frequent occurrence it has been ignored." ]
+    ],
+    "21": [
+        [ "Playerswillprobablyfinditmoreconvenienttoinstead", "Players will probably find it more convenient to instead" ],
+        [ "addathird,different-coloreddietothisTH/IFTDR,", "add a third, different-colored die to this TH/IFT DR, " ],
+        [ "usingittodeterminetheDust DRM.", "using it to determine the Dust DRM." ],
+        [ "Thefamiliarterm\"subsequentdr\"wasusedintherulebecauseitobviates theneed", "The familiar term \"subsequent dr\" was used in the rule because it obviates the need" ],
+        [ "a\"new\"concept", "a \"new\" concept" ],
+        [ "thatof rolling athird diesimultaneously", "that of rolling a third die simultaneously" ]
+    ],
+    "22": [
+        [ "theDustcounter\"follows\"thevehicleasit movesfromhex to hex", "the Dust counter \"follows\" the vehicle as it moves from hex to hex" ],
+        [ "itexpends", "it expends " ],
+        [ "two MPeach timeitdoesso", " two MP each time it does so" ]
+    ],
+    "23": [
+        [ "Anotherwind-relatedaspectoftheNorthAfricanenvironmentisthedesertsandstorm,", "Another wind-related aspect of the North African environment is the desert sandstorm, " ],
+        [ "orkhamsininArabic.", "or khamsin in Arabic. " ],
+        [ "ChapterFincludesnospecial rulesforitbecause,", "Chapter F includes no special rules for it because, " ],
+        [ "withvisibilitycutbythestormtoaslittleasthreeyards,", "with visibility cut by the storm to as little as three yards, " ],
+        [ "allactivitiesgenerallywerereducedtoseekingcoverfromthesandblastingwindandchoking dust.", "all activities generally were reduced to seeking cover from the sandblasting wind and choking dust. " ],
+        [ "However,thegamedoesnotignorethepossibilityofakhamsin'soccurrence.", "However, the game does not ignore the possibility of a khamsin's occurrence. " ],
+        [ "The propercombinationofWeather,EC,WindandGustsinaDYOscenariocancreateits effects,", "The proper combination of Weather, EC, Wind and Gusts in a DYO scenario can create its effects, " ],
+        [ "andtheprobabilityofitsoccurrenceisgreatestinascenariosetinspringor summer", "and the probability of its occurrence is greatest in a scenario set in spring or summer" ],
+        [ "thetimewhen khamsinsoccurred mostfrequently.", "the time when khamsins occurred most frequently." ]
+    ],
+    "24": [
+        [ "Thisoverlay isused in aHOLLOW LEGIONS scenario.", "This overlay is used in a HOLLOW LEGIONS scenario." ]
+    ],
+    "25": [
+        [ "ThefamousNorthAfricanescarpmentsaresimilarto cliffs,", "The famous North African escarpments are similar to cliffs, " ],
+        [ "butwithlesssteep(andveryeroded)slopes.", "but with less steep (and very eroded) slopes. " ],
+        [ "Somearesixhundredfeethigh", "Some are six hundred feet high" ],
+        [ "thoughgenerallytheirheightsrangefromonehundredtotwohundredfeet.", "though generally their heights range from one hundred to two hundred feet. " ],
+        [ "Theirsignificanceinthedesertwarlaymainlyinthattheywerecommandingheights,", "Their significance in the desert war lay mainly in that they were commanding heights, " ] ,
+        [ "defensivepositionsforinfantry,", "defensive positions for infantry, " ],
+        [ "andgreatlyrestrictedvehicularmovementacrossthem", "and greatly restricted vehicular movement across them" ],
+        [ "Hencetheywereoftenthesceneofheavyfighting,", "Hence they were often the scene of heavy fighting, " ],
+        [ "especiallywherecrossedbya road", "especially where crossed by a road" ]
+    ]
+},
+
+"G": {
+
+    "4": [ [ "It's also interesting", "<p> It's also interesting" ] ],
+    "8": [ [ "miniDC,", "mini-DC," ] ],
+    "33": [ [ "closein", "close-in" ] ],
+    "45": [
+        [ "Guomindang(akaKuomintang", "Guomindang (aka Kuomintang" ],
+        [ "XForce", "X-Force" ],
+        [ "The two-tone color", "<p> The two-tone color" ]
+    ],
+    "47" : [ [ "against-allodds", "against-all-odds"  ] ],
+    "48": [ [ "trained-andequipped", "trained-and-equipped" ] ]
+
+},
+
+"W": {
+
+    "2": [
+        [ "Korean National Defense Constabulary:", "<ul> <li> <em>Korean National Defense Constabulary</em>: " ],
+        [ "ROK Army:", "<li> <em>ROK Army</em>: " ],
+        [ "Korean Marine Corps:", "<li> <em>Korean Marine Corps</em>: " ],
+        [ "United States - Army:", "<li> <em>United States</em> <ul> <li> Army:" ],
+        [ "- Army Airborne:", "<li> Army Airborne:" ],
+        [ "- Army Rangers:", "<li> Army Rangers:" ],
+        [ "- KATUSA:", "<li> KATUSA:" ],
+        [ "- Marine Corps:", "<li> Marine Corps:" ],
+        [ "British Commonwealth:",  "</ul> <li> <em>British Commonwealth</em>: " ],
+        [ "- 41 Independent Commando, Royal Marines: 9/50-12/51", "<ul> <li> 41 Independent Commando, Royal Marines: 9/50-12/51 </ul>" ],
+        [ "Other United Nations Command:", "<li> <em>Other United Nations Command</em>: " ],
+        [ "10/50-7/53", "10/50-7/53 </ul>" ]
+    ],
+    "3": [
+        [ "Korean People's Army:", "<ul> <li> <em>Korean People's Army</em>: " ],
+        [ "Communist Guerillas:", "<li> <em>Communist Guerillas</em>: " ],
+        [ "Chinese People's Volunteer Army:10/50-7/53", "<li> <em>Chinese People's Volunteer Army</em>: 10/50-7/53 </ul>" ]
+    ],
+    "9": [ [ "T34/85", "T-34/85" ] ],
+    "16": [
+        [ "3 1/3 PP", "3&frac13; PP" ],
+        [ "24-8 HS", "2-4-8 HS" ]
+    ],
+    "18": [ [ "The first unit",  "<p> The first unit" ] ],
+    "29" : [ [ "RAT KILLERin which", "RAT KILLER in which" ] ],
+    "30": [
+        [ "G.M.D in", "G.M.D. in" ],
+        [ "as. sumed", "assumed" ]
+    ],
+    "49": [ [ "SUP-PORT", "SUPPORT" ] ],
+    "50": [ [ "Speciallytrained", "Specially-trained" ] ]
+}
+
+}
+
--- a/asl_rulebook2/extract/data/index-fixups.json
+++ b/asl_rulebook2/extract/data/index-fixups.json
@ -0,0 +1,288 @@
+{
+
+"AirSupport": {
+    "new_title": "Air Support",
+    "old_content": "E7,[BRT:TCG6][ChineseDYO:G18.83][ENEMY: S8.9][cannotbeusedvsanyLocationinFog:E3.313][inRB,German AirSupportisalwaysaStukaM42:SSRRB9][JapaneseDYO: G1.6621][inKGP,NAifMistDensity>Light,Night,orOvercast:SSR KGP3][Napalm:G17.4][NightNA:E7.2][OvercastNA:E3.55][during SeaborneAssault/Evacuation:G14.34][SeaborneAssaultDYO: G14.262][TarawaNavalGunfire: TCG3.3]",
+    "new_content": "E7, [BRT: TCG6] [Chinese DYO: G18.83] [ENEMY: S8.9] [cannot be used vs any Location in Fog: E3.313] [in RB, German Air Support is always a Stuka M42: SSR RB9] [Japanese DYO: G1.6621] [in KGP, NA if Mist Density > Light, Night, or Overcast: SSR KGP3] [Napalm: G17.4] [Night NA: E7.2] [Overcast NA: E3.55] [during Seaborne Assault/Evacuation: G14.34] [Seaborne Assault DYO: G14.262] [Tarawa Naval Gunfire: TCG3.3]"
+},
+
+"Ambush": {
+    "old_content": "A11.4[attacksfirstinCC:A11.32][ATTACKERadds+1 drm to Ambush dr in jungle, kunai, or bamboo Location: G.6] [Banks: G8.212][keeping \"?\" during CC: A12.14] [Dummies are eliminated BEFORE the Ambush dr, and do not qualify for the -2 drm: ASOP 8.11B] [Hand-to-Hand CC: J2.31] [Night: E1.77] [Panic Action: S6.213] [Panjis: G9.21] [in Rubble: SSR RB8] [Street Fighting: A11.8] [T-H Heroes are created after Ambush determination: G1.421]",
+    "new_content": "A11.4 [attacks first in CC: A11.32] [ATTACKER adds +1 drm to Ambush dr in jungle, kunai, or bamboo Location: G.6] [Banks: G8.212] [keeping \"?\" during CC: A12.14] [Dummies are eliminated BEFORE the Ambush dr, and do not qualify for the -2 drm: ASOP 8.11B] [Hand-to-Hand CC: J2.31] [Night: E1.77] [Panic Action: S6.213] [Panjis: G9.21] [in Rubble: SSR RB8] [Street Fighting: A11.8] [T-H Heroes are created after Ambush determination: G1.421]"
+},
+
+"American": {
+    "old_content": "A25.3[EarlyArmy:G17.2][OBAAccuracy:C1.3] [Paramarine:G17.111][Raider:G17.111][RifleCompany:S18.5][U.S. Marine Corps: G17.1]",
+    "new_content": "A25.3 [Early Army: G17.2] [OBA Accuracy: C1.3] [Paramarine: G17.111] [Raider: G17.111] [Rifle Company: S18.5] [U.S. Marine Corps: G17.1]"
+},
+
+"AmmoPP Reduction": {
+    "new_title": "Ammo PP Reduction"
+},
+
+"APCR": {
+    "new_title": "APCR/APDS",
+    "old_content": "(Armor Piercing Composite Rigid)/",
+    "new_content": "(Armor Piercing Composite Rigid/Discarding Sabot): C8.1-.2 [EXC German 28LL, 40LL: C4.3, C7.32] [vs Guns: C11.52] [HE Equivalency: C8.31] [TH# Modification: C4.3] [To Kill Table: C7.32] [Residual FP NA: C8.31]"
+},
+"APDS": {
+    "old_content": "(Armor Piercing Discarding Sabot): C8.1-.2 [EXC German 28LL, 40LL: C4.3, C7.32] [vs Guns: C11.52] [HE Equivalency: C8.31] [TH# Modification: C4.3] [To Kill Table: C7.32] [Residual FP NA: C8.31]",
+    "new_content": null
+},
+
+"Broken Units": {
+    "replace": [
+        [ "[Pin NA: A7.8(EXCInterdiction and Huts)]", "[Pin NA (EXC Interdiction and Huts): A7.8]" ]
+    ]
+},
+
+"Cellars": {
+    "replace": [
+        [ "RBCellars", "RB Cellars" ]
+    ]
+},
+
+"DC": {
+    "replace": [
+        [ "[Thrown from: (Halftrack: D6.63) (Sidecar: D15.6)]", "[Thrown from Halftrack: D6.63] [Thrown from Sidecar: D15.6]" ]
+    ]
+},
+
+"Direct Fire": {
+    "old_content": "(Any fireattackrequiringaLOSfromthe firerwhichdoesnotuseIndirectFire):C.1,C9.1[InterveningUnits:A6.6][LC: G12.61-.62, G12.671]",
+    "new_content": "(Any fire attack requiring a LOS from the firer which does not use Indirect Fire): C.1, C9.1 [Intervening Units: A6.6] [LC: G12.61-.62, G12.671]"
+},
+
+"Dogfight": {
+    "old_content": "(AerialCombat):E7.22",
+    "new_content": "(Aerial Combat): E7.22"
+},
+
+"Elite": {
+    "replace": [
+        [ "[German (Africa, 1942-43: F.6) (prior to 1944: A25.1) (SS: A25.11)]", "[German (Africa, 1942-43): F.6] [German (prior to 1944): A25.1] [German (SS): A25.11]" ]
+    ]
+},
+
+"End of Scenario": {
+    "replace": [
+        [ "[in ABtF: R9.4 CG4]", "[in ABtF: R9.4, CG4]" ],
+        [ "[in KGP: P8.4 CG23]", "[in KGP: P8.4, CG23]" ],
+        [ "[in PB: Q9.4 CG19 (-1 drm for any Night Scenario and +1 drm for Day II scenario: Turn Record Track)]", "[in PB (-1 drm for any Night Scenario and +1 drm for Day II scenario: Turn Record Track): Q9.4, CG19]" ],
+        [ "[in RB: O11.4 CG4]", "[in RB: O11.4, CG4]" ]
+    ]
+},
+
+"EX": {
+    "old_content": "ExampleEXC: Exception",
+    "new_content": "Example",
+    "_comment_": "The code manually inserts an entry for EXC: Exception"
+},
+
+"Fortification": {
+    "replace": [
+        [ "[in BRT: SSR1 (BRT Sand: T3.2) (NA in Betio Piers: T9.2)]", "[in BRT: SSR1 (BRT Sand): T3.2] [in BRT: SSR1 (NA in Betio Piers): T9.2]" ]
+    ]
+},
+
+"Hazardous Movement": {
+    "replace": [
+        [ "[Clearance of: (Debris: O1.5) (Fire: B24.72) (Jungle Path: G2.7) (Roadblock: B24.76) (Rubble: B24.71)]", "[Clearance of  Debris: O1.5] [Clearance of Fire: B24.72] [Clearance of Jungle Path: G2.7] [Clearance of Roadblock: B24.76] [Clearance of Rubble: B24.71]" ]
+    ]
+},
+
+"Hedges": {
+    "replace": [
+        [ "[TEM NA for Ground Support: E7.4; for PRC: B9.3]", "[TEM NA for Ground Support: E7.4] [TEM NA for PRC: B9.3]" ]
+    ]
+},
+
+"Immobilization": {
+    "replace": [
+        [ "[LC: G12.602; LC Passengers NA: G12.13]", "[LC: G12.602] [LC Passengers NA: G12.13]" ],
+        [ "[TC: D5.5; TC in BRT: SSR12]", "[TC: D5.5] [TC in BRT: SSR12]" ]
+    ]
+},
+
+"Jungle": {
+    "replace": [
+        [ "G.2G.6", "G.2-G.6" ]
+    ]
+},
+
+"Kunai": {
+    "replace": [
+        [ "G.2G.6", "G.2-G.6" ]
+    ]
+},
+
+"Leadership": {
+    "replace": [
+        [ "[Battle Hardening: A15.3, Finns: A25.71, Japanese: G1.41]", "[Battle Hardening: A15.3] [Battle Hardening (Finns): A25.71] [Battle Hardening (Japanese): G1.41]" ]
+    ]
+},
+
+"MG": {
+    "replace": [
+        [ "[Vehicular: (see Vehicular MG: D3.5-.54)]", "[Vehicular MG: D3.5-.54]" ],
+        [ "[Aerial: E7.41, vs AFV: C7.22]", "[Aerial: E7.41] [Aerial (vs AFV): C7.22]" ]
+    ]
+},
+
+"Minefield": {
+    "replace": [
+        [ "[fully-tracked A FV T B: B 8.61]", "[fully-tracked AFV TB: B8.61]" ]
+    ]
+},
+
+"Morale": {
+    "replace": [
+        [ "[Gain:", "Gain:" ],
+        [ "FFE]", "FFE" ]
+    ]
+},
+
+"Movement, Vehicle": {
+    "replace": [
+        [ "(see Amphibians: D16)", "(see Amphibians)" ]
+    ]
+},
+
+"OBA": {
+    "replace": [
+        [ "USOrdnance", "US Ordnance" ]
+    ]
+},
+
+"Optional Rules": {
+    "replace": [
+        [ "A12.16 (see footnote A18)", "A12.16, footnote A18" ]
+    ]
+},
+
+"PAATC": {
+    "old_content": "(Pre-AFVAdvance/AttackTaskCheck;NAtoberserk/Fanatic/Japanese/SMC): A11.6, G1.62 [vs Armored Cupola: O.7] [DC Placement: A23.3] [ENEMY Advance into CC/Melee: S11.4] [1PAATC: Chinese, NonElite Italians, Inexperienced, Allied/Axis Minors] [OVR vs \"?\": A12.41] [CC Reaction Fire: D7.21]",
+    "new_content": "(Pre-AFV Advance/Attack Task Check; NA to berserk/Fanatic/Japanese/SMC): A11.6, G1.62 [vs Armored Cupola: O.7] [DC Placement: A23.3] [ENEMY Advance into CC/Melee: S11.4] [1PAATC: Chinese, Non-Elite Italians, Inexperienced, Allied/Axis Minors] [OVR vs \"?\": A12.41] [CC Reaction Fire: D7.21]"
+},
+
+"PBF": {
+    "replace": [
+        [ "A11.l", "A11.1" ]
+    ]
+},
+
+"Pillbox": {
+    "replace": [
+        [ "[Control: B30.91; in BRT: TCG15]", "[Control: B30.91] [Control (in BRT): TCG15]" ]
+    ]
+},
+
+"Pin": {
+    "replace": [
+        [ "D6.23.24", "D6.23-.24" ],
+        [ "[Fire Lanes: A9.22; Cancellation: A9.223]", "[Fire Lanes: A9.22] [Fire Lanes (Cancellation): A9.223]" ]
+    ]
+},
+
+"PRC": {
+    "replace": [
+        [ "[disembarking in Panji: G9.423; embarking: G9.51]", "[disembarking in Panji: G9.423] [embarking in Panji: G9.51]" ]
+    ]
+},
+
+"RMG": {
+    "replace": [
+        [ "D1.81 (hull) & D1.82 (turret)", "Hull: D1.81; Turret: D1.82" ]
+    ]
+},
+
+"Radio": {
+    "replace": [
+        [ "[in KGP: P8.4 CG15]", "[in KGP: P8.4, CG15]" ],
+        [ "[in RB: O11.4 CG6]", "[in RB: O11.4, CG6]" ]
+    ]
+},
+
+"Range": {
+    "replace": [
+        [ "see Firing Within Hex", "A7.21" ]
+    ]
+},
+
+"Roadblock": {
+    "replace": [
+        [ "[TEM NA for Ground Support: E7.4; for PRC: B9.3]", "[TEM NA for Ground Support: E7.4] [TEM NA for PRC: B9.3]" ]
+    ]
+},
+
+"Rubble": {
+    "replace": [
+        [ "; Stone Blaze:", "] [RePh, Stone Blaze:" ]
+    ]
+},
+
+"Scrounging": {
+    "replace": [
+        [ "RBCG7", "RB CG7" ]
+    ]
+},
+
+"Stacking Limits": {
+    "replace": [
+        [ "[Inspecting: see Right of Inspection: (Before Play: A2.9) (During Play: A12.16) (Pillboxes: B30.7)]", "[Inspecting: see Right of Inspection (Before Play): A2.9] [Inspecting: see Right of Inspection (During Play): A12.16] [Inspecting: see Right of Inspection (Pillboxes): B30.7]" ]
+    ]
+},
+
+"Stall": {
+    "old_content": "(Rules are givenin a ChapterH Vehicle Note ifa nationality's AFV are subject to Stall; for example, German Multi-Applicable Vehicle Note H): [Platoon Movement: D14.22]",
+    "new_content": "(Rules are given in a Chapter H Vehicle Note if a nationality's AFV are subject to Stall; for example, German Multi-Applicable Vehicle Note H): [Platoon Movement: D14.22]"
+},
+
+"Target Size": {
+    "replace": [
+        [ "[Vehicular: D1.7, Concealment: D1.76]", "[Vehicular: D1.7] [Vehicular (Concealment): D1.76]" ]
+    ]
+},
+
+"Uncon irmed Kill": {
+    "new_title": "Unconfirmed Kill"
+},
+
+"Unarmored Vehicles": {
+    "replace": [
+        [ "[AFV (vs A-P mines: B28.42) (vs A-T mines: B28.52)]", "[AFV (vs A-P mines): B28.42] [AFV (vs A-T mines): B28.52]" ]
+    ]
+},
+
+"Unit": {
+    "replace": [
+        [ "[but not horses],", "(but not horses)," ]
+    ]
+},
+
+"Voluntary Break": {
+    "replace": [
+        [ "[Japanese: G1.13, SMC NA: G1.4]", "[Japanese: G1.13] [Japanese (SMC NA): G1.4]" ]
+    ]
+},
+
+"Walls": {
+    "replace": [
+        [ "[Bypass LOS across: (Infantry: A4.34) (Vehicle: D2.37)]", "[Bypass LOS across Infantry: A4.34] [Bypass LOS across Vehicle: D2.37]" ],
+        [ "; for PRC", "] [TEM NA for PRC" ]
+    ]
+},
+
+"Winter Camouflage": {
+    "replace": [
+        [ "OBA Observer: C 1.6", "OBA Observer: C1.6" ]
+    ]
+},
+
+"Wreck Blaze": {
+    "replace": [
+        [ "[Creation: (AFV C7.6) (Unarmored: A7.308)]", "[Creation (AFV): C7.6] [Creation (Unarmored): A7.308]" ]
+    ]
+}
+
+}
--- a/asl_rulebook2/extract/data/known-missing-ruleids.json
+++ b/asl_rulebook2/extract/data/known-missing-ruleids.json
@ -0,0 +1,40 @@
+{
+
+"chapters": [ "H", "O", "P", "Q", "R", "S", "T" ],
+
+"strings": [
+    "see appropriate Vehicle Notes",
+    "Chapter H",
+    "derived by cross-indexing Target Type & Range on To Hit Table",
+    "Basic TH# plus any modifications for Gun and Ammo Types",
+    "Number beneath Gun Caliber & Length on applicable To Kill Table",
+    "Basic TK# plus applicable To Kill Modifications (Cases A-D)",
+    "The Modified TK# minus the AF of the Target Facing hit",
+    "FP-Range",
+    "Morale-Leadership",
+    "FP-Range-Morale",
+    "abbr. for Bow Flamethrower",
+    "HE NA", "AP NA",
+    "Chinese, Non-Elite Italians, Inexperienced, Allied/Axis Minors"
+],
+
+"regexes": [
+    "^ASOP .+$",
+    "^(RB )?OCG[0-9.]+$",
+    "^PCG[0-9.]+[a-e]?$",
+    "^(PB )?QCG[0-9.]+$",
+    "^RCG[0-9.]+$",
+    "^TCG[0-9.]+[a-e]?$",
+    "^SSR[0-9.]+$",
+    "^(RB CG )?SSR .+$",
+    "^(RB )?CG[0-9.]+$",
+    "^(SSR |SSRs )?(ABtF|KGP|PB|RB|BRT)[0-9.]+$",
+    "Chapter [A-Z] [Ii]ntroduction",
+    "Chapter [A-Z] [Dd]ivider",
+    "^footnote [A-Z]\\d+",
+    "^.+ [Oo]verlay$",
+    " Multi-Applicable Note ",
+    " (Vehicle|Ordnance) Note "
+]
+
+}
--- a/asl_rulebook2/extract/data/target-fixups.json
+++ b/asl_rulebook2/extract/data/target-fixups.json
@ -0,0 +1,400 @@
+{
+
+"A1": {
+    "A.10LEADERSHIP DRM ():": {
+        "new_ruleid": "A.10",
+        "new_caption": "LEADERSHIP DRM (&#9651;)"
+    }
+},
+
+"A3": {
+    "23": { "new_ruleid": null }
+},
+
+"A5": {
+    "3.BASIC SEQUENCE OF PLAY": {
+        "new_ruleid": "A3",
+        "new_caption": "BASIC SEQUENCE OF PLAY"
+    },
+    "TURN RECORD CHART": { "new_ruleid": null }
+},
+
+"A21": {
+    "9.223CANCELLATION:": {
+        "new_ruleid": "A9.223",
+        "new_caption": "CANCELLATION"
+    }
+},
+
+"A28": {
+    "1 -": { "new_ruleid": null }
+},
+"A29": {
+    "1 -": { "new_ruleid": null }
+},
+
+"A30": {
+    "11.CLOSE COMBAT (CC)": {
+        "new_ruleid": "A11",
+        "new_caption": "CLOSE COMBAT (CC)"
+    }
+},
+
+"A31": {
+    "11.2WITHDRAWALFROMMELEE:": {
+        "new_ruleid": "A11.2",
+        "new_caption": "WITHDRAWAL FROM MELEE"
+    }
+},
+
+"A34": {
+    "12.CONCEALMENT": {
+        "new_ruleid": "A12",
+        "new_caption": "CONCEALMENT"
+    }
+},
+
+"A37": {
+    "5 12.2 CONCEALED / COUNTERS:": {
+        "new_ruleid": "A12.2",
+        "new_caption": "CONCEALED &frac58;\" COUNTERS"
+    }
+},
+
+"A38": {
+    "13.CAVALRY": {
+        "new_ruleid": "A13",
+        "new_caption": "CAVALRY"
+    }
+},
+
+"A39": {
+    "6MF 1MF2MF6FP": { "new_ruleid": null },
+    "4FP6FP4FP": { "new_ruleid": null }
+},
+
+"A43": {
+    "A18.2 LEADER CREATION TABLE*LEADER CREATION drm": { "new_ruleid": null }
+},
+
+"A46": {
+    "21.CAPTURED EQUIPMENT": {
+        "new_ruleid": "A21",
+        "new_caption": "CAPTURED EQUIPMENT"
+    }
+},
+
+"A50": {
+    "C24.5 STRENGTH:": {
+        "new_ruleid": "A24.5",
+        "new_caption": "STRENGTH"
+    }
+},
+
+"A51": {
+    "30 25.NATIONALITY DISTINCTIONS": {
+        "new_ruleid": "A25",
+        "new_caption": "NATIONALITY DISTINCTIONS"
+    }
+},
+
+"A54": {
+    "25.53 FREEFRENCH:": {
+        "new_ruleid": "A25.53",
+        "new_caption": "FREE FRENCH"
+    }
+},
+
+"A55": {
+    "26.VICTORYCONDITIONS": {
+        "new_ruleid": "A26",
+        "new_caption": "VICTORY CONDITIONS"
+    }
+},
+
+"B4": {
+    "6.BRIDGES": {
+        "new_ruleid": "B6",
+        "new_caption": "BRIDGES"
+    }
+},
+
+"B6": {
+    "8.45BROKEN & BERSERK:": {
+        "new_ruleid": "B8.45",
+        "new_caption": "BROKEN & BERSERK"
+    }
+},
+
+"B15": {
+    "11.CLIFFS": {
+        "new_ruleid": "B11",
+        "new_caption": "CLIFFS"
+    }
+},
+
+"B17": {
+    "13.8 PINEWOODS:": {
+        "new_ruleid": "B13.8",
+        "new_caption": "PINE WOODS"
+    },
+    "13.81 OBSTACLEHEIGHT:": {
+        "new_ruleid": "B13.81",
+        "new_caption": "OBSTACLE HEIGHT"
+    },
+    "13.82 MFCOST:": {
+        "new_ruleid": "B13.82",
+        "new_caption": "MF COST"
+    }
+},
+
+"B19": {
+    "17.CRAG": {
+        "new_ruleid": "17",
+        "new_caption": "CRAG"
+    }
+},
+
+"B22": {
+    "2 2": { "new_ruleid": null }
+},
+
+"B24": {
+    "23.BUILDINGS": {
+        "new_ruleid": "B23",
+        "new_caption": "BUILDINGS"
+    }
+},
+
+"B33": {
+    "0 25.64 WIND DIRECTION:": {
+        "new_ruleid": "B25.64",
+        "new_caption": "WIND DIRECTION"
+    }
+},
+
+"B35": {
+    "53": { "new_ruleid": null },
+    "1 2": { "new_ruleid": null }
+},
+
+"C7": {
+    "2.3 360 MOUNT:": {
+        "new_ruleid": "C2.3",
+        "new_caption": "360&deg; MOUNT:"
+    }
+},
+
+"C11": {
+    "5.31 CASE C; BOUNDING FIRST FIRER, RESTRICTED AIM: 1": {
+        "new_ruleid": "C5.31",
+        "new_caption": "CASE C<sup>1</sup>; BOUNDING FIRST FIRER, RESTRICTED AIM"
+    },
+    "5.32 CASE C; BOUNDING FIRST FIRER, LIMITED AIM:": {
+        "new_ruleid": "C5.32",
+        "new_caption": "CASE C<sup>2</sup>; BOUNDING FIRST FIRER, LIMITED AIM"
+    },
+    "5.34 CASE C; LATW:": {
+        "new_ruleid": "C5.34",
+        "new_caption": "CASE C<sup>3</sup>; LATW"
+    },
+    "5.35 CASE C; MOTION FIRER:": {
+        "new_ruleid": "C5.35",
+        "new_caption": "CASE C<sup>4</sup>; MOTION FIRER"
+    }
+},
+
+"C12": {
+    "6.11 CASE J; RESTRICTED AIM:": {
+        "new_ruleid": "C6.11",
+        "new_caption": "CASE J<sup>1</sup>; RESTRICTED AIM"
+    },
+    "6.12 CASE J; LIMITED AIM:": {
+        "new_ruleid": "C6.12",
+        "new_caption": "CASE J<sup>2</sup>; LIMITED AIM"
+    },
+    "6.13 CASE J; FFNAM:": {
+        "new_ruleid": "C6.13",
+        "new_caption": "CASE J<sup>3</sup>; FFNAM"
+    },
+    "6.14 CASE J; FFMO:": {
+        "new_ruleid": "C6.14",
+        "new_caption": "CASE J<sup>4</sup>; FFMO"
+    }
+},
+
+"C13": {
+    "21--": { "new_ruleid": null },
+    "12": { "new_ruleid": null }
+},
+
+"C15": {
+    "7.7 AFV DESTRU": { "new_ruleid": null },
+    "1KIA": { "new_ruleid": null }
+},
+
+"C16": {
+    "1819 8.11APCR(A)/APDS (D):": {
+        "new_ruleid": "C8.11",
+        "new_caption": "APCR (A)/APDS (D)"
+    }
+},
+
+"C20": {
+    "10.3 MANHANDLING DRM:": { "new_ruleid": null }
+},
+
+"C21": {
+    "1 GUN DESTRUCTION TABLE": { "new_ruleid": null }
+},
+
+"D4": {
+    "46": { "new_ruleid": null }
+},
+
+"D5": {
+    "13": { "new_ruleid": null }
+},
+
+"D6": {
+    "10 MP": { "new_ruleid": null },
+    "2 6": { "new_ruleid": null },
+    "1 21": { "new_ruleid": null },
+    "1 /": { "new_ruleid": null }
+},
+
+"D9": {
+    "56,": { "new_ruleid": null },
+    "1 2": { "new_ruleid": null }
+},
+
+"D10": {
+    "3.71 LOW AMMO B# (B # ):": {
+        "new_ruleid": "D3.71",
+        "new_caption": "LOW AMMO B#"
+    }
+},
+
+"D11": {
+    "3 1": { "new_ruleid": null },
+    "1 1 3": { "new_ruleid": null }
+},
+
+"D17": {
+    "1 9 12.5 2": { "new_ruleid": null }
+},
+
+"D22": {
+    "5 1 1(4)1 1 1(4)(1)(1) 1": { "new_ruleid": null },
+    "1 1": { "new_ruleid": null },
+    "1(4)": { "new_ruleid": null, "instances": 4 },
+    "1(4) 5": { "new_ruleid": null },
+    "1 1 1 (1) 1": { "new_ruleid": null },
+    "1 1(4)": { "new_ruleid": null }
+},
+
+"E5": {
+    "1.": { "new_ruleid": null }
+},
+
+"E6": {
+    "2.": { "new_ruleid": null },
+    "3.": { "new_ruleid": null }
+},
+
+"E7": {
+    "E3. DYO TEMPERATE WEATHER CHART": { "new_ruleid": null }
+},
+
+"E14": {
+    "18 7.AIR SUPPORT Fighter-Bomber/Stuka Counter example": {
+        "new_ruleid": "E7",
+        "new_caption": "AIR SUPPORT"
+    }
+},
+
+"E24": {
+    "1)": { "new_ruleid": null },
+    "2)": { "new_ruleid": null },
+    "3)": { "new_ruleid": null }
+},
+
+"E27": {
+    "1)": { "new_ruleid": null, "instances": 2 },
+    "2)": { "new_ruleid": null, "instances": 2 },
+    "3)": { "new_ruleid": null }
+},
+
+"F18": {
+    "D3": { "new_ruleid": null },
+    "W1": { "new_ruleid": null },
+    "H4": { "new_ruleid": null }
+},
+
+"G30": {
+    "1 ION TABLE": { "new_ruleid": null },
+    "2 :": { "new_ruleid": null },
+    "3 :": { "new_ruleid": null },
+    "4 :": { "new_ruleid": null },
+    "5 :": { "new_ruleid": null },
+    "6 :": { "new_ruleid": null },
+    "8 :": { "new_ruleid": null },
+    "9 :": { "new_ruleid": null },
+    "10 :": { "new_ruleid": null },
+    "10 Armored": { "new_ruleid": null },
+    "11 :": { "new_ruleid": null },
+    "11": { "new_ruleid": null, "instances": 4 }
+},
+
+"G34": {
+    "13.2BEACHELEVATION&SLOPE:": {
+        "new_ruleid": "G13.2",
+        "new_caption": "BEACH ELEVATION & SLOPE"
+    }
+},
+
+"G42": {
+    "1133": { "new_ruleid": null, "instances": 10 },
+    "11233": { "new_ruleid": null },
+    "10": { "new_ruleid": null },
+    "11": { "new_ruleid": null },
+    "12": { "new_ruleid": null }
+},
+
+"G45": {
+    "1 U.S.M.C. DEFENSEBN.SWALLOTMENTCHART": { "new_ruleid": null },
+    "1 U.S.M.C. PARA/RAIDERSQUADSWALLOTMENTCHART": { "new_ruleid": null },
+    "1 U.S.M.C. RIFLE/BARSQUADSWALLOTMENTCHART": { "new_ruleid": null },
+    "1 U.S.M.C. OBA AVAILABILITY CHART": { "new_ruleid": null },
+    "11/42-11/43YEAR8-10/4210/436/447-12/441945 DR: 2": { "new_ruleid": null },
+    "234356": { "new_ruleid": null },
+    "10": { "new_ruleid": null },
+    "11": { "new_ruleid": null },
+    "12": { "new_ruleid": null },
+    "55": { "new_ruleid": null }
+},
+
+"J1": {
+    "1.MINIATURES:": { "new_ruleid": null }
+},
+
+"W4": {
+    "!, 1.37 FORTIFICATIONS:": {
+        "new_ruleid": "W1.37",
+        "new_caption": "FORTIFICATIONS:"
+    }
+},
+
+"W5": {
+    "17": { "new_ruleid": null },
+    "18 3.2 REPUBLIC OF KOREA ARMY (ROKA):": {
+        "new_ruleid": "W3.2",
+        "new_caption": "REPUBLIC OF KOREA ARMY (ROKA)"
+    }
+},
+
+"W6": {
+    "27": { "new_ruleid": null }
+}
+
+}
--- a/asl_rulebook2/extract/index.py
+++ b/asl_rulebook2/extract/index.py
@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+""" Extract the index from the MMP eASLRB. """
+
+import os
+import json
+import re
+
+import click
+from pdfminer.layout import LTChar
+
+from asl_rulebook2.extract.base import ExtractBase, log_msg_stderr
+from asl_rulebook2.pdf import PdfDoc, PageIterator, PageElemIterator
+from asl_rulebook2.utils import parse_page_numbers, fixup_text, extract_parens_content, jsonval
+
+# ---------------------------------------------------------------------
+
+_DEFAULT_ARGS = {
+    "pages": "10-41",
+    "index_vp_left": 0, "index_vp_right": 565, "index_vp_top": 715, "index_vp_bottom": 20, # viewport
+    "first_title": "a", "last_title": "X#", # first/last index entries
+}
+
+# ---------------------------------------------------------------------
+
+class ExtractIndex( ExtractBase ):
+    """Extract the index from the MMP eASLRB."""
+
+    def __init__( self, args, log=None ):
+        super().__init__( args, _DEFAULT_ARGS, log )
+        self._index_entries = None
+        # prepare to fixup problems in the index content
+        fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" )
+        with open( fname2, "r", encoding="utf-8" ) as fp:
+            self._fixups = json.load( fp )
+
+    def extract_index( self, pdf ):
+        """Extract the index from the MMP eASLRB."""
+
+        # initialize
+        page_nos = parse_page_numbers( self._args["pages"] )
+        curr_title = curr_content = None
+
+        # process each page in the index
+        for page_no, page, lt_page in PageIterator( pdf ):
+
+            if page_no > max( page_nos ):
+                break
+            if page_no not in page_nos:
+                self._log_msg( "progress", "- Skipping page {}.", page_no )
+                continue
+            self._log_msg( "progress", "- Processing page {}...", page_no )
+
+            # process each element on the page
+            self._prev_y0 = 99999
+            elem_filter = lambda e: isinstance( e, LTChar )
+            for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter ):
+
+                # check if we should ignore this element
+                if not self._in_viewport( elem, "index" ):
+                    continue
+                if self._is_ignore( elem ):
+                    continue
+
+                # NOTE: We identify the start of a new index entry by bold text at the start of a line.
+                # We then collect the remaining bold text as the index entry's title, until we see some
+                # non-bold text. This is collected as the index entry's content, until we see the start
+                # of the next index entry.
+
+                # figure out what we've got
+                if self._is_bold( elem ):
+                    if curr_content is not None:
+                        # we've found the start of a new index entry
+                        if curr_title:
+                            # save the index entry we've just finished collecting
+                            self._save_index_entry( curr_title, curr_content )
+                            if curr_title == self._args["last_title"]:
+                                curr_title = curr_content = None
+                                break # nb: that was the last one - we're all done
+                        curr_title = curr_content = None
+                    if curr_title is None:
+                        # start collecting the title
+                        curr_title = elem.get_text()
+                    else:
+                        # continue collecting the title
+                        curr_title += elem.get_text()
+                else:
+                    if curr_content is None:
+                        # start collecting the content text
+                        curr_content = elem.get_text()
+                    else:
+                        # continue collecting the content text
+                        if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ):
+                            # join up hyphenated words
+                            curr_content = curr_content[:-1]
+                        curr_content += elem.get_text()
+
+                # loop back to process the next element
+                self._prev_y0 = elem.y0
+
+        # add the last index entry (if it hasn't already been done)
+        if curr_title:
+            self._save_index_entry( curr_title, curr_content )
+
+        # check for unused fixups
+        if self._fixups:
+            self._log_msg( "warning", "Unused fixups: {}", self._fixups )
+
+        # process the content for each index entry
+        if not self._index_entries:
+            raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) )
+        self._process_content()
+
+    def _save_index_entry( self, title, content ):
+        """Save a parsed index entry."""
+
+        # check if we've started parsing index entries
+        # NOTE: There is some bold text at the start of the index, which we parse as an index title,
+        # so we don't save anything until we've actually seen the first index entry.
+        if self._index_entries is None:
+            if title != self._args["first_title"]:
+                return
+            self._index_entries = []
+
+        # initialize
+        title, content = title.strip(), content.strip()
+        if content.startswith( ":" ):
+            content = content[1:].strip() # nb: this comes after the title, but we don't need it
+
+        # save the new index entry
+        if title == "bold":
+            # FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect
+            # as the start of a new entry. We fix that up here.
+            self._index_entries[-1]["content"] = "{} bold {}".format(
+                self._index_entries[-1]["content"], fixup_text(content)
+            )
+        elif title == "C" and self._index_entries[-1]["title"] == "FFE":
+            # FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate
+            # index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is
+            # also a real "FFE" entry, so we do it in the code here.
+            self._index_entries[-1].update( {
+                "title": "FFE:C", "content": fixup_text(content)
+            } )
+        else:
+            # save the new index entry
+            index_entry = self._make_index_entry( title, content )
+            if index_entry:
+                self._index_entries.append( index_entry )
+            # FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here.
+            if title == "EX":
+                self._index_entries.append( self._make_index_entry( "EXC", "Exception" ) )
+
+    def _make_index_entry( self, title, content ):
+        """Create a new index entry."""
+
+        # initialize
+        orig_content = content
+        title = fixup_text( title )
+        if title.endswith( ":" ):
+            title = title[:-1]
+
+        # check for any fixups
+        fixup = self._fixups.pop( title, None )
+        if fixup:
+            # replace the title
+            title = fixup.get( "new_title", title )
+            # do any search-replace's
+            for sr in fixup.get( "replace", [] ):
+                new_content = content.replace( sr[0], sr[1] )
+                if new_content == content:
+                    self._log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] )
+                else:
+                    content = new_content
+            # replace the content
+            old_content = fixup.get( "old_content" )
+            if old_content:
+                if fixup_text( content ) != old_content:
+                    self._log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title )
+                else:
+                    new_content = fixup.get( "new_content" )
+                    if not new_content:
+                        return None
+                    content = new_content
+
+        # FUDGE! There are two "Entry" index entries, but one of them should be "Entry (Offboard)" (the parsing code
+        # is actually correct, since the "(Offboard)" is not bold). We can't really fix this via the usual data-driven
+        # fixups, so we fix it in the code here.
+        if title == "Entry" and content.startswith( "(Offboard): " ):
+            title += " (Offboard)"
+            content = content[12:]
+
+        return {
+            "title": title,
+            "content": fixup_text( content ),
+            "raw_content": orig_content
+        }
+
+    def _process_content( self ):
+        """Extract information out of the index entries into a structured form."""
+
+        for index_entry in self._index_entries:
+
+            # initialize
+            content = index_entry[ "content" ]
+
+            # extract any "see also"
+            mo = re.search( r"\(see (also )?(.+?)\):?", content )
+            if mo:
+                see_also = [ sa.strip() for sa in mo.group(2).split(",") ]
+                if "SW" in see_also or "Class" in see_also:
+                    # FUDGE! See-also's are normally comma-separated, but we don't want to
+                    # split things like "Recovery, SW" or "Class, Personnel Types".
+                    see_also = [ mo.group(2) ]
+                index_entry[ "see_also" ] = see_also
+                content = content[:mo.start()] + content[mo.end():]
+                content = content.strip()
+
+            # extract any sub-title
+            if content.startswith( "(" ):
+                pos = content.find( ")" )
+                if pos < 0:
+                    # FUDGE! Some index entries have the closing ) missing :-/
+                    pos = content.find( ":" )
+                    subtitle, content = content[1:pos], content[pos+1:]
+                else:
+                    subtitle, content = extract_parens_content( content )
+                index_entry[ "subtitle" ] = subtitle
+                if content.startswith( ":" ):
+                    content = content[1:]
+                content = content.strip()
+
+            # extract any ruleid's
+            ruleids = []
+            while True:
+                if content == "A./G.":
+                    break # nb: special handling for "NCC" (National Capabilities Chart)
+                mo = re.search( r"^(SSR )?[A-Z]{1,3}[0-9.-]+[A-Fa-f]?", content )
+                if not mo:
+                    break
+                ruleids.append( mo.group() )
+                content = content[mo.end():].strip()
+                if content.startswith( "," ):
+                    content = content[1:].strip()
+                else:
+                    break
+            if ruleids:
+                index_entry[ "ruleids" ] = ruleids
+
+            # extract any ruleref's
+            rulerefs = []
+            matches = list( re.finditer( r"\[(.+?)\]", content ) )
+            if matches:
+                for mo in reversed(matches):
+                    val = mo.group(1)
+                    # NOTE: We search for the ":" from the right, to avoid picking it up in the ruleref text.
+                    pos = val.rfind( ":" )
+                    if pos > 0:
+                        vals = re.split( "[;,]", val[pos+1:] )
+                        ruleids = [ v.strip() for v in vals ]
+                        val = val[:pos]
+                    else:
+                        ruleids = None
+                    rulerefs.append( { "caption": val, "ruleids": ruleids } )
+                    content = content[:mo.start()] + content[mo.end():]
+                index_entry[ "rulerefs" ] = list( reversed( rulerefs ) )
+
+            # save the final content
+            content = re.sub( r"\s+", " ", content ).strip()
+            if content:
+                index_entry[ "content" ] = content
+            else:
+                del index_entry["content"]
+
+    def _is_ignore( self, elem ):
+        """Check if we should ignore an element on the page."""
+        # check if we have a bold item as the first thing on a line
+        if self._is_bold( elem ) and elem.y0 - self._prev_y0 < -1:
+            # yup - check if it's near the start of the line
+            if self._is_near_start_of_line( elem ):
+                # yup - this is the title for an index entry
+                return False
+            # nope - this is a header that indicates a new section (the index is grouped by letter)
+            return True
+        return False
+
+    def _is_near_start_of_line( self, elem ):
+        """Check if the element is near the start of its line."""
+        if self._args["index_vp_left"] <= elem.x0 <= self._args["index_vp_left"]+20:
+            # yup (left column)
+            return True
+        left = self._args["index_vp_left"] + (self._args["index_vp_right"]+1 - self._args["index_vp_left"]) / 2
+        if left <= elem.x0 <= left+20:
+            # yup (right column)
+            return True
+        return False
+
+    def save_as_raw( self, out ):
+        """Save the raw results."""
+        for index_entry in self._index_entries:
+            print( "=== {} ===".format( index_entry["title"] ), file=out )
+            print( "{}".format( index_entry["raw_content"] ), file=out )
+            print( file=out )
+
+    def save_as_text( self, out ):
+        """Save the results as plain-text."""
+        for index_entry in self._index_entries:
+            print( "=== {} ===".format( index_entry["title"] ), file=out )
+            if "subtitle" in index_entry:
+                print( index_entry["subtitle"], file=out )
+            if index_entry.get( "ruleids" ):
+                print( "RULEID'S: {}".format(
+                    " ; ".join( index_entry["ruleids"] )
+                ), file=out )
+            if index_entry.get( "see_also" ):
+                print( "SEE ALSO: {}".format(
+                    " ; ".join( index_entry["see_also"] ),
+                ), file=out )
+            if index_entry.get( "content" ):
+                print( "CONTENT:", index_entry["content"], file=out )
+            if index_entry.get( "rulerefs" ):
+                print( "RULEREF'S:", file=out )
+                for ruleref in index_entry["rulerefs"]:
+                    if ruleref["ruleids"]:
+                        ruleids = [ "[{}]".format(ri) for ri in ruleref["ruleids"] ]
+                        print( "- {} {}".format( ruleref["caption"], " ".join(ruleids) ), file=out )
+                    else:
+                        print( "- {}".format( ruleref["caption"] ), file=out )
+            print( file=out )
+
+    def save_as_json( self, out ):
+        """Save the results as JSON."""
+        entries = []
+        for index_entry in self._index_entries:
+            buf = []
+            buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) )
+            if "subtitle" in index_entry:
+                buf.append( "  \"subtitle\": {}".format( jsonval(index_entry["subtitle"]) ) )
+            if index_entry.get( "ruleids" ):
+                buf.append( "  \"ruleids\": {}".format( jsonval(index_entry["ruleids"]) ) )
+            if index_entry.get( "see_also" ):
+                buf.append( "  \"see_also\": {}".format( jsonval(index_entry["see_also"]) ) )
+            if index_entry.get( "content" ):
+                buf.append( "  \"content\": {}".format( jsonval(index_entry["content"]) ) )
+            if index_entry.get( "rulerefs" ):
+                buf2 = []
+                for ruleref in index_entry["rulerefs"]:
+                    buf2.append( "    {{ \"caption\": {}, \"ruleids\": {} }}".format(
+                        jsonval( ruleref["caption"] ),
+                        jsonval( ruleref["ruleids"] )
+                    ) )
+                buf.append( "  \"rulerefs\": [\n{}\n  ]".format( ",\n".join(buf2) ) )
+            entries.append( ",\n".join( buf ) + "\n}" )
+        print( "[\n\n{}\n\n]".format( ",\n\n".join(entries) ), file=out )
+
+# ---------------------------------------------------------------------
+
+@click.command()
+@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
+@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
+@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
+@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
+@click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." )
+def main( pdf_file, args, progress, format, output_fname ):
+    """Extract the index from the MMP eASLRB."""
+
+    # initialize
+    args = ExtractBase.parse_args( args, _DEFAULT_ARGS )
+
+    # extract the index
+    def log_msg( msg_type, msg ):
+        if msg_type == "progress" and not progress:
+            return
+        log_msg_stderr( msg_type, msg )
+    extract = ExtractIndex( args, log_msg )
+    extract._log_msg( "progress",  "Loading PDF: {}", pdf_file )
+    with PdfDoc( pdf_file ) as pdf:
+        extract.extract_index( pdf )
+
+    # save the results
+    with open( output_fname, "w", encoding="utf-8" ) as out:
+        getattr( extract, "save_as_"+format )( out )
+
+if __name__ == "__main__":
+    main() #pylint: disable=no-value-for-parameter
--- a/asl_rulebook2/pdf.py
+++ b/asl_rulebook2/pdf.py
@ -1,7 +1,5 @@
 """ Parse and process a PDF. """

-import collections
-
 import click
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
@ -10,6 +8,8 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTContainer
 from pdfminer.pdfpage import PDFPage

+from asl_rulebook2.utils import remove_quotes, roundf
+
 # ---------------------------------------------------------------------

 class PdfDoc:
@ -33,7 +33,7 @@ class PdfDoc:
        if self._fp:
            self._fp.close()

-    def dump_pdf( self, dump_toc=True, pages=None, elem_filter=None, out=None ):
+    def dump_pdf( self, dump_toc=True, page_nos=None, sort_elems=False, elem_filter=None, out=None ):
        """Dump the PDF document."""

        # dump the TOC
@ -41,15 +41,14 @@ class PdfDoc:
            self._dump_toc( out=out )

        # dump each page
-        max_page_no = max( pages ) if pages else None
        first_page = not dump_toc
-        for page_no, page in PageIterator( self ):
+        for page_no, page, lt_page in PageIterator( self ): #pylint: disable=unused-variable

-            # parse the next page
-            self.interp.process_page( page )
-            if pages and page_no not in pages:
-                continue
-            lt_page = self.device.get_result()
+            if page_nos:
+                if page_no > max( page_nos ):
+                    break
+                if page_no not in page_nos:
+                    continue

            # dump the page details
            if first_page:
@ -61,15 +60,9 @@ class PdfDoc:
            click.echo( file=out )

            # dump each element on the page
-            for depth, elem in PageElemIterator( lt_page ):
-                if elem_filter and not elem_filter( elem ):
-                    continue
+            for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
                click.echo( "{}- {}".format( depth*"  ", elem ), file=out )

-            # check if we're done
-            if max_page_no and page_no >= max_page_no:
-                break
-
    def _dump_toc( self, out=None ):
        """Dump a PDF document's TOC."""

@ -84,9 +77,7 @@ class PdfDoc:
            if depth > 1:
                bullet = "*" if depth == 2 else "-"
                click.echo( "{}{} ".format( (depth-2)*"  ", bullet ), nl=False, file=out )
-            title = repr( title ).strip()
-            if title[0] in ('"',"'") and title[-1] == title[0]:
-                title = title[1:-1]
+            title = remove_quotes( repr( title ).strip() )
            col = "cyan" if depth <= 2 else "green"
            click.echo( "{} => {}".format(
                click.style( title, fg=col ),
@ -101,41 +92,60 @@ class PageIterator:
    def __init__( self, pdf ):
        self.pdf = pdf
        self._pages = PDFPage.create_pages( pdf.doc )
-        self._page_no = 0
+        self._curr_page_no = 0

    def __iter__( self ):
        return self

    def __next__( self ):
        """Return the next page."""
-        page = next( self._pages )
-        self._page_no += 1
-        return self._page_no, page
+        while True:
+            self._curr_page_no += 1
+            page = next( self._pages )
+            self.pdf.interp.process_page( page )
+            lt_page = self.pdf.device.get_result()
+            return self._curr_page_no, page, lt_page

 # ---------------------------------------------------------------------

 class PageElemIterator:
    """Iterate over each element in a page."""

-    def __init__( self, lt_page ):
+    def __init__( self, lt_page, elem_filter=None, sort_elems=False ):
        self.lt_page = lt_page
        # collect all the elements (so that they can be sorted)
-        self._elems = collections.deque()
+        self._elems = []
+        self._curr_elem_no = -1
        def walk( elem, depth ):
            for child in elem:
-                self._elems.append( ( depth, child ) )
+                # NOTE: If elements are to be sorted, we ignore anything that is not laid out.
+                if not sort_elems or hasattr( child, "x0" ):
+                    if not elem_filter or elem_filter( child ):
+                        self._elems.append( ( depth, child ) )
                if isinstance( child, LTContainer ):
                    walk( child, depth+1 )
        walk( lt_page, 0 )
+        if sort_elems:
+            def sort_key( elem ):
+                col_no = 0 if elem[1].x0 < lt_page.width/2 else 1
+                # NOTE: Some elements that should be aligned are actually misaligned by a miniscule amount (e.g. 10^-5),
+                # so to stop this from resulting in the wrong sort order, we truncate the decimal places.
+                # NOTE: Characters are often rendered in different fonts, with bounding boxes that don't align neatly.
+                # I tried sorting by the centre of the bounding boxes, but superscripts causes problems :-/
+                ypos = - roundf( elem[1].y1, 1 )
+                xpos = roundf( elem[1].x0, 1 )
+                return col_no, ypos, xpos
+            self._elems.sort( key=sort_key )

    def __iter__( self ):
        return self

    def __next__( self ):
        """Return the next element on the page."""
-        if not self._elems:
+        self._curr_elem_no  += 1
+        if self._curr_elem_no >= len(self._elems):
            raise StopIteration()
-        return self._elems.popleft()
+        return self._elems[ self._curr_elem_no ]

 # ---------------------------------------------------------------------

--- a/asl_rulebook2/tests/init.py
+++ b/asl_rulebook2/tests/init.py
@ -0,0 +1,3 @@
+"""Module definitions."""
+
+pytest_options = None #pylint: disable=invalid-name
--- a/asl_rulebook2/tests/test_extract.py
+++ b/asl_rulebook2/tests/test_extract.py
@ -0,0 +1,116 @@
+""" Test eASLRB extraction. """
+
+import os
+import io
+
+import pytest
+
+from asl_rulebook2.pdf import PdfDoc
+from asl_rulebook2.extract.index import ExtractIndex
+from asl_rulebook2.extract.content import ExtractContent
+from asl_rulebook2.extract.all import ExtractAll
+from asl_rulebook2.tests import pytest_options
+
+# ---------------------------------------------------------------------
+
+@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." )
+@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." )
+def test_extract_index():
+    """Test extracting the index."""
+
+    def do_test( dname ):
+
+        # extract the index
+        fname = os.path.join( dname, "eASLRB.pdf" )
+        with PdfDoc( fname ) as pdf:
+            extract = ExtractIndex( args={}, log=_check_log_msg )
+            extract.extract_index( pdf )
+        buf = io.StringIO()
+        extract.save_as_text( buf )
+        buf = buf.getvalue()
+
+        # check the results
+        fname = os.path.join( dname, "index.txt" )
+        assert open( fname, "r", encoding="utf-8" ).read() == buf
+
+    # run the test
+    _for_each_version( do_test )
+
+# ---------------------------------------------------------------------
+
+@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." )
+@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." )
+def test_extract_content():
+    """Test extracting content."""
+
+    def do_test( dname ):
+
+        # extract the content
+        fname = os.path.join( dname, "eASLRB.pdf" )
+        with PdfDoc( fname ) as pdf:
+            extract = ExtractContent( args={}, log=_check_log_msg )
+            extract.extract_content( pdf )
+        targets_buf, footnotes_buf = io.StringIO(), io.StringIO()
+        extract.save_as_text( targets_buf, footnotes_buf )
+        targets_buf = targets_buf.getvalue()
+        footnotes_buf = footnotes_buf.getvalue()
+
+        # check the results
+        fname2 = os.path.join( dname, "targets.txt" )
+        assert open( fname2, "r", encoding="utf-8" ).read() == targets_buf
+        fname2 = os.path.join( dname, "footnotes.txt" )
+        assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf
+
+    # run the test
+    _for_each_version( do_test )
+
+# ---------------------------------------------------------------------
+
+@pytest.mark.skipif( not pytest_options.easlrb_path, reason="eASLRB not available." )
+@pytest.mark.skipif( pytest_options.short_tests, reason="--short-tests specified." )
+def test_extract_all():
+    """Test extracting everything."""
+
+    def do_test( dname ):
+
+        # extract everything
+        fname = os.path.join( dname, "eASLRB.pdf" )
+        with PdfDoc( fname ) as pdf:
+            extract = ExtractAll( args={}, log=_check_log_msg )
+            extract.extract_all( pdf )
+        index_buf = io.StringIO()
+        extract.extract_index.save_as_json( index_buf )
+        index_buf = index_buf.getvalue()
+        targets_buf, footnotes_buf = io.StringIO(), io.StringIO()
+        extract.extract_content.save_as_json( targets_buf, footnotes_buf )
+        targets_buf = targets_buf.getvalue()
+        footnotes_buf = footnotes_buf.getvalue()
+
+        # check the results
+        fname2 = os.path.join( dname, "index.json" )
+        assert open( fname2, "r", encoding="utf-8" ).read() == index_buf
+        fname2 = os.path.join( dname, "targets.json" )
+        assert open( fname2, "r", encoding="utf-8" ).read() == targets_buf
+        fname2 = os.path.join( dname, "footnotes.json" )
+        assert open( fname2, "r", encoding="utf-8" ).read() == footnotes_buf
+
+    # run the test
+    _for_each_version( do_test )
+
+# ---------------------------------------------------------------------
+
+def _for_each_version( func ):
+    """Run tests for each version of the eASLRB."""
+    base_dir = pytest_options.easlrb_path
+    ncalls = 0
+    for name in os.listdir( base_dir ):
+        dname = os.path.join( base_dir, name )
+        if os.path.isdir( dname ):
+            func( dname )
+            ncalls += 1
+    assert ncalls > 0
+
+def _check_log_msg( msg_type, msg ):
+    """Check a log message."""
+    assert msg_type not in ( "warning", "error" ), \
+        "Unexpected {}: {}".format( msg_type, msg )
--- a/asl_rulebook2/utils.py
+++ b/asl_rulebook2/utils.py
@ -1,6 +1,51 @@
 """ Miscellaneous utilities. """

+import pathlib
 import re
+import math
+
+# ---------------------------------------------------------------------
+
+def fixup_text( val ):
+    """Fixup special characters in a string."""
+
+    # fixup smart quotes, dashes and other non-ASCII characters
+    def replace_chars( val, ch, targets ):
+        for target in targets:
+            val = val.replace( target, ch )
+        return val
+    val = replace_chars( val, '"', [ "\u00ab", "\u00bb", "\u201c", "\u201d", "\u201e", "\u201f", "\u02dd" ] )
+    val = replace_chars( val, "'", [ "\u2018", "\u2019", "\u201a", "\u201b", "\u2039", "\u203a" ] )
+    val = replace_chars( val, " - ", [ "\u2013", "\u2014" ] )
+    val = replace_chars( val, "-", [ "\u2022" ] ) # nb: bullet
+    val = replace_chars( val, "&le;", [ "\u2264" ] )
+    val = replace_chars( val, "&ge;", [ "\u2265" ] )
+    val = replace_chars( val, "&#9651;", [ "\u2206" ] ) # nb: "no leadership DRM" triangle
+    val = replace_chars( val, "&reg;", [ "\u00ae" ] ) # nb: circled R
+    val = replace_chars( val, "&deg;", [ "\u00b0" ] ) # nb: degree sign
+    val = replace_chars( val, "&auml;", [ "\u00e4" ] )
+
+    # replace fractions with their corresponding HTML entity
+    for frac in [ (1,2), (1,3), (2,3), (3,8), (5,8) ]:
+        val = re.sub(
+            r"\b{}/{}(?=(\"| MF| MP))".format( frac[0], frac[1] ),
+            "&frac{}{};".format( frac[0], frac[1] ),
+            val
+        )
+    return val
+
+def extract_parens_content( val ):
+    """Extract content in parenthesis (including nested parentheses)."""
+    assert val[0] == "("
+    nesting = 0
+    for pos, ch in enumerate(val):
+        if ch == "(":
+            nesting += 1
+        elif ch == ")":
+            nesting -= 1
+            if nesting <= 0:
+                return val[1:pos], val[pos+1:]
+    return val # nb: if we get here, we have unclosed parantheses :-/

 # ---------------------------------------------------------------------

@ -11,10 +56,62 @@ def parse_page_numbers( val, offset=0 ):
    """
    vals = set()
    if val:
-        for v in val.split( "," ):
+        for v in str(val).split( "," ):
            mo = re.search( r"^(\d+)-(\d+)$", v )
            if mo:
                vals.update( range( int(mo.group(1)), int(mo.group(2))+1 ) )
            else:
                vals.add( int(v) )
    return [ v+offset for v in vals ]
+
+# ---------------------------------------------------------------------
+
+def jsonval( val ):
+    """Return a value in a JSON-safe format."""
+    if val is None:
+        return "null"
+    if isinstance( val, int ):
+        return val
+    if isinstance( val, list ):
+        if not val:
+            return "[]"
+        vals = [ jsonval(v) for v in val ]
+        return "[ {} ]".format( ", ".join( vals ) )
+    if isinstance( val, str ):
+        val = "".join(
+            ch if 32 <= ord(ch) <= 127 else r"\u{:04x}".format(ord(ch))
+            for ch in val
+        )
+        return '"{}"'.format( val.replace('"',r'\"') )
+    assert False, "Unknown JSON data type: {}".format( type(val) )
+    return '"???"'
+
+def change_extn( fname, extn ):
+    """Change a filename's extension."""
+    return pathlib.Path( fname ).with_suffix( extn )
+
+def append_text( buf, new ):
+    """Append text to a buffer."""
+    if buf:
+        if buf[-1] == "-":
+            return buf[:-1] + new # nb: join hyphenated words
+        if buf[-1] != "/":
+            buf += " "
+    return buf + new
+
+def remove_quotes( val ):
+    """Remove enclosing quotes from a string."""
+    if val[0] in ('"',"'") and val[-1] == val[0]:
+        val = val[1:-1]
+    return val
+
+def remove_trailing( val, ch ):
+    """Remove a trailing character from a string."""
+    if val.endswith( ch ):
+        val = val[:-1]
+    return val
+
+def roundf( val, ndigits ):
+    """Round a floating-point value."""
+    pow10 = math.pow( 10, ndigits )
+    return int( pow10 * val + 0.5 ) / pow10
--- a/bin/dump_pdf.py
+++ b/bin/dump_pdf.py
@ -11,16 +11,15 @@ from asl_rulebook2.utils import parse_page_numbers
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--toc","dump_toc", is_flag=True, default=False, help="Dump the TOC." )
-@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." )
-def main( pdf_file, dump_toc, pages ):
+@click.option( "--pages","-p","page_nos", help="Page(s) to dump (e.g. 2,5,9-15)." )
+@click.option( "--sort","-s","sort_elems", is_flag=True, default=False, help="Sort elements within each page." )
+def main( pdf_file, dump_toc, page_nos, sort_elems ):
    """Dump a PDF file."""

-    # process the command-line arguments
-    pages = parse_page_numbers( pages )
-
    # dump the PDF file
+    page_nos = parse_page_numbers( page_nos )
    with PdfDoc( pdf_file ) as pdf:
-        pdf.dump_pdf( dump_toc=dump_toc, pages=pages )
+        pdf.dump_pdf( dump_toc=dump_toc, page_nos=page_nos, sort_elems=sort_elems )

 # ---------------------------------------------------------------------

--- a/bin/extract_pages.py
+++ b/bin/extract_pages.py
@ -18,8 +18,8 @@ def main( pdf_file, output_fname, pages ):

    # NOTE: This extracts pages from the eASLRB, so we can work on specific parts of it without having to load
    # the entire document each time. In particular, it maintains the internal PDF strucuture of each page.
-    # The files as small as you might expect (e.g. extracting a single page results in a file only about half
-    # the size), but processing them are significantly faster.
+    # The files are not as small as you might expect (e.g. extracting a single page results in a file only
+    # about half the size), but processing them is significantly faster.

    # process the command-line arguments
    pages = parse_page_numbers( pages, offset=-1 )
@ -34,7 +34,7 @@ def main( pdf_file, output_fname, pages ):
                del outline.root[-1]

        # extract the specified pages
-        print( "Extracting pages:", ", ".join( str(p) for p in sorted(pages) ) )
+        print( "Extracting pages:", ", ".join( str(1+p) for p in sorted(pages) ) )
        for page_no in range( len(pdf.pages)-1, -1, -1 ):
            if page_no not in pages:
                del pdf.pages[ page_no ]
--- a/conftest.py
+++ b/conftest.py
@ -0,0 +1,33 @@
+""" pytest support functions. """
+
+import pytest
+
+_pytest_options = None
+
+# ---------------------------------------------------------------------
+
+def pytest_addoption( parser ):
+    """Configure pytest options."""
+
+    # NOTE: This file needs to be in the project root for this to work :-/
+
+    # add test options
+    parser.addoption(
+        "--easlrb", action="store", dest="easlrb_path", default=None,
+        help="Directory containing the MMP eASLRB PDF and extracted data file(s)."
+    )
+
+    # add test options
+    parser.addoption(
+        "--short-tests", action="store_true", dest="short_tests", default=False,
+        help="Skip running the longer tests."
+    )
+
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+def pytest_configure( config ):
+    """Called after command-line options have been parsed."""
+    global _pytest_options
+    _pytest_options = config.option
+    import asl_rulebook2.tests
+    asl_rulebook2.tests.pytest_options = _pytest_options
--- a/setup.py
+++ b/setup.py
@ -41,6 +41,9 @@ setup(
        ( "asl-rulebook2", ["LICENSE.txt"] ),
    ],
    entry_points = {
-        "console_scripts": "dump-pdf = bin.dump_pdf:main",
+        "console_scripts": [
+            "dump-pdf = bin.dump_pdf:main",
+            "extract-all = asl_rulebook2.extract.all:main"
+        ],
    }
 )