Added a tool to dump a PDF document.

3 years ago · 52484240aa
parent 9c360e940a
commit 52484240aa
8 changed files with 267 additions and 0 deletions
--- a/asl_rulebook2/init.py
+++ b/asl_rulebook2/init.py
--- a/asl_rulebook2/pdf.py
+++ b/asl_rulebook2/pdf.py
@ -0,0 +1,161 @@
+""" Parse and process a PDF. """
+
+import collections
+
+import click
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTContainer
+from pdfminer.pdfpage import PDFPage
+
+# ---------------------------------------------------------------------
+
+class PdfDoc:
+    """Wrapper around a PDF document."""
+
+    def __init__( self, fname ):
+        self.fname = fname
+        self._fp = None
+
+    def __enter__( self ):
+        self._fp = open( self.fname, "rb" )
+        #pylint: disable=attribute-defined-outside-init
+        self.parser = PDFParser( self._fp )
+        self.doc = PDFDocument( self.parser )
+        self.rmgr = PDFResourceManager()
+        self.device = PDFPageAggregator( self.rmgr, laparams=LAParams() )
+        self.interp = PDFPageInterpreter( self.rmgr, self.device )
+        return self
+
+    def __exit__( self, exc_type, exc_value, exc_traceback ):
+        if self._fp:
+            self._fp.close()
+
+    def dump_pdf( self, dump_toc=True, pages=None, elem_filter=None, out=None ):
+        """Dump the PDF document."""
+
+        # dump the TOC
+        if dump_toc:
+            self._dump_toc( out=out )
+
+        # dump each page
+        max_page_no = max( pages ) if pages else None
+        first_page = not dump_toc
+        for page_no, page in PageIterator( self ):
+
+            # parse the next page
+            self.interp.process_page( page )
+            if pages and page_no not in pages:
+                continue
+            lt_page = self.device.get_result()
+
+            # dump the page details
+            if first_page:
+                first_page = False
+            else:
+                click.echo( file=out )
+            click.secho( "--- PAGE {} {}".format( page_no, 80*"-" )[:80], fg="bright_cyan", file=out )
+            click.echo( "lt_page = {}".format( lt_page ), file=out )
+            click.echo( file=out )
+
+            # dump each element on the page
+            for depth, elem in PageElemIterator( lt_page ):
+                if elem_filter and not elem_filter( elem ):
+                    continue
+                click.echo( "{}- {}".format( depth*"  ", elem ), file=out )
+
+            # check if we're done
+            if max_page_no and page_no >= max_page_no:
+                break
+
+    def _dump_toc( self, out=None ):
+        """Dump a PDF document's TOC."""
+
+        # initialize
+        toc_iter = TocIterator( self )
+        if not toc_iter.has_toc():
+            click.secho( "No TOC.", fg="red", file=out )
+            return
+
+        # dump each TOC entry
+        for depth, title, dest in toc_iter:
+            if depth > 1:
+                bullet = "*" if depth == 2 else "-"
+                click.echo( "{}{} ".format( (depth-2)*"  ", bullet ), nl=False, file=out )
+            title = repr( title ).strip()
+            if title[0] in ('"',"'") and title[-1] == title[0]:
+                title = title[1:-1]
+            col = "cyan" if depth <= 2 else "green"
+            click.echo( "{} => {}".format(
+                click.style( title, fg=col ),
+                click.style( repr(dest), fg="yellow" )
+            ), file=out )
+
+# ---------------------------------------------------------------------
+
+class PageIterator:
+    """Iterate over each page in a PDF document."""
+
+    def __init__( self, pdf ):
+        self.pdf = pdf
+        self._pages = PDFPage.create_pages( pdf.doc )
+        self._page_no = 0
+
+    def __iter__( self ):
+        return self
+
+    def __next__( self ):
+        """Return the next page."""
+        page = next( self._pages )
+        self._page_no += 1
+        return self._page_no, page
+
+# ---------------------------------------------------------------------
+
+class PageElemIterator:
+    """Iterate over each element in a page."""
+
+    def __init__( self, lt_page ):
+        self.lt_page = lt_page
+        # collect all the elements (so that they can be sorted)
+        self._elems = collections.deque()
+        def walk( elem, depth ):
+            for child in elem:
+                self._elems.append( ( depth, child ) )
+                if isinstance( child, LTContainer ):
+                    walk( child, depth+1 )
+        walk( lt_page, 0 )
+
+    def __iter__( self ):
+        return self
+
+    def __next__( self ):
+        """Return the next element on the page."""
+        if not self._elems:
+            raise StopIteration()
+        return self._elems.popleft()
+
+# ---------------------------------------------------------------------
+
+class TocIterator():
+    """Iterate over the entries in a TOC."""
+
+    def __init__( self, pdf ):
+        try:
+            self._outlines = pdf.doc.get_outlines()
+        except PDFNoOutlines:
+            self._outlines = None
+
+    def has_toc( self ):
+        """Check if the document has as TOC."""
+        return self._outlines is not None
+
+    def __iter__( self ):
+        return self
+
+    def __next__( self ):
+        """Return the next entry in the TOC."""
+        level, title, dest, action, se = next( self._outlines ) #pylint: disable=unused-variable,invalid-name
+        return level, title, dest
--- a/asl_rulebook2/tests/fixtures/dump/simple-text.docx
+++ b/asl_rulebook2/tests/fixtures/dump/simple-text.docx
--- a/asl_rulebook2/tests/fixtures/dump/simple-text.pdf
+++ b/asl_rulebook2/tests/fixtures/dump/simple-text.pdf
--- a/asl_rulebook2/tests/test_dump.py
+++ b/asl_rulebook2/tests/test_dump.py
@ -0,0 +1,55 @@
+""" Test dumping PDF's. """
+
+import os
+import io
+import re
+
+from pdfminer.layout import LTTextLineHorizontal
+
+from asl_rulebook2.pdf import PdfDoc
+
+# ---------------------------------------------------------------------
+
+def test_dump():
+    """Test dumping PDF's."""
+
+    # dump the PDF
+    fname = os.path.join( os.path.dirname(__file__), "fixtures/dump/simple-text.pdf" )
+    buf = io.StringIO()
+    with PdfDoc( fname ) as pdf:
+        pdf.dump_pdf( out=buf,
+            elem_filter = lambda e: isinstance( e, LTTextLineHorizontal )
+        )
+    buf = buf.getvalue()
+
+    # check that no TOC was found
+    mo = re.search( r"^No TOC\.$", buf, re.MULTILINE )
+    assert mo
+
+    # extract the results
+    pages = {}
+    curr_page = None
+    for line in buf.split( "\n" ):
+        # check if we've found the start of a new page
+        mo = re.search( r"^--- PAGE (\d+) ---", line, re.MULTILINE )
+        if mo:
+            if curr_page:
+                pages[ curr_page_no ] = curr_page
+            curr_page = []
+            curr_page_no = int( mo.group(1) )
+            continue
+        # check if we've found content we're interested in
+        mo = re.search( r"<LTTextLineHorizontal .*?'(.*?)'>", line )
+        if mo:
+            content = mo.group(1).replace( "\\n", "" ).strip()
+            if content:
+                curr_page.append( content )
+    pages[ curr_page_no ] = curr_page
+    assert pages == {
+        1: [ "This is page 1." ],
+        2: [ "This is page 2.", "Another line on page 2." ],
+        3: [
+            "Line 1a.", "Line 1b.", "Line 1c.", "Line 1d.", "Line 1e.",
+            "Line 2a.", "Line 2b.", "Line 2c."
+        ]
+    }
--- a/asl_rulebook2/utils.py
+++ b/asl_rulebook2/utils.py
@ -0,0 +1,20 @@
+""" Miscellaneous utilities. """
+
+import re
+
+# ---------------------------------------------------------------------
+
+def parse_page_numbers( val ):
+    """Parse a list of page numbers.
+
+    We recognize a list of page numbers, and/or ranges e.g. 1,2,5-9,13.
+    """
+    vals = set()
+    if val:
+        for v in val.split( "," ):
+            mo = re.search( r"^(\d+)-(\d+)$", v )
+            if mo:
+                vals.update( range( int(mo.group(1)), int(mo.group(2))+1 ) )
+            else:
+                vals.add( int(v) )
+    return vals
--- a/bin/dump_pdf.py
+++ b/bin/dump_pdf.py
@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+""" Dump a PDF file. """
+
+import click
+
+from asl_rulebook2.pdf import PdfDoc
+from asl_rulebook2.utils import parse_page_numbers
+
+# ---------------------------------------------------------------------
+
+@click.command()
+@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
+@click.option( "--toc","dump_toc", is_flag=True, default=False, help="Dump the TOC." )
+@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." )
+def main( pdf_file, dump_toc, pages ):
+    """Dump a PDF file."""
+
+    # process the command-line arguments
+    pages = parse_page_numbers( pages )
+
+    # dump the PDF file
+    with PdfDoc( pdf_file ) as pdf:
+        pdf.dump_pdf( dump_toc=dump_toc, pages=pages )
+
+# ---------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main() #pylint: disable=no-value-for-parameter
--- a/setup.py
+++ b/setup.py
@ -40,4 +40,7 @@ setup(
    data_files = [
        ( "asl-rulebook2", ["LICENSE.txt"] ),
    ],
+    entry_points = {
+        "console_scripts": "dump-pdf = bin.dump_pdf:main",
+    }
 )