Added a tool to dump a PDF document.

master
Pacman Ghost 3 years ago
parent 9c360e940a
commit 52484240aa
  1. 0
      asl_rulebook2/__init__.py
  2. 161
      asl_rulebook2/pdf.py
  3. BIN
      asl_rulebook2/tests/fixtures/dump/simple-text.docx
  4. BIN
      asl_rulebook2/tests/fixtures/dump/simple-text.pdf
  5. 55
      asl_rulebook2/tests/test_dump.py
  6. 20
      asl_rulebook2/utils.py
  7. 28
      bin/dump_pdf.py
  8. 3
      setup.py

@ -0,0 +1,161 @@
""" Parse and process a PDF. """
import collections
import click
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer
from pdfminer.pdfpage import PDFPage
# ---------------------------------------------------------------------
class PdfDoc:
"""Wrapper around a PDF document."""
def __init__( self, fname ):
self.fname = fname
self._fp = None
def __enter__( self ):
self._fp = open( self.fname, "rb" )
#pylint: disable=attribute-defined-outside-init
self.parser = PDFParser( self._fp )
self.doc = PDFDocument( self.parser )
self.rmgr = PDFResourceManager()
self.device = PDFPageAggregator( self.rmgr, laparams=LAParams() )
self.interp = PDFPageInterpreter( self.rmgr, self.device )
return self
def __exit__( self, exc_type, exc_value, exc_traceback ):
if self._fp:
self._fp.close()
def dump_pdf( self, dump_toc=True, pages=None, elem_filter=None, out=None ):
"""Dump the PDF document."""
# dump the TOC
if dump_toc:
self._dump_toc( out=out )
# dump each page
max_page_no = max( pages ) if pages else None
first_page = not dump_toc
for page_no, page in PageIterator( self ):
# parse the next page
self.interp.process_page( page )
if pages and page_no not in pages:
continue
lt_page = self.device.get_result()
# dump the page details
if first_page:
first_page = False
else:
click.echo( file=out )
click.secho( "--- PAGE {} {}".format( page_no, 80*"-" )[:80], fg="bright_cyan", file=out )
click.echo( "lt_page = {}".format( lt_page ), file=out )
click.echo( file=out )
# dump each element on the page
for depth, elem in PageElemIterator( lt_page ):
if elem_filter and not elem_filter( elem ):
continue
click.echo( "{}- {}".format( depth*" ", elem ), file=out )
# check if we're done
if max_page_no and page_no >= max_page_no:
break
def _dump_toc( self, out=None ):
"""Dump a PDF document's TOC."""
# initialize
toc_iter = TocIterator( self )
if not toc_iter.has_toc():
click.secho( "No TOC.", fg="red", file=out )
return
# dump each TOC entry
for depth, title, dest in toc_iter:
if depth > 1:
bullet = "*" if depth == 2 else "-"
click.echo( "{}{} ".format( (depth-2)*" ", bullet ), nl=False, file=out )
title = repr( title ).strip()
if title[0] in ('"',"'") and title[-1] == title[0]:
title = title[1:-1]
col = "cyan" if depth <= 2 else "green"
click.echo( "{} => {}".format(
click.style( title, fg=col ),
click.style( repr(dest), fg="yellow" )
), file=out )
# ---------------------------------------------------------------------
class PageIterator:
"""Iterate over each page in a PDF document."""
def __init__( self, pdf ):
self.pdf = pdf
self._pages = PDFPage.create_pages( pdf.doc )
self._page_no = 0
def __iter__( self ):
return self
def __next__( self ):
"""Return the next page."""
page = next( self._pages )
self._page_no += 1
return self._page_no, page
# ---------------------------------------------------------------------
class PageElemIterator:
"""Iterate over each element in a page."""
def __init__( self, lt_page ):
self.lt_page = lt_page
# collect all the elements (so that they can be sorted)
self._elems = collections.deque()
def walk( elem, depth ):
for child in elem:
self._elems.append( ( depth, child ) )
if isinstance( child, LTContainer ):
walk( child, depth+1 )
walk( lt_page, 0 )
def __iter__( self ):
return self
def __next__( self ):
"""Return the next element on the page."""
if not self._elems:
raise StopIteration()
return self._elems.popleft()
# ---------------------------------------------------------------------
class TocIterator():
"""Iterate over the entries in a TOC."""
def __init__( self, pdf ):
try:
self._outlines = pdf.doc.get_outlines()
except PDFNoOutlines:
self._outlines = None
def has_toc( self ):
"""Check if the document has as TOC."""
return self._outlines is not None
def __iter__( self ):
return self
def __next__( self ):
"""Return the next entry in the TOC."""
level, title, dest, action, se = next( self._outlines ) #pylint: disable=unused-variable,invalid-name
return level, title, dest

@ -0,0 +1,55 @@
""" Test dumping PDF's. """
import os
import io
import re
from pdfminer.layout import LTTextLineHorizontal
from asl_rulebook2.pdf import PdfDoc
# ---------------------------------------------------------------------
def test_dump():
"""Test dumping PDF's."""
# dump the PDF
fname = os.path.join( os.path.dirname(__file__), "fixtures/dump/simple-text.pdf" )
buf = io.StringIO()
with PdfDoc( fname ) as pdf:
pdf.dump_pdf( out=buf,
elem_filter = lambda e: isinstance( e, LTTextLineHorizontal )
)
buf = buf.getvalue()
# check that no TOC was found
mo = re.search( r"^No TOC\.$", buf, re.MULTILINE )
assert mo
# extract the results
pages = {}
curr_page = None
for line in buf.split( "\n" ):
# check if we've found the start of a new page
mo = re.search( r"^--- PAGE (\d+) ---", line, re.MULTILINE )
if mo:
if curr_page:
pages[ curr_page_no ] = curr_page
curr_page = []
curr_page_no = int( mo.group(1) )
continue
# check if we've found content we're interested in
mo = re.search( r"<LTTextLineHorizontal .*?'(.*?)'>", line )
if mo:
content = mo.group(1).replace( "\\n", "" ).strip()
if content:
curr_page.append( content )
pages[ curr_page_no ] = curr_page
assert pages == {
1: [ "This is page 1." ],
2: [ "This is page 2.", "Another line on page 2." ],
3: [
"Line 1a.", "Line 1b.", "Line 1c.", "Line 1d.", "Line 1e.",
"Line 2a.", "Line 2b.", "Line 2c."
]
}

@ -0,0 +1,20 @@
""" Miscellaneous utilities. """
import re
# ---------------------------------------------------------------------
def parse_page_numbers( val ):
"""Parse a list of page numbers.
We recognize a list of page numbers, and/or ranges e.g. 1,2,5-9,13.
"""
vals = set()
if val:
for v in val.split( "," ):
mo = re.search( r"^(\d+)-(\d+)$", v )
if mo:
vals.update( range( int(mo.group(1)), int(mo.group(2))+1 ) )
else:
vals.add( int(v) )
return vals

@ -0,0 +1,28 @@
#!/usr/bin/env python3
""" Dump a PDF file. """
import click
from asl_rulebook2.pdf import PdfDoc
from asl_rulebook2.utils import parse_page_numbers
# ---------------------------------------------------------------------
@click.command()
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--toc","dump_toc", is_flag=True, default=False, help="Dump the TOC." )
@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." )
def main( pdf_file, dump_toc, pages ):
"""Dump a PDF file."""
# process the command-line arguments
pages = parse_page_numbers( pages )
# dump the PDF file
with PdfDoc( pdf_file ) as pdf:
pdf.dump_pdf( dump_toc=dump_toc, pages=pages )
# ---------------------------------------------------------------------
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -40,4 +40,7 @@ setup(
data_files = [
( "asl-rulebook2", ["LICENSE.txt"] ),
],
entry_points = {
"console_scripts": "dump-pdf = bin.dump_pdf:main",
}
)

Loading…
Cancel
Save