parent
9c360e940a
commit
52484240aa
@ -0,0 +1,161 @@ |
||||
""" Parse and process a PDF. """ |
||||
|
||||
import collections |
||||
|
||||
import click |
||||
from pdfminer.pdfparser import PDFParser |
||||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines |
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter |
||||
from pdfminer.converter import PDFPageAggregator |
||||
from pdfminer.layout import LAParams, LTContainer |
||||
from pdfminer.pdfpage import PDFPage |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class PdfDoc: |
||||
"""Wrapper around a PDF document.""" |
||||
|
||||
def __init__( self, fname ): |
||||
self.fname = fname |
||||
self._fp = None |
||||
|
||||
def __enter__( self ): |
||||
self._fp = open( self.fname, "rb" ) |
||||
#pylint: disable=attribute-defined-outside-init |
||||
self.parser = PDFParser( self._fp ) |
||||
self.doc = PDFDocument( self.parser ) |
||||
self.rmgr = PDFResourceManager() |
||||
self.device = PDFPageAggregator( self.rmgr, laparams=LAParams() ) |
||||
self.interp = PDFPageInterpreter( self.rmgr, self.device ) |
||||
return self |
||||
|
||||
def __exit__( self, exc_type, exc_value, exc_traceback ): |
||||
if self._fp: |
||||
self._fp.close() |
||||
|
||||
def dump_pdf( self, dump_toc=True, pages=None, elem_filter=None, out=None ): |
||||
"""Dump the PDF document.""" |
||||
|
||||
# dump the TOC |
||||
if dump_toc: |
||||
self._dump_toc( out=out ) |
||||
|
||||
# dump each page |
||||
max_page_no = max( pages ) if pages else None |
||||
first_page = not dump_toc |
||||
for page_no, page in PageIterator( self ): |
||||
|
||||
# parse the next page |
||||
self.interp.process_page( page ) |
||||
if pages and page_no not in pages: |
||||
continue |
||||
lt_page = self.device.get_result() |
||||
|
||||
# dump the page details |
||||
if first_page: |
||||
first_page = False |
||||
else: |
||||
click.echo( file=out ) |
||||
click.secho( "--- PAGE {} {}".format( page_no, 80*"-" )[:80], fg="bright_cyan", file=out ) |
||||
click.echo( "lt_page = {}".format( lt_page ), file=out ) |
||||
click.echo( file=out ) |
||||
|
||||
# dump each element on the page |
||||
for depth, elem in PageElemIterator( lt_page ): |
||||
if elem_filter and not elem_filter( elem ): |
||||
continue |
||||
click.echo( "{}- {}".format( depth*" ", elem ), file=out ) |
||||
|
||||
# check if we're done |
||||
if max_page_no and page_no >= max_page_no: |
||||
break |
||||
|
||||
def _dump_toc( self, out=None ): |
||||
"""Dump a PDF document's TOC.""" |
||||
|
||||
# initialize |
||||
toc_iter = TocIterator( self ) |
||||
if not toc_iter.has_toc(): |
||||
click.secho( "No TOC.", fg="red", file=out ) |
||||
return |
||||
|
||||
# dump each TOC entry |
||||
for depth, title, dest in toc_iter: |
||||
if depth > 1: |
||||
bullet = "*" if depth == 2 else "-" |
||||
click.echo( "{}{} ".format( (depth-2)*" ", bullet ), nl=False, file=out ) |
||||
title = repr( title ).strip() |
||||
if title[0] in ('"',"'") and title[-1] == title[0]: |
||||
title = title[1:-1] |
||||
col = "cyan" if depth <= 2 else "green" |
||||
click.echo( "{} => {}".format( |
||||
click.style( title, fg=col ), |
||||
click.style( repr(dest), fg="yellow" ) |
||||
), file=out ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class PageIterator: |
||||
"""Iterate over each page in a PDF document.""" |
||||
|
||||
def __init__( self, pdf ): |
||||
self.pdf = pdf |
||||
self._pages = PDFPage.create_pages( pdf.doc ) |
||||
self._page_no = 0 |
||||
|
||||
def __iter__( self ): |
||||
return self |
||||
|
||||
def __next__( self ): |
||||
"""Return the next page.""" |
||||
page = next( self._pages ) |
||||
self._page_no += 1 |
||||
return self._page_no, page |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class PageElemIterator: |
||||
"""Iterate over each element in a page.""" |
||||
|
||||
def __init__( self, lt_page ): |
||||
self.lt_page = lt_page |
||||
# collect all the elements (so that they can be sorted) |
||||
self._elems = collections.deque() |
||||
def walk( elem, depth ): |
||||
for child in elem: |
||||
self._elems.append( ( depth, child ) ) |
||||
if isinstance( child, LTContainer ): |
||||
walk( child, depth+1 ) |
||||
walk( lt_page, 0 ) |
||||
|
||||
def __iter__( self ): |
||||
return self |
||||
|
||||
def __next__( self ): |
||||
"""Return the next element on the page.""" |
||||
if not self._elems: |
||||
raise StopIteration() |
||||
return self._elems.popleft() |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class TocIterator(): |
||||
"""Iterate over the entries in a TOC.""" |
||||
|
||||
def __init__( self, pdf ): |
||||
try: |
||||
self._outlines = pdf.doc.get_outlines() |
||||
except PDFNoOutlines: |
||||
self._outlines = None |
||||
|
||||
def has_toc( self ): |
||||
"""Check if the document has as TOC.""" |
||||
return self._outlines is not None |
||||
|
||||
def __iter__( self ): |
||||
return self |
||||
|
||||
def __next__( self ): |
||||
"""Return the next entry in the TOC.""" |
||||
level, title, dest, action, se = next( self._outlines ) #pylint: disable=unused-variable,invalid-name |
||||
return level, title, dest |
Binary file not shown.
Binary file not shown.
@ -0,0 +1,55 @@ |
||||
""" Test dumping PDF's. """ |
||||
|
||||
import os |
||||
import io |
||||
import re |
||||
|
||||
from pdfminer.layout import LTTextLineHorizontal |
||||
|
||||
from asl_rulebook2.pdf import PdfDoc |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def test_dump(): |
||||
"""Test dumping PDF's.""" |
||||
|
||||
# dump the PDF |
||||
fname = os.path.join( os.path.dirname(__file__), "fixtures/dump/simple-text.pdf" ) |
||||
buf = io.StringIO() |
||||
with PdfDoc( fname ) as pdf: |
||||
pdf.dump_pdf( out=buf, |
||||
elem_filter = lambda e: isinstance( e, LTTextLineHorizontal ) |
||||
) |
||||
buf = buf.getvalue() |
||||
|
||||
# check that no TOC was found |
||||
mo = re.search( r"^No TOC\.$", buf, re.MULTILINE ) |
||||
assert mo |
||||
|
||||
# extract the results |
||||
pages = {} |
||||
curr_page = None |
||||
for line in buf.split( "\n" ): |
||||
# check if we've found the start of a new page |
||||
mo = re.search( r"^--- PAGE (\d+) ---", line, re.MULTILINE ) |
||||
if mo: |
||||
if curr_page: |
||||
pages[ curr_page_no ] = curr_page |
||||
curr_page = [] |
||||
curr_page_no = int( mo.group(1) ) |
||||
continue |
||||
# check if we've found content we're interested in |
||||
mo = re.search( r"<LTTextLineHorizontal .*?'(.*?)'>", line ) |
||||
if mo: |
||||
content = mo.group(1).replace( "\\n", "" ).strip() |
||||
if content: |
||||
curr_page.append( content ) |
||||
pages[ curr_page_no ] = curr_page |
||||
assert pages == { |
||||
1: [ "This is page 1." ], |
||||
2: [ "This is page 2.", "Another line on page 2." ], |
||||
3: [ |
||||
"Line 1a.", "Line 1b.", "Line 1c.", "Line 1d.", "Line 1e.", |
||||
"Line 2a.", "Line 2b.", "Line 2c." |
||||
] |
||||
} |
@ -0,0 +1,20 @@ |
||||
""" Miscellaneous utilities. """ |
||||
|
||||
import re |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def parse_page_numbers( val ): |
||||
"""Parse a list of page numbers. |
||||
|
||||
We recognize a list of page numbers, and/or ranges e.g. 1,2,5-9,13. |
||||
""" |
||||
vals = set() |
||||
if val: |
||||
for v in val.split( "," ): |
||||
mo = re.search( r"^(\d+)-(\d+)$", v ) |
||||
if mo: |
||||
vals.update( range( int(mo.group(1)), int(mo.group(2))+1 ) ) |
||||
else: |
||||
vals.add( int(v) ) |
||||
return vals |
@ -0,0 +1,28 @@ |
||||
#!/usr/bin/env python3 |
||||
""" Dump a PDF file. """ |
||||
|
||||
import click |
||||
|
||||
from asl_rulebook2.pdf import PdfDoc |
||||
from asl_rulebook2.utils import parse_page_numbers |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
@click.command() |
||||
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) |
||||
@click.option( "--toc","dump_toc", is_flag=True, default=False, help="Dump the TOC." ) |
||||
@click.option( "--pages","-p", help="Page(s) to dump (e.g. 2,5,9-15)." ) |
||||
def main( pdf_file, dump_toc, pages ): |
||||
"""Dump a PDF file.""" |
||||
|
||||
# process the command-line arguments |
||||
pages = parse_page_numbers( pages ) |
||||
|
||||
# dump the PDF file |
||||
with PdfDoc( pdf_file ) as pdf: |
||||
pdf.dump_pdf( dump_toc=dump_toc, pages=pages ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
if __name__ == "__main__": |
||||
main() #pylint: disable=no-value-for-parameter |
Loading…
Reference in new issue