""" Parse and process a PDF. """ import click from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer from pdfminer.pdfpage import PDFPage from asl_rulebook2.utils import remove_quotes, roundf # --------------------------------------------------------------------- class PdfDoc: """Wrapper around a PDF document.""" def __init__( self, fname ): self.fname = fname self._fp = None def __enter__( self ): self._fp = open( self.fname, "rb" ) #pylint: disable=attribute-defined-outside-init self.parser = PDFParser( self._fp ) self.doc = PDFDocument( self.parser ) self.rmgr = PDFResourceManager() self.device = PDFPageAggregator( self.rmgr, laparams=LAParams() ) self.interp = PDFPageInterpreter( self.rmgr, self.device ) return self def __exit__( self, exc_type, exc_value, exc_traceback ): if self._fp: self._fp.close() def dump_pdf( self, dump_toc=True, page_nos=None, sort_elems=False, elem_filter=None, out=None ): """Dump the PDF document.""" # dump the TOC if dump_toc: self._dump_toc( out=out ) # dump each page first_page = not dump_toc for page_no, page, lt_page in PageIterator( self ): #pylint: disable=unused-variable if page_nos: if page_no > max( page_nos ): break if page_no not in page_nos: continue # dump the page details if first_page: first_page = False else: click.echo( file=out ) click.secho( "--- PAGE {} {}".format( page_no, 80*"-" )[:80], fg="bright_cyan", file=out ) click.echo( "lt_page = {}".format( lt_page ), file=out ) click.echo( file=out ) # dump each element on the page for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): click.echo( "{}- {}".format( depth*" ", elem ), file=out ) def _dump_toc( self, out=None ): """Dump a PDF document's TOC.""" # initialize toc_iter = TocIterator( self ) if not toc_iter.has_toc(): click.secho( "No TOC.", fg="red", file=out ) return # dump each TOC entry for depth, title, dest in toc_iter: if depth > 1: bullet = "*" if depth == 2 else "-" click.echo( "{}{} ".format( (depth-2)*" ", bullet ), nl=False, file=out ) title = remove_quotes( repr( title ).strip() ) col = "cyan" if depth <= 2 else "green" click.echo( "{} => {}".format( click.style( title, fg=col ), click.style( repr(dest), fg="yellow" ) ), file=out ) # --------------------------------------------------------------------- class PageIterator: """Iterate over each page in a PDF document.""" def __init__( self, pdf ): self.pdf = pdf self._pages = PDFPage.create_pages( pdf.doc ) self._curr_page_no = 0 def __iter__( self ): return self def __next__( self ): """Return the next page.""" while True: self._curr_page_no += 1 page = next( self._pages ) self.pdf.interp.process_page( page ) lt_page = self.pdf.device.get_result() return self._curr_page_no, page, lt_page # --------------------------------------------------------------------- class PageElemIterator: """Iterate over each element in a page.""" def __init__( self, lt_page, elem_filter=None, sort_elems=False, centre_adjust=0 ): self.lt_page = lt_page # collect all the elements (so that they can be sorted) self._elems = [] self._curr_elem_no = -1 def walk( elem, depth ): for child in elem: # NOTE: If elements are to be sorted, we ignore anything that is not laid out. if not sort_elems or hasattr( child, "x0" ): if not elem_filter or elem_filter( child ): self._elems.append( ( depth, child ) ) if isinstance( child, LTContainer ): walk( child, depth+1 ) walk( lt_page, 0 ) if sort_elems: def sort_key( elem ): col_no = 0 if elem[1].x0 < lt_page.width/2 + centre_adjust else 1 # NOTE: Some elements that should be aligned are actually misaligned by a miniscule amount (e.g. 10^-5), # so to stop this from resulting in the wrong sort order, we truncate the decimal places. # NOTE: Characters are often rendered in different fonts, with bounding boxes that don't align neatly. # I tried sorting by the centre of the bounding boxes, but superscripts causes problems :-/ ypos = - roundf( elem[1].y1, 1 ) xpos = roundf( elem[1].x0, 1 ) return col_no, ypos, xpos self._elems.sort( key=sort_key ) def __iter__( self ): return self def __next__( self ): """Return the next element on the page.""" self._curr_elem_no += 1 if self._curr_elem_no >= len(self._elems): raise StopIteration() return self._elems[ self._curr_elem_no ] # --------------------------------------------------------------------- class TocIterator(): """Iterate over the entries in a TOC.""" def __init__( self, pdf ): try: self._outlines = pdf.doc.get_outlines() except PDFNoOutlines: self._outlines = None def has_toc( self ): """Check if the document has as TOC.""" return self._outlines is not None def __iter__( self ): return self def __next__( self ): """Return the next entry in the TOC.""" level, title, dest, action, se = next( self._outlines ) #pylint: disable=unused-variable,invalid-name return level, title, dest