asl-rulebook2/asl_rulebook2/pdf.py

""" Parse and process a PDF. """

import click
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer
from pdfminer.pdfpage import PDFPage

from asl_rulebook2.utils import remove_quotes, roundf

# ---------------------------------------------------------------------

class PdfDoc:
    """Wrapper around a PDF document."""

    def __init__( self, fname ):
        self.fname = fname
        self._fp = None

    def __enter__( self ):
        self._fp = open( self.fname, "rb" )
        #pylint: disable=attribute-defined-outside-init
        self.parser = PDFParser( self._fp )
        self.doc = PDFDocument( self.parser )
        self.rmgr = PDFResourceManager()
        self.device = PDFPageAggregator( self.rmgr, laparams=LAParams() )
        self.interp = PDFPageInterpreter( self.rmgr, self.device )
        return self

    def __exit__( self, exc_type, exc_value, exc_traceback ):
        if self._fp:
            self._fp.close()

    def dump_pdf( self, dump_toc=True, page_nos=None, sort_elems=False, elem_filter=None, out=None ):
        """Dump the PDF document."""

        # dump the TOC
        if dump_toc:
            self._dump_toc( out=out )

        # dump each page
        first_page = not dump_toc
        for page_no, page, lt_page in PageIterator( self ): #pylint: disable=unused-variable

            if page_nos:
                if page_no > max( page_nos ):
                    break
                if page_no not in page_nos:
                    continue

            # dump the page details
            if first_page:
                first_page = False
            else:
                click.echo( file=out )
            click.secho( "--- PAGE {} {}".format( page_no, 80*"-" )[:80], fg="bright_cyan", file=out )
            click.echo( "lt_page = {}".format( lt_page ), file=out )
            click.echo( file=out )

            # dump each element on the page
            for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
                click.echo( "{}- {}".format( depth*"  ", elem ), file=out )

    def _dump_toc( self, out=None ):
        """Dump a PDF document's TOC."""

        # initialize
        toc_iter = TocIterator( self )
        if not toc_iter.has_toc():
            click.secho( "No TOC.", fg="red", file=out )
            return

        # dump each TOC entry
        for depth, title, dest in toc_iter:
            if depth > 1:
                bullet = "*" if depth == 2 else "-"
                click.echo( "{}{} ".format( (depth-2)*"  ", bullet ), nl=False, file=out )
            title = remove_quotes( repr( title ).strip() )
            col = "cyan" if depth <= 2 else "green"
            click.echo( "{} => {}".format(
                click.style( title, fg=col ),
                click.style( repr(dest), fg="yellow" )
            ), file=out )

# ---------------------------------------------------------------------

class PageIterator:
    """Iterate over each page in a PDF document."""

    def __init__( self, pdf ):
        self.pdf = pdf
        self._pages = PDFPage.create_pages( pdf.doc )
        self._curr_page_no = 0

    def __iter__( self ):
        return self

    def __next__( self ):
        """Return the next page."""
        while True:
            self._curr_page_no += 1
            page = next( self._pages )
            self.pdf.interp.process_page( page )
            lt_page = self.pdf.device.get_result()
            return self._curr_page_no, page, lt_page

# ---------------------------------------------------------------------

class PageElemIterator:
    """Iterate over each element in a page."""

    def __init__( self, lt_page, elem_filter=None, sort_elems=False ):
        self.lt_page = lt_page
        # collect all the elements (so that they can be sorted)
        self._elems = []
        self._curr_elem_no = -1
        def walk( elem, depth ):
            for child in elem:
                # NOTE: If elements are to be sorted, we ignore anything that is not laid out.
                if not sort_elems or hasattr( child, "x0" ):
                    if not elem_filter or elem_filter( child ):
                        self._elems.append( ( depth, child ) )
                if isinstance( child, LTContainer ):
                    walk( child, depth+1 )
        walk( lt_page, 0 )
        if sort_elems:
            def sort_key( elem ):
                col_no = 0 if elem[1].x0 < lt_page.width/2 else 1
                # NOTE: Some elements that should be aligned are actually misaligned by a miniscule amount (e.g. 10^-5),
                # so to stop this from resulting in the wrong sort order, we truncate the decimal places.
                # NOTE: Characters are often rendered in different fonts, with bounding boxes that don't align neatly.
                # I tried sorting by the centre of the bounding boxes, but superscripts causes problems :-/
                ypos = - roundf( elem[1].y1, 1 )
                xpos = roundf( elem[1].x0, 1 )
                return col_no, ypos, xpos
            self._elems.sort( key=sort_key )

    def __iter__( self ):
        return self

    def __next__( self ):
        """Return the next element on the page."""
        self._curr_elem_no  += 1
        if self._curr_elem_no >= len(self._elems):
            raise StopIteration()
        return self._elems[ self._curr_elem_no ]

# ---------------------------------------------------------------------

class TocIterator():
    """Iterate over the entries in a TOC."""

    def __init__( self, pdf ):
        try:
            self._outlines = pdf.doc.get_outlines()
        except PDFNoOutlines:
            self._outlines = None

    def has_toc( self ):
        """Check if the document has as TOC."""
        return self._outlines is not None

    def __iter__( self ):
        return self

    def __next__( self ):
        """Return the next entry in the TOC."""
        level, title, dest, action, se = next( self._outlines ) #pylint: disable=unused-variable,invalid-name
        return level, title, dest
Added a tool to dump a PDF document. 3 years ago			`""" Parse and process a PDF. """`

			`import click`
			`from pdfminer.pdfparser import PDFParser`
			`from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines`
			`from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter`
			`from pdfminer.converter import PDFPageAggregator`
			`from pdfminer.layout import LAParams, LTContainer`
			`from pdfminer.pdfpage import PDFPage`

Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`from asl_rulebook2.utils import remove_quotes, roundf`

Added a tool to dump a PDF document. 3 years ago			`# ---------------------------------------------------------------------`

			`class PdfDoc:`
			`"""Wrapper around a PDF document."""`

			`def __init__( self, fname ):`
			`self.fname = fname`
			`self._fp = None`

			`def __enter__( self ):`
			`self._fp = open( self.fname, "rb" )`
			`#pylint: disable=attribute-defined-outside-init`
			`self.parser = PDFParser( self._fp )`
			`self.doc = PDFDocument( self.parser )`
			`self.rmgr = PDFResourceManager()`
			`self.device = PDFPageAggregator( self.rmgr, laparams=LAParams() )`
			`self.interp = PDFPageInterpreter( self.rmgr, self.device )`
			`return self`

			`def __exit__( self, exc_type, exc_value, exc_traceback ):`
			`if self._fp:`
			`self._fp.close()`

Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`def dump_pdf( self, dump_toc=True, page_nos=None, sort_elems=False, elem_filter=None, out=None ):`
Added a tool to dump a PDF document. 3 years ago			`"""Dump the PDF document."""`

			`# dump the TOC`
			`if dump_toc:`
			`self._dump_toc( out=out )`

			`# dump each page`
			`first_page = not dump_toc`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`for page_no, page, lt_page in PageIterator( self ): #pylint: disable=unused-variable`
Added a tool to dump a PDF document. 3 years ago
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`if page_nos:`
			`if page_no > max( page_nos ):`
			`break`
			`if page_no not in page_nos:`
			`continue`
Added a tool to dump a PDF document. 3 years ago
			`# dump the page details`
			`if first_page:`
			`first_page = False`
			`else:`
			`click.echo( file=out )`
			`click.secho( "--- PAGE {} {}".format( page_no, 80*"-" )[:80], fg="bright_cyan", file=out )`
			`click.echo( "lt_page = {}".format( lt_page ), file=out )`
			`click.echo( file=out )`

			`# dump each element on the page`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):`
Added a tool to dump a PDF document. 3 years ago			`click.echo( "{}- {}".format( depth*" ", elem ), file=out )`

			`def _dump_toc( self, out=None ):`
			`"""Dump a PDF document's TOC."""`

			`# initialize`
			`toc_iter = TocIterator( self )`
			`if not toc_iter.has_toc():`
			`click.secho( "No TOC.", fg="red", file=out )`
			`return`

			`# dump each TOC entry`
			`for depth, title, dest in toc_iter:`
			`if depth > 1:`
			`bullet = "*" if depth == 2 else "-"`
			`click.echo( "{}{} ".format( (depth-2)*" ", bullet ), nl=False, file=out )`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`title = remove_quotes( repr( title ).strip() )`
Added a tool to dump a PDF document. 3 years ago			`col = "cyan" if depth <= 2 else "green"`
			`click.echo( "{} => {}".format(`
			`click.style( title, fg=col ),`
			`click.style( repr(dest), fg="yellow" )`
			`), file=out )`

			`# ---------------------------------------------------------------------`

			`class PageIterator:`
			`"""Iterate over each page in a PDF document."""`

			`def __init__( self, pdf ):`
			`self.pdf = pdf`
			`self._pages = PDFPage.create_pages( pdf.doc )`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`self._curr_page_no = 0`
Added a tool to dump a PDF document. 3 years ago
			`def __iter__( self ):`
			`return self`

			`def __next__( self ):`
			`"""Return the next page."""`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`while True:`
			`self._curr_page_no += 1`
			`page = next( self._pages )`
			`self.pdf.interp.process_page( page )`
			`lt_page = self.pdf.device.get_result()`
			`return self._curr_page_no, page, lt_page`
Added a tool to dump a PDF document. 3 years ago
			`# ---------------------------------------------------------------------`

			`class PageElemIterator:`
			`"""Iterate over each element in a page."""`

Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`def __init__( self, lt_page, elem_filter=None, sort_elems=False ):`
Added a tool to dump a PDF document. 3 years ago			`self.lt_page = lt_page`
			`# collect all the elements (so that they can be sorted)`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`self._elems = []`
			`self._curr_elem_no = -1`
Added a tool to dump a PDF document. 3 years ago			`def walk( elem, depth ):`
			`for child in elem:`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`# NOTE: If elements are to be sorted, we ignore anything that is not laid out.`
			`if not sort_elems or hasattr( child, "x0" ):`
			`if not elem_filter or elem_filter( child ):`
			`self._elems.append( ( depth, child ) )`
Added a tool to dump a PDF document. 3 years ago			`if isinstance( child, LTContainer ):`
			`walk( child, depth+1 )`
			`walk( lt_page, 0 )`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`if sort_elems:`
			`def sort_key( elem ):`
			`col_no = 0 if elem[1].x0 < lt_page.width/2 else 1`
			`# NOTE: Some elements that should be aligned are actually misaligned by a miniscule amount (e.g. 10^-5),`
			`# so to stop this from resulting in the wrong sort order, we truncate the decimal places.`
			`# NOTE: Characters are often rendered in different fonts, with bounding boxes that don't align neatly.`
			`# I tried sorting by the centre of the bounding boxes, but superscripts causes problems :-/`
			`ypos = - roundf( elem[1].y1, 1 )`
			`xpos = roundf( elem[1].x0, 1 )`
			`return col_no, ypos, xpos`
			`self._elems.sort( key=sort_key )`
Added a tool to dump a PDF document. 3 years ago
			`def __iter__( self ):`
			`return self`

			`def __next__( self ):`
			`"""Return the next element on the page."""`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`self._curr_elem_no += 1`
			`if self._curr_elem_no >= len(self._elems):`
Added a tool to dump a PDF document. 3 years ago			`raise StopIteration()`
Extracted the index, rule targets and footnotes from the eASLRB. 3 years ago			`return self._elems[ self._curr_elem_no ]`
Added a tool to dump a PDF document. 3 years ago
			`# ---------------------------------------------------------------------`

			`class TocIterator():`
			`"""Iterate over the entries in a TOC."""`

			`def __init__( self, pdf ):`
			`try:`
			`self._outlines = pdf.doc.get_outlines()`
			`except PDFNoOutlines:`
			`self._outlines = None`

			`def has_toc( self ):`
			`"""Check if the document has as TOC."""`
			`return self._outlines is not None`

			`def __iter__( self ):`
			`return self`

			`def __next__( self ):`
			`"""Return the next entry in the TOC."""`
			`level, title, dest, action, se = next( self._outlines ) #pylint: disable=unused-variable,invalid-name`
			`return level, title, dest`