A search engine for MMP's eASLRB.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
asl-rulebook2/asl_rulebook2/tests/test_dump.py

55 lines
1.6 KiB

""" Test dumping PDF's. """
import os
import io
import re
from pdfminer.layout import LTTextLineHorizontal
from asl_rulebook2.pdf import PdfDoc
# ---------------------------------------------------------------------
def test_dump():
"""Test dumping PDF's."""
# dump the PDF
fname = os.path.join( os.path.dirname(__file__), "fixtures/dump/simple-text.pdf" )
buf = io.StringIO()
with PdfDoc( fname ) as pdf:
pdf.dump_pdf( out=buf,
elem_filter = lambda e: isinstance( e, LTTextLineHorizontal )
)
buf = buf.getvalue()
# check that no TOC was found
mo = re.search( r"^No TOC\.$", buf, re.MULTILINE )
assert mo
# extract the results
pages = {}
curr_page = None
for line in buf.split( "\n" ):
# check if we've found the start of a new page
mo = re.search( r"^--- PAGE (\d+) ---", line, re.MULTILINE )
if mo:
if curr_page:
pages[ curr_page_no ] = curr_page
curr_page = []
curr_page_no = int( mo.group(1) )
continue
# check if we've found content we're interested in
mo = re.search( r"<LTTextLineHorizontal .*?'(.*?)'>", line )
if mo:
content = mo.group(1).replace( "\\n", "" ).strip()
if content:
curr_page.append( content )
pages[ curr_page_no ] = curr_page
assert pages == {
1: [ "This is page 1." ],
2: [ "This is page 2.", "Another line on page 2." ],
3: [
"Line 1a.", "Line 1b.", "Line 1c.", "Line 1d.", "Line 1e.",
"Line 2a.", "Line 2b.", "Line 2c."
]
}