Extract card images from the PDF's.

Also changed the way we extract card info.
master
Pacman Ghost 7 years ago
parent de6fb25658
commit 3fd7fdc22c
  1. 2
      asl_cards/__main__.py
  2. 180
      asl_cards/parse.py
  3. 2
      asl_cards/requirements.txt
  4. 27
      asl_cards/tests/test_real_data.py

@ -20,7 +20,7 @@ def main( args ) :
pdf_parser = PdfParser( progress_callback )
cards = pdf_parser.parse_file( val )
for c in cards :
print( c )
print( "{} ; img:{} bytes".format( c[0] , len(c[1]) ) )
elif opt in ("-d","--dir") :
pdf_parser = PdfParser( progress_callback )
fcards = pdf_parser.parse_dir( val )

@ -1,5 +1,9 @@
import sys
import os
import re
import itertools
import tempfile
import locale
from collections import namedtuple
from pdfminer.pdfinterp import PDFResourceManager , PDFPageInterpreter
@ -7,6 +11,9 @@ from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams , LTTextBoxHorizontal
from pdfminer.pdfpage import PDFPage
import ghostscript
from PIL import Image , ImageChops
# ---------------------------------------------------------------------
AslCard = namedtuple(
@ -22,14 +29,23 @@ class PdfParser:
# initialize
self.progress = progress
def parse_file( self , fname ) :
def parse_dir( self , dname , progress=None ) :
"""Parse all PDF's in a directory."""
fcards = {}
for fname in os.listdir(dname) :
if os.path.splitext(fname)[1].lower() != ".pdf" :
continue
cards = self.parse_file( os.path.join(dname,fname) )
fcards[fname] = cards
return fcards
def parse_file( self , fname , images=True ) :
"""Extract the cards from a PDF file."""
# initialize
# extract the details of each card
rmgr = PDFResourceManager()
laparams = LAParams()
dev = PDFPageAggregator( rmgr , laparams=laparams )
interp = PDFPageInterpreter( rmgr , dev )
# process the file
cards = []
with open(fname,"rb") as fp :
self._progress( 0 , "Loading file: {}".format( fname ) )
@ -39,60 +55,146 @@ class PdfParser:
page_cards = self._parse_page( cards , interp , page_no , page )
cards.extend( page_cards )
self._progress( 1.0 , "Done." )
return cards
# extract the card images
if images :
card_images = self._extract_images( fname )
if len(cards) != len(card_images) :
raise RuntimeError( "Found {} cards, {} card images.".format( len(cards) , len(card_images) ) )
return zip( cards , card_images )
else :
return cards
def _parse_page( self , cards , interp , page_no , page ) :
"""Extract the cards from a PDF page."""
cards = []
interp.process_page( page )
lt_page = interp.device.get_result()
pending_card = None
# locate the info box for each card (in the top-left corner)
info_boxes = []
for item in lt_page :
if type(item) is not LTTextBoxHorizontal : continue
item_text = item.get_text().strip()
if item_text.startswith( ("Vehicle","Ordnance") ) :
vals = item_text.split( "\n" )
page_pos = 0 if item.y0 > lt_page.height/2 else 1
if len(vals) >= 3 :
card = AslCard(
lt_page.pageid , page_pos ,
_tidy(vals[0]).replace("# ","#") ,
_tidy(vals[1]) ,
_tidy(vals[2])
)
self._progress( None , "Found card: {}".format( card ) )
cards.append( card )
pending_card = None
else :
pending_card = [
lt_page.pageid , page_pos ,
_tidy(vals[0]).replace("# ","#") ,
_tidy(vals[1])
]
elif pending_card :
pending_card.append( _tidy( item.get_text().strip() ) )
card = AslCard( *pending_card )
self._progress( None , "Found card: {}".format( card ) )
cards.append( card )
pending_card = None
info_boxes.append( [item] )
# get the details from each info box
for item in lt_page :
if type(item) is not LTTextBoxHorizontal : continue
# check if the next item could be part of an info box - it must be within the left/right boundary
# of the first item (within a certain tolerance), and below it (but not too far)
eps = 50 # left/right tolerance
for info_box in info_boxes :
if item.x0 >= info_box[0].x0 - eps and item.x1 <= info_box[0].x1 + eps \
and item.y1 < info_box[0].y1 and info_box[0].y0 - item.y1 < 50 :
# yup - save it
info_box.append( item )
# generate an AslCard from each info box
for info_box in info_boxes :
card = self._make_asl_card( lt_page , info_box )
self._progress( None , "Found card: {}".format( card ) )
cards.append( card )
return cards
def parse_dir( self , dname , progress=None ) :
"""Parse all PDF's in a directory."""
fcards = {}
for fname in os.listdir(dname) :
if os.path.splitext(fname)[1].lower() != ".pdf" :
continue
cards = self.parse_file( os.path.join(dname,fname) )
fcards[fname] = cards
return fcards
def _make_asl_card( self , lt_page , items ) :
# sort the items vertically
items.sort( key=lambda i: i.y0 , reverse=True )
# split out each line of item text
item_texts = list( itertools.chain.from_iterable(
i.get_text().strip().split("\n") for i in items
) )
# ignore short lines
item_texts = [ s for s in item_texts if len(s) >= 5 ]
# generate the AslCard
page_pos = 0 if items[0].y0 > lt_page.height/2 else 1
return AslCard(
lt_page.pageid , page_pos ,
_tidy( item_texts[0] ).replace( "# ", "#" ) ,
_tidy( item_texts[1] ) ,
_tidy( item_texts[2] )
)
def _extract_images( self , fname ) :
"""Extract card images from a file."""
# extract each page from the PDF as an image
fname_template = os.path.join( tempfile.gettempdir() , "asl_cards-%d.png" )
resolution = 300 # pixels/inch
args = [
"_ignored_" , "-dQUIET" , "-dSAFER" , "-dNOPAUSE" ,
"-sDEVICE=png16m" , "-r"+str(resolution) ,
"-sOutputFile="+fname_template ,
"-f" , fname
]
args = [ s.encode(locale.getpreferredencoding()) for s in args ]
# FIXME! stop GhostScript from issuing warnings (stdout).
self._progress( 0 , "Extracting images..." )
ghostscript.Ghostscript( *args )
# figure out how many files were created (so we can show progress)
npages = 0
for i in range(0,99999) :
fname = fname_template % (1+i)
if not os.path.isfile( fname ) :
break
npages += 1
# extract the cards from each page
card_images = []
for page_no in range(0,npages) :
# open the next page image
self._progress( float(page_no)/npages , "Processing page {}...".format( 1+page_no ) )
fname = fname_template % (1+page_no)
img = Image.open( fname )
img_width , img_height = img.size
# extract the cards (by splitting the page in half)
fname2 = list( os.path.split( fname ) )
fname2[1] = os.path.splitext( fname2[1] )
ypos = img_height * 48 / 100
buf1 , size1 = self._crop_image(
img , (0,0,img_width,ypos) ,
os.path.join( fname2[0] , fname2[1][0]+"a"+fname2[1][1] )
)
buf2 , size2 = self._crop_image(
img , (0,ypos+1,img_width,img_height) ,
os.path.join( fname2[0] , fname2[1][0]+"b"+fname2[1][1] )
)
# check if this is the last page, and it has just 1 card on it
if page_no == npages-1 and size1[1] < 1000 and size2[1] < 1000 :
# yup - extract it
buf , _ = self._crop_image(
img , (0,0,img_width,img_height) ,
os.path.join( fname2[0] , fname2[1][0]+"a"+fname2[1][1] )
)
card_images.append( buf )
else :
# nope - save the extracted cards
card_images.append( buf1 )
card_images.append( buf2 )
# clean up
os.unlink( fname )
self._progress( 1.0 , "Done." )
return card_images
def _crop_image( self , img , bbox , fname ) :
# crop the image
rgn = img.crop( bbox )
# trim the cropped region
bgd_col = img.getpixel( (0,0) )
bgd_img = Image.new( img.mode , img.size , bgd_col )
diff = ImageChops.difference( rgn , bgd_img )
diff = ImageChops.add(diff, diff, 2.0, -100)
bbox = diff.getbbox()
if bbox :
rgn = rgn.crop( bbox )
# save the cropped image
rgn.save( fname )
with open( fname , "rb" ) as fp :
buf = fp.read()
#os.unlink( fname )
return buf , rgn.size
def _progress( self , progress , msg ) :
"""Call the progress callback."""
if self.progress :
self.progress( progress , msg )
# ---------------------------------------------------------------------
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
_tidy_regex = re.compile( r"[,.()+-]" )
def _tidy( val ) : return _tidy_regex.sub(" ",val).strip()

@ -1,2 +1,4 @@
# python == 3.6.1
pdfminer.six == 20170419
python3-ghostscript == 0.5.0
Pillow == 4.1.0

@ -21,7 +21,7 @@ class TestRealData( unittest.TestCase ) :
pdf_parser = PdfParser(
#progress = lambda _,msg: print( msg , file=sys.stderr )
)
cards = pdf_parser.parse_file( fname2 )
cards = pdf_parser.parse_file( fname2 , images=False )
# check the results
if len(cards) != len(expected_cards) :
raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) )
@ -29,6 +29,31 @@ class TestRealData( unittest.TestCase ) :
if cards[i] != expected_cards[i] :
raise RuntimeError( "{}: Card mismatch ({}): got {}, expected {}.".format( fname , i , cards[i] , expected_cards[i] ) )
def test_italian_ordnance( self ) :
self._test_pdf_parser( "ItalianOrdnance.pdf" , [
AslCard(page_id=1, page_pos=0, tag="Ordnance5#51", nationality="Italian", name="Mortaio5da545") ,
AslCard(page_id=1, page_pos=1, tag="Ordnance5#52", nationality="Italian", name="Mortaio5da581/14") ,
AslCard(page_id=2, page_pos=0, tag="Ordnance3#33", nationality="Italian", name="Fucile9cc3S") ,
AslCard(page_id=2, page_pos=1, tag="Ordnance3#34", nationality="Italian", name="Cannone9cc3da337/45") ,
AslCard(page_id=3, page_pos=0, tag="Ordnance3#35", nationality="Italian", name="Cannone3da347/32") ,
AslCard(page_id=3, page_pos=1, tag="Ordnance3#36", nationality="Italian", name="Cannone3da365/17") ,
AslCard(page_id=4, page_pos=0, tag="Ordnance5#57", nationality="Italian", name="Cannone5da570/15") ,
AslCard(page_id=4, page_pos=1, tag="Ordnance5#58", nationality="Italian", name="Obice5da575/13") ,
AslCard(page_id=5, page_pos=0, tag="Ordnance1#19", nationality="Italian", name="Cannone1da175/27") ,
AslCard(page_id=5, page_pos=1, tag="Ordnance1#110", nationality="Italian", name="Obice1da175/18") ,
AslCard(page_id=6, page_pos=0, tag="Ordnance6#611", nationality="Italian", name="Cannone6da675/32") ,
AslCard(page_id=6, page_pos=1, tag="Ordnance6#612", nationality="Italian", name="Obice6da6100/17") ,
AslCard(page_id=7, page_pos=0, tag="Ordnance2#213", nationality="Italian", name="Cannone2da2105/28") ,
AslCard(page_id=7, page_pos=1, tag="Ordnance2#214", nationality="Italian", name="Obice2da2149/13") ,
AslCard(page_id=8, page_pos=0, tag="Ordnance2#215", nationality="Italian", name="Cannone2da2149/35") ,
AslCard(page_id=8, page_pos=1, tag="Ordnance2#216", nationality="Italian", name="Cannone2da2149/40") ,
AslCard(page_id=9, page_pos=0, tag="Ordnance #17", nationality="Italian", name="Cannone6mitr 20") ,
AslCard(page_id=9, page_pos=1, tag="Ordnance #17", nationality="Italian", name="Cannone6mitr 20 LF") ,
AslCard(page_id=10, page_pos=0, tag="Ordnance1#118", nationality="Italian", name="Cannone0aa175/39") ,
AslCard(page_id=10, page_pos=1, tag="Ordnance1#119", nationality="Italian", name="Cannone0aa175/46") ,
AslCard(page_id=11, page_pos=0, tag="Ordnance*#*20", nationality="Italian", name="Cannone3aa*90/53") ,
] )
def test_japanese_vehicles( self ) :
self._test_pdf_parser( "JapaneseVehiclesFeb15.pdf" , [
AslCard(page_id=1, page_pos=0, tag="Vehicle #1", nationality="Japanese", name="Type 92A") ,

Loading…
Cancel
Save