Save the extracted cards in a database.

Unified how files and directories are parsed.
Added some helper options to the CLI.
master
Pacman Ghost 7 years ago
parent 3fd7fdc22c
commit 2857beb327
  1. 65
      asl_cards/__main__.py
  2. 99
      asl_cards/db.py
  3. 67
      asl_cards/parse.py
  4. 1
      asl_cards/requirements.txt

@ -6,32 +6,60 @@ import os
import getopt
from parse import PdfParser
import db
# ---------------------------------------------------------------------
def main( args ) :
# parse the arguments
db_fname = None
parse_targets = []
max_pages = -1
extract_images = True
log_progress = False
dump = False
try :
opts , args = getopt.getopt( args , "f:d:h?" , ["file=","dir=","help"] )
opts , args = getopt.getopt( args , "f:d:ph?" , ["db=","file=","dir=","maxpages=","noimages","progress","dump","help"] )
except getopt.GetoptError as err :
raise RuntimeError( "Can't parse arguments: {}".format( err ) )
for opt,val in opts :
if opt in ("-f","--file") :
pdf_parser = PdfParser( progress_callback )
cards = pdf_parser.parse_file( val )
for c in cards :
print( "{} ; img:{} bytes".format( c[0] , len(c[1]) ) )
elif opt in ("-d","--dir") :
pdf_parser = PdfParser( progress_callback )
fcards = pdf_parser.parse_dir( val )
for fname,cards in fcards.items() :
print( "{}:".format( fname ) )
for c in cards :
print( "- {}".format( c ) )
elif opt in ("-h","--help","-?") :
if opt in ["--db"] :
db_fname = val
elif opt in ["-f","--file"] :
parse_targets.append( val )
elif opt in ["-d","--dir"] :
parse_targets.append( val )
elif opt in ["--maxpages"] :
max_pages = int( val )
elif opt in ["--noimages"] :
extract_images = False
elif opt in ["-d","--dump"] :
dump = True
elif opt in ["-p","--progress"] :
log_progress = True
elif opt in ["-h","--help","-?"] :
print_help()
else :
raise RuntimeError( "Unknown argument: {}".format( opt ) )
if not db_fname : raise RuntimeError( "No database was specified." )
# initialize
db.open_database( db_fname )
# do the requested processing
pdf_parser = PdfParser( progress_callback if log_progress else None )
if parse_targets :
cards = []
for pt in parse_targets :
cards.extend (
pdf_parser.parse( pt , max_pages=max_pages , images=extract_images )
)
db.add_cards( cards )
elif dump :
db.dump_database()
else :
raise RuntimeError( "No action." )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -46,8 +74,13 @@ def progress_callback( progress , msg ) :
def print_help() :
print( "{} {{options}}".format( os.path.split(sys.argv[0])[1] ) )
print()
print( " -f --file PDF file to parse." )
print( " -d --dir Directory with PDF's to parse." )
print( " --db Database file." )
print( " -f --file PDF file to parse." )
print( " -d --dir Directory with PDF's to parse." )
print( " --maxpages Maximum number of pages to pages." )
print( " --noimages Don't extract card images." )
print( " --dump Dump the database." )
print( " --progress Log progress during lengthy operations." )
print()
# ---------------------------------------------------------------------

@ -0,0 +1,99 @@
import sys
import os
from sqlalchemy import sql , orm , create_engine
from sqlalchemy import Column , ForeignKey , String , Integer , Binary
# ---------------------------------------------------------------------
db_engine = None
db_session = None
# ---------------------------------------------------------------------
from sqlalchemy.ext.declarative import declarative_base
DbBase = declarative_base()
class DbBaseMixin :
"""Add helper functions to database model classes."""
def _init_db_object( self , **kwargs ) :
"""Initialize ourself from a list of attributes."""
for k,v in kwargs.items() :
setattr( self , k ,v )
def _to_string( self , cls ) :
keys = orm.class_mapper( cls ).c.keys()
buf = "|".join(
"{}={}".format( k , getattr(self,k) ) for k in keys
)
return "{}[{}]".format( type(self).__name__ , buf )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class AslCard( DbBase , DbBaseMixin ) :
"""Models an ASL card."""
__tablename__ = "card"
card_id = Column( Integer , primary_key=True , autoincrement=True )
tag = Column( String(40) )
nationality = Column( String(40) )
name = Column( String(40) )
page_id = Column( Integer )
page_pos = Column( Integer )
card_image = orm.relationship( "AslCardImage" , uselist=False , backref="parent_card" , cascade="all,delete" )
def __init__( self , **kwargs ) : self._init_db_object( **kwargs )
def __str__( self ) : return self._to_string(AslCard)
class AslCardImage( DbBase , DbBaseMixin ) :
"""Models the image data for an ASL card."""
__tablename__ = "card_image"
card_id = Column( Integer , ForeignKey("card.card_id",ondelete="CASCADE") , primary_key=True )
image_data = Column( Binary() )
# nb: a relationship for "card_image" is created by AslCard
def __init__( self , **kwargs ) : self._init_db_object( **kwargs )
def __str__( self ) :
return "AslCardImage[card_id={}|#bytes={}]".format( self.card_id , len(self.image_data) )
# ---------------------------------------------------------------------
def open_database( fname ) :
"""Open the database."""
# open the database
is_new = not os.path.isfile( fname )
conn_string = "sqlite:///{}".format( fname )
global db_engine
db_engine = create_engine( conn_string , convert_unicode=True )
#db_engine.echo = True
# initialize our session
global db_session
db_session = orm.create_session( bind=db_engine , autocommit=False )
db_session.execute( "PRAGMA foreign_keys = on" ) # nb: foreign keys are disabled by default in SQLite
# check if we are creating a new database
if is_new :
# yup - make it so
DbBase.metadata.create_all( db_engine )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def add_cards( cards ) :
"""Build the database from the specified cards."""
# clear the database
db_session.query(AslCard).delete()
# add the cards
for c in cards :
db_session.add( c )
# commit the changes
db_session.commit()
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def dump_database() :
"""Dump the database."""
# dump the ASL cards
query = db_session.query( AslCard )
for card in query.all() :
print( card )
if card.card_image :
print( "- {}".format( card.card_image ) )

@ -14,12 +14,7 @@ from pdfminer.pdfpage import PDFPage
import ghostscript
from PIL import Image , ImageChops
# ---------------------------------------------------------------------
AslCard = namedtuple(
"AslCard" ,
[ "page_id" , "page_pos" , "tag" , "nationality" , "name" ]
)
from db import AslCard , AslCardImage
# ---------------------------------------------------------------------
@ -29,18 +24,24 @@ class PdfParser:
# initialize
self.progress = progress
def parse_dir( self , dname , progress=None ) :
"""Parse all PDF's in a directory."""
fcards = {}
for fname in os.listdir(dname) :
if os.path.splitext(fname)[1].lower() != ".pdf" :
continue
cards = self.parse_file( os.path.join(dname,fname) )
fcards[fname] = cards
return fcards
def parse_file( self , fname , images=True ) :
def parse( self , target , max_pages=-1 , images=True ) :
"""Extract the cards from a PDF file."""
# locate the files we're going to parse
if os.path.isfile( target ) :
fnames = [ target ]
else :
fnames = [
os.path.join( target , f )
for f in os.listdir( target )
if os.path.splitext( f )[1].lower() == ".pdf"
]
# parse each file
cards = []
for fname in fnames :
cards.extend( self._do_parse_file( fname , max_pages , images ) )
return cards
def _do_parse_file( self , fname , max_pages , images ) :
# extract the details of each card
rmgr = PDFResourceManager()
laparams = LAParams()
@ -51,18 +52,20 @@ class PdfParser:
self._progress( 0 , "Loading file: {}".format( fname ) )
pages = list( PDFPage.get_pages( fp ) )
for page_no,page in enumerate(pages) :
self._progress( float(page_no)/len(pages) , "Processing page {}...".format( 1+page_no ) )
self._progress( float(page_no)/len(pages) , "Extracting card info from page {}...".format( 1+page_no ) )
page_cards = self._parse_page( cards , interp , page_no , page )
cards.extend( page_cards )
if max_pages > 0 and 1+page_no >= max_pages :
break
self._progress( 1.0 , "Done." )
# extract the card images
if images :
card_images = self._extract_images( fname )
card_images = self._extract_images( fname , max_pages )
if len(cards) != len(card_images) :
raise RuntimeError( "Found {} cards, {} card images.".format( len(cards) , len(card_images) ) )
return zip( cards , card_images )
else :
return cards
for i in range(0,len(cards)) :
cards[i].card_image = AslCardImage( image_data=card_images[i] )
return cards
def _parse_page( self , cards , interp , page_no , page ) :
"""Extract the cards from a PDF page."""
@ -106,13 +109,14 @@ class PdfParser:
# generate the AslCard
page_pos = 0 if items[0].y0 > lt_page.height/2 else 1
return AslCard(
lt_page.pageid , page_pos ,
_tidy( item_texts[0] ).replace( "# ", "#" ) ,
_tidy( item_texts[1] ) ,
_tidy( item_texts[2] )
tag = _tidy( item_texts[0] ).replace( "# ", "#" ) ,
nationality = _tidy( item_texts[1] ) ,
name = _tidy( item_texts[2] ) ,
page_id = lt_page.pageid ,
page_pos = page_pos ,
)
def _extract_images( self , fname ) :
def _extract_images( self , fname , max_pages ) :
"""Extract card images from a file."""
# extract each page from the PDF as an image
fname_template = os.path.join( tempfile.gettempdir() , "asl_cards-%d.png" )
@ -121,8 +125,11 @@ class PdfParser:
"_ignored_" , "-dQUIET" , "-dSAFER" , "-dNOPAUSE" ,
"-sDEVICE=png16m" , "-r"+str(resolution) ,
"-sOutputFile="+fname_template ,
"-f" , fname
]
if max_pages > 0 :
args.append( "-dLastPage={}".format(max_pages) )
args.extend( [ "-f" , fname ] )
# FIXME! clean up left-over temp files before we start
args = [ s.encode(locale.getpreferredencoding()) for s in args ]
# FIXME! stop GhostScript from issuing warnings (stdout).
self._progress( 0 , "Extracting images..." )
@ -138,7 +145,7 @@ class PdfParser:
card_images = []
for page_no in range(0,npages) :
# open the next page image
self._progress( float(page_no)/npages , "Processing page {}...".format( 1+page_no ) )
self._progress( float(page_no)/npages , "Extracting card images from page {}...".format( 1+page_no ) )
fname = fname_template % (1+page_no)
img = Image.open( fname )
img_width , img_height = img.size
@ -186,7 +193,7 @@ class PdfParser:
rgn.save( fname )
with open( fname , "rb" ) as fp :
buf = fp.read()
#os.unlink( fname )
os.unlink( fname )
return buf , rgn.size
def _progress( self , progress , msg ) :

@ -2,3 +2,4 @@
pdfminer.six == 20170419
python3-ghostscript == 0.5.0
Pillow == 4.1.0
SQLAlchemy == 1.1.9

Loading…
Cancel
Save