From 2857beb327cb1ded3eb70f55748b9dcb0a46c407 Mon Sep 17 00:00:00 2001 From: Taka Date: Sat, 22 Apr 2017 10:14:38 +0000 Subject: [PATCH] Save the extracted cards in a database. Unified how files and directories are parsed. Added some helper options to the CLI. --- asl_cards/__main__.py | 65 +++++++++++++++++++------ asl_cards/db.py | 99 ++++++++++++++++++++++++++++++++++++++ asl_cards/parse.py | 67 ++++++++++++++------------ asl_cards/requirements.txt | 1 + 4 files changed, 186 insertions(+), 46 deletions(-) create mode 100644 asl_cards/db.py diff --git a/asl_cards/__main__.py b/asl_cards/__main__.py index 18cf4f9..3ace0be 100755 --- a/asl_cards/__main__.py +++ b/asl_cards/__main__.py @@ -6,32 +6,60 @@ import os import getopt from parse import PdfParser +import db # --------------------------------------------------------------------- def main( args ) : + # parse the arguments + db_fname = None + parse_targets = [] + max_pages = -1 + extract_images = True + log_progress = False + dump = False try : - opts , args = getopt.getopt( args , "f:d:h?" , ["file=","dir=","help"] ) + opts , args = getopt.getopt( args , "f:d:ph?" , ["db=","file=","dir=","maxpages=","noimages","progress","dump","help"] ) except getopt.GetoptError as err : raise RuntimeError( "Can't parse arguments: {}".format( err ) ) for opt,val in opts : - if opt in ("-f","--file") : - pdf_parser = PdfParser( progress_callback ) - cards = pdf_parser.parse_file( val ) - for c in cards : - print( "{} ; img:{} bytes".format( c[0] , len(c[1]) ) ) - elif opt in ("-d","--dir") : - pdf_parser = PdfParser( progress_callback ) - fcards = pdf_parser.parse_dir( val ) - for fname,cards in fcards.items() : - print( "{}:".format( fname ) ) - for c in cards : - print( "- {}".format( c ) ) - elif opt in ("-h","--help","-?") : + if opt in ["--db"] : + db_fname = val + elif opt in ["-f","--file"] : + parse_targets.append( val ) + elif opt in ["-d","--dir"] : + parse_targets.append( val ) + elif opt in ["--maxpages"] : + max_pages = int( val ) + elif opt in ["--noimages"] : + extract_images = False + elif opt in ["-d","--dump"] : + dump = True + elif opt in ["-p","--progress"] : + log_progress = True + elif opt in ["-h","--help","-?"] : print_help() else : raise RuntimeError( "Unknown argument: {}".format( opt ) ) + if not db_fname : raise RuntimeError( "No database was specified." ) + + # initialize + db.open_database( db_fname ) + + # do the requested processing + pdf_parser = PdfParser( progress_callback if log_progress else None ) + if parse_targets : + cards = [] + for pt in parse_targets : + cards.extend ( + pdf_parser.parse( pt , max_pages=max_pages , images=extract_images ) + ) + db.add_cards( cards ) + elif dump : + db.dump_database() + else : + raise RuntimeError( "No action." ) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -46,8 +74,13 @@ def progress_callback( progress , msg ) : def print_help() : print( "{} {{options}}".format( os.path.split(sys.argv[0])[1] ) ) print() - print( " -f --file PDF file to parse." ) - print( " -d --dir Directory with PDF's to parse." ) + print( " --db Database file." ) + print( " -f --file PDF file to parse." ) + print( " -d --dir Directory with PDF's to parse." ) + print( " --maxpages Maximum number of pages to pages." ) + print( " --noimages Don't extract card images." ) + print( " --dump Dump the database." ) + print( " --progress Log progress during lengthy operations." ) print() # --------------------------------------------------------------------- diff --git a/asl_cards/db.py b/asl_cards/db.py new file mode 100644 index 0000000..c113030 --- /dev/null +++ b/asl_cards/db.py @@ -0,0 +1,99 @@ +import sys +import os +from sqlalchemy import sql , orm , create_engine +from sqlalchemy import Column , ForeignKey , String , Integer , Binary + +# --------------------------------------------------------------------- + +db_engine = None +db_session = None + +# --------------------------------------------------------------------- + +from sqlalchemy.ext.declarative import declarative_base +DbBase = declarative_base() + +class DbBaseMixin : + """Add helper functions to database model classes.""" + def _init_db_object( self , **kwargs ) : + """Initialize ourself from a list of attributes.""" + for k,v in kwargs.items() : + setattr( self , k ,v ) + def _to_string( self , cls ) : + keys = orm.class_mapper( cls ).c.keys() + buf = "|".join( + "{}={}".format( k , getattr(self,k) ) for k in keys + ) + return "{}[{}]".format( type(self).__name__ , buf ) + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +class AslCard( DbBase , DbBaseMixin ) : + """Models an ASL card.""" + __tablename__ = "card" + card_id = Column( Integer , primary_key=True , autoincrement=True ) + tag = Column( String(40) ) + nationality = Column( String(40) ) + name = Column( String(40) ) + page_id = Column( Integer ) + page_pos = Column( Integer ) + card_image = orm.relationship( "AslCardImage" , uselist=False , backref="parent_card" , cascade="all,delete" ) + + def __init__( self , **kwargs ) : self._init_db_object( **kwargs ) + def __str__( self ) : return self._to_string(AslCard) + +class AslCardImage( DbBase , DbBaseMixin ) : + """Models the image data for an ASL card.""" + __tablename__ = "card_image" + card_id = Column( Integer , ForeignKey("card.card_id",ondelete="CASCADE") , primary_key=True ) + image_data = Column( Binary() ) + # nb: a relationship for "card_image" is created by AslCard + + def __init__( self , **kwargs ) : self._init_db_object( **kwargs ) + def __str__( self ) : + return "AslCardImage[card_id={}|#bytes={}]".format( self.card_id , len(self.image_data) ) + +# --------------------------------------------------------------------- + +def open_database( fname ) : + """Open the database.""" + + # open the database + is_new = not os.path.isfile( fname ) + conn_string = "sqlite:///{}".format( fname ) + global db_engine + db_engine = create_engine( conn_string , convert_unicode=True ) + #db_engine.echo = True + + # initialize our session + global db_session + db_session = orm.create_session( bind=db_engine , autocommit=False ) + db_session.execute( "PRAGMA foreign_keys = on" ) # nb: foreign keys are disabled by default in SQLite + + # check if we are creating a new database + if is_new : + # yup - make it so + DbBase.metadata.create_all( db_engine ) + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def add_cards( cards ) : + """Build the database from the specified cards.""" + # clear the database + db_session.query(AslCard).delete() + # add the cards + for c in cards : + db_session.add( c ) + # commit the changes + db_session.commit() + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def dump_database() : + """Dump the database.""" + # dump the ASL cards + query = db_session.query( AslCard ) + for card in query.all() : + print( card ) + if card.card_image : + print( "- {}".format( card.card_image ) ) diff --git a/asl_cards/parse.py b/asl_cards/parse.py index 737aa94..77e1645 100644 --- a/asl_cards/parse.py +++ b/asl_cards/parse.py @@ -14,12 +14,7 @@ from pdfminer.pdfpage import PDFPage import ghostscript from PIL import Image , ImageChops -# --------------------------------------------------------------------- - -AslCard = namedtuple( - "AslCard" , - [ "page_id" , "page_pos" , "tag" , "nationality" , "name" ] -) +from db import AslCard , AslCardImage # --------------------------------------------------------------------- @@ -29,18 +24,24 @@ class PdfParser: # initialize self.progress = progress - def parse_dir( self , dname , progress=None ) : - """Parse all PDF's in a directory.""" - fcards = {} - for fname in os.listdir(dname) : - if os.path.splitext(fname)[1].lower() != ".pdf" : - continue - cards = self.parse_file( os.path.join(dname,fname) ) - fcards[fname] = cards - return fcards - - def parse_file( self , fname , images=True ) : + def parse( self , target , max_pages=-1 , images=True ) : """Extract the cards from a PDF file.""" + # locate the files we're going to parse + if os.path.isfile( target ) : + fnames = [ target ] + else : + fnames = [ + os.path.join( target , f ) + for f in os.listdir( target ) + if os.path.splitext( f )[1].lower() == ".pdf" + ] + # parse each file + cards = [] + for fname in fnames : + cards.extend( self._do_parse_file( fname , max_pages , images ) ) + return cards + + def _do_parse_file( self , fname , max_pages , images ) : # extract the details of each card rmgr = PDFResourceManager() laparams = LAParams() @@ -51,18 +52,20 @@ class PdfParser: self._progress( 0 , "Loading file: {}".format( fname ) ) pages = list( PDFPage.get_pages( fp ) ) for page_no,page in enumerate(pages) : - self._progress( float(page_no)/len(pages) , "Processing page {}...".format( 1+page_no ) ) + self._progress( float(page_no)/len(pages) , "Extracting card info from page {}...".format( 1+page_no ) ) page_cards = self._parse_page( cards , interp , page_no , page ) cards.extend( page_cards ) + if max_pages > 0 and 1+page_no >= max_pages : + break self._progress( 1.0 , "Done." ) # extract the card images if images : - card_images = self._extract_images( fname ) + card_images = self._extract_images( fname , max_pages ) if len(cards) != len(card_images) : raise RuntimeError( "Found {} cards, {} card images.".format( len(cards) , len(card_images) ) ) - return zip( cards , card_images ) - else : - return cards + for i in range(0,len(cards)) : + cards[i].card_image = AslCardImage( image_data=card_images[i] ) + return cards def _parse_page( self , cards , interp , page_no , page ) : """Extract the cards from a PDF page.""" @@ -106,13 +109,14 @@ class PdfParser: # generate the AslCard page_pos = 0 if items[0].y0 > lt_page.height/2 else 1 return AslCard( - lt_page.pageid , page_pos , - _tidy( item_texts[0] ).replace( "# ", "#" ) , - _tidy( item_texts[1] ) , - _tidy( item_texts[2] ) + tag = _tidy( item_texts[0] ).replace( "# ", "#" ) , + nationality = _tidy( item_texts[1] ) , + name = _tidy( item_texts[2] ) , + page_id = lt_page.pageid , + page_pos = page_pos , ) - def _extract_images( self , fname ) : + def _extract_images( self , fname , max_pages ) : """Extract card images from a file.""" # extract each page from the PDF as an image fname_template = os.path.join( tempfile.gettempdir() , "asl_cards-%d.png" ) @@ -121,8 +125,11 @@ class PdfParser: "_ignored_" , "-dQUIET" , "-dSAFER" , "-dNOPAUSE" , "-sDEVICE=png16m" , "-r"+str(resolution) , "-sOutputFile="+fname_template , - "-f" , fname ] + if max_pages > 0 : + args.append( "-dLastPage={}".format(max_pages) ) + args.extend( [ "-f" , fname ] ) + # FIXME! clean up left-over temp files before we start args = [ s.encode(locale.getpreferredencoding()) for s in args ] # FIXME! stop GhostScript from issuing warnings (stdout). self._progress( 0 , "Extracting images..." ) @@ -138,7 +145,7 @@ class PdfParser: card_images = [] for page_no in range(0,npages) : # open the next page image - self._progress( float(page_no)/npages , "Processing page {}...".format( 1+page_no ) ) + self._progress( float(page_no)/npages , "Extracting card images from page {}...".format( 1+page_no ) ) fname = fname_template % (1+page_no) img = Image.open( fname ) img_width , img_height = img.size @@ -186,7 +193,7 @@ class PdfParser: rgn.save( fname ) with open( fname , "rb" ) as fp : buf = fp.read() - #os.unlink( fname ) + os.unlink( fname ) return buf , rgn.size def _progress( self , progress , msg ) : diff --git a/asl_cards/requirements.txt b/asl_cards/requirements.txt index 8bd9400..be13710 100644 --- a/asl_cards/requirements.txt +++ b/asl_cards/requirements.txt @@ -2,3 +2,4 @@ pdfminer.six == 20170419 python3-ghostscript == 0.5.0 Pillow == 4.1.0 +SQLAlchemy == 1.1.9