diff --git a/asl_cards/parse.py b/asl_cards/parse.py index fa4983e..e5bd965 100644 --- a/asl_cards/parse.py +++ b/asl_cards/parse.py @@ -2,6 +2,8 @@ import sys import os import re import itertools +import time +import datetime import tempfile import locale from collections import namedtuple @@ -84,7 +86,7 @@ class PdfParser: self.on_error = on_error # nb: for showing the user an error message self.cancelling = False - def parse( self , target , max_pages=-1 , images=True ) : + def parse( self , target , max_pages=-1 , image_res=None ) : """Extract the cards from a PDF file.""" # locate the files we're going to parse if os.path.isfile( target ) : @@ -97,10 +99,11 @@ class PdfParser: ] # parse each file cards = [] + start_time = time.time() for file_no,fname in enumerate(fnames) : if self.cancelling : raise AnalyzeCancelledException() try : - file_cards = self._do_parse_file( float(file_no)/len(fnames) , fname , max_pages , images ) + file_cards = self._do_parse_file( float(file_no)/len(fnames) , fname , max_pages , image_res ) except AnalyzeCancelledException as ex : raise except Exception as ex : @@ -118,9 +121,11 @@ class PdfParser: self._progress( 1.0 , "Done." ) # filter out placeholder cards cards = [ c for c in cards if c.nationality != "_unused_" and c.name != "_unused_" ] + elapsed_time = int( time.time() - start_time ) + #print( "Elapsed time: {}".format( datetime.timedelta( seconds=elapsed_time ) ) ) return cards - def _do_parse_file( self , pval , fname , max_pages , images ) : + def _do_parse_file( self , pval , fname , max_pages , image_res ) : cards = [] # check if we have an index for this file # NOTE: We originally tried to get the details of each card by parsing the PDF files but unfortunately, @@ -184,9 +189,9 @@ class PdfParser: if max_pages > 0 and 1+page_no >= max_pages : break # extract the card images - if images : + if image_res : self._progress( pval , "Extracting images from {}...".format( os.path.split(fname)[1] ) ) - card_images = self._extract_images( fname , max_pages ) + card_images = self._extract_images( fname , max_pages , image_res ) if len(cards) != len(card_images) : raise RuntimeError( "Card mismatch in {}: found {} cards, {} card images.".format( @@ -248,17 +253,16 @@ class PdfParser: page_pos = page_pos , ) - def _extract_images( self , fname , max_pages ) : + def _extract_images( self , fname , max_pages , image_res ) : """Extract card images from a file.""" # clean up any leftover extracted images from a previous run # NOTE: It's important we do this, otherwise we might think they're part of this run. for f in _find_extracted_image_files() : os.unlink( f ) # extract each page from the PDF as an image - resolution = 300 # pixels/inch args = [ "_ignored_" , "-dQUIET" , "-dSAFER" , "-dNOPAUSE" , - "-sDEVICE=png16m" , "-r"+str(resolution) , + "-sDEVICE=png16m" , "-r"+str(image_res) , "-sOutputFile="+_EXTRACTED_IMAGES_FILENAME_TEMPLATE ] if max_pages > 0 : @@ -277,7 +281,7 @@ class PdfParser: # extract the cards (by splitting the page in half) fname2 = list( os.path.split( fname ) ) fname2[1] = os.path.splitext( fname2[1] ) - ypos = img_height * 48 / 100 + ypos = img_height * 48/100 # nb: the cards are not perfectly aligned in the page buf1 , size1 = self._crop_image( img , (0,0,img_width,ypos) , os.path.join( fname2[0] , fname2[1][0]+"a"+fname2[1][1] ) @@ -286,8 +290,11 @@ class PdfParser: img , (0,ypos+1,img_width,img_height) , os.path.join( fname2[0] , fname2[1][0]+"b"+fname2[1][1] ) ) - # check if this is the last page, and it has just 1 card on it - if page_no == len(image_fnames)-1 and size1[1] < 1000 and size2[1] < 1000 : + if not buf1 and not buf2 : + continue # nb: blank page + # check if this is the last page, and it has just 1 card (centred) on it (e.g. ItalianOrdnance.pdf) + cutoff = img_height / 4 + if page_no == len(image_fnames)-1 and size1[1] < cutoff and size2[1] < cutoff : # yup - extract it buf , _ = self._crop_image( img , (0,0,img_width,img_height) , @@ -295,9 +302,11 @@ class PdfParser: ) card_images.append( buf ) else : - # nope - save the extracted cards - card_images.append( buf1 ) - card_images.append( buf2 ) + # nope - save the extracted card(s) + if buf1 : + card_images.append( buf1 ) + if buf2 : + card_images.append( buf2 ) # clean up os.unlink( fname ) return card_images @@ -309,16 +318,19 @@ class PdfParser: bgd_col = img.getpixel( (0,0) ) bgd_img = Image.new( img.mode , img.size , bgd_col ) diff = ImageChops.difference( rgn , bgd_img ) - #diff = ImageChops.add(diff, diff, 2.0, -100) + diff = ImageChops.add(diff, diff, 2.0, -100) bbox = diff.getbbox() if bbox : + # save the cropped image rgn = rgn.crop( bbox ) - # save the cropped image - rgn.save( fname ) - with open( fname , "rb" ) as fp : - buf = fp.read() - os.unlink( fname ) - return buf , rgn.size + rgn.save( fname ) + with open( fname , "rb" ) as fp : + buf = fp.read() + os.unlink( fname ) + return buf , rgn.size + else : + # nb: we get here if the entire region is blank (e.g. the bottom half of a single-card page) + return None , None def _progress( self , pval , msg ) : """Call the progress callback.""" diff --git a/asl_cards/tests/_test_case_base.py b/asl_cards/tests/_test_case_base.py index 12e499b..3ab625c 100755 --- a/asl_cards/tests/_test_case_base.py +++ b/asl_cards/tests/_test_case_base.py @@ -21,10 +21,7 @@ class TestCaseBase( unittest.TestCase ) : None , #progress = lambda _,msg: print( msg , file=sys.stderr , flush=True ) ) - cards = pdf_parser.parse( fname2 , images=False ) - if False : - for c in cards : - print(c) + cards = pdf_parser.parse( fname2 , image_res=None ) # check the results if len(cards) != len(expected_cards) : raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) ) diff --git a/asl_cards/tests/synthetic-data/1-card.doc b/asl_cards/tests/synthetic-data/1-card.doc index 5b52de6..a37b9a1 100755 Binary files a/asl_cards/tests/synthetic-data/1-card.doc and b/asl_cards/tests/synthetic-data/1-card.doc differ diff --git a/asl_cards/tests/synthetic-data/1-card.pdf b/asl_cards/tests/synthetic-data/1-card.pdf index 02c8f3b..fbf2c61 100755 Binary files a/asl_cards/tests/synthetic-data/1-card.pdf and b/asl_cards/tests/synthetic-data/1-card.pdf differ diff --git a/asl_cards/tests/synthetic-data/2-cards.doc b/asl_cards/tests/synthetic-data/2-cards.doc index e40958b..47c0a5b 100755 Binary files a/asl_cards/tests/synthetic-data/2-cards.doc and b/asl_cards/tests/synthetic-data/2-cards.doc differ diff --git a/asl_cards/tests/synthetic-data/2-cards.pdf b/asl_cards/tests/synthetic-data/2-cards.pdf index 25ebe1b..d4e9cfb 100755 Binary files a/asl_cards/tests/synthetic-data/2-cards.pdf and b/asl_cards/tests/synthetic-data/2-cards.pdf differ diff --git a/asl_cards/tests/synthetic-data/3-cards.doc b/asl_cards/tests/synthetic-data/3-cards.doc index 28a2d6d..a19582d 100755 Binary files a/asl_cards/tests/synthetic-data/3-cards.doc and b/asl_cards/tests/synthetic-data/3-cards.doc differ diff --git a/asl_cards/tests/synthetic-data/3-cards.pdf b/asl_cards/tests/synthetic-data/3-cards.pdf index 65a3e16..677bb4b 100755 Binary files a/asl_cards/tests/synthetic-data/3-cards.pdf and b/asl_cards/tests/synthetic-data/3-cards.pdf differ diff --git a/startup_widget.py b/startup_widget.py index 2ffebd8..fa63921 100644 --- a/startup_widget.py +++ b/startup_widget.py @@ -20,10 +20,11 @@ class AnalyzeThread( QThread ) : progress2_signal = pyqtSignal( float , name="progress2" ) completed_signal = pyqtSignal( str , name="completed" ) - def __init__( self , cards_dir , db_fname ) : + def __init__( self , cards_dir , image_res , db_fname ) : # initialize - super(AnalyzeThread,self).__init__() + super().__init__() self.cards_dir = cards_dir + self.image_res = image_res self.db_fname = db_fname def run( self ) : @@ -40,7 +41,7 @@ class AnalyzeThread( QThread ) : on_ask = self.on_ask , on_error = self.on_error , ) - cards = self.parser.parse( self.cards_dir ) + cards = self.parser.parse( self.cards_dir , image_res=self.image_res ) if not cards : raise RuntimeError( "No cards were found." ) db.open_database( self.db_fname , True ) @@ -137,6 +138,10 @@ class StartupWidget( QWidget ) : ) self.btn_load_db.setText( " " + self.btn_load_db.text() ) # load the widget + self.cbo_resolution.addItem( "150 dpi" ) + self.cbo_resolution.addItem( "300 dpi" ) + self.cbo_resolution.addItem( "600 dpi" ) + self.cbo_resolution.setCurrentIndex( 1 ) if os.path.isfile( db_fname ) : self.le_load_db_fname.setText( db_fname ) else : @@ -187,6 +192,8 @@ class StartupWidget( QWidget ) : MainWindow.show_error_msg( "Please choose where you want to save the results." ) self.le_save_db_fname.setFocus() return + # unload other settings + image_res = int( self.cbo_resolution.currentText().split()[ 0 ] ) # run the analysis (in a worker thread) self.frm_open_db.hide() self.frm_analyze_progress.show() @@ -194,7 +201,7 @@ class StartupWidget( QWidget ) : self._update_analyze_ui( False ) self.btn_cancel_analyze.setEnabled( True ) self.btn_cancel_analyze.clicked.connect( self.on_cancel_analyze ) - self.analyze_thread = AnalyzeThread( cards_dir , fname ) + self.analyze_thread = AnalyzeThread( cards_dir , image_res , fname ) self.analyze_thread.progress_signal.connect( self.on_analyze_progress ) self.analyze_thread.progress2_signal.connect( self.on_analyze_progress2 ) self.analyze_thread.completed_signal.connect( self.on_analyze_completed ) @@ -252,6 +259,7 @@ class StartupWidget( QWidget ) : def _update_analyze_ui( self , enable ) : # update the UI widgets = [ self.lbl_cards_dir , self.le_cards_dir, self.btn_cards_dir ] + widgets.extend( [ self.lbl_resolution , self.cbo_resolution , self.lbl_resolution_hint ] ) widgets.extend( [ self.lbl_save_db_fname , self.le_save_db_fname , self.btn_save_db_fname ] ) widgets.append( self.btn_analyze ) for w in widgets : diff --git a/ui/startup_widget.ui b/ui/startup_widget.ui index b74cec4..5addfc1 100644 --- a/ui/startup_widget.ui +++ b/ui/startup_widget.ui @@ -7,7 +7,7 @@ 0 0 592 - 418 + 471 @@ -64,7 +64,7 @@ 2 - 8 + 0 0 @@ -263,6 +263,65 @@ + + + + 0 + + + + + &Resolution: + + + cbo_resolution + + + + + + + + 80 + 0 + + + + + 80 + 16777215 + + + + + + + + + 8 + true + + + + (higher values are slower, but look better) + + + + + + + Qt::Horizontal + + + + 40 + 20 + + + + + + @@ -654,6 +713,7 @@ le_cards_dir btn_cards_dir + cbo_resolution le_save_db_fname btn_save_db_fname btn_analyze