diff --git a/asl_cards/parse.py b/asl_cards/parse.py
index fa4983e..e5bd965 100644
--- a/asl_cards/parse.py
+++ b/asl_cards/parse.py
@@ -2,6 +2,8 @@ import sys
import os
import re
import itertools
+import time
+import datetime
import tempfile
import locale
from collections import namedtuple
@@ -84,7 +86,7 @@ class PdfParser:
self.on_error = on_error # nb: for showing the user an error message
self.cancelling = False
- def parse( self , target , max_pages=-1 , images=True ) :
+ def parse( self , target , max_pages=-1 , image_res=None ) :
"""Extract the cards from a PDF file."""
# locate the files we're going to parse
if os.path.isfile( target ) :
@@ -97,10 +99,11 @@ class PdfParser:
]
# parse each file
cards = []
+ start_time = time.time()
for file_no,fname in enumerate(fnames) :
if self.cancelling : raise AnalyzeCancelledException()
try :
- file_cards = self._do_parse_file( float(file_no)/len(fnames) , fname , max_pages , images )
+ file_cards = self._do_parse_file( float(file_no)/len(fnames) , fname , max_pages , image_res )
except AnalyzeCancelledException as ex :
raise
except Exception as ex :
@@ -118,9 +121,11 @@ class PdfParser:
self._progress( 1.0 , "Done." )
# filter out placeholder cards
cards = [ c for c in cards if c.nationality != "_unused_" and c.name != "_unused_" ]
+ elapsed_time = int( time.time() - start_time )
+ #print( "Elapsed time: {}".format( datetime.timedelta( seconds=elapsed_time ) ) )
return cards
- def _do_parse_file( self , pval , fname , max_pages , images ) :
+ def _do_parse_file( self , pval , fname , max_pages , image_res ) :
cards = []
# check if we have an index for this file
# NOTE: We originally tried to get the details of each card by parsing the PDF files but unfortunately,
@@ -184,9 +189,9 @@ class PdfParser:
if max_pages > 0 and 1+page_no >= max_pages :
break
# extract the card images
- if images :
+ if image_res :
self._progress( pval , "Extracting images from {}...".format( os.path.split(fname)[1] ) )
- card_images = self._extract_images( fname , max_pages )
+ card_images = self._extract_images( fname , max_pages , image_res )
if len(cards) != len(card_images) :
raise RuntimeError(
"Card mismatch in {}: found {} cards, {} card images.".format(
@@ -248,17 +253,16 @@ class PdfParser:
page_pos = page_pos ,
)
- def _extract_images( self , fname , max_pages ) :
+ def _extract_images( self , fname , max_pages , image_res ) :
"""Extract card images from a file."""
# clean up any leftover extracted images from a previous run
# NOTE: It's important we do this, otherwise we might think they're part of this run.
for f in _find_extracted_image_files() :
os.unlink( f )
# extract each page from the PDF as an image
- resolution = 300 # pixels/inch
args = [
"_ignored_" , "-dQUIET" , "-dSAFER" , "-dNOPAUSE" ,
- "-sDEVICE=png16m" , "-r"+str(resolution) ,
+ "-sDEVICE=png16m" , "-r"+str(image_res) ,
"-sOutputFile="+_EXTRACTED_IMAGES_FILENAME_TEMPLATE
]
if max_pages > 0 :
@@ -277,7 +281,7 @@ class PdfParser:
# extract the cards (by splitting the page in half)
fname2 = list( os.path.split( fname ) )
fname2[1] = os.path.splitext( fname2[1] )
- ypos = img_height * 48 / 100
+ ypos = img_height * 48/100 # nb: the cards are not perfectly aligned in the page
buf1 , size1 = self._crop_image(
img , (0,0,img_width,ypos) ,
os.path.join( fname2[0] , fname2[1][0]+"a"+fname2[1][1] )
@@ -286,8 +290,11 @@ class PdfParser:
img , (0,ypos+1,img_width,img_height) ,
os.path.join( fname2[0] , fname2[1][0]+"b"+fname2[1][1] )
)
- # check if this is the last page, and it has just 1 card on it
- if page_no == len(image_fnames)-1 and size1[1] < 1000 and size2[1] < 1000 :
+ if not buf1 and not buf2 :
+ continue # nb: blank page
+ # check if this is the last page, and it has just 1 card (centred) on it (e.g. ItalianOrdnance.pdf)
+ cutoff = img_height / 4
+ if page_no == len(image_fnames)-1 and size1[1] < cutoff and size2[1] < cutoff :
# yup - extract it
buf , _ = self._crop_image(
img , (0,0,img_width,img_height) ,
@@ -295,9 +302,11 @@ class PdfParser:
)
card_images.append( buf )
else :
- # nope - save the extracted cards
- card_images.append( buf1 )
- card_images.append( buf2 )
+ # nope - save the extracted card(s)
+ if buf1 :
+ card_images.append( buf1 )
+ if buf2 :
+ card_images.append( buf2 )
# clean up
os.unlink( fname )
return card_images
@@ -309,16 +318,19 @@ class PdfParser:
bgd_col = img.getpixel( (0,0) )
bgd_img = Image.new( img.mode , img.size , bgd_col )
diff = ImageChops.difference( rgn , bgd_img )
- #diff = ImageChops.add(diff, diff, 2.0, -100)
+ diff = ImageChops.add(diff, diff, 2.0, -100)
bbox = diff.getbbox()
if bbox :
+ # save the cropped image
rgn = rgn.crop( bbox )
- # save the cropped image
- rgn.save( fname )
- with open( fname , "rb" ) as fp :
- buf = fp.read()
- os.unlink( fname )
- return buf , rgn.size
+ rgn.save( fname )
+ with open( fname , "rb" ) as fp :
+ buf = fp.read()
+ os.unlink( fname )
+ return buf , rgn.size
+ else :
+ # nb: we get here if the entire region is blank (e.g. the bottom half of a single-card page)
+ return None , None
def _progress( self , pval , msg ) :
"""Call the progress callback."""
diff --git a/asl_cards/tests/_test_case_base.py b/asl_cards/tests/_test_case_base.py
index 12e499b..3ab625c 100755
--- a/asl_cards/tests/_test_case_base.py
+++ b/asl_cards/tests/_test_case_base.py
@@ -21,10 +21,7 @@ class TestCaseBase( unittest.TestCase ) :
None ,
#progress = lambda _,msg: print( msg , file=sys.stderr , flush=True )
)
- cards = pdf_parser.parse( fname2 , images=False )
- if False :
- for c in cards :
- print(c)
+ cards = pdf_parser.parse( fname2 , image_res=None )
# check the results
if len(cards) != len(expected_cards) :
raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) )
diff --git a/asl_cards/tests/synthetic-data/1-card.doc b/asl_cards/tests/synthetic-data/1-card.doc
index 5b52de6..a37b9a1 100755
Binary files a/asl_cards/tests/synthetic-data/1-card.doc and b/asl_cards/tests/synthetic-data/1-card.doc differ
diff --git a/asl_cards/tests/synthetic-data/1-card.pdf b/asl_cards/tests/synthetic-data/1-card.pdf
index 02c8f3b..fbf2c61 100755
Binary files a/asl_cards/tests/synthetic-data/1-card.pdf and b/asl_cards/tests/synthetic-data/1-card.pdf differ
diff --git a/asl_cards/tests/synthetic-data/2-cards.doc b/asl_cards/tests/synthetic-data/2-cards.doc
index e40958b..47c0a5b 100755
Binary files a/asl_cards/tests/synthetic-data/2-cards.doc and b/asl_cards/tests/synthetic-data/2-cards.doc differ
diff --git a/asl_cards/tests/synthetic-data/2-cards.pdf b/asl_cards/tests/synthetic-data/2-cards.pdf
index 25ebe1b..d4e9cfb 100755
Binary files a/asl_cards/tests/synthetic-data/2-cards.pdf and b/asl_cards/tests/synthetic-data/2-cards.pdf differ
diff --git a/asl_cards/tests/synthetic-data/3-cards.doc b/asl_cards/tests/synthetic-data/3-cards.doc
index 28a2d6d..a19582d 100755
Binary files a/asl_cards/tests/synthetic-data/3-cards.doc and b/asl_cards/tests/synthetic-data/3-cards.doc differ
diff --git a/asl_cards/tests/synthetic-data/3-cards.pdf b/asl_cards/tests/synthetic-data/3-cards.pdf
index 65a3e16..677bb4b 100755
Binary files a/asl_cards/tests/synthetic-data/3-cards.pdf and b/asl_cards/tests/synthetic-data/3-cards.pdf differ
diff --git a/startup_widget.py b/startup_widget.py
index 2ffebd8..fa63921 100644
--- a/startup_widget.py
+++ b/startup_widget.py
@@ -20,10 +20,11 @@ class AnalyzeThread( QThread ) :
progress2_signal = pyqtSignal( float , name="progress2" )
completed_signal = pyqtSignal( str , name="completed" )
- def __init__( self , cards_dir , db_fname ) :
+ def __init__( self , cards_dir , image_res , db_fname ) :
# initialize
- super(AnalyzeThread,self).__init__()
+ super().__init__()
self.cards_dir = cards_dir
+ self.image_res = image_res
self.db_fname = db_fname
def run( self ) :
@@ -40,7 +41,7 @@ class AnalyzeThread( QThread ) :
on_ask = self.on_ask ,
on_error = self.on_error ,
)
- cards = self.parser.parse( self.cards_dir )
+ cards = self.parser.parse( self.cards_dir , image_res=self.image_res )
if not cards :
raise RuntimeError( "No cards were found." )
db.open_database( self.db_fname , True )
@@ -137,6 +138,10 @@ class StartupWidget( QWidget ) :
)
self.btn_load_db.setText( " " + self.btn_load_db.text() )
# load the widget
+ self.cbo_resolution.addItem( "150 dpi" )
+ self.cbo_resolution.addItem( "300 dpi" )
+ self.cbo_resolution.addItem( "600 dpi" )
+ self.cbo_resolution.setCurrentIndex( 1 )
if os.path.isfile( db_fname ) :
self.le_load_db_fname.setText( db_fname )
else :
@@ -187,6 +192,8 @@ class StartupWidget( QWidget ) :
MainWindow.show_error_msg( "Please choose where you want to save the results." )
self.le_save_db_fname.setFocus()
return
+ # unload other settings
+ image_res = int( self.cbo_resolution.currentText().split()[ 0 ] )
# run the analysis (in a worker thread)
self.frm_open_db.hide()
self.frm_analyze_progress.show()
@@ -194,7 +201,7 @@ class StartupWidget( QWidget ) :
self._update_analyze_ui( False )
self.btn_cancel_analyze.setEnabled( True )
self.btn_cancel_analyze.clicked.connect( self.on_cancel_analyze )
- self.analyze_thread = AnalyzeThread( cards_dir , fname )
+ self.analyze_thread = AnalyzeThread( cards_dir , image_res , fname )
self.analyze_thread.progress_signal.connect( self.on_analyze_progress )
self.analyze_thread.progress2_signal.connect( self.on_analyze_progress2 )
self.analyze_thread.completed_signal.connect( self.on_analyze_completed )
@@ -252,6 +259,7 @@ class StartupWidget( QWidget ) :
def _update_analyze_ui( self , enable ) :
# update the UI
widgets = [ self.lbl_cards_dir , self.le_cards_dir, self.btn_cards_dir ]
+ widgets.extend( [ self.lbl_resolution , self.cbo_resolution , self.lbl_resolution_hint ] )
widgets.extend( [ self.lbl_save_db_fname , self.le_save_db_fname , self.btn_save_db_fname ] )
widgets.append( self.btn_analyze )
for w in widgets :
diff --git a/ui/startup_widget.ui b/ui/startup_widget.ui
index b74cec4..5addfc1 100644
--- a/ui/startup_widget.ui
+++ b/ui/startup_widget.ui
@@ -7,7 +7,7 @@
0
0
592
- 418
+ 471
@@ -64,7 +64,7 @@
2
- 8
+ 0
0
@@ -263,6 +263,65 @@
+ -
+
+
+ 0
+
+
-
+
+
+ &Resolution:
+
+
+ cbo_resolution
+
+
+
+ -
+
+
+
+ 80
+ 0
+
+
+
+
+ 80
+ 16777215
+
+
+
+
+ -
+
+
+
+ 8
+ true
+
+
+
+ (higher values are slower, but look better)
+
+
+
+ -
+
+
+ Qt::Horizontal
+
+
+
+ 40
+ 20
+
+
+
+
+
+
-
@@ -654,6 +713,7 @@
le_cards_dir
btn_cards_dir
+ cbo_resolution
le_save_db_fname
btn_save_db_fname
btn_analyze