Try to keep going after a file fails processing.

7 years ago · 049eb322f9
parent 21c005aa3e
commit 049eb322f9
2 changed files with 55 additions and 14 deletions
--- a/asl_cards/parse.py
+++ b/asl_cards/parse.py
@ -20,14 +20,21 @@ from asl_cards.db import AslCard , AslCardImage

 # ---------------------------------------------------------------------

+class AnalyzeCancelledException( RuntimeError ) :
+    def __init__( self ) :
+        super().__init__( "Cancelled." )
+
+# ---------------------------------------------------------------------
+
 class PdfParser:

-    def __init__( self , index_dir , ask=None , progress=None  , progress2=None ) :
+    def __init__( self , index_dir , progress=None , progress2=None , on_ask=None , on_error=None ) :
        # initialize
        self.index_dir = index_dir
-        self.ask = ask # nb: for asking the user something during processing
        self.progress = progress # nb: for tracking file progress
        self.progress2 = progress2 # nb: for tracking page progress within a file
+        self.on_ask = on_ask # nb: for asking the user something during processing
+        self.on_error = on_error # nb: for showing the user an error message
        self.cancelling = False

    def parse( self , target , max_pages=-1 , images=True ) :
@ -44,8 +51,21 @@ class PdfParser:
        # parse each file
        cards = []
        for file_no,fname in enumerate(fnames) :
-            if self.cancelling : raise RuntimeError("Cancelled.")
-            file_cards = self._do_parse_file( float(file_no)/len(fnames) , fname , max_pages , images )
+            if self.cancelling : raise AnalyzeCancelledException()
+            try :
+                file_cards = self._do_parse_file( float(file_no)/len(fnames) , fname , max_pages , images )
+            except AnalyzeCancelledException as ex :
+                raise
+            except Exception as ex :
+                # notify the caller of the error
+                if not self.on_error :
+                    raise
+                self.on_error(
+                    "An error occured while processing {}:\n\n{}\n\nThis file will be ignored.".format(
+                        os.path.split(fname)[1] , str(ex)
+                    )
+                )
+                continue
            if file_cards :
                cards.extend( file_cards )
        self._progress( 1.0 , "Done." )
@ -87,8 +107,8 @@ class PdfParser:
                ) )
        else :
            # ask the user if they want to try parsing the PDF
-            if self.ask :
-                rc = self.ask(
+            if self.on_ask :
+                rc = self.on_ask(
                    "Can't find an index file for {}.\n\nDo you want to try parsing the PDF (slow and unreliable)?".format(
                        os.path.split( fname )[ 1 ]
                    ) ,
@ -97,6 +117,11 @@ class PdfParser:
                if rc != QMessageBox.Yes :
                    return None
            # extract each AslCard from the file
+            # NOTE: Some of the PDF's have cards that have not been filled out - we detect this correctly (because
+            # they don't have a "Vehicle" or "Ordnance" tag, but we barf later because the image extractor thinks
+            # they're a valid card, and so we get a different number of cards vs. images.
+            # It's not really worth fixing this, since we're now using index files instead of extracting the info
+            # from the PDF's (because extraction is giving such poor results :-/).
            self._progress( pval , "Analyzing {}...".format( os.path.split(fname)[1] ) )
            rmgr = PDFResourceManager()
            laparams = LAParams()
@ -105,7 +130,7 @@ class PdfParser:
            with open(fname,"rb") as fp :
                pages = list( PDFPage.get_pages( fp ) )
                for page_no,page in enumerate(pages) :
-                    if self.cancelling : raise RuntimeError("Cancelled.")
+                    if self.cancelling : raise AnalyzeCancelledException()
                    self._progress2( float(page_no) / len(pages) )
                    page_cards = self._parse_page( cards , interp , page_no , page )
                    cards.extend( page_cards )
@ -122,7 +147,7 @@ class PdfParser:
                    )
                )
            for i in range(0,len(cards)) :
-                if self.cancelling : raise RuntimeError("Cancelled.")
+                if self.cancelling : raise AnalyzeCancelledException()
                cards[i].card_image = AslCardImage( image_data=card_images[i] )
        return cards

@ -134,14 +159,14 @@ class PdfParser:
        # locate the info box for each card (in the top-left corner)
        info_boxes = []
        for item in lt_page :
-            if self.cancelling : raise RuntimeError("Cancelled.")
+            if self.cancelling : raise AnalyzeCancelledException()
            if type(item) is not LTTextBoxHorizontal : continue
            item_text = item.get_text().strip()
            if item_text.startswith( ("Vehicle","Ordnance") ) :
                info_boxes.append( [item] )
        # get the details from each info box
        for item in lt_page :
-            if self.cancelling : raise RuntimeError("Cancelled.")
+            if self.cancelling : raise AnalyzeCancelledException()
            if type(item) is not LTTextBoxHorizontal : continue
            # check if the next item could be part of an info box - it must be within the left/right boundary
            # of the first item (within a certain tolerance), and below it (but not too far)
@ -203,7 +228,7 @@ class PdfParser:
        # extract the cards from each page
        card_images = []
        for page_no in range(0,npages) :
-            if self.cancelling : raise RuntimeError("Cancelled.")
+            if self.cancelling : raise AnalyzeCancelledException()
            # open the next page image
            self._progress2( float(page_no) / npages )
            fname = fname_template % (1+page_no)
--- a/startup_widget.py
+++ b/startup_widget.py
@ -35,9 +35,10 @@ class AnalyzeThread( QThread ) :
            # parse the files
            self.parser = PdfParser(
                os.path.join( globals.base_dir , "index" ) ,
-                ask = self.ask ,
                progress = lambda pval,msg: self.progress_signal.emit( -1 if pval is None else pval , msg ) ,
-                progress2 = lambda pval: self.progress2_signal.emit( pval )
+                progress2 = lambda pval: self.progress2_signal.emit( pval ) ,
+                on_ask = self.on_ask ,
+                on_error = self.on_error ,
            )
            cards = self.parser.parse( self.cards_dir )
            db.open_database( self.db_fname , True )
@ -52,7 +53,16 @@ class AnalyzeThread( QThread ) :
            # notify slots that we've finished
            self.completed_signal.emit( "" )

-    def ask( self , msg , btns , default ) :
+    def on_error( self , msg ) :
+        """Show the user an error message."""
+        # NOTE: We are running in a worker thread, so we need to delegate showing the message box
+        # to the GUI thread.
+        QMetaObject.invokeMethod(
+            StartupWidget._instance , "on_error" , Qt.BlockingQueuedConnection ,
+            Q_ARG( str , msg )
+        )
+
+    def on_ask( self , msg , btns , default ) :
        """Ask the user a question."""
        # NOTE: We are running in a worker thread, so we need to delegate showing the message box
        # to the GUI thread.
@ -77,6 +87,7 @@ class StartupWidget( QWidget ) :
    def __init__( self , db_fname , parent=None ) :
        # initialize
        super(StartupWidget,self).__init__( parent=parent )
+        assert StartupWidget._instance is None
        StartupWidget._instance = self
        self.analyze_thread = None
        # FUDGE! Workaround recursive import's :-/
@ -190,6 +201,11 @@ class StartupWidget( QWidget ) :
        """Update the analysis progress in the UI."""
        self.pb_pages.setValue( int( 100*pval + 0.5 ) )

+    @pyqtSlot( str )
+    def on_error( self , msg ) :
+        """Show an error message box."""
+        MainWindow.show_error_msg( msg )
+
    @pyqtSlot( str , QMessageBox.StandardButtons , QMessageBox.StandardButton , result=QMessageBox.StandardButton )
    def on_ask( self , msg , buttons , default ) :
        """Ask the user a question."""