Clean up old extracted image temp files before doing a new extraction.

7 years ago · fe5ffd4324
parent ea8676e7eb
commit fe5ffd4324
1 changed files with 28 additions and 14 deletions
--- a/asl_cards/parse.py
+++ b/asl_cards/parse.py
@ -26,6 +26,25 @@ class AnalyzeCancelledException( RuntimeError ) :

 # ---------------------------------------------------------------------

+# NOTE: Ghostscript extracts PDF pages to image files - this value defines where to put them.
+_EXTRACTED_IMAGES_FILENAME_TEMPLATE = os.path.join( tempfile.gettempdir() , "asl_cards-%d.png" )
+
+def _find_extracted_image_files() :
+    """Find the image files extracted by Ghostscript."""
+    fnames = []
+    # NOTE: We assume there are never more than 500 of these.
+    # This method is used to clean up files left over from a previous (failed) run, so we can't
+    # just start at 1 and increment as we look for files. We could do some funky stuff with
+    # os.listdir() and regex's, but we need the extracted files in page order, and it's more
+    # trouble than it's worth... :-/
+    for i in range(1,500) :
+        fname = _EXTRACTED_IMAGES_FILENAME_TEMPLATE % i
+        if os.path.isfile( fname ) :
+            fnames.append( fname )
+    return fnames
+
+# ---------------------------------------------------------------------
+
 class PdfParser:

    def __init__( self , index_dir , progress=None , progress2=None , on_ask=None , on_error=None ) :
@ -203,35 +222,30 @@ class PdfParser:

    def _extract_images( self , fname , max_pages ) :
        """Extract card images from a file."""
+        # clean up any leftover extracted images from a previous run
+        # NOTE: It's important we do this, otherwise we might think they're part of this run.
+        for f in _find_extracted_image_files() :
+            os.unlink( f )
        # extract each page from the PDF as an image
-        fname_template = os.path.join( tempfile.gettempdir() , "asl_cards-%d.png" )
        resolution = 300 # pixels/inch
        args = [
            "_ignored_" , "-dQUIET" , "-dSAFER" , "-dNOPAUSE" ,
            "-sDEVICE=png16m" , "-r"+str(resolution) ,
-            "-sOutputFile="+fname_template ,
+            "-sOutputFile="+_EXTRACTED_IMAGES_FILENAME_TEMPLATE
        ]
        if max_pages > 0 :
            args.append( "-dLastPage={}".format(max_pages) )
        args.extend( [ "-f" , fname ] )
-        # FIXME! clean up left-over temp files before we start
        args = [ s.encode(locale.getpreferredencoding()) for s in args ]
        # FIXME! stop GhostScript from issuing warnings (stdout).
        ghostscript.Ghostscript( *args )
-        # figure out how many files were created (so we can show progress)
-        npages = 0
-        for i in range(0,99999) :
-            fname = fname_template % (1+i)
-            if not os.path.isfile( fname ) :
-                break
-            npages += 1
+        image_fnames = _find_extracted_image_files()
        # extract the cards from each page
        card_images = []
-        for page_no in range(0,npages) :
+        for page_no,fname in enumerate(image_fnames) :
            if self.cancelling : raise AnalyzeCancelledException()
            # open the next page image
-            self._progress2( float(page_no) / npages )
-            fname = fname_template % (1+page_no)
+            self._progress2( float(page_no) / len(image_fnames) )
            img = Image.open( fname )
            img_width , img_height = img.size
            # extract the cards (by splitting the page in half)
@ -247,7 +261,7 @@ class PdfParser:
                os.path.join( fname2[0] , fname2[1][0]+"b"+fname2[1][1] )
            )
            # check if this is the last page, and it has just 1 card on it
-            if page_no == npages-1 and size1[1] < 1000 and size2[1] < 1000 :
+            if page_no == len(image_fnames)-1 and size1[1] < 1000 and size2[1] < 1000 :
                # yup - extract it
                buf , _ = self._crop_image(
                    img , (0,0,img_width,img_height) ,