diff --git a/asl_cards/parse.py b/asl_cards/parse.py index 43de795..566e264 100644 --- a/asl_cards/parse.py +++ b/asl_cards/parse.py @@ -116,8 +116,8 @@ class PdfParser: page_pos = 0 if items[0].y0 > lt_page.height/2 else 1 return AslCard( card_tag = _tidy( item_texts[0] ).replace( "# ", "#" ) , - nationality = _tidy( item_texts[1] ) , - name = _tidy( item_texts[2] ) , + nationality = _tidy(item_texts[1]) if len(item_texts) > 1 else "" , + name = _tidy(item_texts[2]) if len(item_texts) > 2 else "" , page_id = lt_page.pageid , page_pos = page_pos , ) diff --git a/asl_cards/tests/_test_case_base.py b/asl_cards/tests/_test_case_base.py new file mode 100755 index 0000000..74c10d7 --- /dev/null +++ b/asl_cards/tests/_test_case_base.py @@ -0,0 +1,39 @@ +import sys +import os +import unittest + +base_dir = os.path.split( __file__ )[ 0 ] + +sys.path.append( "../.." ) # fudge! need this to allow a script to run within a package :-/ +from asl_cards.parse import PdfParser + +# --------------------------------------------------------------------- + +class TestCaseBase( unittest.TestCase ) : + """Base for all test classes.""" + + def _test_pdf_parser( self , fname , expected_cards ) : + # parse the specified PDF + fname2 = os.path.join( base_dir , fname ) + if not os.path.isfile( fname2 ) : + raise RuntimeError( "Missing data file: {}".format( fname2 ) ) + pdf_parser = PdfParser( + #progress = lambda _,msg: print( msg , file=sys.stderr , flush=True ) + ) + cards = pdf_parser.parse( fname2 , images=False ) + if False : + for c in cards : + print(c) + # check the results + if len(cards) != len(expected_cards) : + raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) ) + if len(cards) == 0 : + return + # get the attributes we're interested in + card = expected_cards[0] + attrs = [ a for a in dir(card) if not a.startswith("_") and not callable(getattr(card,a)) ] + attrs.remove( "card_image" ) # this is messing things up :-/ + # compare the extracted cards with the expected results + for i in range(0,len(cards)) : + if not all( getattr(cards[i],a) == getattr(expected_cards[i],a) for a in attrs ) : + raise RuntimeError( "{}: Card mismatch ({}): got {}, expected {}.".format( fname , i , cards[i] , expected_cards[i] ) ) diff --git a/asl_cards/tests/synthetic-data/1-card.doc b/asl_cards/tests/synthetic-data/1-card.doc new file mode 100755 index 0000000..5b52de6 Binary files /dev/null and b/asl_cards/tests/synthetic-data/1-card.doc differ diff --git a/asl_cards/tests/synthetic-data/1-card.pdf b/asl_cards/tests/synthetic-data/1-card.pdf new file mode 100755 index 0000000..02c8f3b Binary files /dev/null and b/asl_cards/tests/synthetic-data/1-card.pdf differ diff --git a/asl_cards/tests/synthetic-data/2-cards.doc b/asl_cards/tests/synthetic-data/2-cards.doc new file mode 100755 index 0000000..e40958b Binary files /dev/null and b/asl_cards/tests/synthetic-data/2-cards.doc differ diff --git a/asl_cards/tests/synthetic-data/2-cards.pdf b/asl_cards/tests/synthetic-data/2-cards.pdf new file mode 100755 index 0000000..25ebe1b Binary files /dev/null and b/asl_cards/tests/synthetic-data/2-cards.pdf differ diff --git a/asl_cards/tests/synthetic-data/3-cards.doc b/asl_cards/tests/synthetic-data/3-cards.doc new file mode 100755 index 0000000..28a2d6d Binary files /dev/null and b/asl_cards/tests/synthetic-data/3-cards.doc differ diff --git a/asl_cards/tests/synthetic-data/3-cards.pdf b/asl_cards/tests/synthetic-data/3-cards.pdf new file mode 100755 index 0000000..65a3e16 Binary files /dev/null and b/asl_cards/tests/synthetic-data/3-cards.pdf differ diff --git a/asl_cards/tests/synthetic-data/bad-spacing.doc b/asl_cards/tests/synthetic-data/bad-spacing.doc new file mode 100755 index 0000000..0b1d17b Binary files /dev/null and b/asl_cards/tests/synthetic-data/bad-spacing.doc differ diff --git a/asl_cards/tests/synthetic-data/bad-spacing.pdf b/asl_cards/tests/synthetic-data/bad-spacing.pdf new file mode 100755 index 0000000..456aae9 Binary files /dev/null and b/asl_cards/tests/synthetic-data/bad-spacing.pdf differ diff --git a/asl_cards/tests/synthetic-data/empty.doc b/asl_cards/tests/synthetic-data/empty.doc new file mode 100755 index 0000000..a04423b Binary files /dev/null and b/asl_cards/tests/synthetic-data/empty.doc differ diff --git a/asl_cards/tests/synthetic-data/empty.pdf b/asl_cards/tests/synthetic-data/empty.pdf new file mode 100755 index 0000000..c99348b Binary files /dev/null and b/asl_cards/tests/synthetic-data/empty.pdf differ diff --git a/asl_cards/tests/synthetic-data/null.pdf b/asl_cards/tests/synthetic-data/null.pdf new file mode 100644 index 0000000..e69de29 diff --git a/asl_cards/tests/test_real_data.py b/asl_cards/tests/test_real_data.py index b981670..6a558f0 100755 --- a/asl_cards/tests/test_real_data.py +++ b/asl_cards/tests/test_real_data.py @@ -4,36 +4,17 @@ import sys import os import unittest -base_dir = os.path.split( __file__ )[ 0 ] - -sys.path.append( ".." ) # fudge! need this to allow a script to run within a package :-/ -from asl_cards.parse import PdfParser , AslCard +from _test_case_base import TestCaseBase +from asl_cards.parse import AslCard # --------------------------------------------------------------------- -class TestRealData( unittest.TestCase ) : +class TestRealData( TestCaseBase ) : """Run tests using the real "ASL Cards" PDF files.""" def _test_pdf_parser( self , fname , expected_cards ) : - # parse the specified PDF - fname2 = os.path.join( base_dir , os.path.join("real-data",fname) ) - if not os.path.isfile( fname2 ) : - raise RuntimeError( "Missing data file: {}".format( fname2 ) ) - pdf_parser = PdfParser( - #progress = lambda _,msg: print( msg , file=sys.stderr , flush=True ) - ) - cards = pdf_parser.parse( fname2 , images=False ) - # check the results - if len(cards) != len(expected_cards) : - raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) ) - # get the attributes we're interested in - card = expected_cards[0] - attrs = [ a for a in dir(card) if not a.startswith("_") and not callable(getattr(card,a)) ] - attrs.remove( "card_image" ) # this is messing things up :-/ - # compare the extracted cards with the expected results - for i in range(0,len(cards)) : - if not all( getattr(cards[i],a) == getattr(expected_cards[i],a) for a in attrs ) : - raise RuntimeError( "{}: Card mismatch ({}): got {}, expected {}.".format( fname , i , cards[i] , expected_cards[i] ) ) + """Test the PDF parser.""" + super()._test_pdf_parser( os.path.join("real-data",fname) , expected_cards ) def test_italian_ordnance( self ) : self._test_pdf_parser( "ItalianOrdnance.pdf" , [ diff --git a/asl_cards/tests/test_synthetic_data.py b/asl_cards/tests/test_synthetic_data.py new file mode 100755 index 0000000..c40a5a1 --- /dev/null +++ b/asl_cards/tests/test_synthetic_data.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import sys +import os +import unittest + +from pdfminer.pdfparser import PDFSyntaxError + +from _test_case_base import TestCaseBase +from asl_cards.parse import AslCard + +# --------------------------------------------------------------------- + +class TestSyntheticData( TestCaseBase ) : + """Run tests using the synthetic PDF files. + + We test with some generated test files, since the real "ASL Cards" files need to purchased, + so we can't keep them in source control. + """ + + def _test_pdf_parser( self , fname , expected_cards ) : + """Test the PDF parser.""" + super()._test_pdf_parser( os.path.join("synthetic-data",fname) , expected_cards ) + + def test_null_file( self ) : + # try parsing a zero-byte file + self.assertRaises( + PDFSyntaxError , + self._test_pdf_parser , "null.pdf" , None + ) + + def test_empty_file( self ) : + # try parsing an empty file + self._test_pdf_parser( "empty.pdf" , [] ) + + def test_1card_file( self ) : + # try parsing a file with 1 card + self._test_pdf_parser( "1-card.pdf" , [ + AslCard( page_id=1 , page_pos=0 , card_tag="Vehicle #1" , nationality="Moldovia" , name="Big Tank" ) , + ] ) + + def test_2card_file( self ) : + # try parsing a file with 2 cards + self._test_pdf_parser( "2-cards.pdf" , [ + AslCard( page_id=1 , page_pos=0 , card_tag="Vehicle #1" , nationality="Moldovia" , name="Big Tank" ) , + AslCard( page_id=1 , page_pos=1 , card_tag="Vehicle #2" , nationality="Moldovia" , name="Little Tank" ) , + ] ) + + def test_3card_file( self ) : + # try parsing a file with 3 cards + self._test_pdf_parser( "3-cards.pdf" , [ + AslCard( page_id=1 , page_pos=0 , card_tag="Vehicle #1" , nationality="Moldovia" , name="Big Tank" ) , + AslCard( page_id=1 , page_pos=1 , card_tag="Vehicle #2" , nationality="Moldovia" , name="Little Tank" ) , + AslCard( page_id=2 , page_pos=0 , card_tag="Ordnance #1" , nationality="Moldovia" , name="Big Gun" ) , + ] ) + + def test_bad_spacing( self ) : + # try parsing cards with bad spacing + self._test_pdf_parser( "bad-spacing.pdf" , [ + AslCard( page_id=1 , page_pos=0 , card_tag="Vehicle #1" , nationality="" , name="" ) , + AslCard( page_id=1 , page_pos=1 , card_tag="Vehicle #2" , nationality="Moldovia" , name="" ) , + ] ) + +# --------------------------------------------------------------------- + +if __name__ == "__main__" : + unittest.main()