parent
472657325c
commit
de6fb25658
@ -0,0 +1,3 @@ |
||||
.venv-* |
||||
__pycache__/ |
||||
*.py[cod] |
@ -0,0 +1,56 @@ |
||||
#!/usr/bin/env python3 |
||||
""" CLI for the asl_cards module. """ |
||||
|
||||
import sys |
||||
import os |
||||
import getopt |
||||
|
||||
from parse import PdfParser |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def main( args ) : |
||||
# parse the arguments |
||||
try : |
||||
opts , args = getopt.getopt( args , "f:d:h?" , ["file=","dir=","help"] ) |
||||
except getopt.GetoptError as err : |
||||
raise RuntimeError( "Can't parse arguments: {}".format( err ) ) |
||||
for opt,val in opts : |
||||
if opt in ("-f","--file") : |
||||
pdf_parser = PdfParser( progress_callback ) |
||||
cards = pdf_parser.parse_file( val ) |
||||
for c in cards : |
||||
print( c ) |
||||
elif opt in ("-d","--dir") : |
||||
pdf_parser = PdfParser( progress_callback ) |
||||
fcards = pdf_parser.parse_dir( val ) |
||||
for fname,cards in fcards.items() : |
||||
print( "{}:".format( fname ) ) |
||||
for c in cards : |
||||
print( "- {}".format( c ) ) |
||||
elif opt in ("-h","--help","-?") : |
||||
print_help() |
||||
else : |
||||
raise RuntimeError( "Unknown argument: {}".format( opt ) ) |
||||
|
||||
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
||||
|
||||
def progress_callback( progress , msg ) : |
||||
if progress is not None : |
||||
print( "{:3}% | {}".format(int(100*progress),msg) , file=sys.stderr , flush=True ) |
||||
else : |
||||
print( " | {}".format(msg) , file=sys.stderr , flush=True ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
def print_help() : |
||||
print( "{} {{options}}".format( os.path.split(sys.argv[0])[1] ) ) |
||||
print() |
||||
print( " -f --file PDF file to parse." ) |
||||
print( " -d --dir Directory with PDF's to parse." ) |
||||
print() |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
if __name__ == "__main__" : |
||||
main( sys.argv[1:] ) |
@ -0,0 +1,98 @@ |
||||
import os |
||||
import re |
||||
from collections import namedtuple |
||||
|
||||
from pdfminer.pdfinterp import PDFResourceManager , PDFPageInterpreter |
||||
from pdfminer.converter import PDFPageAggregator |
||||
from pdfminer.layout import LAParams , LTTextBoxHorizontal |
||||
from pdfminer.pdfpage import PDFPage |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
AslCard = namedtuple( |
||||
"AslCard" , |
||||
[ "page_id" , "page_pos" , "tag" , "nationality" , "name" ] |
||||
) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class PdfParser: |
||||
|
||||
def __init__( self , progress=None ) : |
||||
# initialize |
||||
self.progress = progress |
||||
|
||||
def parse_file( self , fname ) : |
||||
"""Extract the cards from a PDF file.""" |
||||
# initialize |
||||
rmgr = PDFResourceManager() |
||||
laparams = LAParams() |
||||
dev = PDFPageAggregator( rmgr , laparams=laparams ) |
||||
interp = PDFPageInterpreter( rmgr , dev ) |
||||
# process the file |
||||
cards = [] |
||||
with open(fname,"rb") as fp : |
||||
self._progress( 0 , "Loading file: {}".format( fname ) ) |
||||
pages = list( PDFPage.get_pages( fp ) ) |
||||
for page_no,page in enumerate(pages) : |
||||
self._progress( float(page_no)/len(pages) , "Processing page {}...".format( 1+page_no ) ) |
||||
page_cards = self._parse_page( cards , interp , page_no , page ) |
||||
cards.extend( page_cards ) |
||||
self._progress( 1.0 , "Done." ) |
||||
return cards |
||||
|
||||
def _parse_page( self , cards , interp , page_no , page ) : |
||||
"""Extract the cards from a PDF page.""" |
||||
cards = [] |
||||
interp.process_page( page ) |
||||
lt_page = interp.device.get_result() |
||||
pending_card = None |
||||
for item in lt_page : |
||||
if type(item) is not LTTextBoxHorizontal : continue |
||||
item_text = item.get_text().strip() |
||||
if item_text.startswith( ("Vehicle","Ordnance") ) : |
||||
vals = item_text.split( "\n" ) |
||||
page_pos = 0 if item.y0 > lt_page.height/2 else 1 |
||||
if len(vals) >= 3 : |
||||
card = AslCard( |
||||
lt_page.pageid , page_pos , |
||||
_tidy(vals[0]).replace("# ","#") , |
||||
_tidy(vals[1]) , |
||||
_tidy(vals[2]) |
||||
) |
||||
self._progress( None , "Found card: {}".format( card ) ) |
||||
cards.append( card ) |
||||
pending_card = None |
||||
else : |
||||
pending_card = [ |
||||
lt_page.pageid , page_pos , |
||||
_tidy(vals[0]).replace("# ","#") , |
||||
_tidy(vals[1]) |
||||
] |
||||
elif pending_card : |
||||
pending_card.append( _tidy( item.get_text().strip() ) ) |
||||
card = AslCard( *pending_card ) |
||||
self._progress( None , "Found card: {}".format( card ) ) |
||||
cards.append( card ) |
||||
pending_card = None |
||||
return cards |
||||
|
||||
def parse_dir( self , dname , progress=None ) : |
||||
"""Parse all PDF's in a directory.""" |
||||
fcards = {} |
||||
for fname in os.listdir(dname) : |
||||
if os.path.splitext(fname)[1].lower() != ".pdf" : |
||||
continue |
||||
cards = self.parse_file( os.path.join(dname,fname) ) |
||||
fcards[fname] = cards |
||||
return fcards |
||||
|
||||
def _progress( self , progress , msg ) : |
||||
"""Call the progress callback.""" |
||||
if self.progress : |
||||
self.progress( progress , msg ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
_tidy_regex = re.compile( r"[,.()+-]" ) |
||||
def _tidy( val ) : return _tidy_regex.sub(" ",val).strip() |
@ -0,0 +1,2 @@ |
||||
# python == 3.6.1 |
||||
pdfminer.six == 20170419 |
@ -0,0 +1,2 @@ |
||||
cd `dirname "$0"` |
||||
python -m unittest discover |
@ -0,0 +1 @@ |
||||
*.pdf |
@ -0,0 +1 @@ |
||||
This directory contains the real "ASL Cards" PDF files, but since these are not free (i.e. purchased), we don't keep them in source control. |
@ -0,0 +1,109 @@ |
||||
#!/usr/bin/env python3 |
||||
|
||||
import sys |
||||
import os |
||||
import unittest |
||||
|
||||
base_dir = os.path.split( __file__ )[ 0 ] |
||||
sys.path.append( os.path.join( base_dir , ".." ) ) |
||||
from parse import PdfParser , AslCard |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
class TestRealData( unittest.TestCase ) : |
||||
"""Run tests using the real "ASL Cards" PDF files.""" |
||||
|
||||
def _test_pdf_parser( self , fname , expected_cards ) : |
||||
# parse the specified PDF |
||||
fname2 = os.path.join( base_dir , os.path.join("real-data",fname) ) |
||||
if not os.path.isfile( fname2 ) : |
||||
raise RuntimeError( "Missing data file: {}".format( fname2 ) ) |
||||
pdf_parser = PdfParser( |
||||
#progress = lambda _,msg: print( msg , file=sys.stderr ) |
||||
) |
||||
cards = pdf_parser.parse_file( fname2 ) |
||||
# check the results |
||||
if len(cards) != len(expected_cards) : |
||||
raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) ) |
||||
for i in range(0,len(cards)) : |
||||
if cards[i] != expected_cards[i] : |
||||
raise RuntimeError( "{}: Card mismatch ({}): got {}, expected {}.".format( fname , i , cards[i] , expected_cards[i] ) ) |
||||
|
||||
def test_japanese_vehicles( self ) : |
||||
self._test_pdf_parser( "JapaneseVehiclesFeb15.pdf" , [ |
||||
AslCard(page_id=1, page_pos=0, tag="Vehicle #1", nationality="Japanese", name="Type 92A") , |
||||
AslCard(page_id=1, page_pos=1, tag="Vehicle #1", nationality="Japanese", name="Type 92B") , |
||||
AslCard(page_id=2, page_pos=0, tag="Vehicle #2", nationality="Japanese", name="Type 94") , |
||||
AslCard(page_id=2, page_pos=1, tag="Vehicle #3", nationality="Japanese", name="Type 95 SO4KI") , |
||||
AslCard(page_id=3, page_pos=0, tag="Vehicle #4", nationality="Japanese", name="Type 97A TE6KE") , |
||||
AslCard(page_id=3, page_pos=1, tag="Vehicle #4", nationality="Japanese", name="Type 97B TE6KE") , |
||||
AslCard(page_id=4, page_pos=0, tag="Vehicle #5", nationality="Japanese", name="Type 95 HA6GO") , |
||||
AslCard(page_id=4, page_pos=1, tag="Vehicle #6", nationality="Japanese", name="Type 2 KA6MI") , |
||||
AslCard(page_id=5, page_pos=0, tag="Vehicle #6", nationality="Japanese", name="Type 2 KA:MI w/o") , |
||||
AslCard(page_id=5, page_pos=1, tag="Vehicle #7", nationality="Japanese", name="Type 89A CHI:RO") , |
||||
AslCard(page_id=6, page_pos=0, tag="Vehicle #7", nationality="Japanese", name="Type 89B CHI7RO") , |
||||
AslCard(page_id=6, page_pos=1, tag="Vehicle #8", nationality="Japanese", name="Type 97A CHI7HA") , |
||||
AslCard(page_id=7, page_pos=0, tag="Vehicle #8", nationality="Japanese", name="Type 97B CHI6HA") , |
||||
AslCard(page_id=7, page_pos=1, tag="Vehicle #9", nationality="Japanese", name="Type 1 CHI6HE") , |
||||
AslCard(page_id=8, page_pos=0, tag="Vehicle #10", nationality="Japanese", name="Type 91") , |
||||
AslCard(page_id=8, page_pos=1, tag="Vehicle #11", nationality="Japanese", name="Type 92") , |
||||
AslCard(page_id=9, page_pos=0, tag="Vehicle #12", nationality="Japanese", name="Type 1 HO?NI I") , |
||||
AslCard(page_id=9, page_pos=1, tag="Vehicle #13", nationality="Japanese", name="Type 4 HO?RO") , |
||||
AslCard(page_id=10, page_pos=0, tag="Vehicle #14", nationality="Japanese", name="Type 1 HO?KI") , |
||||
AslCard(page_id=10, page_pos=1, tag="Vehicle #15", nationality="Japanese", name="Type 98 SHI?KE") , |
||||
AslCard(page_id=11, page_pos=0, tag="Vehicle #16", nationality="Japanese", name="Type 92 IBKE") , |
||||
AslCard(page_id=11, page_pos=1, tag="Vehicle #17", nationality="Japanese", name="Type 95") , |
||||
AslCard(page_id=12, page_pos=0, tag="Vehicle #18", nationality="Japanese", name="Type 94 Truck") , |
||||
AslCard(page_id=12, page_pos=1, tag="Vehicle #18", nationality="Japanese", name="Type 97 Truck") , |
||||
] ) |
||||
|
||||
def test_chinese_ordnance( self ) : |
||||
self._test_pdf_parser( "ChineseOrdnanceMidApril15.pdf" , [ |
||||
AslCard(page_id=1, page_pos=0, tag="Ordnance3#31", nationality="Chinese", name="Type3273GL") , |
||||
AslCard(page_id=1, page_pos=1, tag="Ordnance3#32", nationality="Chinese", name="Mortaio3da3453 i") , |
||||
AslCard(page_id=2, page_pos=0, tag="Ordnance0#02", nationality="Chinese", name="5cm0leGrW0360 g") , |
||||
AslCard(page_id=2, page_pos=1, tag="Ordnance0#02", nationality="Chinese", name="50mm0RM0obr 380 r") , |
||||
AslCard(page_id=3, page_pos=0, tag="Ordnance1#12", nationality="Chinese", name="Type1891HGL1 j") , |
||||
AslCard(page_id=3, page_pos=1, tag="Ordnance1#13", nationality="Chinese", name="M2160mm1 a") , |
||||
AslCard(page_id=4, page_pos=0, tag="Ordnance3#34", nationality="Chinese", name="Stokes33>in3 b") , |
||||
AslCard(page_id=4, page_pos=1, tag="Ordnance3#34", nationality="Chinese", name="8cm3GrW3343 g") , |
||||
AslCard(page_id=5, page_pos=0, tag="Ordnance2#24", nationality="Chinese", name="82mm2BM2o 2372 r") , |
||||
AslCard(page_id=5, page_pos=1, tag="Ordnance2#25", nationality="Chinese", name="M1281mm2 a") , |
||||
AslCard(page_id=6, page_pos=0, tag="Ordnance5#55", nationality="Chinese", name="M254 2?in5 a") , |
||||
AslCard(page_id=6, page_pos=1, tag="Ordnance5#56", nationality="Chinese", name="3 7cm5PaK535/365 g") , |
||||
AslCard(page_id=7, page_pos=0, tag="Ordnance2#26", nationality="Chinese", name="M3A1237mm2 a") , |
||||
AslCard(page_id=7, page_pos=1, tag="Ordnance2#27", nationality="Chinese", name="37mm2PP2o 215R2 r") , |
||||
AslCard(page_id=8, page_pos=0, tag="Ordnance2#27", nationality="Chinese", name="Cann 2da270/152 i") , |
||||
AslCard(page_id=8, page_pos=1, tag="Ordnance2#28", nationality="Chinese", name="7 5cm2Krupp2 g") , |
||||
AslCard(page_id=9, page_pos=0, tag="Ordnance1#18", nationality="Chinese", name="Obice1da175/131 i") , |
||||
AslCard(page_id=9, page_pos=1, tag="Ordnance1#19", nationality="Chinese", name="7 5cm1leIG1181 g") , |
||||
AslCard(page_id=10, page_pos=0, tag="Ordnance2#29", nationality="Chinese", name="76 2mm2PP2o 2272 r") , |
||||
AslCard(page_id=10, page_pos=1, tag="Ordnance2#210", nationality="Chinese", name="M1A1275mm2 a") , |
||||
AslCard(page_id=11, page_pos=0, tag="Ordnance #11", nationality="Chinese", name="7 7cm FK 16 g") , |
||||
AslCard(page_id=11, page_pos=1, tag="Ordnance #11", nationality="Chinese", name="76 2mm o 02/30 r") , |
||||
AslCard(page_id=12, page_pos=0, tag="Ordnance4#411", nationality="Chinese", name="OQF418pdr4 b") , |
||||
AslCard(page_id=12, page_pos=1, tag="Ordnance4#412", nationality="Chinese", name="10 5cm4leFH4164 g") , |
||||
AslCard(page_id=13, page_pos=0, tag="Ordnance1#112", nationality="Chinese", name="Cann 1da1105/281 i") , |
||||
AslCard(page_id=13, page_pos=1, tag="Ordnance1#112", nationality="Chinese", name="M2A11105mm1 a") , |
||||
AslCard(page_id=14, page_pos=0, tag="Ordnance1#113", nationality="Chinese", name="122mm1o 10/301 r") , |
||||
AslCard(page_id=14, page_pos=1, tag="Ordnance1#113", nationality="Chinese", name="122mm1G1o 1381 r") , |
||||
AslCard(page_id=15, page_pos=0, tag="Ordnance4#414", nationality="Chinese", name="Obice4da4149/134 i") , |
||||
AslCard(page_id=15, page_pos=1, tag="Ordnance4#415", nationality="Chinese", name="Oerlikon4FF4 g") , |
||||
AslCard(page_id=16, page_pos=0, tag="Ordnance #15", nationality="Chinese", name="Oerlikon FF LF") , |
||||
AslCard(page_id=16, page_pos=1, tag="Ordnance #15", nationality="Chinese", name="Cann Amitr 20/65 i") , |
||||
AslCard(page_id=17, page_pos=0, tag="Ordnance #15", nationality="Chinese", name="Cann 5mitr 20/65 LF") , |
||||
AslCard(page_id=17, page_pos=1, tag="Ordnance #15", nationality="Chinese", name="2cm FlaK 30 g") , |
||||
AslCard(page_id=18, page_pos=0, tag="Ordnance #15", nationality="Chinese", name="2cm FlaK 30 LF") , |
||||
AslCard(page_id=18, page_pos=1, tag="Ordnance #16", nationality="Chinese", name="3 7cm FlaK 36/37 g") , |
||||
AslCard(page_id=19, page_pos=0, tag="Ordnance #16", nationality="Chinese", name="Bofors 40mm L/60") , |
||||
AslCard(page_id=19, page_pos=1, tag="Ordnance #16", nationality="Chinese", name="Bofors 40mm L/60 LF") , |
||||
AslCard(page_id=20, page_pos=0, tag="Ordnance #17", nationality="Chinese", name="Bofors 76mm M29") , |
||||
AslCard(page_id=20, page_pos=1, tag="Ordnance #17", nationality="Chinese", name="Bofors 76mm M29 LF") , |
||||
AslCard(page_id=21, page_pos=0, tag="Ordnance #17", nationality="Chinese", name="8 8cm FlaK 18 g") , |
||||
AslCard(page_id=21, page_pos=1, tag="Ordnance #17", nationality="Chinese", name="8 8cm FlaK 18 LF") , |
||||
] ) |
||||
|
||||
# --------------------------------------------------------------------- |
||||
|
||||
if __name__ == "__main__" : |
||||
unittest.main() |
Loading…
Reference in new issue