Extract basic card information from some PDF's (WIP).

master
Pacman Ghost 7 years ago
parent 472657325c
commit de6fb25658
  1. 3
      .gitignore
  2. 56
      asl_cards/__main__.py
  3. 98
      asl_cards/parse.py
  4. 2
      asl_cards/requirements.txt
  5. 2
      asl_cards/run_tests.sh
  6. 0
      asl_cards/tests/__init__.py
  7. 1
      asl_cards/tests/real-data/.gitignore
  8. 1
      asl_cards/tests/real-data/READ-ME
  9. 109
      asl_cards/tests/test_real_data.py

3
.gitignore vendored

@ -0,0 +1,3 @@
.venv-*
__pycache__/
*.py[cod]

@ -0,0 +1,56 @@
#!/usr/bin/env python3
""" CLI for the asl_cards module. """
import sys
import os
import getopt
from parse import PdfParser
# ---------------------------------------------------------------------
def main( args ) :
# parse the arguments
try :
opts , args = getopt.getopt( args , "f:d:h?" , ["file=","dir=","help"] )
except getopt.GetoptError as err :
raise RuntimeError( "Can't parse arguments: {}".format( err ) )
for opt,val in opts :
if opt in ("-f","--file") :
pdf_parser = PdfParser( progress_callback )
cards = pdf_parser.parse_file( val )
for c in cards :
print( c )
elif opt in ("-d","--dir") :
pdf_parser = PdfParser( progress_callback )
fcards = pdf_parser.parse_dir( val )
for fname,cards in fcards.items() :
print( "{}:".format( fname ) )
for c in cards :
print( "- {}".format( c ) )
elif opt in ("-h","--help","-?") :
print_help()
else :
raise RuntimeError( "Unknown argument: {}".format( opt ) )
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def progress_callback( progress , msg ) :
if progress is not None :
print( "{:3}% | {}".format(int(100*progress),msg) , file=sys.stderr , flush=True )
else :
print( " | {}".format(msg) , file=sys.stderr , flush=True )
# ---------------------------------------------------------------------
def print_help() :
print( "{} {{options}}".format( os.path.split(sys.argv[0])[1] ) )
print()
print( " -f --file PDF file to parse." )
print( " -d --dir Directory with PDF's to parse." )
print()
# ---------------------------------------------------------------------
if __name__ == "__main__" :
main( sys.argv[1:] )

@ -0,0 +1,98 @@
import os
import re
from collections import namedtuple
from pdfminer.pdfinterp import PDFResourceManager , PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams , LTTextBoxHorizontal
from pdfminer.pdfpage import PDFPage
# ---------------------------------------------------------------------
AslCard = namedtuple(
"AslCard" ,
[ "page_id" , "page_pos" , "tag" , "nationality" , "name" ]
)
# ---------------------------------------------------------------------
class PdfParser:
def __init__( self , progress=None ) :
# initialize
self.progress = progress
def parse_file( self , fname ) :
"""Extract the cards from a PDF file."""
# initialize
rmgr = PDFResourceManager()
laparams = LAParams()
dev = PDFPageAggregator( rmgr , laparams=laparams )
interp = PDFPageInterpreter( rmgr , dev )
# process the file
cards = []
with open(fname,"rb") as fp :
self._progress( 0 , "Loading file: {}".format( fname ) )
pages = list( PDFPage.get_pages( fp ) )
for page_no,page in enumerate(pages) :
self._progress( float(page_no)/len(pages) , "Processing page {}...".format( 1+page_no ) )
page_cards = self._parse_page( cards , interp , page_no , page )
cards.extend( page_cards )
self._progress( 1.0 , "Done." )
return cards
def _parse_page( self , cards , interp , page_no , page ) :
"""Extract the cards from a PDF page."""
cards = []
interp.process_page( page )
lt_page = interp.device.get_result()
pending_card = None
for item in lt_page :
if type(item) is not LTTextBoxHorizontal : continue
item_text = item.get_text().strip()
if item_text.startswith( ("Vehicle","Ordnance") ) :
vals = item_text.split( "\n" )
page_pos = 0 if item.y0 > lt_page.height/2 else 1
if len(vals) >= 3 :
card = AslCard(
lt_page.pageid , page_pos ,
_tidy(vals[0]).replace("# ","#") ,
_tidy(vals[1]) ,
_tidy(vals[2])
)
self._progress( None , "Found card: {}".format( card ) )
cards.append( card )
pending_card = None
else :
pending_card = [
lt_page.pageid , page_pos ,
_tidy(vals[0]).replace("# ","#") ,
_tidy(vals[1])
]
elif pending_card :
pending_card.append( _tidy( item.get_text().strip() ) )
card = AslCard( *pending_card )
self._progress( None , "Found card: {}".format( card ) )
cards.append( card )
pending_card = None
return cards
def parse_dir( self , dname , progress=None ) :
"""Parse all PDF's in a directory."""
fcards = {}
for fname in os.listdir(dname) :
if os.path.splitext(fname)[1].lower() != ".pdf" :
continue
cards = self.parse_file( os.path.join(dname,fname) )
fcards[fname] = cards
return fcards
def _progress( self , progress , msg ) :
"""Call the progress callback."""
if self.progress :
self.progress( progress , msg )
# ---------------------------------------------------------------------
_tidy_regex = re.compile( r"[,.()+-]" )
def _tidy( val ) : return _tidy_regex.sub(" ",val).strip()

@ -0,0 +1,2 @@
# python == 3.6.1
pdfminer.six == 20170419

@ -0,0 +1,2 @@
cd `dirname "$0"`
python -m unittest discover

@ -0,0 +1 @@
This directory contains the real "ASL Cards" PDF files, but since these are not free (i.e. purchased), we don't keep them in source control.

@ -0,0 +1,109 @@
#!/usr/bin/env python3
import sys
import os
import unittest
base_dir = os.path.split( __file__ )[ 0 ]
sys.path.append( os.path.join( base_dir , ".." ) )
from parse import PdfParser , AslCard
# ---------------------------------------------------------------------
class TestRealData( unittest.TestCase ) :
"""Run tests using the real "ASL Cards" PDF files."""
def _test_pdf_parser( self , fname , expected_cards ) :
# parse the specified PDF
fname2 = os.path.join( base_dir , os.path.join("real-data",fname) )
if not os.path.isfile( fname2 ) :
raise RuntimeError( "Missing data file: {}".format( fname2 ) )
pdf_parser = PdfParser(
#progress = lambda _,msg: print( msg , file=sys.stderr )
)
cards = pdf_parser.parse_file( fname2 )
# check the results
if len(cards) != len(expected_cards) :
raise RuntimeError( "{}: got {} cards, expected {}.".format( fname , len(cards) , len(expected_cards) ) )
for i in range(0,len(cards)) :
if cards[i] != expected_cards[i] :
raise RuntimeError( "{}: Card mismatch ({}): got {}, expected {}.".format( fname , i , cards[i] , expected_cards[i] ) )
def test_japanese_vehicles( self ) :
self._test_pdf_parser( "JapaneseVehiclesFeb15.pdf" , [
AslCard(page_id=1, page_pos=0, tag="Vehicle #1", nationality="Japanese", name="Type 92A") ,
AslCard(page_id=1, page_pos=1, tag="Vehicle #1", nationality="Japanese", name="Type 92B") ,
AslCard(page_id=2, page_pos=0, tag="Vehicle #2", nationality="Japanese", name="Type 94") ,
AslCard(page_id=2, page_pos=1, tag="Vehicle #3", nationality="Japanese", name="Type 95 SO4KI") ,
AslCard(page_id=3, page_pos=0, tag="Vehicle #4", nationality="Japanese", name="Type 97A TE6KE") ,
AslCard(page_id=3, page_pos=1, tag="Vehicle #4", nationality="Japanese", name="Type 97B TE6KE") ,
AslCard(page_id=4, page_pos=0, tag="Vehicle #5", nationality="Japanese", name="Type 95 HA6GO") ,
AslCard(page_id=4, page_pos=1, tag="Vehicle #6", nationality="Japanese", name="Type 2 KA6MI") ,
AslCard(page_id=5, page_pos=0, tag="Vehicle #6", nationality="Japanese", name="Type 2 KA:MI w/o") ,
AslCard(page_id=5, page_pos=1, tag="Vehicle #7", nationality="Japanese", name="Type 89A CHI:RO") ,
AslCard(page_id=6, page_pos=0, tag="Vehicle #7", nationality="Japanese", name="Type 89B CHI7RO") ,
AslCard(page_id=6, page_pos=1, tag="Vehicle #8", nationality="Japanese", name="Type 97A CHI7HA") ,
AslCard(page_id=7, page_pos=0, tag="Vehicle #8", nationality="Japanese", name="Type 97B CHI6HA") ,
AslCard(page_id=7, page_pos=1, tag="Vehicle #9", nationality="Japanese", name="Type 1 CHI6HE") ,
AslCard(page_id=8, page_pos=0, tag="Vehicle #10", nationality="Japanese", name="Type 91") ,
AslCard(page_id=8, page_pos=1, tag="Vehicle #11", nationality="Japanese", name="Type 92") ,
AslCard(page_id=9, page_pos=0, tag="Vehicle #12", nationality="Japanese", name="Type 1 HO?NI I") ,
AslCard(page_id=9, page_pos=1, tag="Vehicle #13", nationality="Japanese", name="Type 4 HO?RO") ,
AslCard(page_id=10, page_pos=0, tag="Vehicle #14", nationality="Japanese", name="Type 1 HO?KI") ,
AslCard(page_id=10, page_pos=1, tag="Vehicle #15", nationality="Japanese", name="Type 98 SHI?KE") ,
AslCard(page_id=11, page_pos=0, tag="Vehicle #16", nationality="Japanese", name="Type 92 IBKE") ,
AslCard(page_id=11, page_pos=1, tag="Vehicle #17", nationality="Japanese", name="Type 95") ,
AslCard(page_id=12, page_pos=0, tag="Vehicle #18", nationality="Japanese", name="Type 94 Truck") ,
AslCard(page_id=12, page_pos=1, tag="Vehicle #18", nationality="Japanese", name="Type 97 Truck") ,
] )
def test_chinese_ordnance( self ) :
self._test_pdf_parser( "ChineseOrdnanceMidApril15.pdf" , [
AslCard(page_id=1, page_pos=0, tag="Ordnance3#31", nationality="Chinese", name="Type3273GL") ,
AslCard(page_id=1, page_pos=1, tag="Ordnance3#32", nationality="Chinese", name="Mortaio3da3453 i") ,
AslCard(page_id=2, page_pos=0, tag="Ordnance0#02", nationality="Chinese", name="5cm0leGrW0360 g") ,
AslCard(page_id=2, page_pos=1, tag="Ordnance0#02", nationality="Chinese", name="50mm0RM0obr 380 r") ,
AslCard(page_id=3, page_pos=0, tag="Ordnance1#12", nationality="Chinese", name="Type1891HGL1 j") ,
AslCard(page_id=3, page_pos=1, tag="Ordnance1#13", nationality="Chinese", name="M2160mm1 a") ,
AslCard(page_id=4, page_pos=0, tag="Ordnance3#34", nationality="Chinese", name="Stokes33>in3 b") ,
AslCard(page_id=4, page_pos=1, tag="Ordnance3#34", nationality="Chinese", name="8cm3GrW3343 g") ,
AslCard(page_id=5, page_pos=0, tag="Ordnance2#24", nationality="Chinese", name="82mm2BM2o 2372 r") ,
AslCard(page_id=5, page_pos=1, tag="Ordnance2#25", nationality="Chinese", name="M1281mm2 a") ,
AslCard(page_id=6, page_pos=0, tag="Ordnance5#55", nationality="Chinese", name="M254 2?in5 a") ,
AslCard(page_id=6, page_pos=1, tag="Ordnance5#56", nationality="Chinese", name="3 7cm5PaK535/365 g") ,
AslCard(page_id=7, page_pos=0, tag="Ordnance2#26", nationality="Chinese", name="M3A1237mm2 a") ,
AslCard(page_id=7, page_pos=1, tag="Ordnance2#27", nationality="Chinese", name="37mm2PP2o 215R2 r") ,
AslCard(page_id=8, page_pos=0, tag="Ordnance2#27", nationality="Chinese", name="Cann 2da270/152 i") ,
AslCard(page_id=8, page_pos=1, tag="Ordnance2#28", nationality="Chinese", name="7 5cm2Krupp2 g") ,
AslCard(page_id=9, page_pos=0, tag="Ordnance1#18", nationality="Chinese", name="Obice1da175/131 i") ,
AslCard(page_id=9, page_pos=1, tag="Ordnance1#19", nationality="Chinese", name="7 5cm1leIG1181 g") ,
AslCard(page_id=10, page_pos=0, tag="Ordnance2#29", nationality="Chinese", name="76 2mm2PP2o 2272 r") ,
AslCard(page_id=10, page_pos=1, tag="Ordnance2#210", nationality="Chinese", name="M1A1275mm2 a") ,
AslCard(page_id=11, page_pos=0, tag="Ordnance #11", nationality="Chinese", name="7 7cm FK 16 g") ,
AslCard(page_id=11, page_pos=1, tag="Ordnance #11", nationality="Chinese", name="76 2mm o 02/30 r") ,
AslCard(page_id=12, page_pos=0, tag="Ordnance4#411", nationality="Chinese", name="OQF418pdr4 b") ,
AslCard(page_id=12, page_pos=1, tag="Ordnance4#412", nationality="Chinese", name="10 5cm4leFH4164 g") ,
AslCard(page_id=13, page_pos=0, tag="Ordnance1#112", nationality="Chinese", name="Cann 1da1105/281 i") ,
AslCard(page_id=13, page_pos=1, tag="Ordnance1#112", nationality="Chinese", name="M2A11105mm1 a") ,
AslCard(page_id=14, page_pos=0, tag="Ordnance1#113", nationality="Chinese", name="122mm1o 10/301 r") ,
AslCard(page_id=14, page_pos=1, tag="Ordnance1#113", nationality="Chinese", name="122mm1G1o 1381 r") ,
AslCard(page_id=15, page_pos=0, tag="Ordnance4#414", nationality="Chinese", name="Obice4da4149/134 i") ,
AslCard(page_id=15, page_pos=1, tag="Ordnance4#415", nationality="Chinese", name="Oerlikon4FF4 g") ,
AslCard(page_id=16, page_pos=0, tag="Ordnance #15", nationality="Chinese", name="Oerlikon FF LF") ,
AslCard(page_id=16, page_pos=1, tag="Ordnance #15", nationality="Chinese", name="Cann Amitr 20/65 i") ,
AslCard(page_id=17, page_pos=0, tag="Ordnance #15", nationality="Chinese", name="Cann 5mitr 20/65 LF") ,
AslCard(page_id=17, page_pos=1, tag="Ordnance #15", nationality="Chinese", name="2cm FlaK 30 g") ,
AslCard(page_id=18, page_pos=0, tag="Ordnance #15", nationality="Chinese", name="2cm FlaK 30 LF") ,
AslCard(page_id=18, page_pos=1, tag="Ordnance #16", nationality="Chinese", name="3 7cm FlaK 36/37 g") ,
AslCard(page_id=19, page_pos=0, tag="Ordnance #16", nationality="Chinese", name="Bofors 40mm L/60") ,
AslCard(page_id=19, page_pos=1, tag="Ordnance #16", nationality="Chinese", name="Bofors 40mm L/60 LF") ,
AslCard(page_id=20, page_pos=0, tag="Ordnance #17", nationality="Chinese", name="Bofors 76mm M29") ,
AslCard(page_id=20, page_pos=1, tag="Ordnance #17", nationality="Chinese", name="Bofors 76mm M29 LF") ,
AslCard(page_id=21, page_pos=0, tag="Ordnance #17", nationality="Chinese", name="8 8cm FlaK 18 g") ,
AslCard(page_id=21, page_pos=1, tag="Ordnance #17", nationality="Chinese", name="8 8cm FlaK 18 LF") ,
] )
# ---------------------------------------------------------------------
if __name__ == "__main__" :
unittest.main()
Loading…
Cancel
Save