Added pylint.

master
Pacman Ghost 3 years ago
parent 082af20d04
commit 725bffb963
  1. 596
      .pylintrc
  2. 27
      asl_rulebook2/extract/all.py
  3. 5
      asl_rulebook2/extract/base.py
  4. 53
      asl_rulebook2/extract/content.py
  5. 55
      asl_rulebook2/extract/index.py
  6. 3
      bin/extract_pages.py
  7. 2
      conftest.py
  8. 3
      pytest.ini

@ -0,0 +1,596 @@
[MASTER]
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=
# Specify a score threshold to be exceeded before program exits with error.
fail-under=10.0
# Add files or directories to the blacklist. They should be base names, not
# paths.
# NOTE: "generated" is for the auto-generated gRPC protobuf stuff.
ignore=generated
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use.
jobs=4
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=print-statement,
parameter-unpacking,
unpacking-in-except,
old-raise-syntax,
backtick,
long-suffix,
old-ne-operator,
old-octal-literal,
import-star-module-level,
non-ascii-bytes-literal,
raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
apply-builtin,
basestring-builtin,
buffer-builtin,
cmp-builtin,
coerce-builtin,
execfile-builtin,
file-builtin,
long-builtin,
raw_input-builtin,
reduce-builtin,
standarderror-builtin,
unicode-builtin,
xrange-builtin,
coerce-method,
delslice-method,
getslice-method,
setslice-method,
no-absolute-import,
old-division,
dict-iter-method,
dict-view-method,
next-method-called,
metaclass-assignment,
indexing-exception,
raising-string,
reload-builtin,
oct-method,
hex-method,
nonzero-method,
cmp-method,
input-builtin,
round-builtin,
intern-builtin,
unichr-builtin,
map-builtin-not-iterating,
zip-builtin-not-iterating,
range-builtin-not-iterating,
filter-builtin-not-iterating,
using-cmp-argument,
eq-without-hash,
div-method,
idiv-method,
rdiv-method,
exception-message-attribute,
invalid-str-codec,
sys-max-int,
bad-python3-import,
deprecated-string-function,
deprecated-str-translate-call,
deprecated-itertools-function,
deprecated-types-field,
next-method-defined,
dict-items-not-iterating,
dict-keys-not-iterating,
dict-values-not-iterating,
deprecated-operator-function,
deprecated-urllib-function,
xreadlines-attribute,
deprecated-sys-function,
exception-escape,
comprehension-escape,
# custom changes follow
import-outside-toplevel,
global-statement,
invalid-name,
too-few-public-methods,
duplicate-code
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
# which contain the number of messages in each category, as well as 'statement'
# which is the total number of statements analyzed. This score is used by the
# global evaluation report (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
#msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=8
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
[STRING]
# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no
# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# List of decorators that change the signature of a decorated function.
signature-mutators=
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=120
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[LOGGING]
# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style.
#class-attribute-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=UPPER_CASE
# Regular expression matching correct constant names. Overrides const-naming-
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_,
ch,mo,fp,v,s
# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style.
#variable-rgx=
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: none. To make it work,
# install the python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
# Regular expression of note tags to take in consideration.
#notes-rgx=
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# Maximum number of arguments for function / method.
max-args=10
# Maximum number of attributes for a class (see R0902).
max-attributes=10
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=30
# Maximum number of locals for function / method body.
max-locals=30
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=10
# Maximum number of statements in function / method body.
max-statements=100
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled).
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled).
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception

@ -1,7 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" Extract everything we need from the MMP eASLRB. """ """ Extract everything we need from the MMP eASLRB. """
import sys
import os import os
import json import json
import re import re
@ -22,6 +21,8 @@ class ExtractAll( ExtractBase ):
def __init__( self, args, log=None ): def __init__( self, args, log=None ):
super().__init__( None, None, log ) super().__init__( None, None, log )
self._args = args self._args = args
self.extract_index = None
self.extract_content = None
def extract_all( self, pdf ): def extract_all( self, pdf ):
"""Extract everything from the eASLRB.""" """Extract everything from the eASLRB."""
@ -33,13 +34,13 @@ class ExtractAll( ExtractBase ):
default_args.update( getattr( mod, "_DEFAULT_ARGS" ) ) default_args.update( getattr( mod, "_DEFAULT_ARGS" ) )
# extract the index # extract the index
self._log_msg( "progress", "\nExtracting the index..." ) self.log_msg( "progress", "\nExtracting the index..." )
args = ExtractBase.parse_args( self._args, default_args ) args = ExtractBase.parse_args( self._args, default_args )
self.extract_index = ExtractIndex( args, self._log ) self.extract_index = ExtractIndex( args, self._log )
self.extract_index.extract_index( pdf ) self.extract_index.extract_index( pdf )
# extract the content # extract the content
self._log_msg( "progress", "\nExtracting the content..." ) self.log_msg( "progress", "\nExtracting the content..." )
args = ExtractBase.parse_args( self._args, default_args ) args = ExtractBase.parse_args( self._args, default_args )
self.extract_content = ExtractContent( args, self._log ) self.extract_content = ExtractContent( args, self._log )
self.extract_content.extract_content( pdf ) self.extract_content.extract_content( pdf )
@ -52,7 +53,7 @@ class ExtractAll( ExtractBase ):
# build an index of known targets # build an index of known targets
targets = {} targets = {}
for ruleid, target in self.extract_content._targets.items(): for ruleid, target in self.extract_content.targets.items():
assert ruleid not in targets assert ruleid not in targets
targets[ ruleid ] = target["caption"] targets[ ruleid ] = target["caption"]
@ -82,7 +83,7 @@ class ExtractAll( ExtractBase ):
# check each index entry # check each index entry
first = True first = True
for index_entry in self.extract_index._index_entries: for index_entry in self.extract_index.index_entries:
errors = [] errors = []
@ -106,10 +107,10 @@ class ExtractAll( ExtractBase ):
# log any errors # log any errors
if errors: if errors:
if first: if first:
self._log_msg( "warning", "\n=== Unknown targets ===\n" ) self.log_msg( "warning", "\n=== Unknown targets ===\n" )
first = False first = False
errors = [ "- {}".format( e ) for e in errors ] errors = [ "- {}".format( e ) for e in errors ]
self._log_msg( "warning", "{}:\n{}", self.log_msg( "warning", "{}:\n{}",
index_entry["caption"], "\n".join(errors) index_entry["caption"], "\n".join(errors)
) )
@ -119,11 +120,13 @@ class ExtractAll( ExtractBase ):
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) @click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
help="Output format."
)
@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." ) @click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." )
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname, save_footnotes_fname ): def main( pdf_file, args, progress, output_fmt, save_index_fname, save_targets_fname, save_footnotes_fname ):
"""Extract everything we need from the MMP eASLRB.""" """Extract everything we need from the MMP eASLRB."""
# extract everything # extract everything
@ -132,7 +135,7 @@ def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname
return return
log_msg_stderr( msg_type, msg ) log_msg_stderr( msg_type, msg )
extract = ExtractAll( args, log_msg ) extract = ExtractAll( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf: with PdfDoc( pdf_file ) as pdf:
extract.extract_all( pdf ) extract.extract_all( pdf )
@ -140,8 +143,8 @@ def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname
with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \ with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract.extract_index, "save_as_"+format )( index_out ) getattr( extract.extract_index, "save_as_"+output_fmt )( index_out )
getattr( extract.extract_content, "save_as_"+format )( targets_out, footnotes_out ) getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, footnotes_out )
if __name__ == "__main__": if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter main() #pylint: disable=no-value-for-parameter

@ -1,4 +1,4 @@
""" Base class for the extraction tools. """ """ Base class for the extraction classes. """
import sys import sys
@ -7,6 +7,7 @@ import click
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
class ExtractBase: class ExtractBase:
"""Base class for the extraction classes."""
def __init__( self, args, default_args, log ): def __init__( self, args, default_args, log ):
self._args = args self._args = args
@ -43,7 +44,7 @@ class ExtractBase:
"""Check if an element is using a bold font.""" """Check if an element is using a bold font."""
return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) ) return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) )
def _log_msg( self, msg_type, msg, *args, **kwargs ): def log_msg( self, msg_type, msg, *args, **kwargs ):
"""Log a message.""" """Log a message."""
if not self._log: if not self._log:
return return

@ -46,8 +46,10 @@ class ExtractContent( ExtractBase ):
def __init__( self, args, log=None ): def __init__( self, args, log=None ):
super().__init__( args, _DEFAULT_ARGS, log ) super().__init__( args, _DEFAULT_ARGS, log )
self._targets = {} self.targets = {}
self._footnotes = {} self._footnotes = {}
self._curr_chapter = self._curr_footnote = self._curr_pageid = None
self._prev_elem = self._top_left_elem = None
# prepare to fixup problems in the content # prepare to fixup problems in the content
fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" ) fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp: with open( fname2, "r", encoding="utf-8" ) as fp:
@ -83,13 +85,13 @@ class ExtractContent( ExtractBase ):
# the start of a footnote by a bold number near the start of the line. # the start of a footnote by a bold number near the start of the line.
# process each page # process each page
for page_no, page, lt_page in PageIterator( pdf ): for page_no, _, lt_page in PageIterator( pdf ):
# prepare to process the next page # prepare to process the next page
if page_no > max( page_index.keys() ): if page_no > max( page_index.keys() ):
break break
if page_no not in page_index: if page_no not in page_index:
self._log_msg( "progress", "- Skipping page {}.", page_no ) self.log_msg( "progress", "- Skipping page {}.", page_no )
continue continue
if not self._curr_chapter or self._curr_chapter != page_index[page_no]: if not self._curr_chapter or self._curr_chapter != page_index[page_no]:
# we've found the start of a new chapter # we've found the start of a new chapter
@ -98,17 +100,17 @@ class ExtractContent( ExtractBase ):
curr_chapter_pageno = 1 curr_chapter_pageno = 1
else: else:
curr_chapter_pageno += 1 curr_chapter_pageno += 1
self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page# self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
self._curr_chapter, curr_chapter_pageno self._curr_chapter, curr_chapter_pageno
) )
self._log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid ) self.log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid )
# process each element on the page # process each element on the page
curr_caption = None curr_caption = None
self._top_left_elem = self._prev_elem = None self._top_left_elem = self._prev_elem = None
elem_filter = lambda e: isinstance( e, LTChar ) elem_filter = lambda e: isinstance( e, LTChar )
sort_elems = self._curr_pageid not in disable_sort_items sort_elems = self._curr_pageid not in disable_sort_items
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
# keep track of the top-left-most bold element # keep track of the top-left-most bold element
if self._is_bold( elem ): if self._is_bold( elem ):
@ -128,7 +130,8 @@ class ExtractContent( ExtractBase ):
# figure out what we've got # figure out what we've got
is_bold = self._is_bold( elem ) is_bold = self._is_bold( elem )
if is_bold and curr_caption and curr_caption[0].isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2: ch = curr_caption[0] if curr_caption else None #pylint: disable=unsubscriptable-object
if is_bold and ch and ch.isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2:
# the previous bold character looks like a footnote superscript - ignore it # the previous bold character looks like a footnote superscript - ignore it
curr_caption = None curr_caption = None
if curr_caption and elem.get_text() == " ": if curr_caption and elem.get_text() == " ":
@ -149,9 +152,11 @@ class ExtractContent( ExtractBase ):
# continue collecting the caption # continue collecting the caption
if self._prev_elem.y0 - elem.y0 > 1: if self._prev_elem.y0 - elem.y0 > 1:
# nb: we just started a new line # nb: we just started a new line
curr_caption[0] = append_text( curr_caption[0], elem.get_text() ) curr_caption[0] = append_text( #pylint: disable=unsupported-assignment-operation
curr_caption[0], elem.get_text() #pylint: disable=unsubscriptable-object
)
else: else:
curr_caption[0] += elem.get_text() curr_caption[0] += elem.get_text() #pylint: disable=unsupported-assignment-operation
else: else:
# check if this is the first character of the line # check if this is the first character of the line
if self._is_start_of_line( elem, lt_page ): if self._is_start_of_line( elem, lt_page ):
@ -174,9 +179,9 @@ class ExtractContent( ExtractBase ):
# check for unused fixups # check for unused fixups
if self._target_fixups: if self._target_fixups:
self._log_msg( "warning", "Unused fixups: {}", self._target_fixups ) self.log_msg( "warning", "Unused fixups: {}", self._target_fixups )
if self._footnote_fixups: if self._footnote_fixups:
self._log_msg( "warning", "Unused fixups: {}", self._footnote_fixups ) self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )
def _save_target( self, caption, page_no, lt_page, elem ): def _save_target( self, caption, page_no, lt_page, elem ):
"""Save a parsed target.""" """Save a parsed target."""
@ -233,14 +238,14 @@ class ExtractContent( ExtractBase ):
# save the new target # save the new target
if not ruleid.startswith( self._curr_chapter ): if not ruleid.startswith( self._curr_chapter ):
ruleid = self._curr_chapter + ruleid ruleid = self._curr_chapter + ruleid
if ruleid in self._targets: if ruleid in self.targets:
self._log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").", self.log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").",
ruleid, caption[0] ruleid, caption[0]
) )
return return
if caption_text == "\u2014": if caption_text == "\u2014":
caption_text = "-" # nb: for A7.306 :-/ caption_text = "-" # nb: for A7.306 :-/
self._targets[ ruleid ] = { self.targets[ ruleid ] = {
"caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1], "caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1],
"raw_caption": orig_caption "raw_caption": orig_caption
} }
@ -292,7 +297,7 @@ class ExtractContent( ExtractBase ):
self._curr_footnote[0] = parts[0] self._curr_footnote[0] = parts[0]
self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip() self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip()
else: else:
self._log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] ) self.log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] )
footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." ) footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." )
content = self._curr_footnote[1].strip() content = self._curr_footnote[1].strip()
mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content ) mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content )
@ -335,7 +340,7 @@ class ExtractContent( ExtractBase ):
prev_content = content prev_content = content
content = content.replace( sr[0], sr[1] ) content = content.replace( sr[0], sr[1] )
if content == prev_content: if content == prev_content:
self._log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}", self.log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}",
self._curr_chapter, footnote_id, sr[0] self._curr_chapter, footnote_id, sr[0]
) )
errors["replace"].append( sr ) errors["replace"].append( sr )
@ -361,7 +366,7 @@ class ExtractContent( ExtractBase ):
captions.append( ( ruleid, content[:pos] ) ) captions.append( ( ruleid, content[:pos] ) )
content = content[pos+1:].strip() content = content[pos+1:].strip()
else: else:
self._log_msg( "warning", "Can't extract footnote caption: {}:{} - {}", self.log_msg( "warning", "Can't extract footnote caption: {}:{} - {}",
self._curr_chapter, footnote_id, content self._curr_chapter, footnote_id, content
) )
@ -404,7 +409,7 @@ class ExtractContent( ExtractBase ):
# save the targets # save the targets
curr_page_no = None curr_page_no = None
for ruleid, target in self._targets.items(): for ruleid, target in self.targets.items():
if target["page_no"] != curr_page_no: if target["page_no"] != curr_page_no:
if curr_page_no: if curr_page_no:
print( file=targets_out ) print( file=targets_out )
@ -448,7 +453,7 @@ class ExtractContent( ExtractBase ):
# save the targets # save the targets
targets, curr_chapter = [], None targets, curr_chapter = [], None
for ruleid, target in self._targets.items(): for ruleid, target in self.targets.items():
xpos, ypos = self._get_target_pos( target ) xpos, ypos = self._get_target_pos( target )
targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format( targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format(
jsonval( ruleid ), jsonval( ruleid ),
@ -496,10 +501,12 @@ class ExtractContent( ExtractBase ):
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) @click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
help="Output format."
)
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_fname ): def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_footnotes_fname ):
"""Extract content from the MMP eASLRB.""" """Extract content from the MMP eASLRB."""
# initialize # initialize
@ -511,14 +518,14 @@ def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_f
return return
log_msg_stderr( msg_type, msg ) log_msg_stderr( msg_type, msg )
extract = ExtractContent( args, log_msg ) extract = ExtractContent( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf: with PdfDoc( pdf_file ) as pdf:
extract.extract_content( pdf ) extract.extract_content( pdf )
# save the results # save the results
with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract, "save_as_"+format )( targets_out, footnotes_out ) getattr( extract, "save_as_"+output_fmt, )( targets_out, footnotes_out )
if __name__ == "__main__": if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter main() #pylint: disable=no-value-for-parameter

@ -27,7 +27,8 @@ class ExtractIndex( ExtractBase ):
def __init__( self, args, log=None ): def __init__( self, args, log=None ):
super().__init__( args, _DEFAULT_ARGS, log ) super().__init__( args, _DEFAULT_ARGS, log )
self._index_entries = None self.index_entries = None
self._prev_y0 = None
# prepare to fixup problems in the index content # prepare to fixup problems in the index content
fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" ) fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp: with open( fname2, "r", encoding="utf-8" ) as fp:
@ -41,19 +42,19 @@ class ExtractIndex( ExtractBase ):
curr_title = curr_content = None curr_title = curr_content = None
# process each page in the index # process each page in the index
for page_no, page, lt_page in PageIterator( pdf ): for page_no, _, lt_page in PageIterator( pdf ):
if page_no > max( page_nos ): if page_no > max( page_nos ):
break break
if page_no not in page_nos: if page_no not in page_nos:
self._log_msg( "progress", "- Skipping page {}.", page_no ) self.log_msg( "progress", "- Skipping page {}.", page_no )
continue continue
self._log_msg( "progress", "- Processing page {}...", page_no ) self.log_msg( "progress", "- Processing page {}...", page_no )
# process each element on the page # process each element on the page
self._prev_y0 = 99999 self._prev_y0 = 99999
elem_filter = lambda e: isinstance( e, LTChar ) elem_filter = lambda e: isinstance( e, LTChar )
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter ): for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter ):
# check if we should ignore this element # check if we should ignore this element
if not self._in_viewport( elem, "index" ): if not self._in_viewport( elem, "index" ):
@ -91,7 +92,7 @@ class ExtractIndex( ExtractBase ):
# continue collecting the content text # continue collecting the content text
if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ): if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ):
# join up hyphenated words # join up hyphenated words
curr_content = curr_content[:-1] curr_content = curr_content[:-1] #pylint: disable=unsubscriptable-object
curr_content += elem.get_text() curr_content += elem.get_text()
# loop back to process the next element # loop back to process the next element
@ -103,10 +104,10 @@ class ExtractIndex( ExtractBase ):
# check for unused fixups # check for unused fixups
if self._fixups: if self._fixups:
self._log_msg( "warning", "Unused fixups: {}", self._fixups ) self.log_msg( "warning", "Unused fixups: {}", self._fixups )
# process the content for each index entry # process the content for each index entry
if not self._index_entries: if not self.index_entries:
raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) ) raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) )
self._process_content() self._process_content()
@ -116,10 +117,10 @@ class ExtractIndex( ExtractBase ):
# check if we've started parsing index entries # check if we've started parsing index entries
# NOTE: There is some bold text at the start of the index, which we parse as an index title, # NOTE: There is some bold text at the start of the index, which we parse as an index title,
# so we don't save anything until we've actually seen the first index entry. # so we don't save anything until we've actually seen the first index entry.
if self._index_entries is None: if self.index_entries is None:
if title != self._args["first_title"]: if title != self._args["first_title"]:
return return
self._index_entries = [] self.index_entries = []
# initialize # initialize
title, content = title.strip(), content.strip() title, content = title.strip(), content.strip()
@ -130,24 +131,24 @@ class ExtractIndex( ExtractBase ):
if title == "bold": if title == "bold":
# FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect # FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect
# as the start of a new entry. We fix that up here. # as the start of a new entry. We fix that up here.
self._index_entries[-1]["content"] = "{} bold {}".format( self.index_entries[-1]["content"] = "{} bold {}".format(
self._index_entries[-1]["content"], fixup_text(content) self.index_entries[-1]["content"], fixup_text(content)
) )
elif title == "C" and self._index_entries[-1]["title"] == "FFE": elif title == "C" and self.index_entries[-1]["title"] == "FFE":
# FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate # FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate
# index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is # index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is
# also a real "FFE" entry, so we do it in the code here. # also a real "FFE" entry, so we do it in the code here.
self._index_entries[-1].update( { self.index_entries[-1].update( {
"title": "FFE:C", "content": fixup_text(content) "title": "FFE:C", "content": fixup_text(content)
} ) } )
else: else:
# save the new index entry # save the new index entry
index_entry = self._make_index_entry( title, content ) index_entry = self._make_index_entry( title, content )
if index_entry: if index_entry:
self._index_entries.append( index_entry ) self.index_entries.append( index_entry )
# FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here. # FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here.
if title == "EX": if title == "EX":
self._index_entries.append( self._make_index_entry( "EXC", "Exception" ) ) self.index_entries.append( self._make_index_entry( "EXC", "Exception" ) )
def _make_index_entry( self, title, content ): def _make_index_entry( self, title, content ):
"""Create a new index entry.""" """Create a new index entry."""
@ -167,14 +168,14 @@ class ExtractIndex( ExtractBase ):
for sr in fixup.get( "replace", [] ): for sr in fixup.get( "replace", [] ):
new_content = content.replace( sr[0], sr[1] ) new_content = content.replace( sr[0], sr[1] )
if new_content == content: if new_content == content:
self._log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] ) self.log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] )
else: else:
content = new_content content = new_content
# replace the content # replace the content
old_content = fixup.get( "old_content" ) old_content = fixup.get( "old_content" )
if old_content: if old_content:
if fixup_text( content ) != old_content: if fixup_text( content ) != old_content:
self._log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title ) self.log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title )
else: else:
new_content = fixup.get( "new_content" ) new_content = fixup.get( "new_content" )
if not new_content: if not new_content:
@ -197,7 +198,7 @@ class ExtractIndex( ExtractBase ):
def _process_content( self ): def _process_content( self ):
"""Extract information out of the index entries into a structured form.""" """Extract information out of the index entries into a structured form."""
for index_entry in self._index_entries: for index_entry in self.index_entries:
# initialize # initialize
content = index_entry[ "content" ] content = index_entry[ "content" ]
@ -295,14 +296,14 @@ class ExtractIndex( ExtractBase ):
def save_as_raw( self, out ): def save_as_raw( self, out ):
"""Save the raw results.""" """Save the raw results."""
for index_entry in self._index_entries: for index_entry in self.index_entries:
print( "=== {} ===".format( index_entry["title"] ), file=out ) print( "=== {} ===".format( index_entry["title"] ), file=out )
print( "{}".format( index_entry["raw_content"] ), file=out ) print( "{}".format( index_entry["raw_content"] ), file=out )
print( file=out ) print( file=out )
def save_as_text( self, out ): def save_as_text( self, out ):
"""Save the results as plain-text.""" """Save the results as plain-text."""
for index_entry in self._index_entries: for index_entry in self.index_entries:
print( "=== {} ===".format( index_entry["title"] ), file=out ) print( "=== {} ===".format( index_entry["title"] ), file=out )
if "subtitle" in index_entry: if "subtitle" in index_entry:
print( index_entry["subtitle"], file=out ) print( index_entry["subtitle"], file=out )
@ -329,7 +330,7 @@ class ExtractIndex( ExtractBase ):
def save_as_json( self, out ): def save_as_json( self, out ):
"""Save the results as JSON.""" """Save the results as JSON."""
entries = [] entries = []
for index_entry in self._index_entries: for index_entry in self.index_entries:
buf = [] buf = []
buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) ) buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) )
if "subtitle" in index_entry: if "subtitle" in index_entry:
@ -357,9 +358,11 @@ class ExtractIndex( ExtractBase ):
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) @click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
help="Output format."
)
@click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." ) @click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." )
def main( pdf_file, args, progress, format, output_fname ): def main( pdf_file, args, progress, output_fmt, output_fname ):
"""Extract the index from the MMP eASLRB.""" """Extract the index from the MMP eASLRB."""
# initialize # initialize
@ -371,13 +374,13 @@ def main( pdf_file, args, progress, format, output_fname ):
return return
log_msg_stderr( msg_type, msg ) log_msg_stderr( msg_type, msg )
extract = ExtractIndex( args, log_msg ) extract = ExtractIndex( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf: with PdfDoc( pdf_file ) as pdf:
extract.extract_index( pdf ) extract.extract_index( pdf )
# save the results # save the results
with open( output_fname, "w", encoding="utf-8" ) as out: with open( output_fname, "w", encoding="utf-8" ) as out:
getattr( extract, "save_as_"+format )( out ) getattr( extract, "save_as_"+output_fmt )( out )
if __name__ == "__main__": if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter main() #pylint: disable=no-value-for-parameter

@ -2,9 +2,8 @@
""" Extract pages from a PDF. """ """ Extract pages from a PDF. """
import click import click
from pikepdf import Pdf, Page, OutlineItem, Encryption, make_page_destination from pikepdf import Pdf
from asl_rulebook2.pdf import PdfDoc
from asl_rulebook2.utils import parse_page_numbers from asl_rulebook2.utils import parse_page_numbers
# --------------------------------------------------------------------- # ---------------------------------------------------------------------

@ -159,7 +159,7 @@ def _make_webapp():
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
@pytest.fixture( scope="session" ) @pytest.fixture( scope="session" )
def webdriver( request ): def webdriver():
"""Return a webdriver that can be used to control a browser.""" """Return a webdriver that can be used to control a browser."""
# initialize # initialize

@ -0,0 +1,3 @@
[pytest]
addopts = --pylint
norecursedirs = _work_
Loading…
Cancel
Save