Added pylint.

master
Pacman Ghost 3 years ago
parent 082af20d04
commit 725bffb963
  1. 596
      .pylintrc
  2. 27
      asl_rulebook2/extract/all.py
  3. 5
      asl_rulebook2/extract/base.py
  4. 53
      asl_rulebook2/extract/content.py
  5. 55
      asl_rulebook2/extract/index.py
  6. 3
      bin/extract_pages.py
  7. 2
      conftest.py
  8. 3
      pytest.ini

@ -0,0 +1,596 @@
[MASTER]
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=
# Specify a score threshold to be exceeded before program exits with error.
fail-under=10.0
# Add files or directories to the blacklist. They should be base names, not
# paths.
# NOTE: "generated" is for the auto-generated gRPC protobuf stuff.
ignore=generated
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use.
jobs=4
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=print-statement,
parameter-unpacking,
unpacking-in-except,
old-raise-syntax,
backtick,
long-suffix,
old-ne-operator,
old-octal-literal,
import-star-module-level,
non-ascii-bytes-literal,
raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
apply-builtin,
basestring-builtin,
buffer-builtin,
cmp-builtin,
coerce-builtin,
execfile-builtin,
file-builtin,
long-builtin,
raw_input-builtin,
reduce-builtin,
standarderror-builtin,
unicode-builtin,
xrange-builtin,
coerce-method,
delslice-method,
getslice-method,
setslice-method,
no-absolute-import,
old-division,
dict-iter-method,
dict-view-method,
next-method-called,
metaclass-assignment,
indexing-exception,
raising-string,
reload-builtin,
oct-method,
hex-method,
nonzero-method,
cmp-method,
input-builtin,
round-builtin,
intern-builtin,
unichr-builtin,
map-builtin-not-iterating,
zip-builtin-not-iterating,
range-builtin-not-iterating,
filter-builtin-not-iterating,
using-cmp-argument,
eq-without-hash,
div-method,
idiv-method,
rdiv-method,
exception-message-attribute,
invalid-str-codec,
sys-max-int,
bad-python3-import,
deprecated-string-function,
deprecated-str-translate-call,
deprecated-itertools-function,
deprecated-types-field,
next-method-defined,
dict-items-not-iterating,
dict-keys-not-iterating,
dict-values-not-iterating,
deprecated-operator-function,
deprecated-urllib-function,
xreadlines-attribute,
deprecated-sys-function,
exception-escape,
comprehension-escape,
# custom changes follow
import-outside-toplevel,
global-statement,
invalid-name,
too-few-public-methods,
duplicate-code
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
# which contain the number of messages in each category, as well as 'statement'
# which is the total number of statements analyzed. This score is used by the
# global evaluation report (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
#msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=8
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
[STRING]
# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no
# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# List of decorators that change the signature of a decorated function.
signature-mutators=
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=120
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[LOGGING]
# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style.
#class-attribute-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=UPPER_CASE
# Regular expression matching correct constant names. Overrides const-naming-
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_,
ch,mo,fp,v,s
# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style.
#variable-rgx=
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: none. To make it work,
# install the python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
# Regular expression of note tags to take in consideration.
#notes-rgx=
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# Maximum number of arguments for function / method.
max-args=10
# Maximum number of attributes for a class (see R0902).
max-attributes=10
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=30
# Maximum number of locals for function / method body.
max-locals=30
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=10
# Maximum number of statements in function / method body.
max-statements=100
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled).
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled).
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception

@ -1,7 +1,6 @@
#!/usr/bin/env python3
""" Extract everything we need from the MMP eASLRB. """
import sys
import os
import json
import re
@ -22,6 +21,8 @@ class ExtractAll( ExtractBase ):
def __init__( self, args, log=None ):
super().__init__( None, None, log )
self._args = args
self.extract_index = None
self.extract_content = None
def extract_all( self, pdf ):
"""Extract everything from the eASLRB."""
@ -33,13 +34,13 @@ class ExtractAll( ExtractBase ):
default_args.update( getattr( mod, "_DEFAULT_ARGS" ) )
# extract the index
self._log_msg( "progress", "\nExtracting the index..." )
self.log_msg( "progress", "\nExtracting the index..." )
args = ExtractBase.parse_args( self._args, default_args )
self.extract_index = ExtractIndex( args, self._log )
self.extract_index.extract_index( pdf )
# extract the content
self._log_msg( "progress", "\nExtracting the content..." )
self.log_msg( "progress", "\nExtracting the content..." )
args = ExtractBase.parse_args( self._args, default_args )
self.extract_content = ExtractContent( args, self._log )
self.extract_content.extract_content( pdf )
@ -52,7 +53,7 @@ class ExtractAll( ExtractBase ):
# build an index of known targets
targets = {}
for ruleid, target in self.extract_content._targets.items():
for ruleid, target in self.extract_content.targets.items():
assert ruleid not in targets
targets[ ruleid ] = target["caption"]
@ -82,7 +83,7 @@ class ExtractAll( ExtractBase ):
# check each index entry
first = True
for index_entry in self.extract_index._index_entries:
for index_entry in self.extract_index.index_entries:
errors = []
@ -106,10 +107,10 @@ class ExtractAll( ExtractBase ):
# log any errors
if errors:
if first:
self._log_msg( "warning", "\n=== Unknown targets ===\n" )
self.log_msg( "warning", "\n=== Unknown targets ===\n" )
first = False
errors = [ "- {}".format( e ) for e in errors ]
self._log_msg( "warning", "{}:\n{}",
self.log_msg( "warning", "{}:\n{}",
index_entry["caption"], "\n".join(errors)
)
@ -119,11 +120,13 @@ class ExtractAll( ExtractBase ):
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
help="Output format."
)
@click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." )
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname, save_footnotes_fname ):
def main( pdf_file, args, progress, output_fmt, save_index_fname, save_targets_fname, save_footnotes_fname ):
"""Extract everything we need from the MMP eASLRB."""
# extract everything
@ -132,7 +135,7 @@ def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname
return
log_msg_stderr( msg_type, msg )
extract = ExtractAll( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file )
extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf:
extract.extract_all( pdf )
@ -140,8 +143,8 @@ def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname
with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \
open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract.extract_index, "save_as_"+format )( index_out )
getattr( extract.extract_content, "save_as_"+format )( targets_out, footnotes_out )
getattr( extract.extract_index, "save_as_"+output_fmt )( index_out )
getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, footnotes_out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -1,4 +1,4 @@
""" Base class for the extraction tools. """
""" Base class for the extraction classes. """
import sys
@ -7,6 +7,7 @@ import click
# ---------------------------------------------------------------------
class ExtractBase:
"""Base class for the extraction classes."""
def __init__( self, args, default_args, log ):
self._args = args
@ -43,7 +44,7 @@ class ExtractBase:
"""Check if an element is using a bold font."""
return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) )
def _log_msg( self, msg_type, msg, *args, **kwargs ):
def log_msg( self, msg_type, msg, *args, **kwargs ):
"""Log a message."""
if not self._log:
return

@ -46,8 +46,10 @@ class ExtractContent( ExtractBase ):
def __init__( self, args, log=None ):
super().__init__( args, _DEFAULT_ARGS, log )
self._targets = {}
self.targets = {}
self._footnotes = {}
self._curr_chapter = self._curr_footnote = self._curr_pageid = None
self._prev_elem = self._top_left_elem = None
# prepare to fixup problems in the content
fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp:
@ -83,13 +85,13 @@ class ExtractContent( ExtractBase ):
# the start of a footnote by a bold number near the start of the line.
# process each page
for page_no, page, lt_page in PageIterator( pdf ):
for page_no, _, lt_page in PageIterator( pdf ):
# prepare to process the next page
if page_no > max( page_index.keys() ):
break
if page_no not in page_index:
self._log_msg( "progress", "- Skipping page {}.", page_no )
self.log_msg( "progress", "- Skipping page {}.", page_no )
continue
if not self._curr_chapter or self._curr_chapter != page_index[page_no]:
# we've found the start of a new chapter
@ -98,17 +100,17 @@ class ExtractContent( ExtractBase ):
curr_chapter_pageno = 1
else:
curr_chapter_pageno += 1
self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page#
self._curr_chapter, curr_chapter_pageno
)
self._log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid )
self.log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid )
# process each element on the page
curr_caption = None
self._top_left_elem = self._prev_elem = None
elem_filter = lambda e: isinstance( e, LTChar )
sort_elems = self._curr_pageid not in disable_sort_items
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ):
# keep track of the top-left-most bold element
if self._is_bold( elem ):
@ -128,7 +130,8 @@ class ExtractContent( ExtractBase ):
# figure out what we've got
is_bold = self._is_bold( elem )
if is_bold and curr_caption and curr_caption[0].isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2:
ch = curr_caption[0] if curr_caption else None #pylint: disable=unsubscriptable-object
if is_bold and ch and ch.isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2:
# the previous bold character looks like a footnote superscript - ignore it
curr_caption = None
if curr_caption and elem.get_text() == " ":
@ -149,9 +152,11 @@ class ExtractContent( ExtractBase ):
# continue collecting the caption
if self._prev_elem.y0 - elem.y0 > 1:
# nb: we just started a new line
curr_caption[0] = append_text( curr_caption[0], elem.get_text() )
curr_caption[0] = append_text( #pylint: disable=unsupported-assignment-operation
curr_caption[0], elem.get_text() #pylint: disable=unsubscriptable-object
)
else:
curr_caption[0] += elem.get_text()
curr_caption[0] += elem.get_text() #pylint: disable=unsupported-assignment-operation
else:
# check if this is the first character of the line
if self._is_start_of_line( elem, lt_page ):
@ -174,9 +179,9 @@ class ExtractContent( ExtractBase ):
# check for unused fixups
if self._target_fixups:
self._log_msg( "warning", "Unused fixups: {}", self._target_fixups )
self.log_msg( "warning", "Unused fixups: {}", self._target_fixups )
if self._footnote_fixups:
self._log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )
self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups )
def _save_target( self, caption, page_no, lt_page, elem ):
"""Save a parsed target."""
@ -233,14 +238,14 @@ class ExtractContent( ExtractBase ):
# save the new target
if not ruleid.startswith( self._curr_chapter ):
ruleid = self._curr_chapter + ruleid
if ruleid in self._targets:
self._log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").",
if ruleid in self.targets:
self.log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").",
ruleid, caption[0]
)
return
if caption_text == "\u2014":
caption_text = "-" # nb: for A7.306 :-/
self._targets[ ruleid ] = {
self.targets[ ruleid ] = {
"caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1],
"raw_caption": orig_caption
}
@ -292,7 +297,7 @@ class ExtractContent( ExtractBase ):
self._curr_footnote[0] = parts[0]
self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip()
else:
self._log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] )
self.log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] )
footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." )
content = self._curr_footnote[1].strip()
mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content )
@ -335,7 +340,7 @@ class ExtractContent( ExtractBase ):
prev_content = content
content = content.replace( sr[0], sr[1] )
if content == prev_content:
self._log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}",
self.log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}",
self._curr_chapter, footnote_id, sr[0]
)
errors["replace"].append( sr )
@ -361,7 +366,7 @@ class ExtractContent( ExtractBase ):
captions.append( ( ruleid, content[:pos] ) )
content = content[pos+1:].strip()
else:
self._log_msg( "warning", "Can't extract footnote caption: {}:{} - {}",
self.log_msg( "warning", "Can't extract footnote caption: {}:{} - {}",
self._curr_chapter, footnote_id, content
)
@ -404,7 +409,7 @@ class ExtractContent( ExtractBase ):
# save the targets
curr_page_no = None
for ruleid, target in self._targets.items():
for ruleid, target in self.targets.items():
if target["page_no"] != curr_page_no:
if curr_page_no:
print( file=targets_out )
@ -448,7 +453,7 @@ class ExtractContent( ExtractBase ):
# save the targets
targets, curr_chapter = [], None
for ruleid, target in self._targets.items():
for ruleid, target in self.targets.items():
xpos, ypos = self._get_target_pos( target )
targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format(
jsonval( ruleid ),
@ -496,10 +501,12 @@ class ExtractContent( ExtractBase ):
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
help="Output format."
)
@click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." )
@click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." )
def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_fname ):
def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_footnotes_fname ):
"""Extract content from the MMP eASLRB."""
# initialize
@ -511,14 +518,14 @@ def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_f
return
log_msg_stderr( msg_type, msg )
extract = ExtractContent( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file )
extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf:
extract.extract_content( pdf )
# save the results
with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \
open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out:
getattr( extract, "save_as_"+format )( targets_out, footnotes_out )
getattr( extract, "save_as_"+output_fmt, )( targets_out, footnotes_out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -27,7 +27,8 @@ class ExtractIndex( ExtractBase ):
def __init__( self, args, log=None ):
super().__init__( args, _DEFAULT_ARGS, log )
self._index_entries = None
self.index_entries = None
self._prev_y0 = None
# prepare to fixup problems in the index content
fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" )
with open( fname2, "r", encoding="utf-8" ) as fp:
@ -41,19 +42,19 @@ class ExtractIndex( ExtractBase ):
curr_title = curr_content = None
# process each page in the index
for page_no, page, lt_page in PageIterator( pdf ):
for page_no, _, lt_page in PageIterator( pdf ):
if page_no > max( page_nos ):
break
if page_no not in page_nos:
self._log_msg( "progress", "- Skipping page {}.", page_no )
self.log_msg( "progress", "- Skipping page {}.", page_no )
continue
self._log_msg( "progress", "- Processing page {}...", page_no )
self.log_msg( "progress", "- Processing page {}...", page_no )
# process each element on the page
self._prev_y0 = 99999
elem_filter = lambda e: isinstance( e, LTChar )
for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter ):
for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter ):
# check if we should ignore this element
if not self._in_viewport( elem, "index" ):
@ -91,7 +92,7 @@ class ExtractIndex( ExtractBase ):
# continue collecting the content text
if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ):
# join up hyphenated words
curr_content = curr_content[:-1]
curr_content = curr_content[:-1] #pylint: disable=unsubscriptable-object
curr_content += elem.get_text()
# loop back to process the next element
@ -103,10 +104,10 @@ class ExtractIndex( ExtractBase ):
# check for unused fixups
if self._fixups:
self._log_msg( "warning", "Unused fixups: {}", self._fixups )
self.log_msg( "warning", "Unused fixups: {}", self._fixups )
# process the content for each index entry
if not self._index_entries:
if not self.index_entries:
raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) )
self._process_content()
@ -116,10 +117,10 @@ class ExtractIndex( ExtractBase ):
# check if we've started parsing index entries
# NOTE: There is some bold text at the start of the index, which we parse as an index title,
# so we don't save anything until we've actually seen the first index entry.
if self._index_entries is None:
if self.index_entries is None:
if title != self._args["first_title"]:
return
self._index_entries = []
self.index_entries = []
# initialize
title, content = title.strip(), content.strip()
@ -130,24 +131,24 @@ class ExtractIndex( ExtractBase ):
if title == "bold":
# FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect
# as the start of a new entry. We fix that up here.
self._index_entries[-1]["content"] = "{} bold {}".format(
self._index_entries[-1]["content"], fixup_text(content)
self.index_entries[-1]["content"] = "{} bold {}".format(
self.index_entries[-1]["content"], fixup_text(content)
)
elif title == "C" and self._index_entries[-1]["title"] == "FFE":
elif title == "C" and self.index_entries[-1]["title"] == "FFE":
# FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate
# index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is
# also a real "FFE" entry, so we do it in the code here.
self._index_entries[-1].update( {
self.index_entries[-1].update( {
"title": "FFE:C", "content": fixup_text(content)
} )
else:
# save the new index entry
index_entry = self._make_index_entry( title, content )
if index_entry:
self._index_entries.append( index_entry )
self.index_entries.append( index_entry )
# FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here.
if title == "EX":
self._index_entries.append( self._make_index_entry( "EXC", "Exception" ) )
self.index_entries.append( self._make_index_entry( "EXC", "Exception" ) )
def _make_index_entry( self, title, content ):
"""Create a new index entry."""
@ -167,14 +168,14 @@ class ExtractIndex( ExtractBase ):
for sr in fixup.get( "replace", [] ):
new_content = content.replace( sr[0], sr[1] )
if new_content == content:
self._log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] )
self.log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] )
else:
content = new_content
# replace the content
old_content = fixup.get( "old_content" )
if old_content:
if fixup_text( content ) != old_content:
self._log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title )
self.log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title )
else:
new_content = fixup.get( "new_content" )
if not new_content:
@ -197,7 +198,7 @@ class ExtractIndex( ExtractBase ):
def _process_content( self ):
"""Extract information out of the index entries into a structured form."""
for index_entry in self._index_entries:
for index_entry in self.index_entries:
# initialize
content = index_entry[ "content" ]
@ -295,14 +296,14 @@ class ExtractIndex( ExtractBase ):
def save_as_raw( self, out ):
"""Save the raw results."""
for index_entry in self._index_entries:
for index_entry in self.index_entries:
print( "=== {} ===".format( index_entry["title"] ), file=out )
print( "{}".format( index_entry["raw_content"] ), file=out )
print( file=out )
def save_as_text( self, out ):
"""Save the results as plain-text."""
for index_entry in self._index_entries:
for index_entry in self.index_entries:
print( "=== {} ===".format( index_entry["title"] ), file=out )
if "subtitle" in index_entry:
print( index_entry["subtitle"], file=out )
@ -329,7 +330,7 @@ class ExtractIndex( ExtractBase ):
def save_as_json( self, out ):
"""Save the results as JSON."""
entries = []
for index_entry in self._index_entries:
for index_entry in self.index_entries:
buf = []
buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) )
if "subtitle" in index_entry:
@ -357,9 +358,11 @@ class ExtractIndex( ExtractBase ):
@click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) )
@click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." )
@click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." )
@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." )
@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]),
help="Output format."
)
@click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." )
def main( pdf_file, args, progress, format, output_fname ):
def main( pdf_file, args, progress, output_fmt, output_fname ):
"""Extract the index from the MMP eASLRB."""
# initialize
@ -371,13 +374,13 @@ def main( pdf_file, args, progress, format, output_fname ):
return
log_msg_stderr( msg_type, msg )
extract = ExtractIndex( args, log_msg )
extract._log_msg( "progress", "Loading PDF: {}", pdf_file )
extract.log_msg( "progress", "Loading PDF: {}", pdf_file )
with PdfDoc( pdf_file ) as pdf:
extract.extract_index( pdf )
# save the results
with open( output_fname, "w", encoding="utf-8" ) as out:
getattr( extract, "save_as_"+format )( out )
getattr( extract, "save_as_"+output_fmt )( out )
if __name__ == "__main__":
main() #pylint: disable=no-value-for-parameter

@ -2,9 +2,8 @@
""" Extract pages from a PDF. """
import click
from pikepdf import Pdf, Page, OutlineItem, Encryption, make_page_destination
from pikepdf import Pdf
from asl_rulebook2.pdf import PdfDoc
from asl_rulebook2.utils import parse_page_numbers
# ---------------------------------------------------------------------

@ -159,7 +159,7 @@ def _make_webapp():
# ---------------------------------------------------------------------
@pytest.fixture( scope="session" )
def webdriver( request ):
def webdriver():
"""Return a webdriver that can be used to control a browser."""
# initialize

@ -0,0 +1,3 @@
[pytest]
addopts = --pylint
norecursedirs = _work_
Loading…
Cancel
Save