From 725bffb963f64badac48caf6c5088b2bad5abb4e Mon Sep 17 00:00:00 2001 From: Taka Date: Sun, 14 Mar 2021 23:43:39 +1100 Subject: [PATCH] Added pylint. --- .pylintrc | 596 +++++++++++++++++++++++++++++++ asl_rulebook2/extract/all.py | 27 +- asl_rulebook2/extract/base.py | 5 +- asl_rulebook2/extract/content.py | 53 +-- asl_rulebook2/extract/index.py | 55 +-- bin/extract_pages.py | 3 +- conftest.py | 2 +- pytest.ini | 3 + 8 files changed, 678 insertions(+), 66 deletions(-) create mode 100644 .pylintrc create mode 100644 pytest.ini diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..3361bd0 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,596 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-whitelist= + +# Specify a score threshold to be exceeded before program exits with error. +fail-under=10.0 + +# Add files or directories to the blacklist. They should be base names, not +# paths. +# NOTE: "generated" is for the auto-generated gRPC protobuf stuff. +ignore=generated + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=4 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape, + # custom changes follow + import-outside-toplevel, + global-statement, + invalid-name, + too-few-public-methods, + duplicate-code + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'error', 'warning', 'refactor', and 'convention' +# which contain the number of messages in each category, as well as 'statement' +# which is the total number of statements analyzed. This score is used by the +# global evaluation report (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=8 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +#class-attribute-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _, + ch,mo,fp,v,s + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +#variable-rgx= + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it work, +# install the python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +#notes-rgx= + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=10 + +# Maximum number of attributes for a class (see R0902). +max-attributes=10 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=30 + +# Maximum number of locals for function / method body. +max-locals=30 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=10 + +# Maximum number of statements in function / method body. +max-statements=100 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled). +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled). +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "BaseException, Exception". +overgeneral-exceptions=BaseException, + Exception diff --git a/asl_rulebook2/extract/all.py b/asl_rulebook2/extract/all.py index b6b888e..581c905 100755 --- a/asl_rulebook2/extract/all.py +++ b/asl_rulebook2/extract/all.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 """ Extract everything we need from the MMP eASLRB. """ -import sys import os import json import re @@ -22,6 +21,8 @@ class ExtractAll( ExtractBase ): def __init__( self, args, log=None ): super().__init__( None, None, log ) self._args = args + self.extract_index = None + self.extract_content = None def extract_all( self, pdf ): """Extract everything from the eASLRB.""" @@ -33,13 +34,13 @@ class ExtractAll( ExtractBase ): default_args.update( getattr( mod, "_DEFAULT_ARGS" ) ) # extract the index - self._log_msg( "progress", "\nExtracting the index..." ) + self.log_msg( "progress", "\nExtracting the index..." ) args = ExtractBase.parse_args( self._args, default_args ) self.extract_index = ExtractIndex( args, self._log ) self.extract_index.extract_index( pdf ) # extract the content - self._log_msg( "progress", "\nExtracting the content..." ) + self.log_msg( "progress", "\nExtracting the content..." ) args = ExtractBase.parse_args( self._args, default_args ) self.extract_content = ExtractContent( args, self._log ) self.extract_content.extract_content( pdf ) @@ -52,7 +53,7 @@ class ExtractAll( ExtractBase ): # build an index of known targets targets = {} - for ruleid, target in self.extract_content._targets.items(): + for ruleid, target in self.extract_content.targets.items(): assert ruleid not in targets targets[ ruleid ] = target["caption"] @@ -82,7 +83,7 @@ class ExtractAll( ExtractBase ): # check each index entry first = True - for index_entry in self.extract_index._index_entries: + for index_entry in self.extract_index.index_entries: errors = [] @@ -106,10 +107,10 @@ class ExtractAll( ExtractBase ): # log any errors if errors: if first: - self._log_msg( "warning", "\n=== Unknown targets ===\n" ) + self.log_msg( "warning", "\n=== Unknown targets ===\n" ) first = False errors = [ "- {}".format( e ) for e in errors ] - self._log_msg( "warning", "{}:\n{}", + self.log_msg( "warning", "{}:\n{}", index_entry["caption"], "\n".join(errors) ) @@ -119,11 +120,13 @@ class ExtractAll( ExtractBase ): @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) -@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) +@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]), + help="Output format." +) @click.option( "--save-index","save_index_fname", required=True, help="Where to save the extracted index." ) @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) -def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname, save_footnotes_fname ): +def main( pdf_file, args, progress, output_fmt, save_index_fname, save_targets_fname, save_footnotes_fname ): """Extract everything we need from the MMP eASLRB.""" # extract everything @@ -132,7 +135,7 @@ def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname return log_msg_stderr( msg_type, msg ) extract = ExtractAll( args, log_msg ) - extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) + extract.log_msg( "progress", "Loading PDF: {}", pdf_file ) with PdfDoc( pdf_file ) as pdf: extract.extract_all( pdf ) @@ -140,8 +143,8 @@ def main( pdf_file, args, progress, format, save_index_fname, save_targets_fname with open( save_index_fname, "w", encoding="utf-8" ) as index_out, \ open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: - getattr( extract.extract_index, "save_as_"+format )( index_out ) - getattr( extract.extract_content, "save_as_"+format )( targets_out, footnotes_out ) + getattr( extract.extract_index, "save_as_"+output_fmt )( index_out ) + getattr( extract.extract_content, "save_as_"+output_fmt )( targets_out, footnotes_out ) if __name__ == "__main__": main() #pylint: disable=no-value-for-parameter diff --git a/asl_rulebook2/extract/base.py b/asl_rulebook2/extract/base.py index 71eb2fa..a09ce0c 100644 --- a/asl_rulebook2/extract/base.py +++ b/asl_rulebook2/extract/base.py @@ -1,4 +1,4 @@ -""" Base class for the extraction tools. """ +""" Base class for the extraction classes. """ import sys @@ -7,6 +7,7 @@ import click # --------------------------------------------------------------------- class ExtractBase: + """Base class for the extraction classes.""" def __init__( self, args, default_args, log ): self._args = args @@ -43,7 +44,7 @@ class ExtractBase: """Check if an element is using a bold font.""" return elem.fontname.endswith( ( "-Bold", ",Bold", "-BoldMT" ) ) - def _log_msg( self, msg_type, msg, *args, **kwargs ): + def log_msg( self, msg_type, msg, *args, **kwargs ): """Log a message.""" if not self._log: return diff --git a/asl_rulebook2/extract/content.py b/asl_rulebook2/extract/content.py index badd82b..655cbce 100755 --- a/asl_rulebook2/extract/content.py +++ b/asl_rulebook2/extract/content.py @@ -46,8 +46,10 @@ class ExtractContent( ExtractBase ): def __init__( self, args, log=None ): super().__init__( args, _DEFAULT_ARGS, log ) - self._targets = {} + self.targets = {} self._footnotes = {} + self._curr_chapter = self._curr_footnote = self._curr_pageid = None + self._prev_elem = self._top_left_elem = None # prepare to fixup problems in the content fname2 = os.path.join( os.path.dirname(__file__), "data/target-fixups.json" ) with open( fname2, "r", encoding="utf-8" ) as fp: @@ -83,13 +85,13 @@ class ExtractContent( ExtractBase ): # the start of a footnote by a bold number near the start of the line. # process each page - for page_no, page, lt_page in PageIterator( pdf ): + for page_no, _, lt_page in PageIterator( pdf ): # prepare to process the next page if page_no > max( page_index.keys() ): break if page_no not in page_index: - self._log_msg( "progress", "- Skipping page {}.", page_no ) + self.log_msg( "progress", "- Skipping page {}.", page_no ) continue if not self._curr_chapter or self._curr_chapter != page_index[page_no]: # we've found the start of a new chapter @@ -98,17 +100,17 @@ class ExtractContent( ExtractBase ): curr_chapter_pageno = 1 else: curr_chapter_pageno += 1 - self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page# + self._curr_pageid = "{}{}".format( # nb: this is the ASL page# (e.g. "A42"), not the PDF page# self._curr_chapter, curr_chapter_pageno ) - self._log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid ) + self.log_msg( "progress", "- Processing page {} ({})...", page_no, self._curr_pageid ) # process each element on the page curr_caption = None self._top_left_elem = self._prev_elem = None elem_filter = lambda e: isinstance( e, LTChar ) sort_elems = self._curr_pageid not in disable_sort_items - for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): + for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter, sort_elems=sort_elems ): # keep track of the top-left-most bold element if self._is_bold( elem ): @@ -128,7 +130,8 @@ class ExtractContent( ExtractBase ): # figure out what we've got is_bold = self._is_bold( elem ) - if is_bold and curr_caption and curr_caption[0].isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2: + ch = curr_caption[0] if curr_caption else None #pylint: disable=unsubscriptable-object + if is_bold and ch and ch.isdigit() and 1 < elem.y1 - self._prev_elem.y0 < elem.height/2: # the previous bold character looks like a footnote superscript - ignore it curr_caption = None if curr_caption and elem.get_text() == " ": @@ -149,9 +152,11 @@ class ExtractContent( ExtractBase ): # continue collecting the caption if self._prev_elem.y0 - elem.y0 > 1: # nb: we just started a new line - curr_caption[0] = append_text( curr_caption[0], elem.get_text() ) + curr_caption[0] = append_text( #pylint: disable=unsupported-assignment-operation + curr_caption[0], elem.get_text() #pylint: disable=unsubscriptable-object + ) else: - curr_caption[0] += elem.get_text() + curr_caption[0] += elem.get_text() #pylint: disable=unsupported-assignment-operation else: # check if this is the first character of the line if self._is_start_of_line( elem, lt_page ): @@ -174,9 +179,9 @@ class ExtractContent( ExtractBase ): # check for unused fixups if self._target_fixups: - self._log_msg( "warning", "Unused fixups: {}", self._target_fixups ) + self.log_msg( "warning", "Unused fixups: {}", self._target_fixups ) if self._footnote_fixups: - self._log_msg( "warning", "Unused fixups: {}", self._footnote_fixups ) + self.log_msg( "warning", "Unused fixups: {}", self._footnote_fixups ) def _save_target( self, caption, page_no, lt_page, elem ): """Save a parsed target.""" @@ -233,14 +238,14 @@ class ExtractContent( ExtractBase ): # save the new target if not ruleid.startswith( self._curr_chapter ): ruleid = self._curr_chapter + ruleid - if ruleid in self._targets: - self._log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").", + if ruleid in self.targets: + self.log_msg( "warning", "Ignoring duplicate ruleid: {} (from \"{}\").", ruleid, caption[0] ) return if caption_text == "\u2014": caption_text = "-" # nb: for A7.306 :-/ - self._targets[ ruleid ] = { + self.targets[ ruleid ] = { "caption": fixup_text(caption_text), "page_no": page_no, "pos": caption[1], "raw_caption": orig_caption } @@ -292,7 +297,7 @@ class ExtractContent( ExtractBase ): self._curr_footnote[0] = parts[0] self._curr_footnote[1] = parts[1].strip() + " " + self._curr_footnote[1].strip() else: - self._log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] ) + self.log_msg( "warning", "Couldn't split Chapter F footnote caption: {}", self._curr_footnote[0] ) footnote_id = remove_trailing( self._curr_footnote[0].strip(), "." ) content = self._curr_footnote[1].strip() mo = re.search( r"^(F\.1B|W\.\d+[AB]|[A-Z]?[0-9.]+)", content ) @@ -335,7 +340,7 @@ class ExtractContent( ExtractBase ): prev_content = content content = content.replace( sr[0], sr[1] ) if content == prev_content: - self._log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}", + self.log_msg( "warning", "Footnote fixup for \"{}:{}\" had no effect: {}", self._curr_chapter, footnote_id, sr[0] ) errors["replace"].append( sr ) @@ -361,7 +366,7 @@ class ExtractContent( ExtractBase ): captions.append( ( ruleid, content[:pos] ) ) content = content[pos+1:].strip() else: - self._log_msg( "warning", "Can't extract footnote caption: {}:{} - {}", + self.log_msg( "warning", "Can't extract footnote caption: {}:{} - {}", self._curr_chapter, footnote_id, content ) @@ -404,7 +409,7 @@ class ExtractContent( ExtractBase ): # save the targets curr_page_no = None - for ruleid, target in self._targets.items(): + for ruleid, target in self.targets.items(): if target["page_no"] != curr_page_no: if curr_page_no: print( file=targets_out ) @@ -448,7 +453,7 @@ class ExtractContent( ExtractBase ): # save the targets targets, curr_chapter = [], None - for ruleid, target in self._targets.items(): + for ruleid, target in self.targets.items(): xpos, ypos = self._get_target_pos( target ) targets.append( "{}: {{ \"caption\": {}, \"page_no\": {}, \"pos\": [{},{}] }}".format( jsonval( ruleid ), @@ -496,10 +501,12 @@ class ExtractContent( ExtractBase ): @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) -@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) +@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]), + help="Output format." +) @click.option( "--save-targets","save_targets_fname", required=True, help="Where to save the extracted targets." ) @click.option( "--save-footnotes","save_footnotes_fname", required=True, help="Where to save the extracted footnotes." ) -def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_fname ): +def main( pdf_file, args, progress, output_fmt, save_targets_fname, save_footnotes_fname ): """Extract content from the MMP eASLRB.""" # initialize @@ -511,14 +518,14 @@ def main( pdf_file, args, progress, format, save_targets_fname, save_footnotes_f return log_msg_stderr( msg_type, msg ) extract = ExtractContent( args, log_msg ) - extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) + extract.log_msg( "progress", "Loading PDF: {}", pdf_file ) with PdfDoc( pdf_file ) as pdf: extract.extract_content( pdf ) # save the results with open( save_targets_fname, "w", encoding="utf-8" ) as targets_out, \ open( save_footnotes_fname, "w", encoding="utf-8" ) as footnotes_out: - getattr( extract, "save_as_"+format )( targets_out, footnotes_out ) + getattr( extract, "save_as_"+output_fmt, )( targets_out, footnotes_out ) if __name__ == "__main__": main() #pylint: disable=no-value-for-parameter diff --git a/asl_rulebook2/extract/index.py b/asl_rulebook2/extract/index.py index 74104c6..82f0408 100755 --- a/asl_rulebook2/extract/index.py +++ b/asl_rulebook2/extract/index.py @@ -27,7 +27,8 @@ class ExtractIndex( ExtractBase ): def __init__( self, args, log=None ): super().__init__( args, _DEFAULT_ARGS, log ) - self._index_entries = None + self.index_entries = None + self._prev_y0 = None # prepare to fixup problems in the index content fname2 = os.path.join( os.path.dirname(__file__), "data/index-fixups.json" ) with open( fname2, "r", encoding="utf-8" ) as fp: @@ -41,19 +42,19 @@ class ExtractIndex( ExtractBase ): curr_title = curr_content = None # process each page in the index - for page_no, page, lt_page in PageIterator( pdf ): + for page_no, _, lt_page in PageIterator( pdf ): if page_no > max( page_nos ): break if page_no not in page_nos: - self._log_msg( "progress", "- Skipping page {}.", page_no ) + self.log_msg( "progress", "- Skipping page {}.", page_no ) continue - self._log_msg( "progress", "- Processing page {}...", page_no ) + self.log_msg( "progress", "- Processing page {}...", page_no ) # process each element on the page self._prev_y0 = 99999 elem_filter = lambda e: isinstance( e, LTChar ) - for depth, elem in PageElemIterator( lt_page, elem_filter=elem_filter ): + for _, elem in PageElemIterator( lt_page, elem_filter=elem_filter ): # check if we should ignore this element if not self._in_viewport( elem, "index" ): @@ -91,7 +92,7 @@ class ExtractIndex( ExtractBase ): # continue collecting the content text if elem.y0 - self._prev_y0 < -1 and curr_content.endswith( "-" ): # join up hyphenated words - curr_content = curr_content[:-1] + curr_content = curr_content[:-1] #pylint: disable=unsubscriptable-object curr_content += elem.get_text() # loop back to process the next element @@ -103,10 +104,10 @@ class ExtractIndex( ExtractBase ): # check for unused fixups if self._fixups: - self._log_msg( "warning", "Unused fixups: {}", self._fixups ) + self.log_msg( "warning", "Unused fixups: {}", self._fixups ) # process the content for each index entry - if not self._index_entries: + if not self.index_entries: raise RuntimeError( "Didn't find the first title (\"{}\").".format( self._args["first_title"] ) ) self._process_content() @@ -116,10 +117,10 @@ class ExtractIndex( ExtractBase ): # check if we've started parsing index entries # NOTE: There is some bold text at the start of the index, which we parse as an index title, # so we don't save anything until we've actually seen the first index entry. - if self._index_entries is None: + if self.index_entries is None: if title != self._args["first_title"]: return - self._index_entries = [] + self.index_entries = [] # initialize title, content = title.strip(), content.strip() @@ -130,24 +131,24 @@ class ExtractIndex( ExtractBase ): if title == "bold": # FUDGE! Some entries have "bold" in their content, using a bold font :-/, which we detect # as the start of a new entry. We fix that up here. - self._index_entries[-1]["content"] = "{} bold {}".format( - self._index_entries[-1]["content"], fixup_text(content) + self.index_entries[-1]["content"] = "{} bold {}".format( + self.index_entries[-1]["content"], fixup_text(content) ) - elif title == "C" and self._index_entries[-1]["title"] == "FFE": + elif title == "C" and self.index_entries[-1]["title"] == "FFE": # FUDGE! The colon in the title for "FFE:C" is non-bold, so we parse this as two separate # index titles ("FFE" and "C") :-/ We can't fix this up in the normal way, since there is # also a real "FFE" entry, so we do it in the code here. - self._index_entries[-1].update( { + self.index_entries[-1].update( { "title": "FFE:C", "content": fixup_text(content) } ) else: # save the new index entry index_entry = self._make_index_entry( title, content ) if index_entry: - self._index_entries.append( index_entry ) + self.index_entries.append( index_entry ) # FUDGE! EX/EXC are mis-parsed as a single index entry - we correct that in the fixups, and here. if title == "EX": - self._index_entries.append( self._make_index_entry( "EXC", "Exception" ) ) + self.index_entries.append( self._make_index_entry( "EXC", "Exception" ) ) def _make_index_entry( self, title, content ): """Create a new index entry.""" @@ -167,14 +168,14 @@ class ExtractIndex( ExtractBase ): for sr in fixup.get( "replace", [] ): new_content = content.replace( sr[0], sr[1] ) if new_content == content: - self._log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] ) + self.log_msg( "warning", "Content fixup had no effect for \"{}\": {}", title, sr[0] ) else: content = new_content # replace the content old_content = fixup.get( "old_content" ) if old_content: if fixup_text( content ) != old_content: - self._log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title ) + self.log_msg( "warning", "Unexpected content for \"{}\" - skipping fixup.", title ) else: new_content = fixup.get( "new_content" ) if not new_content: @@ -197,7 +198,7 @@ class ExtractIndex( ExtractBase ): def _process_content( self ): """Extract information out of the index entries into a structured form.""" - for index_entry in self._index_entries: + for index_entry in self.index_entries: # initialize content = index_entry[ "content" ] @@ -295,14 +296,14 @@ class ExtractIndex( ExtractBase ): def save_as_raw( self, out ): """Save the raw results.""" - for index_entry in self._index_entries: + for index_entry in self.index_entries: print( "=== {} ===".format( index_entry["title"] ), file=out ) print( "{}".format( index_entry["raw_content"] ), file=out ) print( file=out ) def save_as_text( self, out ): """Save the results as plain-text.""" - for index_entry in self._index_entries: + for index_entry in self.index_entries: print( "=== {} ===".format( index_entry["title"] ), file=out ) if "subtitle" in index_entry: print( index_entry["subtitle"], file=out ) @@ -329,7 +330,7 @@ class ExtractIndex( ExtractBase ): def save_as_json( self, out ): """Save the results as JSON.""" entries = [] - for index_entry in self._index_entries: + for index_entry in self.index_entries: buf = [] buf.append( "{{ \"title\": {}".format( jsonval(index_entry["title"]) ) ) if "subtitle" in index_entry: @@ -357,9 +358,11 @@ class ExtractIndex( ExtractBase ): @click.argument( "pdf_file", nargs=1, type=click.Path(exists=True,dir_okay=False) ) @click.option( "--arg","args", multiple=True, help="Configuration parameter(s) (key=val)." ) @click.option( "--progress/--no-progress", is_flag=True, default=False, help="Log progress messages." ) -@click.option( "--format","-f", default="json", type=click.Choice(["raw","text","json"]), help="Output format." ) +@click.option( "--format","-f","output_fmt", default="json", type=click.Choice(["raw","text","json"]), + help="Output format." +) @click.option( "--output","-o","output_fname", required=True, help="Where to save the extracted index." ) -def main( pdf_file, args, progress, format, output_fname ): +def main( pdf_file, args, progress, output_fmt, output_fname ): """Extract the index from the MMP eASLRB.""" # initialize @@ -371,13 +374,13 @@ def main( pdf_file, args, progress, format, output_fname ): return log_msg_stderr( msg_type, msg ) extract = ExtractIndex( args, log_msg ) - extract._log_msg( "progress", "Loading PDF: {}", pdf_file ) + extract.log_msg( "progress", "Loading PDF: {}", pdf_file ) with PdfDoc( pdf_file ) as pdf: extract.extract_index( pdf ) # save the results with open( output_fname, "w", encoding="utf-8" ) as out: - getattr( extract, "save_as_"+format )( out ) + getattr( extract, "save_as_"+output_fmt )( out ) if __name__ == "__main__": main() #pylint: disable=no-value-for-parameter diff --git a/bin/extract_pages.py b/bin/extract_pages.py index 3696886..7d83641 100755 --- a/bin/extract_pages.py +++ b/bin/extract_pages.py @@ -2,9 +2,8 @@ """ Extract pages from a PDF. """ import click -from pikepdf import Pdf, Page, OutlineItem, Encryption, make_page_destination +from pikepdf import Pdf -from asl_rulebook2.pdf import PdfDoc from asl_rulebook2.utils import parse_page_numbers # --------------------------------------------------------------------- diff --git a/conftest.py b/conftest.py index fe753bb..6b0b2fc 100644 --- a/conftest.py +++ b/conftest.py @@ -159,7 +159,7 @@ def _make_webapp(): # --------------------------------------------------------------------- @pytest.fixture( scope="session" ) -def webdriver( request ): +def webdriver(): """Return a webdriver that can be used to control a browser.""" # initialize diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..86c3827 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +addopts = --pylint +norecursedirs = _work_