|
|
|
""" Miscellaneous utilities. """
|
|
|
|
|
|
|
|
import os
|
|
|
|
import pathlib
|
|
|
|
import tempfile
|
|
|
|
import re
|
|
|
|
import math
|
|
|
|
from io import StringIO
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
class TempFile:
|
|
|
|
"""Manage a temp file that can be closed while it's still being used."""
|
|
|
|
|
|
|
|
def __init__( self, mode="wb", extn=None, encoding=None ):
|
|
|
|
self.mode = mode
|
|
|
|
self.extn = extn
|
|
|
|
self.encoding = encoding
|
|
|
|
self.temp_file = None
|
|
|
|
self.name = None
|
|
|
|
|
|
|
|
def open( self ):
|
|
|
|
"""Allocate a temp file."""
|
|
|
|
if self.encoding:
|
|
|
|
encoding = self.encoding
|
|
|
|
else:
|
|
|
|
encoding = "utf-8" if "b" not in self.mode else None
|
|
|
|
assert self.temp_file is None
|
|
|
|
self.temp_file = tempfile.NamedTemporaryFile(
|
|
|
|
mode = self.mode,
|
|
|
|
encoding = encoding,
|
|
|
|
suffix = self.extn,
|
|
|
|
delete = False
|
|
|
|
)
|
|
|
|
self.name = self.temp_file.name
|
|
|
|
|
|
|
|
def close( self, delete ):
|
|
|
|
"""Close the temp file."""
|
|
|
|
self.temp_file.close()
|
|
|
|
if delete:
|
|
|
|
os.unlink( self.temp_file.name )
|
|
|
|
|
|
|
|
def write( self, data ):
|
|
|
|
"""Write data to the temp file."""
|
|
|
|
self.temp_file.write( data )
|
|
|
|
|
|
|
|
def __enter__( self ):
|
|
|
|
"""Enter the context manager."""
|
|
|
|
self.open()
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __exit__( self, exc_type, exc_val, exc_tb ):
|
|
|
|
"""Exit the context manager."""
|
|
|
|
self.close( delete=True )
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
def strip_html( val ):
|
|
|
|
"""Strip HTML."""
|
|
|
|
|
|
|
|
if not val:
|
|
|
|
return val
|
|
|
|
|
|
|
|
buf = StringIO()
|
|
|
|
class StripHtml( HTMLParser ):
|
|
|
|
"""Strip HTML."""
|
|
|
|
def __init__( self ):
|
|
|
|
super().__init__()
|
|
|
|
self.strict = False
|
|
|
|
def handle_data( self, data ):
|
|
|
|
buf.write( data )
|
|
|
|
def error( self, message ):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# strip HTML
|
|
|
|
html_stripper = StripHtml()
|
|
|
|
html_stripper.feed( val )
|
|
|
|
return buf.getvalue()
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
def fixup_text( val ):
|
|
|
|
"""Fixup special characters in a string."""
|
|
|
|
|
|
|
|
# fixup smart quotes, dashes and other non-ASCII characters
|
|
|
|
def replace_chars( val, ch, targets ):
|
|
|
|
for target in targets:
|
|
|
|
val = val.replace( target, ch )
|
|
|
|
return val
|
|
|
|
val = replace_chars( val, '"', [ "\u00ab", "\u00bb", "\u201c", "\u201d", "\u201e", "\u201f", "\u02dd" ] )
|
|
|
|
val = replace_chars( val, "'", [ "\u2018", "\u2019", "\u201a", "\u201b", "\u2039", "\u203a" ] )
|
|
|
|
val = replace_chars( val, " - ", [ "\u2013", "\u2014" ] )
|
|
|
|
val = replace_chars( val, "-", [ "\u2022" ] ) # nb: bullet
|
|
|
|
val = replace_chars( val, "≤", [ "\u2264" ] )
|
|
|
|
val = replace_chars( val, "≥", [ "\u2265" ] )
|
|
|
|
val = replace_chars( val, "△", [ "\u2206" ] ) # nb: "no leadership DRM" triangle
|
|
|
|
val = replace_chars( val, "®", [ "\u00ae" ] ) # nb: circled R
|
|
|
|
val = replace_chars( val, "°", [ "\u00b0" ] ) # nb: degree sign
|
|
|
|
val = replace_chars( val, "ä", [ "\u00e4" ] )
|
|
|
|
|
|
|
|
# replace fractions with their corresponding HTML entity
|
|
|
|
for frac in [ (1,2), (1,3), (2,3), (3,8), (5,8) ]:
|
|
|
|
val = re.sub(
|
|
|
|
r"\b{}/{}(?=(\"| MF| MP))".format( frac[0], frac[1] ),
|
|
|
|
"&frac{}{};".format( frac[0], frac[1] ),
|
|
|
|
val
|
|
|
|
)
|
|
|
|
return val
|
|
|
|
|
|
|
|
def extract_parens_content( val ):
|
|
|
|
"""Extract content in parenthesis (including nested parentheses)."""
|
|
|
|
assert val[0] == "("
|
|
|
|
nesting = 0
|
|
|
|
for pos, ch in enumerate(val):
|
|
|
|
if ch == "(":
|
|
|
|
nesting += 1
|
|
|
|
elif ch == ")":
|
|
|
|
nesting -= 1
|
|
|
|
if nesting <= 0:
|
|
|
|
return val[1:pos], val[pos+1:]
|
|
|
|
return val # nb: if we get here, we have unclosed parantheses :-/
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
def parse_page_numbers( val, offset=0 ):
|
|
|
|
"""Parse a list of page numbers.
|
|
|
|
|
|
|
|
We recognize a list of page numbers, and/or ranges e.g. 1,2,5-9,13.
|
|
|
|
"""
|
|
|
|
vals = set()
|
|
|
|
if val:
|
|
|
|
for v in str(val).split( "," ):
|
|
|
|
mo = re.search( r"^(\d+)-(\d+)$", v )
|
|
|
|
if mo:
|
|
|
|
vals.update( range( int(mo.group(1)), int(mo.group(2))+1 ) )
|
|
|
|
else:
|
|
|
|
vals.add( int(v) )
|
|
|
|
return [ v+offset for v in vals ]
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
def jsonval( val ):
|
|
|
|
"""Return a value in a JSON-safe format."""
|
|
|
|
if val is None:
|
|
|
|
return "null"
|
|
|
|
if isinstance( val, int ):
|
|
|
|
return val
|
|
|
|
if isinstance( val, list ):
|
|
|
|
if not val:
|
|
|
|
return "[]"
|
|
|
|
vals = [ jsonval(v) for v in val ]
|
|
|
|
return "[ {} ]".format( ", ".join( vals ) )
|
|
|
|
if isinstance( val, str ):
|
|
|
|
val = "".join(
|
|
|
|
ch if 32 <= ord(ch) <= 127 else r"\u{:04x}".format(ord(ch))
|
|
|
|
for ch in val
|
|
|
|
)
|
|
|
|
return '"{}"'.format( val.replace('"',r'\"') )
|
|
|
|
assert False, "Unknown JSON data type: {}".format( type(val) )
|
|
|
|
return '"???"'
|
|
|
|
|
|
|
|
def change_extn( fname, extn ):
|
|
|
|
"""Change a filename's extension."""
|
|
|
|
return pathlib.Path( fname ).with_suffix( extn )
|
|
|
|
|
|
|
|
def append_text( buf, new ):
|
|
|
|
"""Append text to a buffer."""
|
|
|
|
if buf:
|
|
|
|
if buf[-1] == "-":
|
|
|
|
return buf[:-1] + new # nb: join hyphenated words
|
|
|
|
if buf[-1] != "/":
|
|
|
|
buf += " "
|
|
|
|
return buf + new
|
|
|
|
|
|
|
|
def plural( n, name1, name2 ):
|
|
|
|
"""Return the singular/plural form of a string."""
|
|
|
|
return "{} {}".format( n, name1 if n == 1 else name2 )
|
|
|
|
|
|
|
|
def remove_quotes( val ):
|
|
|
|
"""Remove enclosing quotes from a string."""
|
|
|
|
if val[0] in ('"',"'") and val[-1] == val[0]:
|
|
|
|
val = val[1:-1]
|
|
|
|
return val
|
|
|
|
|
|
|
|
def remove_trailing( val, ch ):
|
|
|
|
"""Remove a trailing character from a string."""
|
|
|
|
if val.endswith( ch ):
|
|
|
|
val = val[:-1]
|
|
|
|
return val
|
|
|
|
|
|
|
|
def roundf( val, ndigits ):
|
|
|
|
"""Round a floating-point value."""
|
|
|
|
pow10 = math.pow( 10, ndigits )
|
|
|
|
return int( pow10 * val + 0.5 ) / pow10
|