mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-12 15:04:10 +01:00
Removed as_unicode function
This commit is contained in:
@@ -13,10 +13,10 @@ try:
|
||||
except EnvironmentError:
|
||||
os.chdir(os.path.expanduser('~'))
|
||||
|
||||
from ebook_converter.constants_old import (iswindows, isosx, islinux, isfrozen,
|
||||
isbsd, preferred_encoding, __appname__, __version__, __author__,
|
||||
win32event, win32api, winerror, fcntl,
|
||||
filesystem_encoding, plugins, config_dir)
|
||||
from ebook_converter.constants_old import iswindows, islinux, isfrozen, \
|
||||
isbsd, preferred_encoding, __appname__, __version__, __author__, \
|
||||
win32event, win32api, winerror, fcntl, \
|
||||
filesystem_encoding, plugins, config_dir
|
||||
from ebook_converter.startup import winutil, winutilerror
|
||||
from ebook_converter.utils.icu import safe_chr
|
||||
|
||||
@@ -51,23 +51,28 @@ def confirm_config_name(name):
|
||||
return name + '_again'
|
||||
|
||||
|
||||
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<', # no2to3
|
||||
'"', ':', '>', '+', '/') + tuple(map(chr, range(32)))) # no2to3
|
||||
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<',
|
||||
'"', ':', '>', '+', '/') +
|
||||
tuple(map(chr, range(32))))
|
||||
|
||||
|
||||
def sanitize_file_name(name, substitute='_'):
|
||||
'''
|
||||
Sanitize the filename `name`. All invalid characters are replaced by `substitute`.
|
||||
The set of invalid characters is the union of the invalid characters in Windows,
|
||||
macOS and Linux. Also removes leading and trailing whitespace.
|
||||
**WARNING:** This function also replaces path separators, so only pass file names
|
||||
and not full paths to it.
|
||||
'''
|
||||
"""
|
||||
Sanitize the filename `name`. All invalid characters are replaced by
|
||||
`substitute`. The set of invalid characters is the union of the invalid
|
||||
characters in Windows, macOS and Linux. Also removes leading and trailing
|
||||
whitespace.
|
||||
|
||||
**WARNING:** This function also replaces path separators, so only pass
|
||||
file names and not full paths to it.
|
||||
"""
|
||||
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode(filesystem_encoding, 'replace')
|
||||
if isinstance(substitute, bytes):
|
||||
substitute = substitute.decode(filesystem_encoding, 'replace')
|
||||
chars = (substitute if c in _filename_sanitize_unicode else c for c in name)
|
||||
chars = (substitute
|
||||
if c in _filename_sanitize_unicode else c for c in name)
|
||||
one = ''.join(chars)
|
||||
one = re.sub(r'\s', ' ', one).strip()
|
||||
bname, ext = os.path.splitext(one)
|
||||
@@ -87,8 +92,8 @@ def prints(*args, **kwargs):
|
||||
"""
|
||||
Print unicode arguments safely by encoding them to preferred_encoding
|
||||
Has the same signature as the print function from Python 3, except for the
|
||||
additional keyword argument safe_encode, which if set to True will cause the
|
||||
function to use repr when encoding fails.
|
||||
additional keyword argument safe_encode, which if set to True will cause
|
||||
the function to use repr when encoding fails.
|
||||
|
||||
Returns the number of bytes written.
|
||||
"""
|
||||
@@ -120,7 +125,7 @@ def prints(*args, **kwargs):
|
||||
except UnicodeEncodeError:
|
||||
try:
|
||||
arg = arg.encode('utf-8')
|
||||
except:
|
||||
except Exception:
|
||||
if not safe_encode:
|
||||
raise
|
||||
arg = repr(arg)
|
||||
@@ -131,7 +136,7 @@ def prints(*args, **kwargs):
|
||||
except UnicodeEncodeError:
|
||||
try:
|
||||
arg = arg.encode('utf-8')
|
||||
except:
|
||||
except Exception:
|
||||
if not safe_encode:
|
||||
raise
|
||||
arg = repr(arg)
|
||||
@@ -139,7 +144,7 @@ def prints(*args, **kwargs):
|
||||
try:
|
||||
file.write(arg)
|
||||
count += len(arg)
|
||||
except:
|
||||
except Exception:
|
||||
from polyglot import reprlib
|
||||
arg = reprlib.repr(arg)
|
||||
file.write(arg)
|
||||
@@ -168,22 +173,12 @@ def setup_cli_handlers(logger, level):
|
||||
elif level == logging.DEBUG:
|
||||
handler = logging.StreamHandler(sys.stderr)
|
||||
handler.setLevel(logging.DEBUG)
|
||||
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s'))
|
||||
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:'
|
||||
'%(lineno)s: %(message)s'))
|
||||
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def load_library(name, cdll):
|
||||
if iswindows:
|
||||
return cdll.LoadLibrary(name)
|
||||
if isosx:
|
||||
name += '.dylib'
|
||||
if hasattr(sys, 'frameworks_dir'):
|
||||
return cdll.LoadLibrary(os.path.join(getattr(sys, 'frameworks_dir'), name))
|
||||
return cdll.LoadLibrary(name)
|
||||
return cdll.LoadLibrary(name+'.so')
|
||||
|
||||
|
||||
def extract(path, dir):
|
||||
extractor = None
|
||||
# First use the file header to identify its type
|
||||
@@ -216,7 +211,8 @@ def fit_image(width, height, pwidth, pheight):
|
||||
@param height: Height of image
|
||||
@param pwidth: Width of box
|
||||
@param pheight: Height of box
|
||||
@return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height.
|
||||
@return: scaled, new_width, new_height. scaled is True iff new_width
|
||||
and/or new_height is different from width or height.
|
||||
'''
|
||||
scaled = height > pheight or width > pwidth
|
||||
if height > pheight:
|
||||
@@ -262,8 +258,10 @@ def walk(dir):
|
||||
|
||||
|
||||
def strftime(fmt, t=None):
|
||||
''' A version of strftime that returns unicode strings and tries to handle dates
|
||||
before 1900 '''
|
||||
"""
|
||||
A version of strftime that returns unicode strings and tries to handle
|
||||
dates before 1900
|
||||
"""
|
||||
if not fmt:
|
||||
return ''
|
||||
if t is None:
|
||||
@@ -272,7 +270,7 @@ def strftime(fmt, t=None):
|
||||
t = t.timetuple()
|
||||
early_year = t[0] < 1900
|
||||
if early_year:
|
||||
replacement = 1900 if t[0]%4 == 0 else 1901
|
||||
replacement = 1900 if t[0] % 4 == 0 else 1901
|
||||
fmt = fmt.replace('%Y', '_early year hack##')
|
||||
t = list(t)
|
||||
orig_year = t[0]
|
||||
@@ -301,27 +299,33 @@ def my_unichr(num):
|
||||
|
||||
|
||||
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
||||
result_exceptions={}):
|
||||
'''
|
||||
result_exceptions={}):
|
||||
"""
|
||||
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
||||
|
||||
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
|
||||
:param exceptions: A list of entities to not convert (Each entry is the
|
||||
name of the entity, for e.g. 'apos' or '#1234'
|
||||
|
||||
:param encoding: The encoding to use to decode numeric entities between 128 and 256.
|
||||
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
|
||||
:param encoding: The encoding to use to decode numeric entities between
|
||||
128 and 256. If None, the Unicode UCS encoding is used.
|
||||
A common encoding is cp1252.
|
||||
|
||||
:param result_exceptions: A mapping of characters to entities. If the
|
||||
result is in result_exceptions,
|
||||
result_exception[result] is returned instead.
|
||||
Convenient way to specify exception for things
|
||||
like < or > that can be specified by various
|
||||
actual entities.
|
||||
"""
|
||||
|
||||
:param result_exceptions: A mapping of characters to entities. If the result
|
||||
is in result_exceptions, result_exception[result] is returned instead.
|
||||
Convenient way to specify exception for things like < or > that can be
|
||||
specified by various actual entities.
|
||||
'''
|
||||
def check(ch):
|
||||
return result_exceptions.get(ch, ch)
|
||||
|
||||
ent = match.group(1)
|
||||
if ent in exceptions:
|
||||
return '&'+ent+';'
|
||||
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
|
||||
# squot is generated by some broken CMS software
|
||||
if ent in {'apos', 'squot'}:
|
||||
return check("'")
|
||||
if ent == 'hellips':
|
||||
ent = 'hellip'
|
||||
@@ -331,7 +335,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
||||
num = int(ent[2:], 16)
|
||||
else:
|
||||
num = int(ent[1:])
|
||||
except:
|
||||
except Exception:
|
||||
return '&'+ent+';'
|
||||
if encoding is None or num > 255:
|
||||
return check(my_unichr(num))
|
||||
@@ -394,15 +398,6 @@ def force_unicode(obj, enc=preferred_encoding):
|
||||
return obj
|
||||
|
||||
|
||||
def as_unicode(obj, enc=preferred_encoding):
|
||||
if not isinstance(obj, bytes):
|
||||
try:
|
||||
obj = str(obj)
|
||||
except Exception:
|
||||
obj = repr(obj)
|
||||
return force_unicode(obj, enc=enc)
|
||||
|
||||
|
||||
def url_slash_cleaner(url):
|
||||
'''
|
||||
Removes redundant /'s from url's.
|
||||
|
||||
@@ -1,43 +1,37 @@
|
||||
import functools, re, json
|
||||
from math import ceil
|
||||
import functools
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
|
||||
from ebook_converter import entity_to_unicode, as_unicode
|
||||
from ebook_converter import entity_to_unicode
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions={
|
||||
'<' : '<',
|
||||
'>' : '>',
|
||||
"'" : ''',
|
||||
'"' : '"',
|
||||
'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
result_exceptions={'<': '<',
|
||||
'>': '>',
|
||||
"'": ''',
|
||||
'"': '"',
|
||||
'&': '&'})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st',
|
||||
}
|
||||
LIGATURES = {'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st'}
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
|
||||
_ligpat = re.compile('|'.join(LIGATURES))
|
||||
|
||||
@@ -83,17 +77,18 @@ def smarten_punctuation(html, log=None):
|
||||
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
format is the type of document analysis will be done against.
|
||||
"""
|
||||
Provides various text analysis functions to determine how the document is
|
||||
structured. format is the type of document analysis will be done against.
|
||||
raw is the raw text to determine the line length to use for wrapping.
|
||||
Blank lines are excluded from analysis
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)',
|
||||
re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
@@ -103,13 +98,13 @@ class DocAnalysis(object):
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
'''
|
||||
"""
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
"""
|
||||
lengths = []
|
||||
for line in self.lines:
|
||||
if len(line) > 0:
|
||||
@@ -121,7 +116,7 @@ class DocAnalysis(object):
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = ceil(avg * 2)
|
||||
max_line = math.ceil(avg * 2)
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
@@ -138,31 +133,32 @@ class DocAnalysis(object):
|
||||
return lengths[index]
|
||||
|
||||
def line_histogram(self, percent):
|
||||
'''
|
||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||
percent is the percentage of lines that should be in a single bucket to return true
|
||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||
'''
|
||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
"""
|
||||
Creates a broad histogram of the document to determine whether it
|
||||
incorporates hard line breaks. Lines are sorted into 20 'buckets'
|
||||
based on length. percent is the percentage of lines that should be in
|
||||
a single bucket to return true The majority of the lines will exist in
|
||||
1-2 buckets in typical docs with hard line breaks
|
||||
"""
|
||||
minLineLength = 20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength = 1900 # Discard larger than this to stay in range
|
||||
buckets = 20 # Each line is divided into a bucket based on length
|
||||
|
||||
# print("there are "+str(len(lines))+" lines")
|
||||
# max = 0
|
||||
# for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
# _l = len(line)
|
||||
# if _l > max:
|
||||
# max = _l
|
||||
# print("max line found is "+str(max))
|
||||
# Build the line length histogram
|
||||
hRaw = [0 for i in range(0,buckets)]
|
||||
hRaw = [0 for i in range(0, buckets)]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l // 100)
|
||||
# print("adding "+str(l))
|
||||
hRaw[l]+=1
|
||||
_l = len(line)
|
||||
if _l > minLineLength and _l < maxLineLength:
|
||||
_l = int(_l // 100)
|
||||
# print("adding "+str(_l))
|
||||
hRaw[_l] += 1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
totalLines = len(self.lines)
|
||||
@@ -175,7 +171,7 @@ class DocAnalysis(object):
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
for i in range(0,len(h)):
|
||||
for i in range(0, len(h)):
|
||||
if h[i] > maxValue:
|
||||
maxValue = h[i]
|
||||
|
||||
@@ -188,36 +184,42 @@ class DocAnalysis(object):
|
||||
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||
retain hyphens.
|
||||
'''
|
||||
"""
|
||||
Analyzes words to determine whether hyphens should be retained/removed.
|
||||
Uses the document itself is as a dictionary. This method handles all
|
||||
languages along with uncommon, made-up, and scientific words. The primary
|
||||
disadvantage is that words appearing only once in the document retain
|
||||
hyphens.
|
||||
"""
|
||||
|
||||
def __init__(self, verbose=0, log=None):
|
||||
self.log = log
|
||||
self.verbose = verbose
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = (
|
||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
||||
# Add common suffixes to the regex below to increase the likelihood of
|
||||
# a match - don't add suffixes which are also complete words, such as
|
||||
# 'able' or 'sex' only remove if it's not already the point of
|
||||
# hyphenation
|
||||
self.suffix_string = ("((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?"
|
||||
"|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|"
|
||||
"istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier"
|
||||
"|al|ex|ian)$")
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string,
|
||||
re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of
|
||||
# hyphenation
|
||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string,
|
||||
re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
try:
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
except Exception:
|
||||
wraptags = ''
|
||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||
@@ -231,65 +233,84 @@ class Dehyphenator(object):
|
||||
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
except Exception:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
|
||||
self.log(" Cleanup:returned dehyphenated word: " +
|
||||
dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
|
||||
self.log(" Cleanup:returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
|
||||
self.log(" Cleanup:returning original text " +
|
||||
firsthalf + " + linefeed " + secondhalf)
|
||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
if (self.format == 'individual_words' and
|
||||
len(firsthalf) + len(secondhalf) <= 6):
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
self.log("too short, returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
self.log("too short, returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned dehyphenated word: " + dehyphenated)
|
||||
self.log(" returned dehyphenated word: " +
|
||||
dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned hyphenated word: " + hyphenated)
|
||||
self.log(" returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
||||
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)'
|
||||
r'(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}'
|
||||
r'(?P<up2threeblanks><(p|div)[^>]*>\s*'
|
||||
r'(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+)'
|
||||
r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}'
|
||||
r'(<span[^>]*>)?)\s*(?P<secondpart>'
|
||||
r'[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||
r'\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*'
|
||||
r'<[iub]>)\s*(?P<secondpart>[\w\d]+)' %
|
||||
length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(
|
||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
||||
intextmatch = re.compile('(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)'
|
||||
'(\u0020|\u0009)*(?P<wraptags>'
|
||||
'(\n(\u0020|\u0009)*)+)(?P<secondpart>'
|
||||
'[\\w\\d]+)' % length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(
|
||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
intextmatch = re.compile(r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*'
|
||||
r'(?P<secondpart>\w+)(?![^<]*?>)',
|
||||
re.UNICODE)
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)'
|
||||
r'(?P<wraptags></span>\s*(</[iubp]>\s*'
|
||||
r'<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>)?\s*(?P<secondpart>'
|
||||
r'[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||
r'(?P<wraptags>\s+)(?P<secondpart>'
|
||||
r'[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
@@ -299,18 +320,18 @@ class CSSPreProcessor(object):
|
||||
|
||||
# Remove some of the broken CSS Microsoft products
|
||||
# create
|
||||
MS_PAT = re.compile(r'''
|
||||
MS_PAT = re.compile(r'''
|
||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||
(%s).+? # The invalid selectors
|
||||
(?P<end>$|;|\}) # The end of the declaration
|
||||
'''%'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||
''' % 'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE | re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
def ms_sub(self, match):
|
||||
end = match.group('end')
|
||||
try:
|
||||
start = match.group('start')
|
||||
except:
|
||||
except Exception:
|
||||
start = ''
|
||||
if end == ';':
|
||||
end = ''
|
||||
@@ -332,7 +353,7 @@ class CSSPreProcessor(object):
|
||||
for line in data.splitlines():
|
||||
ll = line.lstrip()
|
||||
if not (namespaced or ll.startswith('@import') or not ll or
|
||||
ll.startswith('@charset')):
|
||||
ll.startswith('@charset')):
|
||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
||||
namespaced = True
|
||||
ans.append(line)
|
||||
@@ -359,7 +380,8 @@ def accent_regex(accent_maps, letter_before=False):
|
||||
args = ''.join(accent_cat), ''.join(letters)
|
||||
accent_group, letter_group = 1, 2
|
||||
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args),
|
||||
re.UNICODE)
|
||||
|
||||
def sub(m):
|
||||
lmap = accent_maps[m.group(accent_group)]
|
||||
@@ -371,83 +393,96 @@ def accent_regex(accent_maps, letter_before=False):
|
||||
def html_preprocess_rules():
|
||||
ans = getattr(html_preprocess_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = html_preprocess_rules.ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
|
||||
]
|
||||
ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>',
|
||||
re.IGNORECASE | re.DOTALL), sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling,
|
||||
# MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||
'')]
|
||||
html_preprocess_rules.ans = ans
|
||||
return ans
|
||||
|
||||
|
||||
def pdftohtml_rules():
|
||||
ans = getattr(pdftohtml_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = pdftohtml_rules.ans = [
|
||||
accent_regex({
|
||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ',
|
||||
}),
|
||||
ans = [accent_regex({'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚ'
|
||||
'úÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ'}),
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'},
|
||||
letter_before=True),
|
||||
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||
# If pdf printed from a browser then the header/footer has a
|
||||
# reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?'
|
||||
r'[A-Z].*<br>(?=\s*<hr>))',
|
||||
re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'),
|
||||
lambda match: '<p>\n<p style="text-align:center">' +
|
||||
match.group('break') + '</p>'),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||
]
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> ')]
|
||||
pdftohtml_rules.ans = ans
|
||||
return ans
|
||||
|
||||
|
||||
def book_designer_rules():
|
||||
ans = getattr(book_designer_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = book_designer_rules.ans = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
ans = [(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match: '<span style="page-break-after:always"> '
|
||||
'</span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)'
|
||||
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match: '<h1 id="BookTitle" align="%s">%s</h1>' %
|
||||
(match.group(2) if match.group(2) else 'center',
|
||||
match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)'
|
||||
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match: '<h2 id="BookAuthor" align="%s">%s</h2>' %
|
||||
(match.group(2) if match.group(2) else 'center',
|
||||
match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL),
|
||||
lambda match: '<h2 class="title">%s</h2>' % (match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL),
|
||||
lambda match: '<h3 class="subtitle">%s</h3>' %
|
||||
(match.group(1),))]
|
||||
book_designer_rules.ans = ans
|
||||
return None
|
||||
|
||||
|
||||
@@ -470,7 +505,7 @@ class HTMLPreProcessor(object):
|
||||
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def __call__(self, html, remove_special_chars=None,
|
||||
get_preprocess_html=False):
|
||||
get_preprocess_html=False):
|
||||
if remove_special_chars is not None:
|
||||
html = remove_special_chars.sub('', html)
|
||||
html = html.replace('\0', '')
|
||||
@@ -487,13 +522,14 @@ class HTMLPreProcessor(object):
|
||||
start_rules = []
|
||||
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
html = _ligpat.sub(lambda m: LIGATURES[m.group()], html)
|
||||
|
||||
user_sr_rules = {}
|
||||
# Function for processing search and replace
|
||||
|
||||
def do_search_replace(search_pattern, replace_txt):
|
||||
from ebook_converter.ebooks.conversion.search_replace import compile_regular_expression
|
||||
from ebook_converter.ebooks.conversion.search_replace import \
|
||||
compile_regular_expression
|
||||
try:
|
||||
search_re = compile_regular_expression(search_pattern)
|
||||
if not replace_txt:
|
||||
@@ -502,11 +538,11 @@ class HTMLPreProcessor(object):
|
||||
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
(search, e))
|
||||
|
||||
# search / replace using the sr?_search / sr?_replace options
|
||||
for i in range(1, 4):
|
||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
||||
search, replace = 'sr%d_search' % i, 'sr%d_replace' % i
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if search_pattern:
|
||||
@@ -520,31 +556,35 @@ class HTMLPreProcessor(object):
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
end_rules = []
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
# delete soft hyphens - moved here so it's executed after
|
||||
# header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(
|
||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(r'[](</p>\s*<p>\s*)+\s*'
|
||||
r'(?=[\[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(
|
||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+'
|
||||
r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'),
|
||||
lambda match: ''))
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
length = docanalysis.line_length(getattr(self.extra_opts,
|
||||
'unwrap_factor'))
|
||||
if length:
|
||||
# print("The pdf line length returned is " + str(length))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*<p>\s*'
|
||||
r'(?=[\[a-z\d])' % length),
|
||||
lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile((
|
||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
|
||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
(re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą'
|
||||
r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4})'
|
||||
r';))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*'
|
||||
r'<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
|
||||
length, re.UNICODE), wrap_lines))
|
||||
|
||||
for rule in html_preprocess_rules() + start_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
name = '%04d.html' % i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
|
||||
html = rule[0].sub(rule[1], html)
|
||||
except Exception as e:
|
||||
if rule in user_sr_rules:
|
||||
self.log.error(
|
||||
'User supplied search & replace rule: %s -> %s '
|
||||
'failed with error: %s, ignoring.'%(
|
||||
user_sr_rules[rule], rule[1], e))
|
||||
self.log.error('User supplied search & replace rule: %s '
|
||||
'-> %s failed with error: %s, ignoring.' %
|
||||
(user_sr_rules[rule], rule[1], e))
|
||||
else:
|
||||
raise
|
||||
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
html = dehyphenator(html, 'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
from ebook_converter.ebooks.conversion.utils import \
|
||||
HeuristicProcessor
|
||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
if pdf_markup.get_word_count(html) > 7000:
|
||||
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
|
||||
from ebook_converter.utils.localization import get_udc
|
||||
from ebook_converter.utils.mreplace import MReplace
|
||||
unihandecoder = get_udc()
|
||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
||||
mr = MReplace(data={'«': '<' * 3, '»': '>' * 3})
|
||||
html = mr.mreplace(html)
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
from ebook_converter.ebooks.conversion.utils import \
|
||||
HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if is_pdftohtml:
|
||||
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
|
||||
html = html.replace('<!-- created by ebook-converter\'s '
|
||||
'pdftohtml -->', '')
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
unsupported_unicode_chars = (self.extra_opts.output_profile
|
||||
.unsupported_unicode_chars)
|
||||
except AttributeError:
|
||||
unsupported_unicode_chars = ''
|
||||
if unsupported_unicode_chars:
|
||||
|
||||
@@ -10,19 +10,13 @@ import urllib.parse
|
||||
from ebook_converter.ebooks.oeb.base import urlunquote
|
||||
from ebook_converter.ebooks.chardet import detect_xml_encoding
|
||||
from ebook_converter.constants_old import iswindows
|
||||
from ebook_converter import unicode_path, as_unicode, replace_entities
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
from ebook_converter import unicode_path, replace_entities
|
||||
|
||||
|
||||
class Link(object):
|
||||
|
||||
'''
|
||||
"""
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
@@ -31,7 +25,8 @@ class Link(object):
|
||||
if iswindows and path.startswith('/'):
|
||||
path = path[1:]
|
||||
isabs = True
|
||||
path = urllib.parse.urlunparse(('', '', path, url.params, url.query, ''))
|
||||
path = urllib.parse.urlunparse(('', '', path, url.params, url.query,
|
||||
''))
|
||||
path = urlunquote(path)
|
||||
if isabs or os.path.isabs(path):
|
||||
return path
|
||||
@@ -39,17 +34,18 @@ class Link(object):
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
:param url: The url this link points to. Must be an unquoted unicode
|
||||
string.
|
||||
:param base: The base directory that relative URLs are with respect
|
||||
to. Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, str) and isinstance(base, str)
|
||||
self.url = url
|
||||
self.parsed_url = urllib.parse.urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.url = url
|
||||
self.parsed_url = urllib.parse.urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||
self.path = None
|
||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
@@ -62,7 +58,7 @@ class Link(object):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return 'Link: %s --> %s'%(self.url, self.path)
|
||||
return 'Link: %s --> %s' % (self.url, self.path)
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
@@ -84,24 +80,25 @@ class HTMLFile(object):
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
LINK_PAT = re.compile(r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|'
|
||||
r'(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL | re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose,
|
||||
referrer=None):
|
||||
"""
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
"""
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
@@ -112,18 +109,21 @@ class HTMLFile(object):
|
||||
header = header.decode(encoding)
|
||||
except ValueError:
|
||||
pass
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
||||
self.is_binary = level > 0 and not bool(self
|
||||
.HTML_PAT
|
||||
.search(header))
|
||||
if not self.is_binary:
|
||||
src += f.read()
|
||||
except IOError as err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||
msg = ('Could not read from file: %s with error: %s' %
|
||||
(self.path, str(err)))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
if not src:
|
||||
if level == 0:
|
||||
raise ValueError('The file %s is empty'%self.path)
|
||||
raise ValueError('The file %s is empty' % self.path)
|
||||
self.is_binary = True
|
||||
|
||||
if not self.is_binary:
|
||||
@@ -145,7 +145,9 @@ class HTMLFile(object):
|
||||
return hash(self.path)
|
||||
|
||||
def __str__(self):
|
||||
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
return 'HTMLFile:%d:%s:%s' % (self.level,
|
||||
'b' if self.is_binary else 'a',
|
||||
self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
@@ -191,20 +193,22 @@ def depth_first(root, flat, visited=None):
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
|
||||
'''
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0,
|
||||
encoding=None):
|
||||
"""
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
implies that no links in the root HTML file are
|
||||
followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it
|
||||
is auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list
|
||||
contains :class:`HTMLFile` objects.
|
||||
"""
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
@@ -215,9 +219,10 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
nf = HTMLFile(link.path, level, encoding, verbose,
|
||||
referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
raise IgnoreFile('%s is a binary file' % nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile as err:
|
||||
@@ -244,7 +249,8 @@ def get_filelist(htmlfile, dir, opts, log):
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||
encoding=opts
|
||||
.input_encoding)[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
|
||||
@@ -21,7 +21,6 @@ from ebook_converter import force_unicode
|
||||
from ebook_converter.constants_old import filesystem_encoding, __version__
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor
|
||||
from ebook_converter import as_unicode
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.utils.short_uuid import uuid4
|
||||
@@ -419,7 +418,7 @@ def urlnormalize(href):
|
||||
parts = urllib.parse.urlparse(href)
|
||||
except ValueError as e:
|
||||
raise ValueError('Failed to parse the URL: %r with underlying error: '
|
||||
'%s' % (href, as_unicode(e)))
|
||||
'%s' % (href, e))
|
||||
if not parts.scheme or parts.scheme == 'file':
|
||||
path, frag = urllib.parse.urldefrag(href)
|
||||
parts = ('', '', path, '', '', frag)
|
||||
@@ -723,7 +722,7 @@ class Metadata(object):
|
||||
% (parse_utils.barename(self.term), self.value, self.attrib)
|
||||
|
||||
def __str__(self):
|
||||
return as_unicode(self.value)
|
||||
return str(self.value)
|
||||
|
||||
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
|
||||
attrib = {}
|
||||
|
||||
@@ -14,7 +14,7 @@ from lxml.etree import XPath as _XPath
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import as_unicode, force_unicode
|
||||
from ebook_converter import force_unicode
|
||||
from ebook_converter.ebooks.epub import rules
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb.polish.split import do_split
|
||||
@@ -126,7 +126,7 @@ class Split(object):
|
||||
except SelectorError as err:
|
||||
self.log.warn('Ignoring page breaks specified with invalid '
|
||||
'CSS selector: %r (%s)' %
|
||||
(selector, as_unicode(err)))
|
||||
(selector, err))
|
||||
|
||||
for i, elem in enumerate(item.data.iter('*')):
|
||||
try:
|
||||
|
||||
@@ -2,17 +2,13 @@ import os
|
||||
from collections import defaultdict
|
||||
from threading import Thread
|
||||
|
||||
from ebook_converter import walk, prints, as_unicode
|
||||
from ebook_converter.constants_old import (config_dir, iswindows, isosx, plugins, DEBUG,
|
||||
isworker, filesystem_encoding)
|
||||
from ebook_converter import walk, prints
|
||||
from ebook_converter.constants_old import iswindows, isosx
|
||||
from ebook_converter.constants_old import plugins, DEBUG, isworker
|
||||
from ebook_converter.constants_old import filesystem_encoding
|
||||
from ebook_converter.utils.fonts.metadata import FontMetadata, UnsupportedFont
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class NoFonts(ValueError):
|
||||
pass
|
||||
|
||||
@@ -38,7 +34,7 @@ def fc_list():
|
||||
return default_font_dirs()
|
||||
try:
|
||||
lib = ctypes.CDLL(lib)
|
||||
except:
|
||||
except Exception:
|
||||
return default_font_dirs()
|
||||
|
||||
prototype = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p)
|
||||
@@ -97,7 +93,7 @@ def font_dirs():
|
||||
if iswindows:
|
||||
winutil, err = plugins['winutil']
|
||||
if err:
|
||||
raise RuntimeError('Failed to load winutil: %s'%err)
|
||||
raise RuntimeError('Failed to load winutil: %s' % err)
|
||||
try:
|
||||
return [winutil.special_folder_path(winutil.CSIDL_FONTS)]
|
||||
except ValueError:
|
||||
@@ -126,9 +122,10 @@ def font_priority(font):
|
||||
width_normal = font['font-stretch'] == 'normal'
|
||||
weight_normal = font['font-weight'] == 'normal'
|
||||
num_normal = sum(filter(None, (style_normal, width_normal,
|
||||
weight_normal)))
|
||||
weight_normal)))
|
||||
subfamily_name = (font['wws_subfamily_name'] or
|
||||
font['preferred_subfamily_name'] or font['subfamily_name'])
|
||||
font['preferred_subfamily_name'] or
|
||||
font['subfamily_name'])
|
||||
if num_normal == 3 and subfamily_name == 'Regular':
|
||||
return 0
|
||||
if num_normal == 3:
|
||||
@@ -167,7 +164,9 @@ def build_families(cached_fonts, folders, family_attr='font-family'):
|
||||
if fingerprint in fmap:
|
||||
opath = fmap[fingerprint]['path']
|
||||
npath = font['path']
|
||||
if path_significance(npath, folders) >= path_significance(opath, folders):
|
||||
if path_significance(npath,
|
||||
folders) >= path_significance(opath,
|
||||
folders):
|
||||
remove.append(fmap[fingerprint])
|
||||
fmap[fingerprint] = font
|
||||
else:
|
||||
@@ -214,7 +213,7 @@ class FontScanner(Thread):
|
||||
try:
|
||||
return self.font_family_map[family.lower()]
|
||||
except KeyError:
|
||||
raise NoFonts('No fonts found for the family: %r'%family)
|
||||
raise NoFonts('No fonts found for the family: %r' % family)
|
||||
|
||||
def legacy_fonts_for_family(self, family):
|
||||
'''
|
||||
@@ -247,8 +246,11 @@ class FontScanner(Thread):
|
||||
with open(path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def find_font_for_text(self, text, allowed_families={'serif', 'sans-serif'},
|
||||
preferred_families=('serif', 'sans-serif', 'monospace', 'cursive', 'fantasy')):
|
||||
def find_font_for_text(self, text,
|
||||
allowed_families={'serif', 'sans-serif'},
|
||||
preferred_families=('serif', 'sans-serif',
|
||||
'monospace', 'cursive',
|
||||
'fantasy')):
|
||||
'''
|
||||
Find a font on the system capable of rendering the given text.
|
||||
|
||||
@@ -258,10 +260,11 @@ class FontScanner(Thread):
|
||||
|
||||
:return: (family name, faces) or None, None
|
||||
'''
|
||||
from ebook_converter.utils.fonts.utils import (supports_text,
|
||||
panose_to_css_generic_family, get_printable_characters)
|
||||
from ebook_converter.utils.fonts.utils import \
|
||||
supports_text, panose_to_css_generic_family, \
|
||||
get_printable_characters
|
||||
if not isinstance(text, str):
|
||||
raise TypeError(u'%r is not unicode'%text)
|
||||
raise TypeError(u'%r is not unicode' % text)
|
||||
text = get_printable_characters(text)
|
||||
found = {}
|
||||
|
||||
@@ -269,7 +272,7 @@ class FontScanner(Thread):
|
||||
try:
|
||||
raw = self.get_font_data(font)
|
||||
return supports_text(raw, text)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
@@ -278,7 +281,8 @@ class FontScanner(Thread):
|
||||
if not faces:
|
||||
continue
|
||||
generic_family = panose_to_css_generic_family(faces[0]['panose'])
|
||||
if generic_family in allowed_families or generic_family == preferred_families[0]:
|
||||
if (generic_family in allowed_families or
|
||||
generic_family == preferred_families[0]):
|
||||
return (family, faces)
|
||||
elif generic_family not in found:
|
||||
found[generic_family] = (family, faces)
|
||||
@@ -321,18 +325,20 @@ class FontScanner(Thread):
|
||||
files = tuple(walk(folder))
|
||||
except EnvironmentError as e:
|
||||
if DEBUG:
|
||||
prints('Failed to walk font folder:', folder,
|
||||
as_unicode(e))
|
||||
prints('Failed to walk font folder:', folder, str(e))
|
||||
continue
|
||||
for candidate in files:
|
||||
if (candidate.rpartition('.')[-1].lower() not in self.allowed_extensions or not os.path.isfile(candidate)):
|
||||
if (candidate.rpartition('.')[-1].lower() not in
|
||||
self.allowed_extensions or
|
||||
not os.path.isfile(candidate)):
|
||||
continue
|
||||
candidate = os.path.normcase(os.path.abspath(candidate))
|
||||
try:
|
||||
s = os.stat(candidate)
|
||||
except EnvironmentError:
|
||||
continue
|
||||
fileid = '{0}||{1}:{2}'.format(candidate, s.st_size, s.st_mtime)
|
||||
fileid = '{0}||{1}:{2}'.format(candidate, s.st_size,
|
||||
s.st_mtime)
|
||||
if fileid in cached_fonts:
|
||||
# Use previously cached metadata, since the file size and
|
||||
# last modified timestamp have not changed.
|
||||
@@ -343,7 +349,7 @@ class FontScanner(Thread):
|
||||
except Exception as e:
|
||||
if DEBUG:
|
||||
prints('Failed to read metadata from font file:',
|
||||
candidate, as_unicode(e))
|
||||
candidate, str(e))
|
||||
continue
|
||||
|
||||
if frozenset(cached_fonts) != frozenset(self.cached_fonts):
|
||||
@@ -353,7 +359,8 @@ class FontScanner(Thread):
|
||||
self.build_families()
|
||||
|
||||
def build_families(self):
|
||||
self.font_family_map, self.font_families = build_families(self.cached_fonts, self.folders)
|
||||
(self.font_family_map,
|
||||
self.font_families) = build_families(self.cached_fonts, self.folders)
|
||||
|
||||
def write_cache(self):
|
||||
with self.cache:
|
||||
@@ -380,14 +387,14 @@ class FontScanner(Thread):
|
||||
for family in self.font_families:
|
||||
prints(family)
|
||||
for font in self.fonts_for_family(family):
|
||||
prints('\t%s: %s'%(font['full_name'], font['path']))
|
||||
prints('\t%s: %s' % (font['full_name'], font['path']))
|
||||
prints(end='\t')
|
||||
for key in ('font-stretch', 'font-weight', 'font-style'):
|
||||
prints('%s: %s'%(key, font[key]), end=' ')
|
||||
prints('%s: %s' % (key, font[key]), end=' ')
|
||||
prints()
|
||||
prints('\tSub-family:', font['wws_subfamily_name'] or
|
||||
font['preferred_subfamily_name'] or
|
||||
font['subfamily_name'])
|
||||
font['preferred_subfamily_name'] or
|
||||
font['subfamily_name'])
|
||||
prints()
|
||||
prints()
|
||||
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
"""
|
||||
A simplified logging system
|
||||
"""
|
||||
import sys, traceback, io
|
||||
import sys
|
||||
import traceback
|
||||
import io
|
||||
from functools import partial
|
||||
from threading import Lock
|
||||
|
||||
from ebook_converter import force_unicode, as_unicode, prints
|
||||
from ebook_converter import force_unicode, prints
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
DEBUG = 0
|
||||
INFO = 1
|
||||
WARN = 2
|
||||
INFO = 1
|
||||
WARN = 2
|
||||
ERROR = 3
|
||||
|
||||
|
||||
@@ -38,10 +36,10 @@ class ANSIStream(Stream):
|
||||
def __init__(self, stream=sys.stdout):
|
||||
Stream.__init__(self, stream)
|
||||
self.color = {
|
||||
DEBUG: u'green',
|
||||
DEBUG: 'green',
|
||||
INFO: None,
|
||||
WARN: u'yellow',
|
||||
ERROR: u'red',
|
||||
WARN: 'yellow',
|
||||
ERROR: 'red',
|
||||
}
|
||||
|
||||
def prints(self, level, *args, **kwargs):
|
||||
@@ -64,12 +62,10 @@ class FileStream(Stream):
|
||||
|
||||
class HTMLStream(Stream):
|
||||
|
||||
color = {
|
||||
DEBUG: b'<span style="color:green">',
|
||||
INFO: b'<span>',
|
||||
WARN: b'<span style="color:blue">',
|
||||
ERROR: b'<span style="color:red">'
|
||||
}
|
||||
color = {DEBUG: b'<span style="color:green">',
|
||||
INFO: b'<span>',
|
||||
WARN: b'<span style="color:blue">',
|
||||
ERROR: b'<span style="color:red">'}
|
||||
normal = b'</span>'
|
||||
|
||||
def __init__(self, stream=sys.stdout):
|
||||
@@ -104,14 +100,14 @@ class UnicodeHTMLStream(HTMLStream):
|
||||
self.data.append(col)
|
||||
self.last_col = col
|
||||
|
||||
sep = kwargs.get(u'sep', u' ')
|
||||
end = kwargs.get(u'end', u'\n')
|
||||
sep = kwargs.get('sep', ' ')
|
||||
end = kwargs.get('end', '\n')
|
||||
|
||||
for arg in args:
|
||||
if isinstance(arg, bytes):
|
||||
arg = force_unicode(arg)
|
||||
elif not isinstance(arg, str):
|
||||
arg = as_unicode(arg)
|
||||
arg = str(arg)
|
||||
self.data.append(arg+sep)
|
||||
self.plain_text.append(arg+sep)
|
||||
self.data.append(end)
|
||||
@@ -124,8 +120,8 @@ class UnicodeHTMLStream(HTMLStream):
|
||||
|
||||
@property
|
||||
def html(self):
|
||||
end = self.normal if self.data else u''
|
||||
return u''.join(self.data) + end
|
||||
end = self.normal if self.data else ''
|
||||
return ''.join(self.data) + end
|
||||
|
||||
def dump(self):
|
||||
return [self.data, self.plain_text, self.last_col]
|
||||
@@ -143,8 +139,8 @@ class UnicodeHTMLStream(HTMLStream):
|
||||
class Log(object):
|
||||
|
||||
DEBUG = DEBUG
|
||||
INFO = INFO
|
||||
WARN = WARN
|
||||
INFO = INFO
|
||||
WARN = WARN
|
||||
ERROR = ERROR
|
||||
|
||||
def __init__(self, level=INFO):
|
||||
@@ -153,8 +149,8 @@ class Log(object):
|
||||
self.outputs = [default_output]
|
||||
|
||||
self.debug = partial(self.print_with_flush, DEBUG)
|
||||
self.info = partial(self.print_with_flush, INFO)
|
||||
self.warn = self.warning = partial(self.print_with_flush, WARN)
|
||||
self.info = partial(self.print_with_flush, INFO)
|
||||
self.warn = self.warning = partial(self.print_with_flush, WARN)
|
||||
self.error = partial(self.print_with_flush, ERROR)
|
||||
|
||||
def prints(self, level, *args, **kwargs):
|
||||
@@ -222,7 +218,8 @@ class ThreadSafeLog(Log):
|
||||
limit = kwargs.pop('limit', None)
|
||||
with self._lock:
|
||||
Log.print_with_flush(self, ERROR, *args, **kwargs)
|
||||
Log.print_with_flush(self, self.exception_traceback_level, traceback.format_exc(limit))
|
||||
Log.print_with_flush(self, self.exception_traceback_level,
|
||||
traceback.format_exc(limit))
|
||||
|
||||
|
||||
class ThreadSafeWrapper(Log):
|
||||
@@ -242,10 +239,9 @@ class ThreadSafeWrapper(Log):
|
||||
|
||||
|
||||
class GUILog(ThreadSafeLog):
|
||||
|
||||
'''
|
||||
"""
|
||||
Logs in HTML and plain text as unicode. Ideal for display in a GUI context.
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
ThreadSafeLog.__init__(self, level=self.DEBUG)
|
||||
|
||||
Reference in New Issue
Block a user