mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-31 10:55:44 +01:00
Removed as_unicode function
This commit is contained in:
@@ -13,10 +13,10 @@ try:
|
|||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
os.chdir(os.path.expanduser('~'))
|
os.chdir(os.path.expanduser('~'))
|
||||||
|
|
||||||
from ebook_converter.constants_old import (iswindows, isosx, islinux, isfrozen,
|
from ebook_converter.constants_old import iswindows, islinux, isfrozen, \
|
||||||
isbsd, preferred_encoding, __appname__, __version__, __author__,
|
isbsd, preferred_encoding, __appname__, __version__, __author__, \
|
||||||
win32event, win32api, winerror, fcntl,
|
win32event, win32api, winerror, fcntl, \
|
||||||
filesystem_encoding, plugins, config_dir)
|
filesystem_encoding, plugins, config_dir
|
||||||
from ebook_converter.startup import winutil, winutilerror
|
from ebook_converter.startup import winutil, winutilerror
|
||||||
from ebook_converter.utils.icu import safe_chr
|
from ebook_converter.utils.icu import safe_chr
|
||||||
|
|
||||||
@@ -51,23 +51,28 @@ def confirm_config_name(name):
|
|||||||
return name + '_again'
|
return name + '_again'
|
||||||
|
|
||||||
|
|
||||||
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<', # no2to3
|
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<',
|
||||||
'"', ':', '>', '+', '/') + tuple(map(chr, range(32)))) # no2to3
|
'"', ':', '>', '+', '/') +
|
||||||
|
tuple(map(chr, range(32))))
|
||||||
|
|
||||||
|
|
||||||
def sanitize_file_name(name, substitute='_'):
|
def sanitize_file_name(name, substitute='_'):
|
||||||
'''
|
"""
|
||||||
Sanitize the filename `name`. All invalid characters are replaced by `substitute`.
|
Sanitize the filename `name`. All invalid characters are replaced by
|
||||||
The set of invalid characters is the union of the invalid characters in Windows,
|
`substitute`. The set of invalid characters is the union of the invalid
|
||||||
macOS and Linux. Also removes leading and trailing whitespace.
|
characters in Windows, macOS and Linux. Also removes leading and trailing
|
||||||
**WARNING:** This function also replaces path separators, so only pass file names
|
whitespace.
|
||||||
and not full paths to it.
|
|
||||||
'''
|
**WARNING:** This function also replaces path separators, so only pass
|
||||||
|
file names and not full paths to it.
|
||||||
|
"""
|
||||||
|
|
||||||
if isinstance(name, bytes):
|
if isinstance(name, bytes):
|
||||||
name = name.decode(filesystem_encoding, 'replace')
|
name = name.decode(filesystem_encoding, 'replace')
|
||||||
if isinstance(substitute, bytes):
|
if isinstance(substitute, bytes):
|
||||||
substitute = substitute.decode(filesystem_encoding, 'replace')
|
substitute = substitute.decode(filesystem_encoding, 'replace')
|
||||||
chars = (substitute if c in _filename_sanitize_unicode else c for c in name)
|
chars = (substitute
|
||||||
|
if c in _filename_sanitize_unicode else c for c in name)
|
||||||
one = ''.join(chars)
|
one = ''.join(chars)
|
||||||
one = re.sub(r'\s', ' ', one).strip()
|
one = re.sub(r'\s', ' ', one).strip()
|
||||||
bname, ext = os.path.splitext(one)
|
bname, ext = os.path.splitext(one)
|
||||||
@@ -87,8 +92,8 @@ def prints(*args, **kwargs):
|
|||||||
"""
|
"""
|
||||||
Print unicode arguments safely by encoding them to preferred_encoding
|
Print unicode arguments safely by encoding them to preferred_encoding
|
||||||
Has the same signature as the print function from Python 3, except for the
|
Has the same signature as the print function from Python 3, except for the
|
||||||
additional keyword argument safe_encode, which if set to True will cause the
|
additional keyword argument safe_encode, which if set to True will cause
|
||||||
function to use repr when encoding fails.
|
the function to use repr when encoding fails.
|
||||||
|
|
||||||
Returns the number of bytes written.
|
Returns the number of bytes written.
|
||||||
"""
|
"""
|
||||||
@@ -120,7 +125,7 @@ def prints(*args, **kwargs):
|
|||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
try:
|
try:
|
||||||
arg = arg.encode('utf-8')
|
arg = arg.encode('utf-8')
|
||||||
except:
|
except Exception:
|
||||||
if not safe_encode:
|
if not safe_encode:
|
||||||
raise
|
raise
|
||||||
arg = repr(arg)
|
arg = repr(arg)
|
||||||
@@ -131,7 +136,7 @@ def prints(*args, **kwargs):
|
|||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
try:
|
try:
|
||||||
arg = arg.encode('utf-8')
|
arg = arg.encode('utf-8')
|
||||||
except:
|
except Exception:
|
||||||
if not safe_encode:
|
if not safe_encode:
|
||||||
raise
|
raise
|
||||||
arg = repr(arg)
|
arg = repr(arg)
|
||||||
@@ -139,7 +144,7 @@ def prints(*args, **kwargs):
|
|||||||
try:
|
try:
|
||||||
file.write(arg)
|
file.write(arg)
|
||||||
count += len(arg)
|
count += len(arg)
|
||||||
except:
|
except Exception:
|
||||||
from polyglot import reprlib
|
from polyglot import reprlib
|
||||||
arg = reprlib.repr(arg)
|
arg = reprlib.repr(arg)
|
||||||
file.write(arg)
|
file.write(arg)
|
||||||
@@ -168,22 +173,12 @@ def setup_cli_handlers(logger, level):
|
|||||||
elif level == logging.DEBUG:
|
elif level == logging.DEBUG:
|
||||||
handler = logging.StreamHandler(sys.stderr)
|
handler = logging.StreamHandler(sys.stderr)
|
||||||
handler.setLevel(logging.DEBUG)
|
handler.setLevel(logging.DEBUG)
|
||||||
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s'))
|
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:'
|
||||||
|
'%(lineno)s: %(message)s'))
|
||||||
|
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
def load_library(name, cdll):
|
|
||||||
if iswindows:
|
|
||||||
return cdll.LoadLibrary(name)
|
|
||||||
if isosx:
|
|
||||||
name += '.dylib'
|
|
||||||
if hasattr(sys, 'frameworks_dir'):
|
|
||||||
return cdll.LoadLibrary(os.path.join(getattr(sys, 'frameworks_dir'), name))
|
|
||||||
return cdll.LoadLibrary(name)
|
|
||||||
return cdll.LoadLibrary(name+'.so')
|
|
||||||
|
|
||||||
|
|
||||||
def extract(path, dir):
|
def extract(path, dir):
|
||||||
extractor = None
|
extractor = None
|
||||||
# First use the file header to identify its type
|
# First use the file header to identify its type
|
||||||
@@ -216,7 +211,8 @@ def fit_image(width, height, pwidth, pheight):
|
|||||||
@param height: Height of image
|
@param height: Height of image
|
||||||
@param pwidth: Width of box
|
@param pwidth: Width of box
|
||||||
@param pheight: Height of box
|
@param pheight: Height of box
|
||||||
@return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height.
|
@return: scaled, new_width, new_height. scaled is True iff new_width
|
||||||
|
and/or new_height is different from width or height.
|
||||||
'''
|
'''
|
||||||
scaled = height > pheight or width > pwidth
|
scaled = height > pheight or width > pwidth
|
||||||
if height > pheight:
|
if height > pheight:
|
||||||
@@ -262,8 +258,10 @@ def walk(dir):
|
|||||||
|
|
||||||
|
|
||||||
def strftime(fmt, t=None):
|
def strftime(fmt, t=None):
|
||||||
''' A version of strftime that returns unicode strings and tries to handle dates
|
"""
|
||||||
before 1900 '''
|
A version of strftime that returns unicode strings and tries to handle
|
||||||
|
dates before 1900
|
||||||
|
"""
|
||||||
if not fmt:
|
if not fmt:
|
||||||
return ''
|
return ''
|
||||||
if t is None:
|
if t is None:
|
||||||
@@ -272,7 +270,7 @@ def strftime(fmt, t=None):
|
|||||||
t = t.timetuple()
|
t = t.timetuple()
|
||||||
early_year = t[0] < 1900
|
early_year = t[0] < 1900
|
||||||
if early_year:
|
if early_year:
|
||||||
replacement = 1900 if t[0]%4 == 0 else 1901
|
replacement = 1900 if t[0] % 4 == 0 else 1901
|
||||||
fmt = fmt.replace('%Y', '_early year hack##')
|
fmt = fmt.replace('%Y', '_early year hack##')
|
||||||
t = list(t)
|
t = list(t)
|
||||||
orig_year = t[0]
|
orig_year = t[0]
|
||||||
@@ -301,27 +299,33 @@ def my_unichr(num):
|
|||||||
|
|
||||||
|
|
||||||
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
||||||
result_exceptions={}):
|
result_exceptions={}):
|
||||||
'''
|
"""
|
||||||
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
||||||
|
|
||||||
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
|
:param exceptions: A list of entities to not convert (Each entry is the
|
||||||
|
name of the entity, for e.g. 'apos' or '#1234'
|
||||||
|
|
||||||
:param encoding: The encoding to use to decode numeric entities between 128 and 256.
|
:param encoding: The encoding to use to decode numeric entities between
|
||||||
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
|
128 and 256. If None, the Unicode UCS encoding is used.
|
||||||
|
A common encoding is cp1252.
|
||||||
|
|
||||||
|
:param result_exceptions: A mapping of characters to entities. If the
|
||||||
|
result is in result_exceptions,
|
||||||
|
result_exception[result] is returned instead.
|
||||||
|
Convenient way to specify exception for things
|
||||||
|
like < or > that can be specified by various
|
||||||
|
actual entities.
|
||||||
|
"""
|
||||||
|
|
||||||
:param result_exceptions: A mapping of characters to entities. If the result
|
|
||||||
is in result_exceptions, result_exception[result] is returned instead.
|
|
||||||
Convenient way to specify exception for things like < or > that can be
|
|
||||||
specified by various actual entities.
|
|
||||||
'''
|
|
||||||
def check(ch):
|
def check(ch):
|
||||||
return result_exceptions.get(ch, ch)
|
return result_exceptions.get(ch, ch)
|
||||||
|
|
||||||
ent = match.group(1)
|
ent = match.group(1)
|
||||||
if ent in exceptions:
|
if ent in exceptions:
|
||||||
return '&'+ent+';'
|
return '&'+ent+';'
|
||||||
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
|
# squot is generated by some broken CMS software
|
||||||
|
if ent in {'apos', 'squot'}:
|
||||||
return check("'")
|
return check("'")
|
||||||
if ent == 'hellips':
|
if ent == 'hellips':
|
||||||
ent = 'hellip'
|
ent = 'hellip'
|
||||||
@@ -331,7 +335,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
|||||||
num = int(ent[2:], 16)
|
num = int(ent[2:], 16)
|
||||||
else:
|
else:
|
||||||
num = int(ent[1:])
|
num = int(ent[1:])
|
||||||
except:
|
except Exception:
|
||||||
return '&'+ent+';'
|
return '&'+ent+';'
|
||||||
if encoding is None or num > 255:
|
if encoding is None or num > 255:
|
||||||
return check(my_unichr(num))
|
return check(my_unichr(num))
|
||||||
@@ -394,15 +398,6 @@ def force_unicode(obj, enc=preferred_encoding):
|
|||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
def as_unicode(obj, enc=preferred_encoding):
|
|
||||||
if not isinstance(obj, bytes):
|
|
||||||
try:
|
|
||||||
obj = str(obj)
|
|
||||||
except Exception:
|
|
||||||
obj = repr(obj)
|
|
||||||
return force_unicode(obj, enc=enc)
|
|
||||||
|
|
||||||
|
|
||||||
def url_slash_cleaner(url):
|
def url_slash_cleaner(url):
|
||||||
'''
|
'''
|
||||||
Removes redundant /'s from url's.
|
Removes redundant /'s from url's.
|
||||||
|
|||||||
@@ -1,43 +1,37 @@
|
|||||||
import functools, re, json
|
import functools
|
||||||
from math import ceil
|
import json
|
||||||
|
import math
|
||||||
|
import re
|
||||||
|
|
||||||
from ebook_converter import entity_to_unicode, as_unicode
|
from ebook_converter import entity_to_unicode
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
__docformat__ = 'restructuredtext en'
|
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||||
|
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
|
||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
|
||||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
|
||||||
|
|
||||||
convert_entities = functools.partial(entity_to_unicode,
|
convert_entities = functools.partial(entity_to_unicode,
|
||||||
result_exceptions={
|
result_exceptions={'<': '<',
|
||||||
'<' : '<',
|
'>': '>',
|
||||||
'>' : '>',
|
"'": ''',
|
||||||
"'" : ''',
|
'"': '"',
|
||||||
'"' : '"',
|
'&': '&'})
|
||||||
'&' : '&',
|
_span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
|
||||||
})
|
|
||||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
|
||||||
|
|
||||||
LIGATURES = {
|
LIGATURES = {'\uFB00': 'ff',
|
||||||
# '\u00c6': 'AE',
|
'\uFB01': 'fi',
|
||||||
# '\u00e6': 'ae',
|
'\uFB02': 'fl',
|
||||||
# '\u0152': 'OE',
|
'\uFB03': 'ffi',
|
||||||
# '\u0153': 'oe',
|
'\uFB04': 'ffl',
|
||||||
# '\u0132': 'IJ',
|
'\uFB05': 'ft',
|
||||||
# '\u0133': 'ij',
|
'\uFB06': 'st'}
|
||||||
# '\u1D6B': 'ue',
|
# '\u00c6': 'AE',
|
||||||
'\uFB00': 'ff',
|
# '\u00e6': 'ae',
|
||||||
'\uFB01': 'fi',
|
# '\u0152': 'OE',
|
||||||
'\uFB02': 'fl',
|
# '\u0153': 'oe',
|
||||||
'\uFB03': 'ffi',
|
# '\u0132': 'IJ',
|
||||||
'\uFB04': 'ffl',
|
# '\u0133': 'ij',
|
||||||
'\uFB05': 'ft',
|
# '\u1D6B': 'ue',
|
||||||
'\uFB06': 'st',
|
|
||||||
}
|
|
||||||
|
|
||||||
_ligpat = re.compile('|'.join(LIGATURES))
|
_ligpat = re.compile('|'.join(LIGATURES))
|
||||||
|
|
||||||
@@ -83,17 +77,18 @@ def smarten_punctuation(html, log=None):
|
|||||||
|
|
||||||
|
|
||||||
class DocAnalysis(object):
|
class DocAnalysis(object):
|
||||||
'''
|
"""
|
||||||
Provides various text analysis functions to determine how the document is structured.
|
Provides various text analysis functions to determine how the document is
|
||||||
format is the type of document analysis will be done against.
|
structured. format is the type of document analysis will be done against.
|
||||||
raw is the raw text to determine the line length to use for wrapping.
|
raw is the raw text to determine the line length to use for wrapping.
|
||||||
Blank lines are excluded from analysis
|
Blank lines are excluded from analysis
|
||||||
'''
|
"""
|
||||||
|
|
||||||
def __init__(self, format='html', raw=''):
|
def __init__(self, format='html', raw=''):
|
||||||
raw = raw.replace(' ', ' ')
|
raw = raw.replace(' ', ' ')
|
||||||
if format == 'html':
|
if format == 'html':
|
||||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)',
|
||||||
|
re.DOTALL)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
@@ -103,13 +98,13 @@ class DocAnalysis(object):
|
|||||||
self.lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
|
|
||||||
def line_length(self, percent):
|
def line_length(self, percent):
|
||||||
'''
|
"""
|
||||||
Analyses the document to find the median line length.
|
Analyses the document to find the median line length.
|
||||||
percentage is a decimal number, 0 - 1 which is used to determine
|
percentage is a decimal number, 0 - 1 which is used to determine
|
||||||
how far in the list of line lengths to use. The list of line lengths is
|
how far in the list of line lengths to use. The list of line lengths is
|
||||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||||
median value.
|
median value.
|
||||||
'''
|
"""
|
||||||
lengths = []
|
lengths = []
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
if len(line) > 0:
|
if len(line) > 0:
|
||||||
@@ -121,7 +116,7 @@ class DocAnalysis(object):
|
|||||||
lengths = list(set(lengths))
|
lengths = list(set(lengths))
|
||||||
total = sum(lengths)
|
total = sum(lengths)
|
||||||
avg = total / len(lengths)
|
avg = total / len(lengths)
|
||||||
max_line = ceil(avg * 2)
|
max_line = math.ceil(avg * 2)
|
||||||
|
|
||||||
lengths = sorted(lengths)
|
lengths = sorted(lengths)
|
||||||
for i in range(len(lengths) - 1, -1, -1):
|
for i in range(len(lengths) - 1, -1, -1):
|
||||||
@@ -138,31 +133,32 @@ class DocAnalysis(object):
|
|||||||
return lengths[index]
|
return lengths[index]
|
||||||
|
|
||||||
def line_histogram(self, percent):
|
def line_histogram(self, percent):
|
||||||
'''
|
"""
|
||||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
Creates a broad histogram of the document to determine whether it
|
||||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
incorporates hard line breaks. Lines are sorted into 20 'buckets'
|
||||||
percent is the percentage of lines that should be in a single bucket to return true
|
based on length. percent is the percentage of lines that should be in
|
||||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
a single bucket to return true The majority of the lines will exist in
|
||||||
'''
|
1-2 buckets in typical docs with hard line breaks
|
||||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
"""
|
||||||
maxLineLength=1900 # Discard larger than this to stay in range
|
minLineLength = 20 # Ignore lines under 20 chars (typical of spaces)
|
||||||
buckets=20 # Each line is divided into a bucket based on length
|
maxLineLength = 1900 # Discard larger than this to stay in range
|
||||||
|
buckets = 20 # Each line is divided into a bucket based on length
|
||||||
|
|
||||||
# print("there are "+str(len(lines))+" lines")
|
# print("there are "+str(len(lines))+" lines")
|
||||||
# max = 0
|
# max = 0
|
||||||
# for line in self.lines:
|
# for line in self.lines:
|
||||||
# l = len(line)
|
# _l = len(line)
|
||||||
# if l > max:
|
# if _l > max:
|
||||||
# max = l
|
# max = _l
|
||||||
# print("max line found is "+str(max))
|
# print("max line found is "+str(max))
|
||||||
# Build the line length histogram
|
# Build the line length histogram
|
||||||
hRaw = [0 for i in range(0,buckets)]
|
hRaw = [0 for i in range(0, buckets)]
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
l = len(line)
|
_l = len(line)
|
||||||
if l > minLineLength and l < maxLineLength:
|
if _l > minLineLength and _l < maxLineLength:
|
||||||
l = int(l // 100)
|
_l = int(_l // 100)
|
||||||
# print("adding "+str(l))
|
# print("adding "+str(_l))
|
||||||
hRaw[l]+=1
|
hRaw[_l] += 1
|
||||||
|
|
||||||
# Normalize the histogram into percents
|
# Normalize the histogram into percents
|
||||||
totalLines = len(self.lines)
|
totalLines = len(self.lines)
|
||||||
@@ -175,7 +171,7 @@ class DocAnalysis(object):
|
|||||||
|
|
||||||
# Find the biggest bucket
|
# Find the biggest bucket
|
||||||
maxValue = 0
|
maxValue = 0
|
||||||
for i in range(0,len(h)):
|
for i in range(0, len(h)):
|
||||||
if h[i] > maxValue:
|
if h[i] > maxValue:
|
||||||
maxValue = h[i]
|
maxValue = h[i]
|
||||||
|
|
||||||
@@ -188,36 +184,42 @@ class DocAnalysis(object):
|
|||||||
|
|
||||||
|
|
||||||
class Dehyphenator(object):
|
class Dehyphenator(object):
|
||||||
'''
|
"""
|
||||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
Analyzes words to determine whether hyphens should be retained/removed.
|
||||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
Uses the document itself is as a dictionary. This method handles all
|
||||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
languages along with uncommon, made-up, and scientific words. The primary
|
||||||
retain hyphens.
|
disadvantage is that words appearing only once in the document retain
|
||||||
'''
|
hyphens.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, verbose=0, log=None):
|
def __init__(self, verbose=0, log=None):
|
||||||
self.log = log
|
self.log = log
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
# Add common suffixes to the regex below to increase the likelihood of
|
||||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
# a match - don't add suffixes which are also complete words, such as
|
||||||
# only remove if it's not already the point of hyphenation
|
# 'able' or 'sex' only remove if it's not already the point of
|
||||||
self.suffix_string = (
|
# hyphenation
|
||||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
self.suffix_string = ("((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?"
|
||||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
"|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|"
|
||||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
"istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||||
|
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier"
|
||||||
|
"|al|ex|ian)$")
|
||||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
self.removesuffixes = re.compile(r"%s" % self.suffix_string,
|
||||||
# remove prefixes if the prefix was not already the point of hyphenation
|
re.IGNORECASE)
|
||||||
|
# remove prefixes if the prefix was not already the point of
|
||||||
|
# hyphenation
|
||||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
self.removeprefix = re.compile(r'%s' % self.prefix_string,
|
||||||
|
re.IGNORECASE)
|
||||||
|
|
||||||
def dehyphenate(self, match):
|
def dehyphenate(self, match):
|
||||||
firsthalf = match.group('firstpart')
|
firsthalf = match.group('firstpart')
|
||||||
secondhalf = match.group('secondpart')
|
secondhalf = match.group('secondpart')
|
||||||
try:
|
try:
|
||||||
wraptags = match.group('wraptags')
|
wraptags = match.group('wraptags')
|
||||||
except:
|
except Exception:
|
||||||
wraptags = ''
|
wraptags = ''
|
||||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||||
@@ -231,65 +233,84 @@ class Dehyphenator(object):
|
|||||||
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
||||||
try:
|
try:
|
||||||
searchresult = self.html.find(lookupword.lower())
|
searchresult = self.html.find(lookupword.lower())
|
||||||
except:
|
except Exception:
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
|
self.log(" Cleanup:returned dehyphenated word: " +
|
||||||
|
dehyphenated)
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
elif self.html.find(hyphenated) != -1:
|
elif self.html.find(hyphenated) != -1:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
|
self.log(" Cleanup:returned hyphenated word: " +
|
||||||
|
hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
else:
|
else:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
|
self.log(" Cleanup:returning original text " +
|
||||||
|
firsthalf + " + linefeed " + secondhalf)
|
||||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
if (self.format == 'individual_words' and
|
||||||
|
len(firsthalf) + len(secondhalf) <= 6):
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
self.log("too short, returned hyphenated word: " +
|
||||||
|
hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
self.log("too short, returned hyphenated word: " +
|
||||||
|
hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log(" returned dehyphenated word: " + dehyphenated)
|
self.log(" returned dehyphenated word: " +
|
||||||
|
dehyphenated)
|
||||||
return dehyphenated
|
return dehyphenated
|
||||||
else:
|
else:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.log(" returned hyphenated word: " + hyphenated)
|
self.log(" returned hyphenated word: " +
|
||||||
|
hyphenated)
|
||||||
return hyphenated
|
return hyphenated
|
||||||
|
|
||||||
def __call__(self, html, format, length=1):
|
def __call__(self, html, format, length=1):
|
||||||
self.html = html
|
self.html = html
|
||||||
self.format = format
|
self.format = format
|
||||||
if format == 'html':
|
if format == 'html':
|
||||||
intextmatch = re.compile((
|
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)'
|
||||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
r'(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
r'\s*(</[iubp]>\s*){1,2}'
|
||||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
r'(?P<up2threeblanks><(p|div)[^>]*>\s*'
|
||||||
|
r'(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+)'
|
||||||
|
r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}'
|
||||||
|
r'(<span[^>]*>)?)\s*(?P<secondpart>'
|
||||||
|
r'[\w\d]+)' % length)
|
||||||
elif format == 'pdf':
|
elif format == 'pdf':
|
||||||
intextmatch = re.compile((
|
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
r'\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*'
|
||||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
r'<[iub]>)\s*(?P<secondpart>[\w\d]+)' %
|
||||||
|
length)
|
||||||
elif format == 'txt':
|
elif format == 'txt':
|
||||||
intextmatch = re.compile(
|
intextmatch = re.compile('(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)'
|
||||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
'(\u0020|\u0009)*(?P<wraptags>'
|
||||||
|
'(\n(\u0020|\u0009)*)+)(?P<secondpart>'
|
||||||
|
'[\\w\\d]+)' % length)
|
||||||
elif format == 'individual_words':
|
elif format == 'individual_words':
|
||||||
intextmatch = re.compile(
|
intextmatch = re.compile(r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*'
|
||||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
r'(?P<secondpart>\w+)(?![^<]*?>)',
|
||||||
|
re.UNICODE)
|
||||||
elif format == 'html_cleanup':
|
elif format == 'html_cleanup':
|
||||||
intextmatch = re.compile(
|
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)'
|
||||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
r'(?P<wraptags></span>\s*(</[iubp]>\s*'
|
||||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
r'<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>'
|
||||||
|
r'\s*<[iubp][^>]*>)?\s*(?P<secondpart>'
|
||||||
|
r'[\w\d]+)')
|
||||||
elif format == 'txt_cleanup':
|
elif format == 'txt_cleanup':
|
||||||
intextmatch = re.compile(
|
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
r'(?P<wraptags>\s+)(?P<secondpart>'
|
||||||
|
r'[\w\d]+)')
|
||||||
|
|
||||||
html = intextmatch.sub(self.dehyphenate, html)
|
html = intextmatch.sub(self.dehyphenate, html)
|
||||||
return html
|
return html
|
||||||
@@ -299,18 +320,18 @@ class CSSPreProcessor(object):
|
|||||||
|
|
||||||
# Remove some of the broken CSS Microsoft products
|
# Remove some of the broken CSS Microsoft products
|
||||||
# create
|
# create
|
||||||
MS_PAT = re.compile(r'''
|
MS_PAT = re.compile(r'''
|
||||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||||
(%s).+? # The invalid selectors
|
(%s).+? # The invalid selectors
|
||||||
(?P<end>$|;|\}) # The end of the declaration
|
(?P<end>$|;|\}) # The end of the declaration
|
||||||
'''%'mso-|panose-|text-underline|tab-interval',
|
''' % 'mso-|panose-|text-underline|tab-interval',
|
||||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
re.MULTILINE | re.IGNORECASE | re.VERBOSE)
|
||||||
|
|
||||||
def ms_sub(self, match):
|
def ms_sub(self, match):
|
||||||
end = match.group('end')
|
end = match.group('end')
|
||||||
try:
|
try:
|
||||||
start = match.group('start')
|
start = match.group('start')
|
||||||
except:
|
except Exception:
|
||||||
start = ''
|
start = ''
|
||||||
if end == ';':
|
if end == ';':
|
||||||
end = ''
|
end = ''
|
||||||
@@ -332,7 +353,7 @@ class CSSPreProcessor(object):
|
|||||||
for line in data.splitlines():
|
for line in data.splitlines():
|
||||||
ll = line.lstrip()
|
ll = line.lstrip()
|
||||||
if not (namespaced or ll.startswith('@import') or not ll or
|
if not (namespaced or ll.startswith('@import') or not ll or
|
||||||
ll.startswith('@charset')):
|
ll.startswith('@charset')):
|
||||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
ans.append(XHTML_CSS_NAMESPACE.strip())
|
||||||
namespaced = True
|
namespaced = True
|
||||||
ans.append(line)
|
ans.append(line)
|
||||||
@@ -359,7 +380,8 @@ def accent_regex(accent_maps, letter_before=False):
|
|||||||
args = ''.join(accent_cat), ''.join(letters)
|
args = ''.join(accent_cat), ''.join(letters)
|
||||||
accent_group, letter_group = 1, 2
|
accent_group, letter_group = 1, 2
|
||||||
|
|
||||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args),
|
||||||
|
re.UNICODE)
|
||||||
|
|
||||||
def sub(m):
|
def sub(m):
|
||||||
lmap = accent_maps[m.group(accent_group)]
|
lmap = accent_maps[m.group(accent_group)]
|
||||||
@@ -371,83 +393,96 @@ def accent_regex(accent_maps, letter_before=False):
|
|||||||
def html_preprocess_rules():
|
def html_preprocess_rules():
|
||||||
ans = getattr(html_preprocess_rules, 'ans', None)
|
ans = getattr(html_preprocess_rules, 'ans', None)
|
||||||
if ans is None:
|
if ans is None:
|
||||||
ans = html_preprocess_rules.ans = [
|
ans = [
|
||||||
# Remove huge block of contiguous spaces as they slow down
|
# Remove huge block of contiguous spaces as they slow down
|
||||||
# the following regexes pretty badly
|
# the following regexes pretty badly
|
||||||
(re.compile(r'\s{10000,}'), ''),
|
(re.compile(r'\s{10000,}'), ''),
|
||||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||||
# Put all sorts of crap into <head>. This messes up lxml
|
# Put all sorts of crap into <head>. This messes up lxml
|
||||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>',
|
||||||
sanitize_head),
|
re.IGNORECASE | re.DOTALL), sanitize_head),
|
||||||
# Convert all entities, since lxml doesn't handle them well
|
# Convert all entities, since lxml doesn't handle them well
|
||||||
(re.compile(r'&(\S+?);'), convert_entities),
|
(re.compile(r'&(\S+?);'), convert_entities),
|
||||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
# Remove the <![if/endif tags inserted by everybody's darling,
|
||||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
|
# MS Word
|
||||||
]
|
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||||
|
'')]
|
||||||
|
html_preprocess_rules.ans = ans
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def pdftohtml_rules():
|
def pdftohtml_rules():
|
||||||
ans = getattr(pdftohtml_rules, 'ans', None)
|
ans = getattr(pdftohtml_rules, 'ans', None)
|
||||||
if ans is None:
|
if ans is None:
|
||||||
ans = pdftohtml_rules.ans = [
|
ans = [accent_regex({'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||||
accent_regex({
|
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚ'
|
||||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
'úÚźŹ',
|
||||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
'¸': 'cC:çÇ',
|
||||||
'¸': 'cC:çÇ',
|
'˛': 'aAeE:ąĄęĘ',
|
||||||
'˛': 'aAeE:ąĄęĘ',
|
'˙': 'zZ:żŻ',
|
||||||
'˙': 'zZ:żŻ',
|
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
'°': 'uU:ůŮ'}),
|
||||||
'°': 'uU:ůŮ',
|
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'},
|
||||||
}),
|
letter_before=True),
|
||||||
|
|
||||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
# If pdf printed from a browser then the header/footer has a
|
||||||
|
# reliable pattern
|
||||||
|
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?'
|
||||||
|
r'[A-Z].*<br>(?=\s*<hr>))',
|
||||||
|
re.IGNORECASE), lambda match: ''),
|
||||||
|
|
||||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
# Center separator lines
|
||||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'),
|
||||||
|
lambda match: '<p>\n<p style="text-align:center">' +
|
||||||
|
match.group('break') + '</p>'),
|
||||||
|
|
||||||
# Center separator lines
|
# Remove <hr> tags
|
||||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||||
|
|
||||||
# Remove <hr> tags
|
# Remove gray background
|
||||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||||
|
|
||||||
# Remove gray background
|
# Convert line breaks to paragraphs
|
||||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||||
|
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||||
|
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||||
|
|
||||||
# Convert line breaks to paragraphs
|
# Clean up spaces
|
||||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
# Add space before and after italics
|
||||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||||
|
(re.compile(r'</i>(?=\w)'), '</i> ')]
|
||||||
# Clean up spaces
|
pdftohtml_rules.ans = ans
|
||||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
|
||||||
# Add space before and after italics
|
|
||||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
|
||||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
|
||||||
]
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def book_designer_rules():
|
def book_designer_rules():
|
||||||
ans = getattr(book_designer_rules, 'ans', None)
|
ans = getattr(book_designer_rules, 'ans', None)
|
||||||
if ans is None:
|
if ans is None:
|
||||||
ans = book_designer_rules.ans = [
|
ans = [(re.compile('<hr>', re.IGNORECASE),
|
||||||
# HR
|
lambda match: '<span style="page-break-after:always"> '
|
||||||
(re.compile('<hr>', re.IGNORECASE),
|
'</span>'),
|
||||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
# Create header tags
|
||||||
# Create header tags
|
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)'
|
||||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
lambda match: '<h1 id="BookTitle" align="%s">%s</h1>' %
|
||||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
(match.group(2) if match.group(2) else 'center',
|
||||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
match.group(3))),
|
||||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)'
|
||||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
lambda match: '<h2 id="BookAuthor" align="%s">%s</h2>' %
|
||||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
(match.group(2) if match.group(2) else 'center',
|
||||||
]
|
match.group(3))),
|
||||||
|
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>',
|
||||||
|
re.IGNORECASE | re.DOTALL),
|
||||||
|
lambda match: '<h2 class="title">%s</h2>' % (match.group(1),)),
|
||||||
|
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>',
|
||||||
|
re.IGNORECASE | re.DOTALL),
|
||||||
|
lambda match: '<h3 class="subtitle">%s</h3>' %
|
||||||
|
(match.group(1),))]
|
||||||
|
book_designer_rules.ans = ans
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@@ -470,7 +505,7 @@ class HTMLPreProcessor(object):
|
|||||||
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
||||||
|
|
||||||
def __call__(self, html, remove_special_chars=None,
|
def __call__(self, html, remove_special_chars=None,
|
||||||
get_preprocess_html=False):
|
get_preprocess_html=False):
|
||||||
if remove_special_chars is not None:
|
if remove_special_chars is not None:
|
||||||
html = remove_special_chars.sub('', html)
|
html = remove_special_chars.sub('', html)
|
||||||
html = html.replace('\0', '')
|
html = html.replace('\0', '')
|
||||||
@@ -487,13 +522,14 @@ class HTMLPreProcessor(object):
|
|||||||
start_rules = []
|
start_rules = []
|
||||||
|
|
||||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
html = _ligpat.sub(lambda m: LIGATURES[m.group()], html)
|
||||||
|
|
||||||
user_sr_rules = {}
|
user_sr_rules = {}
|
||||||
# Function for processing search and replace
|
# Function for processing search and replace
|
||||||
|
|
||||||
def do_search_replace(search_pattern, replace_txt):
|
def do_search_replace(search_pattern, replace_txt):
|
||||||
from ebook_converter.ebooks.conversion.search_replace import compile_regular_expression
|
from ebook_converter.ebooks.conversion.search_replace import \
|
||||||
|
compile_regular_expression
|
||||||
try:
|
try:
|
||||||
search_re = compile_regular_expression(search_pattern)
|
search_re = compile_regular_expression(search_pattern)
|
||||||
if not replace_txt:
|
if not replace_txt:
|
||||||
@@ -502,11 +538,11 @@ class HTMLPreProcessor(object):
|
|||||||
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error('Failed to parse %r regexp because %s' %
|
self.log.error('Failed to parse %r regexp because %s' %
|
||||||
(search, as_unicode(e)))
|
(search, e))
|
||||||
|
|
||||||
# search / replace using the sr?_search / sr?_replace options
|
# search / replace using the sr?_search / sr?_replace options
|
||||||
for i in range(1, 4):
|
for i in range(1, 4):
|
||||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
search, replace = 'sr%d_search' % i, 'sr%d_replace' % i
|
||||||
search_pattern = getattr(self.extra_opts, search, '')
|
search_pattern = getattr(self.extra_opts, search, '')
|
||||||
replace_txt = getattr(self.extra_opts, replace, '')
|
replace_txt = getattr(self.extra_opts, replace, '')
|
||||||
if search_pattern:
|
if search_pattern:
|
||||||
@@ -520,31 +556,35 @@ class HTMLPreProcessor(object):
|
|||||||
do_search_replace(search_pattern, replace_txt)
|
do_search_replace(search_pattern, replace_txt)
|
||||||
|
|
||||||
end_rules = []
|
end_rules = []
|
||||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
# delete soft hyphens - moved here so it's executed after
|
||||||
|
# header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
end_rules.append((re.compile(
|
end_rules.append((re.compile(r'[](</p>\s*<p>\s*)+\s*'
|
||||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
r'(?=[\[a-z\d])'), lambda match: ''))
|
||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(
|
end_rules.append((re.compile(r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+'
|
||||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'),
|
||||||
|
lambda match: ''))
|
||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
length = docanalysis.line_length(getattr(self.extra_opts,
|
||||||
|
'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
# print("The pdf line length returned is " + str(length))
|
# print("The pdf line length returned is " + str(length))
|
||||||
# unwrap em/en dashes
|
# unwrap em/en dashes
|
||||||
end_rules.append((re.compile(
|
end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*<p>\s*'
|
||||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
r'(?=[\[a-z\d])' % length),
|
||||||
|
lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile((
|
(re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą'
|
||||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
|
r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4})'
|
||||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
r';))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*'
|
||||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
r'<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
|
||||||
)
|
length, re.UNICODE), wrap_lines))
|
||||||
|
|
||||||
for rule in html_preprocess_rules() + start_rules:
|
for rule in html_preprocess_rules() + start_rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
|
|||||||
name, i = None, 0
|
name, i = None, 0
|
||||||
while not name or os.path.exists(os.path.join(odir, name)):
|
while not name or os.path.exists(os.path.join(odir, name)):
|
||||||
i += 1
|
i += 1
|
||||||
name = '%04d.html'%i
|
name = '%04d.html' % i
|
||||||
with open(os.path.join(odir, name), 'wb') as f:
|
with open(os.path.join(odir, name), 'wb') as f:
|
||||||
f.write(raw.encode('utf-8'))
|
f.write(raw.encode('utf-8'))
|
||||||
|
|
||||||
@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
|
|||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if rule in user_sr_rules:
|
if rule in user_sr_rules:
|
||||||
self.log.error(
|
self.log.error('User supplied search & replace rule: %s '
|
||||||
'User supplied search & replace rule: %s -> %s '
|
'-> %s failed with error: %s, ignoring.' %
|
||||||
'failed with error: %s, ignoring.'%(
|
(user_sr_rules[rule], rule[1], e))
|
||||||
user_sr_rules[rule], rule[1], e))
|
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if is_pdftohtml and length > -1:
|
if is_pdftohtml and length > -1:
|
||||||
# Dehyphenate
|
# Dehyphenate
|
||||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html, 'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
from ebook_converter.ebooks.conversion.utils import \
|
||||||
|
HeuristicProcessor
|
||||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||||
totalwords = 0
|
totalwords = 0
|
||||||
if pdf_markup.get_word_count(html) > 7000:
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
|
|||||||
from ebook_converter.utils.localization import get_udc
|
from ebook_converter.utils.localization import get_udc
|
||||||
from ebook_converter.utils.mreplace import MReplace
|
from ebook_converter.utils.mreplace import MReplace
|
||||||
unihandecoder = get_udc()
|
unihandecoder = get_udc()
|
||||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
mr = MReplace(data={'«': '<' * 3, '»': '>' * 3})
|
||||||
html = mr.mreplace(html)
|
html = mr.mreplace(html)
|
||||||
html = unihandecoder.decode(html)
|
html = unihandecoder.decode(html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
from ebook_converter.ebooks.conversion.utils import \
|
||||||
|
HeuristicProcessor
|
||||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||||
html = preprocessor(html)
|
html = preprocessor(html)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
|
html = html.replace('<!-- created by ebook-converter\'s '
|
||||||
|
'pdftohtml -->', '')
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||||
html = smarten_punctuation(html, self.log)
|
html = smarten_punctuation(html, self.log)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
unsupported_unicode_chars = (self.extra_opts.output_profile
|
||||||
|
.unsupported_unicode_chars)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
unsupported_unicode_chars = ''
|
unsupported_unicode_chars = ''
|
||||||
if unsupported_unicode_chars:
|
if unsupported_unicode_chars:
|
||||||
|
|||||||
@@ -10,19 +10,13 @@ import urllib.parse
|
|||||||
from ebook_converter.ebooks.oeb.base import urlunquote
|
from ebook_converter.ebooks.oeb.base import urlunquote
|
||||||
from ebook_converter.ebooks.chardet import detect_xml_encoding
|
from ebook_converter.ebooks.chardet import detect_xml_encoding
|
||||||
from ebook_converter.constants_old import iswindows
|
from ebook_converter.constants_old import iswindows
|
||||||
from ebook_converter import unicode_path, as_unicode, replace_entities
|
from ebook_converter import unicode_path, replace_entities
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
|
"""
|
||||||
'''
|
|
||||||
Represents a link in a HTML file.
|
Represents a link in a HTML file.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def url_to_local_path(cls, url, base):
|
def url_to_local_path(cls, url, base):
|
||||||
@@ -31,7 +25,8 @@ class Link(object):
|
|||||||
if iswindows and path.startswith('/'):
|
if iswindows and path.startswith('/'):
|
||||||
path = path[1:]
|
path = path[1:]
|
||||||
isabs = True
|
isabs = True
|
||||||
path = urllib.parse.urlunparse(('', '', path, url.params, url.query, ''))
|
path = urllib.parse.urlunparse(('', '', path, url.params, url.query,
|
||||||
|
''))
|
||||||
path = urlunquote(path)
|
path = urlunquote(path)
|
||||||
if isabs or os.path.isabs(path):
|
if isabs or os.path.isabs(path):
|
||||||
return path
|
return path
|
||||||
@@ -39,17 +34,18 @@ class Link(object):
|
|||||||
|
|
||||||
def __init__(self, url, base):
|
def __init__(self, url, base):
|
||||||
'''
|
'''
|
||||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
:param url: The url this link points to. Must be an unquoted unicode
|
||||||
:param base: The base directory that relative URLs are with respect to.
|
string.
|
||||||
Must be a unicode string.
|
:param base: The base directory that relative URLs are with respect
|
||||||
|
to. Must be a unicode string.
|
||||||
'''
|
'''
|
||||||
assert isinstance(url, str) and isinstance(base, str)
|
assert isinstance(url, str) and isinstance(base, str)
|
||||||
self.url = url
|
self.url = url
|
||||||
self.parsed_url = urllib.parse.urlparse(self.url)
|
self.parsed_url = urllib.parse.urlparse(self.url)
|
||||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||||
self.path = None
|
self.path = None
|
||||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||||
if self.is_local and not self.is_internal:
|
if self.is_local and not self.is_internal:
|
||||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||||
|
|
||||||
@@ -62,7 +58,7 @@ class Link(object):
|
|||||||
return self.path == getattr(other, 'path', other)
|
return self.path == getattr(other, 'path', other)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Link: %s --> %s'%(self.url, self.path)
|
return 'Link: %s --> %s' % (self.url, self.path)
|
||||||
|
|
||||||
|
|
||||||
class IgnoreFile(Exception):
|
class IgnoreFile(Exception):
|
||||||
@@ -84,24 +80,25 @@ class HTMLFile(object):
|
|||||||
The encoding of the file is available as :member:`encoding`.
|
The encoding of the file is available as :member:`encoding`.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||||
LINK_PAT = re.compile(
|
LINK_PAT = re.compile(r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|'
|
||||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
r'(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||||
re.DOTALL|re.IGNORECASE)
|
re.DOTALL | re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
def __init__(self, path_to_html_file, level, encoding, verbose,
|
||||||
'''
|
referrer=None):
|
||||||
|
"""
|
||||||
:param level: The level of this file. Should be 0 for the root file.
|
:param level: The level of this file. Should be 0 for the root file.
|
||||||
:param encoding: Use `encoding` to decode HTML.
|
:param encoding: Use `encoding` to decode HTML.
|
||||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||||
'''
|
"""
|
||||||
self.path = unicode_path(path_to_html_file, abs=True)
|
self.path = unicode_path(path_to_html_file, abs=True)
|
||||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||||
self.base = os.path.dirname(self.path)
|
self.base = os.path.dirname(self.path)
|
||||||
self.level = level
|
self.level = level
|
||||||
self.referrer = referrer
|
self.referrer = referrer
|
||||||
self.links = []
|
self.links = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.path, 'rb') as f:
|
with open(self.path, 'rb') as f:
|
||||||
@@ -112,18 +109,21 @@ class HTMLFile(object):
|
|||||||
header = header.decode(encoding)
|
header = header.decode(encoding)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
self.is_binary = level > 0 and not bool(self
|
||||||
|
.HTML_PAT
|
||||||
|
.search(header))
|
||||||
if not self.is_binary:
|
if not self.is_binary:
|
||||||
src += f.read()
|
src += f.read()
|
||||||
except IOError as err:
|
except IOError as err:
|
||||||
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
msg = ('Could not read from file: %s with error: %s' %
|
||||||
|
(self.path, str(err)))
|
||||||
if level == 0:
|
if level == 0:
|
||||||
raise IOError(msg)
|
raise IOError(msg)
|
||||||
raise IgnoreFile(msg, err.errno)
|
raise IgnoreFile(msg, err.errno)
|
||||||
|
|
||||||
if not src:
|
if not src:
|
||||||
if level == 0:
|
if level == 0:
|
||||||
raise ValueError('The file %s is empty'%self.path)
|
raise ValueError('The file %s is empty' % self.path)
|
||||||
self.is_binary = True
|
self.is_binary = True
|
||||||
|
|
||||||
if not self.is_binary:
|
if not self.is_binary:
|
||||||
@@ -145,7 +145,9 @@ class HTMLFile(object):
|
|||||||
return hash(self.path)
|
return hash(self.path)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
return 'HTMLFile:%d:%s:%s' % (self.level,
|
||||||
|
'b' if self.is_binary else 'a',
|
||||||
|
self.path)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str(self)
|
return str(self)
|
||||||
@@ -191,20 +193,22 @@ def depth_first(root, flat, visited=None):
|
|||||||
visited.add(hf)
|
visited.add(hf)
|
||||||
|
|
||||||
|
|
||||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
|
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0,
|
||||||
'''
|
encoding=None):
|
||||||
|
"""
|
||||||
Recursively traverse all links in the HTML file.
|
Recursively traverse all links in the HTML file.
|
||||||
|
|
||||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||||
implies that no links in the root HTML file are followed.
|
implies that no links in the root HTML file are
|
||||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
followed.
|
||||||
auto-detected.
|
:param encoding: Specify character encoding of HTML files. If `None` it
|
||||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
is auto-detected.
|
||||||
:class:`HTMLFile` objects.
|
:return: A pair of lists (breadth_first, depth_first). Each list
|
||||||
'''
|
contains :class:`HTMLFile` objects.
|
||||||
|
"""
|
||||||
assert max_levels >= 0
|
assert max_levels >= 0
|
||||||
level = 0
|
level = 0
|
||||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||||
next_level = list(flat)
|
next_level = list(flat)
|
||||||
while level < max_levels and len(next_level) > 0:
|
while level < max_levels and len(next_level) > 0:
|
||||||
level += 1
|
level += 1
|
||||||
@@ -215,9 +219,10 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
|
|||||||
if link.path is None or link.path in flat:
|
if link.path is None or link.path in flat:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
nf = HTMLFile(link.path, level, encoding, verbose,
|
||||||
|
referrer=hf)
|
||||||
if nf.is_binary:
|
if nf.is_binary:
|
||||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
raise IgnoreFile('%s is a binary file' % nf.path, -1)
|
||||||
nl.append(nf)
|
nl.append(nf)
|
||||||
flat.append(nf)
|
flat.append(nf)
|
||||||
except IgnoreFile as err:
|
except IgnoreFile as err:
|
||||||
@@ -244,7 +249,8 @@ def get_filelist(htmlfile, dir, opts, log):
|
|||||||
log.info('Building file list...')
|
log.info('Building file list...')
|
||||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||||
verbose=opts.verbose,
|
verbose=opts.verbose,
|
||||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
encoding=opts
|
||||||
|
.input_encoding)[0 if opts.breadth_first else 1]
|
||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
log.debug('\tFound files...')
|
log.debug('\tFound files...')
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ from ebook_converter import force_unicode
|
|||||||
from ebook_converter.constants_old import filesystem_encoding, __version__
|
from ebook_converter.constants_old import filesystem_encoding, __version__
|
||||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||||
from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor
|
from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor
|
||||||
from ebook_converter import as_unicode
|
|
||||||
from ebook_converter.ebooks.oeb import parse_utils
|
from ebook_converter.ebooks.oeb import parse_utils
|
||||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||||
from ebook_converter.utils.short_uuid import uuid4
|
from ebook_converter.utils.short_uuid import uuid4
|
||||||
@@ -419,7 +418,7 @@ def urlnormalize(href):
|
|||||||
parts = urllib.parse.urlparse(href)
|
parts = urllib.parse.urlparse(href)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise ValueError('Failed to parse the URL: %r with underlying error: '
|
raise ValueError('Failed to parse the URL: %r with underlying error: '
|
||||||
'%s' % (href, as_unicode(e)))
|
'%s' % (href, e))
|
||||||
if not parts.scheme or parts.scheme == 'file':
|
if not parts.scheme or parts.scheme == 'file':
|
||||||
path, frag = urllib.parse.urldefrag(href)
|
path, frag = urllib.parse.urldefrag(href)
|
||||||
parts = ('', '', path, '', '', frag)
|
parts = ('', '', path, '', '', frag)
|
||||||
@@ -723,7 +722,7 @@ class Metadata(object):
|
|||||||
% (parse_utils.barename(self.term), self.value, self.attrib)
|
% (parse_utils.barename(self.term), self.value, self.attrib)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return as_unicode(self.value)
|
return str(self.value)
|
||||||
|
|
||||||
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
|
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
|
||||||
attrib = {}
|
attrib = {}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from lxml.etree import XPath as _XPath
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ebook_converter import constants as const
|
from ebook_converter import constants as const
|
||||||
from ebook_converter import as_unicode, force_unicode
|
from ebook_converter import force_unicode
|
||||||
from ebook_converter.ebooks.epub import rules
|
from ebook_converter.ebooks.epub import rules
|
||||||
from ebook_converter.ebooks.oeb import base
|
from ebook_converter.ebooks.oeb import base
|
||||||
from ebook_converter.ebooks.oeb.polish.split import do_split
|
from ebook_converter.ebooks.oeb.polish.split import do_split
|
||||||
@@ -126,7 +126,7 @@ class Split(object):
|
|||||||
except SelectorError as err:
|
except SelectorError as err:
|
||||||
self.log.warn('Ignoring page breaks specified with invalid '
|
self.log.warn('Ignoring page breaks specified with invalid '
|
||||||
'CSS selector: %r (%s)' %
|
'CSS selector: %r (%s)' %
|
||||||
(selector, as_unicode(err)))
|
(selector, err))
|
||||||
|
|
||||||
for i, elem in enumerate(item.data.iter('*')):
|
for i, elem in enumerate(item.data.iter('*')):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -2,17 +2,13 @@ import os
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
from ebook_converter import walk, prints, as_unicode
|
from ebook_converter import walk, prints
|
||||||
from ebook_converter.constants_old import (config_dir, iswindows, isosx, plugins, DEBUG,
|
from ebook_converter.constants_old import iswindows, isosx
|
||||||
isworker, filesystem_encoding)
|
from ebook_converter.constants_old import plugins, DEBUG, isworker
|
||||||
|
from ebook_converter.constants_old import filesystem_encoding
|
||||||
from ebook_converter.utils.fonts.metadata import FontMetadata, UnsupportedFont
|
from ebook_converter.utils.fonts.metadata import FontMetadata, UnsupportedFont
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
|
|
||||||
class NoFonts(ValueError):
|
class NoFonts(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -38,7 +34,7 @@ def fc_list():
|
|||||||
return default_font_dirs()
|
return default_font_dirs()
|
||||||
try:
|
try:
|
||||||
lib = ctypes.CDLL(lib)
|
lib = ctypes.CDLL(lib)
|
||||||
except:
|
except Exception:
|
||||||
return default_font_dirs()
|
return default_font_dirs()
|
||||||
|
|
||||||
prototype = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p)
|
prototype = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p)
|
||||||
@@ -97,7 +93,7 @@ def font_dirs():
|
|||||||
if iswindows:
|
if iswindows:
|
||||||
winutil, err = plugins['winutil']
|
winutil, err = plugins['winutil']
|
||||||
if err:
|
if err:
|
||||||
raise RuntimeError('Failed to load winutil: %s'%err)
|
raise RuntimeError('Failed to load winutil: %s' % err)
|
||||||
try:
|
try:
|
||||||
return [winutil.special_folder_path(winutil.CSIDL_FONTS)]
|
return [winutil.special_folder_path(winutil.CSIDL_FONTS)]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@@ -126,9 +122,10 @@ def font_priority(font):
|
|||||||
width_normal = font['font-stretch'] == 'normal'
|
width_normal = font['font-stretch'] == 'normal'
|
||||||
weight_normal = font['font-weight'] == 'normal'
|
weight_normal = font['font-weight'] == 'normal'
|
||||||
num_normal = sum(filter(None, (style_normal, width_normal,
|
num_normal = sum(filter(None, (style_normal, width_normal,
|
||||||
weight_normal)))
|
weight_normal)))
|
||||||
subfamily_name = (font['wws_subfamily_name'] or
|
subfamily_name = (font['wws_subfamily_name'] or
|
||||||
font['preferred_subfamily_name'] or font['subfamily_name'])
|
font['preferred_subfamily_name'] or
|
||||||
|
font['subfamily_name'])
|
||||||
if num_normal == 3 and subfamily_name == 'Regular':
|
if num_normal == 3 and subfamily_name == 'Regular':
|
||||||
return 0
|
return 0
|
||||||
if num_normal == 3:
|
if num_normal == 3:
|
||||||
@@ -167,7 +164,9 @@ def build_families(cached_fonts, folders, family_attr='font-family'):
|
|||||||
if fingerprint in fmap:
|
if fingerprint in fmap:
|
||||||
opath = fmap[fingerprint]['path']
|
opath = fmap[fingerprint]['path']
|
||||||
npath = font['path']
|
npath = font['path']
|
||||||
if path_significance(npath, folders) >= path_significance(opath, folders):
|
if path_significance(npath,
|
||||||
|
folders) >= path_significance(opath,
|
||||||
|
folders):
|
||||||
remove.append(fmap[fingerprint])
|
remove.append(fmap[fingerprint])
|
||||||
fmap[fingerprint] = font
|
fmap[fingerprint] = font
|
||||||
else:
|
else:
|
||||||
@@ -214,7 +213,7 @@ class FontScanner(Thread):
|
|||||||
try:
|
try:
|
||||||
return self.font_family_map[family.lower()]
|
return self.font_family_map[family.lower()]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise NoFonts('No fonts found for the family: %r'%family)
|
raise NoFonts('No fonts found for the family: %r' % family)
|
||||||
|
|
||||||
def legacy_fonts_for_family(self, family):
|
def legacy_fonts_for_family(self, family):
|
||||||
'''
|
'''
|
||||||
@@ -247,8 +246,11 @@ class FontScanner(Thread):
|
|||||||
with open(path, 'rb') as f:
|
with open(path, 'rb') as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
def find_font_for_text(self, text, allowed_families={'serif', 'sans-serif'},
|
def find_font_for_text(self, text,
|
||||||
preferred_families=('serif', 'sans-serif', 'monospace', 'cursive', 'fantasy')):
|
allowed_families={'serif', 'sans-serif'},
|
||||||
|
preferred_families=('serif', 'sans-serif',
|
||||||
|
'monospace', 'cursive',
|
||||||
|
'fantasy')):
|
||||||
'''
|
'''
|
||||||
Find a font on the system capable of rendering the given text.
|
Find a font on the system capable of rendering the given text.
|
||||||
|
|
||||||
@@ -258,10 +260,11 @@ class FontScanner(Thread):
|
|||||||
|
|
||||||
:return: (family name, faces) or None, None
|
:return: (family name, faces) or None, None
|
||||||
'''
|
'''
|
||||||
from ebook_converter.utils.fonts.utils import (supports_text,
|
from ebook_converter.utils.fonts.utils import \
|
||||||
panose_to_css_generic_family, get_printable_characters)
|
supports_text, panose_to_css_generic_family, \
|
||||||
|
get_printable_characters
|
||||||
if not isinstance(text, str):
|
if not isinstance(text, str):
|
||||||
raise TypeError(u'%r is not unicode'%text)
|
raise TypeError(u'%r is not unicode' % text)
|
||||||
text = get_printable_characters(text)
|
text = get_printable_characters(text)
|
||||||
found = {}
|
found = {}
|
||||||
|
|
||||||
@@ -269,7 +272,7 @@ class FontScanner(Thread):
|
|||||||
try:
|
try:
|
||||||
raw = self.get_font_data(font)
|
raw = self.get_font_data(font)
|
||||||
return supports_text(raw, text)
|
return supports_text(raw, text)
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -278,7 +281,8 @@ class FontScanner(Thread):
|
|||||||
if not faces:
|
if not faces:
|
||||||
continue
|
continue
|
||||||
generic_family = panose_to_css_generic_family(faces[0]['panose'])
|
generic_family = panose_to_css_generic_family(faces[0]['panose'])
|
||||||
if generic_family in allowed_families or generic_family == preferred_families[0]:
|
if (generic_family in allowed_families or
|
||||||
|
generic_family == preferred_families[0]):
|
||||||
return (family, faces)
|
return (family, faces)
|
||||||
elif generic_family not in found:
|
elif generic_family not in found:
|
||||||
found[generic_family] = (family, faces)
|
found[generic_family] = (family, faces)
|
||||||
@@ -321,18 +325,20 @@ class FontScanner(Thread):
|
|||||||
files = tuple(walk(folder))
|
files = tuple(walk(folder))
|
||||||
except EnvironmentError as e:
|
except EnvironmentError as e:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
prints('Failed to walk font folder:', folder,
|
prints('Failed to walk font folder:', folder, str(e))
|
||||||
as_unicode(e))
|
|
||||||
continue
|
continue
|
||||||
for candidate in files:
|
for candidate in files:
|
||||||
if (candidate.rpartition('.')[-1].lower() not in self.allowed_extensions or not os.path.isfile(candidate)):
|
if (candidate.rpartition('.')[-1].lower() not in
|
||||||
|
self.allowed_extensions or
|
||||||
|
not os.path.isfile(candidate)):
|
||||||
continue
|
continue
|
||||||
candidate = os.path.normcase(os.path.abspath(candidate))
|
candidate = os.path.normcase(os.path.abspath(candidate))
|
||||||
try:
|
try:
|
||||||
s = os.stat(candidate)
|
s = os.stat(candidate)
|
||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
continue
|
continue
|
||||||
fileid = '{0}||{1}:{2}'.format(candidate, s.st_size, s.st_mtime)
|
fileid = '{0}||{1}:{2}'.format(candidate, s.st_size,
|
||||||
|
s.st_mtime)
|
||||||
if fileid in cached_fonts:
|
if fileid in cached_fonts:
|
||||||
# Use previously cached metadata, since the file size and
|
# Use previously cached metadata, since the file size and
|
||||||
# last modified timestamp have not changed.
|
# last modified timestamp have not changed.
|
||||||
@@ -343,7 +349,7 @@ class FontScanner(Thread):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
prints('Failed to read metadata from font file:',
|
prints('Failed to read metadata from font file:',
|
||||||
candidate, as_unicode(e))
|
candidate, str(e))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if frozenset(cached_fonts) != frozenset(self.cached_fonts):
|
if frozenset(cached_fonts) != frozenset(self.cached_fonts):
|
||||||
@@ -353,7 +359,8 @@ class FontScanner(Thread):
|
|||||||
self.build_families()
|
self.build_families()
|
||||||
|
|
||||||
def build_families(self):
|
def build_families(self):
|
||||||
self.font_family_map, self.font_families = build_families(self.cached_fonts, self.folders)
|
(self.font_family_map,
|
||||||
|
self.font_families) = build_families(self.cached_fonts, self.folders)
|
||||||
|
|
||||||
def write_cache(self):
|
def write_cache(self):
|
||||||
with self.cache:
|
with self.cache:
|
||||||
@@ -380,14 +387,14 @@ class FontScanner(Thread):
|
|||||||
for family in self.font_families:
|
for family in self.font_families:
|
||||||
prints(family)
|
prints(family)
|
||||||
for font in self.fonts_for_family(family):
|
for font in self.fonts_for_family(family):
|
||||||
prints('\t%s: %s'%(font['full_name'], font['path']))
|
prints('\t%s: %s' % (font['full_name'], font['path']))
|
||||||
prints(end='\t')
|
prints(end='\t')
|
||||||
for key in ('font-stretch', 'font-weight', 'font-style'):
|
for key in ('font-stretch', 'font-weight', 'font-style'):
|
||||||
prints('%s: %s'%(key, font[key]), end=' ')
|
prints('%s: %s' % (key, font[key]), end=' ')
|
||||||
prints()
|
prints()
|
||||||
prints('\tSub-family:', font['wws_subfamily_name'] or
|
prints('\tSub-family:', font['wws_subfamily_name'] or
|
||||||
font['preferred_subfamily_name'] or
|
font['preferred_subfamily_name'] or
|
||||||
font['subfamily_name'])
|
font['subfamily_name'])
|
||||||
prints()
|
prints()
|
||||||
prints()
|
prints()
|
||||||
|
|
||||||
|
|||||||
@@ -1,20 +1,18 @@
|
|||||||
"""
|
"""
|
||||||
A simplified logging system
|
A simplified logging system
|
||||||
"""
|
"""
|
||||||
import sys, traceback, io
|
import sys
|
||||||
|
import traceback
|
||||||
|
import io
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
|
||||||
from ebook_converter import force_unicode, as_unicode, prints
|
from ebook_converter import force_unicode, prints
|
||||||
|
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
|
||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
DEBUG = 0
|
DEBUG = 0
|
||||||
INFO = 1
|
INFO = 1
|
||||||
WARN = 2
|
WARN = 2
|
||||||
ERROR = 3
|
ERROR = 3
|
||||||
|
|
||||||
|
|
||||||
@@ -38,10 +36,10 @@ class ANSIStream(Stream):
|
|||||||
def __init__(self, stream=sys.stdout):
|
def __init__(self, stream=sys.stdout):
|
||||||
Stream.__init__(self, stream)
|
Stream.__init__(self, stream)
|
||||||
self.color = {
|
self.color = {
|
||||||
DEBUG: u'green',
|
DEBUG: 'green',
|
||||||
INFO: None,
|
INFO: None,
|
||||||
WARN: u'yellow',
|
WARN: 'yellow',
|
||||||
ERROR: u'red',
|
ERROR: 'red',
|
||||||
}
|
}
|
||||||
|
|
||||||
def prints(self, level, *args, **kwargs):
|
def prints(self, level, *args, **kwargs):
|
||||||
@@ -64,12 +62,10 @@ class FileStream(Stream):
|
|||||||
|
|
||||||
class HTMLStream(Stream):
|
class HTMLStream(Stream):
|
||||||
|
|
||||||
color = {
|
color = {DEBUG: b'<span style="color:green">',
|
||||||
DEBUG: b'<span style="color:green">',
|
INFO: b'<span>',
|
||||||
INFO: b'<span>',
|
WARN: b'<span style="color:blue">',
|
||||||
WARN: b'<span style="color:blue">',
|
ERROR: b'<span style="color:red">'}
|
||||||
ERROR: b'<span style="color:red">'
|
|
||||||
}
|
|
||||||
normal = b'</span>'
|
normal = b'</span>'
|
||||||
|
|
||||||
def __init__(self, stream=sys.stdout):
|
def __init__(self, stream=sys.stdout):
|
||||||
@@ -104,14 +100,14 @@ class UnicodeHTMLStream(HTMLStream):
|
|||||||
self.data.append(col)
|
self.data.append(col)
|
||||||
self.last_col = col
|
self.last_col = col
|
||||||
|
|
||||||
sep = kwargs.get(u'sep', u' ')
|
sep = kwargs.get('sep', ' ')
|
||||||
end = kwargs.get(u'end', u'\n')
|
end = kwargs.get('end', '\n')
|
||||||
|
|
||||||
for arg in args:
|
for arg in args:
|
||||||
if isinstance(arg, bytes):
|
if isinstance(arg, bytes):
|
||||||
arg = force_unicode(arg)
|
arg = force_unicode(arg)
|
||||||
elif not isinstance(arg, str):
|
elif not isinstance(arg, str):
|
||||||
arg = as_unicode(arg)
|
arg = str(arg)
|
||||||
self.data.append(arg+sep)
|
self.data.append(arg+sep)
|
||||||
self.plain_text.append(arg+sep)
|
self.plain_text.append(arg+sep)
|
||||||
self.data.append(end)
|
self.data.append(end)
|
||||||
@@ -124,8 +120,8 @@ class UnicodeHTMLStream(HTMLStream):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def html(self):
|
def html(self):
|
||||||
end = self.normal if self.data else u''
|
end = self.normal if self.data else ''
|
||||||
return u''.join(self.data) + end
|
return ''.join(self.data) + end
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
return [self.data, self.plain_text, self.last_col]
|
return [self.data, self.plain_text, self.last_col]
|
||||||
@@ -143,8 +139,8 @@ class UnicodeHTMLStream(HTMLStream):
|
|||||||
class Log(object):
|
class Log(object):
|
||||||
|
|
||||||
DEBUG = DEBUG
|
DEBUG = DEBUG
|
||||||
INFO = INFO
|
INFO = INFO
|
||||||
WARN = WARN
|
WARN = WARN
|
||||||
ERROR = ERROR
|
ERROR = ERROR
|
||||||
|
|
||||||
def __init__(self, level=INFO):
|
def __init__(self, level=INFO):
|
||||||
@@ -153,8 +149,8 @@ class Log(object):
|
|||||||
self.outputs = [default_output]
|
self.outputs = [default_output]
|
||||||
|
|
||||||
self.debug = partial(self.print_with_flush, DEBUG)
|
self.debug = partial(self.print_with_flush, DEBUG)
|
||||||
self.info = partial(self.print_with_flush, INFO)
|
self.info = partial(self.print_with_flush, INFO)
|
||||||
self.warn = self.warning = partial(self.print_with_flush, WARN)
|
self.warn = self.warning = partial(self.print_with_flush, WARN)
|
||||||
self.error = partial(self.print_with_flush, ERROR)
|
self.error = partial(self.print_with_flush, ERROR)
|
||||||
|
|
||||||
def prints(self, level, *args, **kwargs):
|
def prints(self, level, *args, **kwargs):
|
||||||
@@ -222,7 +218,8 @@ class ThreadSafeLog(Log):
|
|||||||
limit = kwargs.pop('limit', None)
|
limit = kwargs.pop('limit', None)
|
||||||
with self._lock:
|
with self._lock:
|
||||||
Log.print_with_flush(self, ERROR, *args, **kwargs)
|
Log.print_with_flush(self, ERROR, *args, **kwargs)
|
||||||
Log.print_with_flush(self, self.exception_traceback_level, traceback.format_exc(limit))
|
Log.print_with_flush(self, self.exception_traceback_level,
|
||||||
|
traceback.format_exc(limit))
|
||||||
|
|
||||||
|
|
||||||
class ThreadSafeWrapper(Log):
|
class ThreadSafeWrapper(Log):
|
||||||
@@ -242,10 +239,9 @@ class ThreadSafeWrapper(Log):
|
|||||||
|
|
||||||
|
|
||||||
class GUILog(ThreadSafeLog):
|
class GUILog(ThreadSafeLog):
|
||||||
|
"""
|
||||||
'''
|
|
||||||
Logs in HTML and plain text as unicode. Ideal for display in a GUI context.
|
Logs in HTML and plain text as unicode. Ideal for display in a GUI context.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
ThreadSafeLog.__init__(self, level=self.DEBUG)
|
ThreadSafeLog.__init__(self, level=self.DEBUG)
|
||||||
|
|||||||
Reference in New Issue
Block a user