1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-31 10:55:44 +01:00

Removed as_unicode function

This commit is contained in:
2020-06-14 19:02:23 +02:00
parent add7a8ca56
commit fdd531f6e0
7 changed files with 412 additions and 366 deletions

View File

@@ -13,10 +13,10 @@ try:
except EnvironmentError: except EnvironmentError:
os.chdir(os.path.expanduser('~')) os.chdir(os.path.expanduser('~'))
from ebook_converter.constants_old import (iswindows, isosx, islinux, isfrozen, from ebook_converter.constants_old import iswindows, islinux, isfrozen, \
isbsd, preferred_encoding, __appname__, __version__, __author__, isbsd, preferred_encoding, __appname__, __version__, __author__, \
win32event, win32api, winerror, fcntl, win32event, win32api, winerror, fcntl, \
filesystem_encoding, plugins, config_dir) filesystem_encoding, plugins, config_dir
from ebook_converter.startup import winutil, winutilerror from ebook_converter.startup import winutil, winutilerror
from ebook_converter.utils.icu import safe_chr from ebook_converter.utils.icu import safe_chr
@@ -51,23 +51,28 @@ def confirm_config_name(name):
return name + '_again' return name + '_again'
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<', # no2to3 _filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<',
'"', ':', '>', '+', '/') + tuple(map(chr, range(32)))) # no2to3 '"', ':', '>', '+', '/') +
tuple(map(chr, range(32))))
def sanitize_file_name(name, substitute='_'): def sanitize_file_name(name, substitute='_'):
''' """
Sanitize the filename `name`. All invalid characters are replaced by `substitute`. Sanitize the filename `name`. All invalid characters are replaced by
The set of invalid characters is the union of the invalid characters in Windows, `substitute`. The set of invalid characters is the union of the invalid
macOS and Linux. Also removes leading and trailing whitespace. characters in Windows, macOS and Linux. Also removes leading and trailing
**WARNING:** This function also replaces path separators, so only pass file names whitespace.
and not full paths to it.
''' **WARNING:** This function also replaces path separators, so only pass
file names and not full paths to it.
"""
if isinstance(name, bytes): if isinstance(name, bytes):
name = name.decode(filesystem_encoding, 'replace') name = name.decode(filesystem_encoding, 'replace')
if isinstance(substitute, bytes): if isinstance(substitute, bytes):
substitute = substitute.decode(filesystem_encoding, 'replace') substitute = substitute.decode(filesystem_encoding, 'replace')
chars = (substitute if c in _filename_sanitize_unicode else c for c in name) chars = (substitute
if c in _filename_sanitize_unicode else c for c in name)
one = ''.join(chars) one = ''.join(chars)
one = re.sub(r'\s', ' ', one).strip() one = re.sub(r'\s', ' ', one).strip()
bname, ext = os.path.splitext(one) bname, ext = os.path.splitext(one)
@@ -87,8 +92,8 @@ def prints(*args, **kwargs):
""" """
Print unicode arguments safely by encoding them to preferred_encoding Print unicode arguments safely by encoding them to preferred_encoding
Has the same signature as the print function from Python 3, except for the Has the same signature as the print function from Python 3, except for the
additional keyword argument safe_encode, which if set to True will cause the additional keyword argument safe_encode, which if set to True will cause
function to use repr when encoding fails. the function to use repr when encoding fails.
Returns the number of bytes written. Returns the number of bytes written.
""" """
@@ -120,7 +125,7 @@ def prints(*args, **kwargs):
except UnicodeEncodeError: except UnicodeEncodeError:
try: try:
arg = arg.encode('utf-8') arg = arg.encode('utf-8')
except: except Exception:
if not safe_encode: if not safe_encode:
raise raise
arg = repr(arg) arg = repr(arg)
@@ -131,7 +136,7 @@ def prints(*args, **kwargs):
except UnicodeEncodeError: except UnicodeEncodeError:
try: try:
arg = arg.encode('utf-8') arg = arg.encode('utf-8')
except: except Exception:
if not safe_encode: if not safe_encode:
raise raise
arg = repr(arg) arg = repr(arg)
@@ -139,7 +144,7 @@ def prints(*args, **kwargs):
try: try:
file.write(arg) file.write(arg)
count += len(arg) count += len(arg)
except: except Exception:
from polyglot import reprlib from polyglot import reprlib
arg = reprlib.repr(arg) arg = reprlib.repr(arg)
file.write(arg) file.write(arg)
@@ -168,22 +173,12 @@ def setup_cli_handlers(logger, level):
elif level == logging.DEBUG: elif level == logging.DEBUG:
handler = logging.StreamHandler(sys.stderr) handler = logging.StreamHandler(sys.stderr)
handler.setLevel(logging.DEBUG) handler.setLevel(logging.DEBUG)
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s')) handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:'
'%(lineno)s: %(message)s'))
logger.addHandler(handler) logger.addHandler(handler)
def load_library(name, cdll):
if iswindows:
return cdll.LoadLibrary(name)
if isosx:
name += '.dylib'
if hasattr(sys, 'frameworks_dir'):
return cdll.LoadLibrary(os.path.join(getattr(sys, 'frameworks_dir'), name))
return cdll.LoadLibrary(name)
return cdll.LoadLibrary(name+'.so')
def extract(path, dir): def extract(path, dir):
extractor = None extractor = None
# First use the file header to identify its type # First use the file header to identify its type
@@ -216,7 +211,8 @@ def fit_image(width, height, pwidth, pheight):
@param height: Height of image @param height: Height of image
@param pwidth: Width of box @param pwidth: Width of box
@param pheight: Height of box @param pheight: Height of box
@return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height. @return: scaled, new_width, new_height. scaled is True iff new_width
and/or new_height is different from width or height.
''' '''
scaled = height > pheight or width > pwidth scaled = height > pheight or width > pwidth
if height > pheight: if height > pheight:
@@ -262,8 +258,10 @@ def walk(dir):
def strftime(fmt, t=None): def strftime(fmt, t=None):
''' A version of strftime that returns unicode strings and tries to handle dates """
before 1900 ''' A version of strftime that returns unicode strings and tries to handle
dates before 1900
"""
if not fmt: if not fmt:
return '' return ''
if t is None: if t is None:
@@ -272,7 +270,7 @@ def strftime(fmt, t=None):
t = t.timetuple() t = t.timetuple()
early_year = t[0] < 1900 early_year = t[0] < 1900
if early_year: if early_year:
replacement = 1900 if t[0]%4 == 0 else 1901 replacement = 1900 if t[0] % 4 == 0 else 1901
fmt = fmt.replace('%Y', '_early year hack##') fmt = fmt.replace('%Y', '_early year hack##')
t = list(t) t = list(t)
orig_year = t[0] orig_year = t[0]
@@ -301,27 +299,33 @@ def my_unichr(num):
def entity_to_unicode(match, exceptions=[], encoding='cp1252', def entity_to_unicode(match, exceptions=[], encoding='cp1252',
result_exceptions={}): result_exceptions={}):
''' """
:param match: A match object such that '&'+match.group(1)';' is the entity. :param match: A match object such that '&'+match.group(1)';' is the entity.
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234' :param exceptions: A list of entities to not convert (Each entry is the
name of the entity, for e.g. 'apos' or '#1234'
:param encoding: The encoding to use to decode numeric entities between 128 and 256. :param encoding: The encoding to use to decode numeric entities between
If None, the Unicode UCS encoding is used. A common encoding is cp1252. 128 and 256. If None, the Unicode UCS encoding is used.
A common encoding is cp1252.
:param result_exceptions: A mapping of characters to entities. If the
result is in result_exceptions,
result_exception[result] is returned instead.
Convenient way to specify exception for things
like < or > that can be specified by various
actual entities.
"""
:param result_exceptions: A mapping of characters to entities. If the result
is in result_exceptions, result_exception[result] is returned instead.
Convenient way to specify exception for things like < or > that can be
specified by various actual entities.
'''
def check(ch): def check(ch):
return result_exceptions.get(ch, ch) return result_exceptions.get(ch, ch)
ent = match.group(1) ent = match.group(1)
if ent in exceptions: if ent in exceptions:
return '&'+ent+';' return '&'+ent+';'
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software # squot is generated by some broken CMS software
if ent in {'apos', 'squot'}:
return check("'") return check("'")
if ent == 'hellips': if ent == 'hellips':
ent = 'hellip' ent = 'hellip'
@@ -331,7 +335,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
num = int(ent[2:], 16) num = int(ent[2:], 16)
else: else:
num = int(ent[1:]) num = int(ent[1:])
except: except Exception:
return '&'+ent+';' return '&'+ent+';'
if encoding is None or num > 255: if encoding is None or num > 255:
return check(my_unichr(num)) return check(my_unichr(num))
@@ -394,15 +398,6 @@ def force_unicode(obj, enc=preferred_encoding):
return obj return obj
def as_unicode(obj, enc=preferred_encoding):
if not isinstance(obj, bytes):
try:
obj = str(obj)
except Exception:
obj = repr(obj)
return force_unicode(obj, enc=enc)
def url_slash_cleaner(url): def url_slash_cleaner(url):
''' '''
Removes redundant /'s from url's. Removes redundant /'s from url's.

View File

@@ -1,43 +1,37 @@
import functools, re, json import functools
from math import ceil import json
import math
import re
from ebook_converter import entity_to_unicode, as_unicode from ebook_converter import entity_to_unicode
__license__ = 'GPL v3' XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' SVG_NS = 'http://www.w3.org/2000/svg'
__docformat__ = 'restructuredtext en' XLINK_NS = 'http://www.w3.org/1999/xlink'
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode, convert_entities = functools.partial(entity_to_unicode,
result_exceptions={ result_exceptions={'<': '&lt;',
'<' : '&lt;', '>': '&gt;',
'>' : '&gt;', "'": '&apos;',
"'" : '&apos;', '"': '&quot;',
'"' : '&quot;', '&': '&amp;'})
'&' : '&amp;', _span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
})
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = { LIGATURES = {'\uFB00': 'ff',
# '\u00c6': 'AE', '\uFB01': 'fi',
# '\u00e6': 'ae', '\uFB02': 'fl',
# '\u0152': 'OE', '\uFB03': 'ffi',
# '\u0153': 'oe', '\uFB04': 'ffl',
# '\u0132': 'IJ', '\uFB05': 'ft',
# '\u0133': 'ij', '\uFB06': 'st'}
# '\u1D6B': 'ue', # '\u00c6': 'AE',
'\uFB00': 'ff', # '\u00e6': 'ae',
'\uFB01': 'fi', # '\u0152': 'OE',
'\uFB02': 'fl', # '\u0153': 'oe',
'\uFB03': 'ffi', # '\u0132': 'IJ',
'\uFB04': 'ffl', # '\u0133': 'ij',
'\uFB05': 'ft', # '\u1D6B': 'ue',
'\uFB06': 'st',
}
_ligpat = re.compile('|'.join(LIGATURES)) _ligpat = re.compile('|'.join(LIGATURES))
@@ -83,17 +77,18 @@ def smarten_punctuation(html, log=None):
class DocAnalysis(object): class DocAnalysis(object):
''' """
Provides various text analysis functions to determine how the document is structured. Provides various text analysis functions to determine how the document is
format is the type of document analysis will be done against. structured. format is the type of document analysis will be done against.
raw is the raw text to determine the line length to use for wrapping. raw is the raw text to determine the line length to use for wrapping.
Blank lines are excluded from analysis Blank lines are excluded from analysis
''' """
def __init__(self, format='html', raw=''): def __init__(self, format='html', raw=''):
raw = raw.replace('&nbsp;', ' ') raw = raw.replace('&nbsp;', ' ')
if format == 'html': if format == 'html':
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL) linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)',
re.DOTALL)
elif format == 'pdf': elif format == 'pdf':
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL) linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html': elif format == 'spanned_html':
@@ -103,13 +98,13 @@ class DocAnalysis(object):
self.lines = linere.findall(raw) self.lines = linere.findall(raw)
def line_length(self, percent): def line_length(self, percent):
''' """
Analyses the document to find the median line length. Analyses the document to find the median line length.
percentage is a decimal number, 0 - 1 which is used to determine percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is how far in the list of line lengths to use. The list of line lengths is
ordered smallest to largest and does not include duplicates. 0.5 is the ordered smallest to largest and does not include duplicates. 0.5 is the
median value. median value.
''' """
lengths = [] lengths = []
for line in self.lines: for line in self.lines:
if len(line) > 0: if len(line) > 0:
@@ -121,7 +116,7 @@ class DocAnalysis(object):
lengths = list(set(lengths)) lengths = list(set(lengths))
total = sum(lengths) total = sum(lengths)
avg = total / len(lengths) avg = total / len(lengths)
max_line = ceil(avg * 2) max_line = math.ceil(avg * 2)
lengths = sorted(lengths) lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1): for i in range(len(lengths) - 1, -1, -1):
@@ -138,31 +133,32 @@ class DocAnalysis(object):
return lengths[index] return lengths[index]
def line_histogram(self, percent): def line_histogram(self, percent):
''' """
Creates a broad histogram of the document to determine whether it incorporates hard Creates a broad histogram of the document to determine whether it
line breaks. Lines are sorted into 20 'buckets' based on length. incorporates hard line breaks. Lines are sorted into 20 'buckets'
percent is the percentage of lines that should be in a single bucket to return true based on length. percent is the percentage of lines that should be in
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks a single bucket to return true The majority of the lines will exist in
''' 1-2 buckets in typical docs with hard line breaks
minLineLength=20 # Ignore lines under 20 chars (typical of spaces) """
maxLineLength=1900 # Discard larger than this to stay in range minLineLength = 20 # Ignore lines under 20 chars (typical of spaces)
buckets=20 # Each line is divided into a bucket based on length maxLineLength = 1900 # Discard larger than this to stay in range
buckets = 20 # Each line is divided into a bucket based on length
# print("there are "+str(len(lines))+" lines") # print("there are "+str(len(lines))+" lines")
# max = 0 # max = 0
# for line in self.lines: # for line in self.lines:
# l = len(line) # _l = len(line)
# if l > max: # if _l > max:
# max = l # max = _l
# print("max line found is "+str(max)) # print("max line found is "+str(max))
# Build the line length histogram # Build the line length histogram
hRaw = [0 for i in range(0,buckets)] hRaw = [0 for i in range(0, buckets)]
for line in self.lines: for line in self.lines:
l = len(line) _l = len(line)
if l > minLineLength and l < maxLineLength: if _l > minLineLength and _l < maxLineLength:
l = int(l // 100) _l = int(_l // 100)
# print("adding "+str(l)) # print("adding "+str(_l))
hRaw[l]+=1 hRaw[_l] += 1
# Normalize the histogram into percents # Normalize the histogram into percents
totalLines = len(self.lines) totalLines = len(self.lines)
@@ -175,7 +171,7 @@ class DocAnalysis(object):
# Find the biggest bucket # Find the biggest bucket
maxValue = 0 maxValue = 0
for i in range(0,len(h)): for i in range(0, len(h)):
if h[i] > maxValue: if h[i] > maxValue:
maxValue = h[i] maxValue = h[i]
@@ -188,36 +184,42 @@ class DocAnalysis(object):
class Dehyphenator(object): class Dehyphenator(object):
''' """
Analyzes words to determine whether hyphens should be retained/removed. Uses the document Analyzes words to determine whether hyphens should be retained/removed.
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and Uses the document itself is as a dictionary. This method handles all
scientific words. The primary disadvantage is that words appearing only once in the document languages along with uncommon, made-up, and scientific words. The primary
retain hyphens. disadvantage is that words appearing only once in the document retain
''' hyphens.
"""
def __init__(self, verbose=0, log=None): def __init__(self, verbose=0, log=None):
self.log = log self.log = log
self.verbose = verbose self.verbose = verbose
# Add common suffixes to the regex below to increase the likelihood of a match - # Add common suffixes to the regex below to increase the likelihood of
# don't add suffixes which are also complete words, such as 'able' or 'sex' # a match - don't add suffixes which are also complete words, such as
# only remove if it's not already the point of hyphenation # 'able' or 'sex' only remove if it's not already the point of
self.suffix_string = ( # hyphenation
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|" self.suffix_string = ("((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?"
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|" "|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|"
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$") "istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier"
"|al|ex|ian)$")
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE) self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE) self.removesuffixes = re.compile(r"%s" % self.suffix_string,
# remove prefixes if the prefix was not already the point of hyphenation re.IGNORECASE)
# remove prefixes if the prefix was not already the point of
# hyphenation
self.prefix_string = '^(dis|re|un|in|ex)' self.prefix_string = '^(dis|re|un|in|ex)'
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE) self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE) self.removeprefix = re.compile(r'%s' % self.prefix_string,
re.IGNORECASE)
def dehyphenate(self, match): def dehyphenate(self, match):
firsthalf = match.group('firstpart') firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart') secondhalf = match.group('secondpart')
try: try:
wraptags = match.group('wraptags') wraptags = match.group('wraptags')
except: except Exception:
wraptags = '' wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf) hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf)
@@ -231,65 +233,84 @@ class Dehyphenator(object):
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated) self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
try: try:
searchresult = self.html.find(lookupword.lower()) searchresult = self.html.find(lookupword.lower())
except: except Exception:
return hyphenated return hyphenated
if self.format == 'html_cleanup' or self.format == 'txt_cleanup': if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
if self.verbose > 2: if self.verbose > 2:
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated) self.log(" Cleanup:returned dehyphenated word: " +
dehyphenated)
return dehyphenated return dehyphenated
elif self.html.find(hyphenated) != -1: elif self.html.find(hyphenated) != -1:
if self.verbose > 2: if self.verbose > 2:
self.log(" Cleanup:returned hyphenated word: " + hyphenated) self.log(" Cleanup:returned hyphenated word: " +
hyphenated)
return hyphenated return hyphenated
else: else:
if self.verbose > 2: if self.verbose > 2:
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf) self.log(" Cleanup:returning original text " +
firsthalf + " + linefeed " + secondhalf)
return firsthalf+'\u2014'+wraptags+secondhalf return firsthalf+'\u2014'+wraptags+secondhalf
else: else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6: if (self.format == 'individual_words' and
len(firsthalf) + len(secondhalf) <= 6):
if self.verbose > 2: if self.verbose > 2:
self.log("too short, returned hyphenated word: " + hyphenated) self.log("too short, returned hyphenated word: " +
hyphenated)
return hyphenated return hyphenated
if len(firsthalf) <= 2 and len(secondhalf) <= 2: if len(firsthalf) <= 2 and len(secondhalf) <= 2:
if self.verbose > 2: if self.verbose > 2:
self.log("too short, returned hyphenated word: " + hyphenated) self.log("too short, returned hyphenated word: " +
hyphenated)
return hyphenated return hyphenated
if self.html.find(lookupword) != -1 or searchresult != -1: if self.html.find(lookupword) != -1 or searchresult != -1:
if self.verbose > 2: if self.verbose > 2:
self.log(" returned dehyphenated word: " + dehyphenated) self.log(" returned dehyphenated word: " +
dehyphenated)
return dehyphenated return dehyphenated
else: else:
if self.verbose > 2: if self.verbose > 2:
self.log(" returned hyphenated word: " + hyphenated) self.log(" returned hyphenated word: " +
hyphenated)
return hyphenated return hyphenated
def __call__(self, html, format, length=1): def __call__(self, html, format, length=1):
self.html = html self.html = html
self.format = format self.format = format
if format == 'html': if format == 'html':
intextmatch = re.compile(( intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)'
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?' r'(-|)\s*(?=<)(?P<wraptags>(</span>)?'
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)' r'\s*(</[iubp]>\s*){1,2}'
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length) r'(?P<up2threeblanks><(p|div)[^>]*>\s*'
r'(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+)'
r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}'
r'(<span[^>]*>)?)\s*(?P<secondpart>'
r'[\w\d]+)' % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(( intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)'
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?P<wraptags><p>|' r'\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*'
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length) r'<[iub]>)\s*(?P<secondpart>[\w\d]+)' %
length)
elif format == 'txt': elif format == 'txt':
intextmatch = re.compile( intextmatch = re.compile('(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)'
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length) '(\u0020|\u0009)*(?P<wraptags>'
'(\n(\u0020|\u0009)*)+)(?P<secondpart>'
'[\\w\\d]+)' % length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile( intextmatch = re.compile(r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*'
r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE) r'(?P<secondpart>\w+)(?![^<]*?>)',
re.UNICODE)
elif format == 'html_cleanup': elif format == 'html_cleanup':
intextmatch = re.compile( intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)'
r'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>' r'(?P<wraptags></span>\s*(</[iubp]>\s*'
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)') r'<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>'
r'\s*<[iubp][^>]*>)?\s*(?P<secondpart>'
r'[\w\d]+)')
elif format == 'txt_cleanup': elif format == 'txt_cleanup':
intextmatch = re.compile( intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|)'
r'(?P<firstpart>[^\W\-]+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)') r'(?P<wraptags>\s+)(?P<secondpart>'
r'[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html
@@ -299,18 +320,18 @@ class CSSPreProcessor(object):
# Remove some of the broken CSS Microsoft products # Remove some of the broken CSS Microsoft products
# create # create
MS_PAT = re.compile(r''' MS_PAT = re.compile(r'''
(?P<start>^|;|\{)\s* # The end of the previous rule or block start (?P<start>^|;|\{)\s* # The end of the previous rule or block start
(%s).+? # The invalid selectors (%s).+? # The invalid selectors
(?P<end>$|;|\}) # The end of the declaration (?P<end>$|;|\}) # The end of the declaration
'''%'mso-|panose-|text-underline|tab-interval', ''' % 'mso-|panose-|text-underline|tab-interval',
re.MULTILINE|re.IGNORECASE|re.VERBOSE) re.MULTILINE | re.IGNORECASE | re.VERBOSE)
def ms_sub(self, match): def ms_sub(self, match):
end = match.group('end') end = match.group('end')
try: try:
start = match.group('start') start = match.group('start')
except: except Exception:
start = '' start = ''
if end == ';': if end == ';':
end = '' end = ''
@@ -332,7 +353,7 @@ class CSSPreProcessor(object):
for line in data.splitlines(): for line in data.splitlines():
ll = line.lstrip() ll = line.lstrip()
if not (namespaced or ll.startswith('@import') or not ll or if not (namespaced or ll.startswith('@import') or not ll or
ll.startswith('@charset')): ll.startswith('@charset')):
ans.append(XHTML_CSS_NAMESPACE.strip()) ans.append(XHTML_CSS_NAMESPACE.strip())
namespaced = True namespaced = True
ans.append(line) ans.append(line)
@@ -359,7 +380,8 @@ def accent_regex(accent_maps, letter_before=False):
args = ''.join(accent_cat), ''.join(letters) args = ''.join(accent_cat), ''.join(letters)
accent_group, letter_group = 1, 2 accent_group, letter_group = 1, 2
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE) pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args),
re.UNICODE)
def sub(m): def sub(m):
lmap = accent_maps[m.group(accent_group)] lmap = accent_maps[m.group(accent_group)]
@@ -371,83 +393,96 @@ def accent_regex(accent_maps, letter_before=False):
def html_preprocess_rules(): def html_preprocess_rules():
ans = getattr(html_preprocess_rules, 'ans', None) ans = getattr(html_preprocess_rules, 'ans', None)
if ans is None: if ans is None:
ans = html_preprocess_rules.ans = [ ans = [
# Remove huge block of contiguous spaces as they slow down # Remove huge block of contiguous spaces as they slow down
# the following regexes pretty badly # the following regexes pretty badly
(re.compile(r'\s{10000,}'), ''), (re.compile(r'\s{10000,}'), ''),
# Some idiotic HTML generators (Frontpage I'm looking at you) # Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml # Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL), (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>',
sanitize_head), re.IGNORECASE | re.DOTALL), sanitize_head),
# Convert all entities, since lxml doesn't handle them well # Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities), (re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word # Remove the <![if/endif tags inserted by everybody's darling,
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''), # MS Word
] (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
'')]
html_preprocess_rules.ans = ans
return ans return ans
def pdftohtml_rules(): def pdftohtml_rules():
ans = getattr(pdftohtml_rules, 'ans', None) ans = getattr(pdftohtml_rules, 'ans', None)
if ans is None: if ans is None:
ans = pdftohtml_rules.ans = [ ans = [accent_regex({'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
accent_regex({ '`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ', '´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚ'
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ', 'úÚźŹ',
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ', 'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ', '¸': 'cC:çÇ',
'¸': 'cC:çÇ', '˛': 'aAeE:ąĄęĘ',
'˛': 'aAeE:ąĄęĘ', '˙': 'zZ:żŻ',
'˙': 'zZ:żŻ', 'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ', '°': 'uU:ůŮ'}),
'°': 'uU:ůŮ', accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'},
}), letter_before=True),
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True), # If pdf printed from a browser then the header/footer has a
# reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?'
r'[A-Z].*<br>(?=\s*<hr>))',
re.IGNORECASE), lambda match: ''),
# If pdf printed from a browser then the header/footer has a reliable pattern # Center separator lines
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'),
lambda match: '<p>\n<p style="text-align:center">' +
match.group('break') + '</p>'),
# Center separator lines # Remove <hr> tags
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'), (re.compile(r'<hr.*?>', re.IGNORECASE), ''),
# Remove <hr> tags # Remove gray background
(re.compile(r'<hr.*?>', re.IGNORECASE), ''), (re.compile(r'<BODY[^<>]+>'), '<BODY>'),
# Remove gray background # Convert line breaks to paragraphs
(re.compile(r'<BODY[^<>]+>'), '<BODY>'), (re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
(re.compile(r'\s*</body>'), '</p>\n</body>'),
# Convert line breaks to paragraphs # Clean up spaces
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'), (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'), # Add space before and after italics
(re.compile(r'\s*</body>'), '</p>\n</body>'), (re.compile(r'(?<!“)<i>'), ' <i>'),
(re.compile(r'</i>(?=\w)'), '</i> ')]
# Clean up spaces pdftohtml_rules.ans = ans
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
# Add space before and after italics
(re.compile(r'(?<!“)<i>'), ' <i>'),
(re.compile(r'</i>(?=\w)'), '</i> '),
]
return ans return ans
def book_designer_rules(): def book_designer_rules():
ans = getattr(book_designer_rules, 'ans', None) ans = getattr(book_designer_rules, 'ans', None)
if ans is None: if ans is None:
ans = book_designer_rules.ans = [ ans = [(re.compile('<hr>', re.IGNORECASE),
# HR lambda match: '<span style="page-break-after:always"> '
(re.compile('<hr>', re.IGNORECASE), '</span>'),
lambda match : '<span style="page-break-after:always"> </span>'), # Create header tags
# Create header tags (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)'
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))), lambda match: '<h1 id="BookTitle" align="%s">%s</h1>' %
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE), (match.group(2) if match.group(2) else 'center',
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))), match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)'
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)), r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL), lambda match: '<h2 id="BookAuthor" align="%s">%s</h2>' %
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)), (match.group(2) if match.group(2) else 'center',
] match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>',
re.IGNORECASE | re.DOTALL),
lambda match: '<h2 class="title">%s</h2>' % (match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>',
re.IGNORECASE | re.DOTALL),
lambda match: '<h3 class="subtitle">%s</h3>' %
(match.group(1),))]
book_designer_rules.ans = ans
return None return None
@@ -470,7 +505,7 @@ class HTMLPreProcessor(object):
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000] return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None, def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False): get_preprocess_html=False):
if remove_special_chars is not None: if remove_special_chars is not None:
html = remove_special_chars.sub('', html) html = remove_special_chars.sub('', html)
html = html.replace('\0', '') html = html.replace('\0', '')
@@ -487,13 +522,14 @@ class HTMLPreProcessor(object):
start_rules = [] start_rules = []
if not getattr(self.extra_opts, 'keep_ligatures', False): if not getattr(self.extra_opts, 'keep_ligatures', False):
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) html = _ligpat.sub(lambda m: LIGATURES[m.group()], html)
user_sr_rules = {} user_sr_rules = {}
# Function for processing search and replace # Function for processing search and replace
def do_search_replace(search_pattern, replace_txt): def do_search_replace(search_pattern, replace_txt):
from ebook_converter.ebooks.conversion.search_replace import compile_regular_expression from ebook_converter.ebooks.conversion.search_replace import \
compile_regular_expression
try: try:
search_re = compile_regular_expression(search_pattern) search_re = compile_regular_expression(search_pattern)
if not replace_txt: if not replace_txt:
@@ -502,11 +538,11 @@ class HTMLPreProcessor(object):
user_sr_rules[(search_re, replace_txt)] = search_pattern user_sr_rules[(search_re, replace_txt)] = search_pattern
except Exception as e: except Exception as e:
self.log.error('Failed to parse %r regexp because %s' % self.log.error('Failed to parse %r regexp because %s' %
(search, as_unicode(e))) (search, e))
# search / replace using the sr?_search / sr?_replace options # search / replace using the sr?_search / sr?_replace options
for i in range(1, 4): for i in range(1, 4):
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i search, replace = 'sr%d_search' % i, 'sr%d_replace' % i
search_pattern = getattr(self.extra_opts, search, '') search_pattern = getattr(self.extra_opts, search, '')
replace_txt = getattr(self.extra_opts, replace, '') replace_txt = getattr(self.extra_opts, replace, '')
if search_pattern: if search_pattern:
@@ -520,31 +556,35 @@ class HTMLPreProcessor(object):
do_search_replace(search_pattern, replace_txt) do_search_replace(search_pattern, replace_txt)
end_rules = [] end_rules = []
# delete soft hyphens - moved here so it's executed after header/footer removal # delete soft hyphens - moved here so it's executed after
# header/footer removal
if is_pdftohtml: if is_pdftohtml:
# unwrap/delete soft hyphens # unwrap/delete soft hyphens
end_rules.append((re.compile( end_rules.append((re.compile(r'[­](</p>\s*<p>\s*)+\s*'
r'[­](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: '')) r'(?=[\[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting # unwrap/delete soft hyphens with formatting
end_rules.append((re.compile( end_rules.append((re.compile(r'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+'
r'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: '')) r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'),
lambda match: ''))
length = -1 length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
docanalysis = DocAnalysis('pdf', html) docanalysis = DocAnalysis('pdf', html)
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) length = docanalysis.line_length(getattr(self.extra_opts,
'unwrap_factor'))
if length: if length:
# print("The pdf line length returned is " + str(length)) # print("The pdf line length returned is " + str(length))
# unwrap em/en dashes # unwrap em/en dashes
end_rules.append((re.compile( end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*<p>\s*'
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: '')) r'(?=[\[a-z\d])' % length),
lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(( (re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą'
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]' r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4})'
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?' r';))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*'
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines), r'<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
) length, re.UNICODE), wrap_lines))
for rule in html_preprocess_rules() + start_rules: for rule in html_preprocess_rules() + start_rules:
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
name, i = None, 0 name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)): while not name or os.path.exists(os.path.join(odir, name)):
i += 1 i += 1
name = '%04d.html'%i name = '%04d.html' % i
with open(os.path.join(odir, name), 'wb') as f: with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8')) f.write(raw.encode('utf-8'))
@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
html = rule[0].sub(rule[1], html) html = rule[0].sub(rule[1], html)
except Exception as e: except Exception as e:
if rule in user_sr_rules: if rule in user_sr_rules:
self.log.error( self.log.error('User supplied search & replace rule: %s '
'User supplied search & replace rule: %s -> %s ' '-> %s failed with error: %s, ignoring.' %
'failed with error: %s, ignoring.'%( (user_sr_rules[rule], rule[1], e))
user_sr_rules[rule], rule[1], e))
else: else:
raise raise
if is_pdftohtml and length > -1: if is_pdftohtml and length > -1:
# Dehyphenate # Dehyphenate
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length) html = dehyphenator(html, 'html', length)
if is_pdftohtml: if is_pdftohtml:
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor from ebook_converter.ebooks.conversion.utils import \
HeuristicProcessor
pdf_markup = HeuristicProcessor(self.extra_opts, None) pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0 totalwords = 0
if pdf_markup.get_word_count(html) > 7000: if pdf_markup.get_word_count(html) > 7000:
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
from ebook_converter.utils.localization import get_udc from ebook_converter.utils.localization import get_udc
from ebook_converter.utils.mreplace import MReplace from ebook_converter.utils.mreplace import MReplace
unihandecoder = get_udc() unihandecoder = get_udc()
mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3}) mr = MReplace(data={'«': '&lt;' * 3, '»': '&gt;' * 3})
html = mr.mreplace(html) html = mr.mreplace(html)
html = unihandecoder.decode(html) html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False): if getattr(self.extra_opts, 'enable_heuristics', False):
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor from ebook_converter.ebooks.conversion.utils import \
HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log) preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html) html = preprocessor(html)
if is_pdftohtml: if is_pdftohtml:
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '') html = html.replace('<!-- created by ebook-converter\'s '
'pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log) html = smarten_punctuation(html, self.log)
try: try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars unsupported_unicode_chars = (self.extra_opts.output_profile
.unsupported_unicode_chars)
except AttributeError: except AttributeError:
unsupported_unicode_chars = '' unsupported_unicode_chars = ''
if unsupported_unicode_chars: if unsupported_unicode_chars:

View File

@@ -10,19 +10,13 @@ import urllib.parse
from ebook_converter.ebooks.oeb.base import urlunquote from ebook_converter.ebooks.oeb.base import urlunquote
from ebook_converter.ebooks.chardet import detect_xml_encoding from ebook_converter.ebooks.chardet import detect_xml_encoding
from ebook_converter.constants_old import iswindows from ebook_converter.constants_old import iswindows
from ebook_converter import unicode_path, as_unicode, replace_entities from ebook_converter import unicode_path, replace_entities
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class Link(object): class Link(object):
"""
'''
Represents a link in a HTML file. Represents a link in a HTML file.
''' """
@classmethod @classmethod
def url_to_local_path(cls, url, base): def url_to_local_path(cls, url, base):
@@ -31,7 +25,8 @@ class Link(object):
if iswindows and path.startswith('/'): if iswindows and path.startswith('/'):
path = path[1:] path = path[1:]
isabs = True isabs = True
path = urllib.parse.urlunparse(('', '', path, url.params, url.query, '')) path = urllib.parse.urlunparse(('', '', path, url.params, url.query,
''))
path = urlunquote(path) path = urlunquote(path)
if isabs or os.path.isabs(path): if isabs or os.path.isabs(path):
return path return path
@@ -39,17 +34,18 @@ class Link(object):
def __init__(self, url, base): def __init__(self, url, base):
''' '''
:param url: The url this link points to. Must be an unquoted unicode string. :param url: The url this link points to. Must be an unquoted unicode
:param base: The base directory that relative URLs are with respect to. string.
Must be a unicode string. :param base: The base directory that relative URLs are with respect
to. Must be a unicode string.
''' '''
assert isinstance(url, str) and isinstance(base, str) assert isinstance(url, str) and isinstance(base, str)
self.url = url self.url = url
self.parsed_url = urllib.parse.urlparse(self.url) self.parsed_url = urllib.parse.urlparse(self.url)
self.is_local = self.parsed_url.scheme in ('', 'file') self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path) self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None self.path = None
self.fragment = urlunquote(self.parsed_url.fragment) self.fragment = urlunquote(self.parsed_url.fragment)
if self.is_local and not self.is_internal: if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base) self.path = self.url_to_local_path(self.parsed_url, base)
@@ -62,7 +58,7 @@ class Link(object):
return self.path == getattr(other, 'path', other) return self.path == getattr(other, 'path', other)
def __str__(self): def __str__(self):
return 'Link: %s --> %s'%(self.url, self.path) return 'Link: %s --> %s' % (self.url, self.path)
class IgnoreFile(Exception): class IgnoreFile(Exception):
@@ -84,24 +80,25 @@ class HTMLFile(object):
The encoding of the file is available as :member:`encoding`. The encoding of the file is available as :member:`encoding`.
''' '''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE) HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE) TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
LINK_PAT = re.compile( LINK_PAT = re.compile(r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|'
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))', r'(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
re.DOTALL|re.IGNORECASE) re.DOTALL | re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): def __init__(self, path_to_html_file, level, encoding, verbose,
''' referrer=None):
"""
:param level: The level of this file. Should be 0 for the root file. :param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML. :param encoding: Use `encoding` to decode HTML.
:param referrer: The :class:`HTMLFile` that first refers to this file. :param referrer: The :class:`HTMLFile` that first refers to this file.
''' """
self.path = unicode_path(path_to_html_file, abs=True) self.path = unicode_path(path_to_html_file, abs=True)
self.title = os.path.splitext(os.path.basename(self.path))[0] self.title = os.path.splitext(os.path.basename(self.path))[0]
self.base = os.path.dirname(self.path) self.base = os.path.dirname(self.path)
self.level = level self.level = level
self.referrer = referrer self.referrer = referrer
self.links = [] self.links = []
try: try:
with open(self.path, 'rb') as f: with open(self.path, 'rb') as f:
@@ -112,18 +109,21 @@ class HTMLFile(object):
header = header.decode(encoding) header = header.decode(encoding)
except ValueError: except ValueError:
pass pass
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) self.is_binary = level > 0 and not bool(self
.HTML_PAT
.search(header))
if not self.is_binary: if not self.is_binary:
src += f.read() src += f.read()
except IOError as err: except IOError as err:
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err)) msg = ('Could not read from file: %s with error: %s' %
(self.path, str(err)))
if level == 0: if level == 0:
raise IOError(msg) raise IOError(msg)
raise IgnoreFile(msg, err.errno) raise IgnoreFile(msg, err.errno)
if not src: if not src:
if level == 0: if level == 0:
raise ValueError('The file %s is empty'%self.path) raise ValueError('The file %s is empty' % self.path)
self.is_binary = True self.is_binary = True
if not self.is_binary: if not self.is_binary:
@@ -145,7 +145,9 @@ class HTMLFile(object):
return hash(self.path) return hash(self.path)
def __str__(self): def __str__(self):
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path) return 'HTMLFile:%d:%s:%s' % (self.level,
'b' if self.is_binary else 'a',
self.path)
def __repr__(self): def __repr__(self):
return str(self) return str(self)
@@ -191,20 +193,22 @@ def depth_first(root, flat, visited=None):
visited.add(hf) visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None): def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0,
''' encoding=None):
"""
Recursively traverse all links in the HTML file. Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0 :param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed. implies that no links in the root HTML file are
:param encoding: Specify character encoding of HTML files. If `None` it is followed.
auto-detected. :param encoding: Specify character encoding of HTML files. If `None` it
:return: A pair of lists (breadth_first, depth_first). Each list contains is auto-detected.
:class:`HTMLFile` objects. :return: A pair of lists (breadth_first, depth_first). Each list
''' contains :class:`HTMLFile` objects.
"""
assert max_levels >= 0 assert max_levels >= 0
level = 0 level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat) next_level = list(flat)
while level < max_levels and len(next_level) > 0: while level < max_levels and len(next_level) > 0:
level += 1 level += 1
@@ -215,9 +219,10 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
if link.path is None or link.path in flat: if link.path is None or link.path in flat:
continue continue
try: try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) nf = HTMLFile(link.path, level, encoding, verbose,
referrer=hf)
if nf.is_binary: if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1) raise IgnoreFile('%s is a binary file' % nf.path, -1)
nl.append(nf) nl.append(nf)
flat.append(nf) flat.append(nf)
except IgnoreFile as err: except IgnoreFile as err:
@@ -244,7 +249,8 @@ def get_filelist(htmlfile, dir, opts, log):
log.info('Building file list...') log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels), filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose, verbose=opts.verbose,
encoding=opts.input_encoding)[0 if opts.breadth_first else 1] encoding=opts
.input_encoding)[0 if opts.breadth_first else 1]
if opts.verbose: if opts.verbose:
log.debug('\tFound files...') log.debug('\tFound files...')
for f in filelist: for f in filelist:

View File

@@ -21,7 +21,6 @@ from ebook_converter import force_unicode
from ebook_converter.constants_old import filesystem_encoding, __version__ from ebook_converter.constants_old import filesystem_encoding, __version__
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor
from ebook_converter import as_unicode
from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils.short_uuid import uuid4 from ebook_converter.utils.short_uuid import uuid4
@@ -419,7 +418,7 @@ def urlnormalize(href):
parts = urllib.parse.urlparse(href) parts = urllib.parse.urlparse(href)
except ValueError as e: except ValueError as e:
raise ValueError('Failed to parse the URL: %r with underlying error: ' raise ValueError('Failed to parse the URL: %r with underlying error: '
'%s' % (href, as_unicode(e))) '%s' % (href, e))
if not parts.scheme or parts.scheme == 'file': if not parts.scheme or parts.scheme == 'file':
path, frag = urllib.parse.urldefrag(href) path, frag = urllib.parse.urldefrag(href)
parts = ('', '', path, '', '', frag) parts = ('', '', path, '', '', frag)
@@ -723,7 +722,7 @@ class Metadata(object):
% (parse_utils.barename(self.term), self.value, self.attrib) % (parse_utils.barename(self.term), self.value, self.attrib)
def __str__(self): def __str__(self):
return as_unicode(self.value) return str(self.value)
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}): def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
attrib = {} attrib = {}

View File

@@ -14,7 +14,7 @@ from lxml.etree import XPath as _XPath
from lxml import etree from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import as_unicode, force_unicode from ebook_converter import force_unicode
from ebook_converter.ebooks.epub import rules from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.split import do_split from ebook_converter.ebooks.oeb.polish.split import do_split
@@ -126,7 +126,7 @@ class Split(object):
except SelectorError as err: except SelectorError as err:
self.log.warn('Ignoring page breaks specified with invalid ' self.log.warn('Ignoring page breaks specified with invalid '
'CSS selector: %r (%s)' % 'CSS selector: %r (%s)' %
(selector, as_unicode(err))) (selector, err))
for i, elem in enumerate(item.data.iter('*')): for i, elem in enumerate(item.data.iter('*')):
try: try:

View File

@@ -2,17 +2,13 @@ import os
from collections import defaultdict from collections import defaultdict
from threading import Thread from threading import Thread
from ebook_converter import walk, prints, as_unicode from ebook_converter import walk, prints
from ebook_converter.constants_old import (config_dir, iswindows, isosx, plugins, DEBUG, from ebook_converter.constants_old import iswindows, isosx
isworker, filesystem_encoding) from ebook_converter.constants_old import plugins, DEBUG, isworker
from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.utils.fonts.metadata import FontMetadata, UnsupportedFont from ebook_converter.utils.fonts.metadata import FontMetadata, UnsupportedFont
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class NoFonts(ValueError): class NoFonts(ValueError):
pass pass
@@ -38,7 +34,7 @@ def fc_list():
return default_font_dirs() return default_font_dirs()
try: try:
lib = ctypes.CDLL(lib) lib = ctypes.CDLL(lib)
except: except Exception:
return default_font_dirs() return default_font_dirs()
prototype = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p) prototype = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p)
@@ -97,7 +93,7 @@ def font_dirs():
if iswindows: if iswindows:
winutil, err = plugins['winutil'] winutil, err = plugins['winutil']
if err: if err:
raise RuntimeError('Failed to load winutil: %s'%err) raise RuntimeError('Failed to load winutil: %s' % err)
try: try:
return [winutil.special_folder_path(winutil.CSIDL_FONTS)] return [winutil.special_folder_path(winutil.CSIDL_FONTS)]
except ValueError: except ValueError:
@@ -126,9 +122,10 @@ def font_priority(font):
width_normal = font['font-stretch'] == 'normal' width_normal = font['font-stretch'] == 'normal'
weight_normal = font['font-weight'] == 'normal' weight_normal = font['font-weight'] == 'normal'
num_normal = sum(filter(None, (style_normal, width_normal, num_normal = sum(filter(None, (style_normal, width_normal,
weight_normal))) weight_normal)))
subfamily_name = (font['wws_subfamily_name'] or subfamily_name = (font['wws_subfamily_name'] or
font['preferred_subfamily_name'] or font['subfamily_name']) font['preferred_subfamily_name'] or
font['subfamily_name'])
if num_normal == 3 and subfamily_name == 'Regular': if num_normal == 3 and subfamily_name == 'Regular':
return 0 return 0
if num_normal == 3: if num_normal == 3:
@@ -167,7 +164,9 @@ def build_families(cached_fonts, folders, family_attr='font-family'):
if fingerprint in fmap: if fingerprint in fmap:
opath = fmap[fingerprint]['path'] opath = fmap[fingerprint]['path']
npath = font['path'] npath = font['path']
if path_significance(npath, folders) >= path_significance(opath, folders): if path_significance(npath,
folders) >= path_significance(opath,
folders):
remove.append(fmap[fingerprint]) remove.append(fmap[fingerprint])
fmap[fingerprint] = font fmap[fingerprint] = font
else: else:
@@ -214,7 +213,7 @@ class FontScanner(Thread):
try: try:
return self.font_family_map[family.lower()] return self.font_family_map[family.lower()]
except KeyError: except KeyError:
raise NoFonts('No fonts found for the family: %r'%family) raise NoFonts('No fonts found for the family: %r' % family)
def legacy_fonts_for_family(self, family): def legacy_fonts_for_family(self, family):
''' '''
@@ -247,8 +246,11 @@ class FontScanner(Thread):
with open(path, 'rb') as f: with open(path, 'rb') as f:
return f.read() return f.read()
def find_font_for_text(self, text, allowed_families={'serif', 'sans-serif'}, def find_font_for_text(self, text,
preferred_families=('serif', 'sans-serif', 'monospace', 'cursive', 'fantasy')): allowed_families={'serif', 'sans-serif'},
preferred_families=('serif', 'sans-serif',
'monospace', 'cursive',
'fantasy')):
''' '''
Find a font on the system capable of rendering the given text. Find a font on the system capable of rendering the given text.
@@ -258,10 +260,11 @@ class FontScanner(Thread):
:return: (family name, faces) or None, None :return: (family name, faces) or None, None
''' '''
from ebook_converter.utils.fonts.utils import (supports_text, from ebook_converter.utils.fonts.utils import \
panose_to_css_generic_family, get_printable_characters) supports_text, panose_to_css_generic_family, \
get_printable_characters
if not isinstance(text, str): if not isinstance(text, str):
raise TypeError(u'%r is not unicode'%text) raise TypeError(u'%r is not unicode' % text)
text = get_printable_characters(text) text = get_printable_characters(text)
found = {} found = {}
@@ -269,7 +272,7 @@ class FontScanner(Thread):
try: try:
raw = self.get_font_data(font) raw = self.get_font_data(font)
return supports_text(raw, text) return supports_text(raw, text)
except: except Exception:
pass pass
return False return False
@@ -278,7 +281,8 @@ class FontScanner(Thread):
if not faces: if not faces:
continue continue
generic_family = panose_to_css_generic_family(faces[0]['panose']) generic_family = panose_to_css_generic_family(faces[0]['panose'])
if generic_family in allowed_families or generic_family == preferred_families[0]: if (generic_family in allowed_families or
generic_family == preferred_families[0]):
return (family, faces) return (family, faces)
elif generic_family not in found: elif generic_family not in found:
found[generic_family] = (family, faces) found[generic_family] = (family, faces)
@@ -321,18 +325,20 @@ class FontScanner(Thread):
files = tuple(walk(folder)) files = tuple(walk(folder))
except EnvironmentError as e: except EnvironmentError as e:
if DEBUG: if DEBUG:
prints('Failed to walk font folder:', folder, prints('Failed to walk font folder:', folder, str(e))
as_unicode(e))
continue continue
for candidate in files: for candidate in files:
if (candidate.rpartition('.')[-1].lower() not in self.allowed_extensions or not os.path.isfile(candidate)): if (candidate.rpartition('.')[-1].lower() not in
self.allowed_extensions or
not os.path.isfile(candidate)):
continue continue
candidate = os.path.normcase(os.path.abspath(candidate)) candidate = os.path.normcase(os.path.abspath(candidate))
try: try:
s = os.stat(candidate) s = os.stat(candidate)
except EnvironmentError: except EnvironmentError:
continue continue
fileid = '{0}||{1}:{2}'.format(candidate, s.st_size, s.st_mtime) fileid = '{0}||{1}:{2}'.format(candidate, s.st_size,
s.st_mtime)
if fileid in cached_fonts: if fileid in cached_fonts:
# Use previously cached metadata, since the file size and # Use previously cached metadata, since the file size and
# last modified timestamp have not changed. # last modified timestamp have not changed.
@@ -343,7 +349,7 @@ class FontScanner(Thread):
except Exception as e: except Exception as e:
if DEBUG: if DEBUG:
prints('Failed to read metadata from font file:', prints('Failed to read metadata from font file:',
candidate, as_unicode(e)) candidate, str(e))
continue continue
if frozenset(cached_fonts) != frozenset(self.cached_fonts): if frozenset(cached_fonts) != frozenset(self.cached_fonts):
@@ -353,7 +359,8 @@ class FontScanner(Thread):
self.build_families() self.build_families()
def build_families(self): def build_families(self):
self.font_family_map, self.font_families = build_families(self.cached_fonts, self.folders) (self.font_family_map,
self.font_families) = build_families(self.cached_fonts, self.folders)
def write_cache(self): def write_cache(self):
with self.cache: with self.cache:
@@ -380,14 +387,14 @@ class FontScanner(Thread):
for family in self.font_families: for family in self.font_families:
prints(family) prints(family)
for font in self.fonts_for_family(family): for font in self.fonts_for_family(family):
prints('\t%s: %s'%(font['full_name'], font['path'])) prints('\t%s: %s' % (font['full_name'], font['path']))
prints(end='\t') prints(end='\t')
for key in ('font-stretch', 'font-weight', 'font-style'): for key in ('font-stretch', 'font-weight', 'font-style'):
prints('%s: %s'%(key, font[key]), end=' ') prints('%s: %s' % (key, font[key]), end=' ')
prints() prints()
prints('\tSub-family:', font['wws_subfamily_name'] or prints('\tSub-family:', font['wws_subfamily_name'] or
font['preferred_subfamily_name'] or font['preferred_subfamily_name'] or
font['subfamily_name']) font['subfamily_name'])
prints() prints()
prints() prints()

View File

@@ -1,20 +1,18 @@
""" """
A simplified logging system A simplified logging system
""" """
import sys, traceback, io import sys
import traceback
import io
from functools import partial from functools import partial
from threading import Lock from threading import Lock
from ebook_converter import force_unicode, as_unicode, prints from ebook_converter import force_unicode, prints
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
DEBUG = 0 DEBUG = 0
INFO = 1 INFO = 1
WARN = 2 WARN = 2
ERROR = 3 ERROR = 3
@@ -38,10 +36,10 @@ class ANSIStream(Stream):
def __init__(self, stream=sys.stdout): def __init__(self, stream=sys.stdout):
Stream.__init__(self, stream) Stream.__init__(self, stream)
self.color = { self.color = {
DEBUG: u'green', DEBUG: 'green',
INFO: None, INFO: None,
WARN: u'yellow', WARN: 'yellow',
ERROR: u'red', ERROR: 'red',
} }
def prints(self, level, *args, **kwargs): def prints(self, level, *args, **kwargs):
@@ -64,12 +62,10 @@ class FileStream(Stream):
class HTMLStream(Stream): class HTMLStream(Stream):
color = { color = {DEBUG: b'<span style="color:green">',
DEBUG: b'<span style="color:green">', INFO: b'<span>',
INFO: b'<span>', WARN: b'<span style="color:blue">',
WARN: b'<span style="color:blue">', ERROR: b'<span style="color:red">'}
ERROR: b'<span style="color:red">'
}
normal = b'</span>' normal = b'</span>'
def __init__(self, stream=sys.stdout): def __init__(self, stream=sys.stdout):
@@ -104,14 +100,14 @@ class UnicodeHTMLStream(HTMLStream):
self.data.append(col) self.data.append(col)
self.last_col = col self.last_col = col
sep = kwargs.get(u'sep', u' ') sep = kwargs.get('sep', ' ')
end = kwargs.get(u'end', u'\n') end = kwargs.get('end', '\n')
for arg in args: for arg in args:
if isinstance(arg, bytes): if isinstance(arg, bytes):
arg = force_unicode(arg) arg = force_unicode(arg)
elif not isinstance(arg, str): elif not isinstance(arg, str):
arg = as_unicode(arg) arg = str(arg)
self.data.append(arg+sep) self.data.append(arg+sep)
self.plain_text.append(arg+sep) self.plain_text.append(arg+sep)
self.data.append(end) self.data.append(end)
@@ -124,8 +120,8 @@ class UnicodeHTMLStream(HTMLStream):
@property @property
def html(self): def html(self):
end = self.normal if self.data else u'' end = self.normal if self.data else ''
return u''.join(self.data) + end return ''.join(self.data) + end
def dump(self): def dump(self):
return [self.data, self.plain_text, self.last_col] return [self.data, self.plain_text, self.last_col]
@@ -143,8 +139,8 @@ class UnicodeHTMLStream(HTMLStream):
class Log(object): class Log(object):
DEBUG = DEBUG DEBUG = DEBUG
INFO = INFO INFO = INFO
WARN = WARN WARN = WARN
ERROR = ERROR ERROR = ERROR
def __init__(self, level=INFO): def __init__(self, level=INFO):
@@ -153,8 +149,8 @@ class Log(object):
self.outputs = [default_output] self.outputs = [default_output]
self.debug = partial(self.print_with_flush, DEBUG) self.debug = partial(self.print_with_flush, DEBUG)
self.info = partial(self.print_with_flush, INFO) self.info = partial(self.print_with_flush, INFO)
self.warn = self.warning = partial(self.print_with_flush, WARN) self.warn = self.warning = partial(self.print_with_flush, WARN)
self.error = partial(self.print_with_flush, ERROR) self.error = partial(self.print_with_flush, ERROR)
def prints(self, level, *args, **kwargs): def prints(self, level, *args, **kwargs):
@@ -222,7 +218,8 @@ class ThreadSafeLog(Log):
limit = kwargs.pop('limit', None) limit = kwargs.pop('limit', None)
with self._lock: with self._lock:
Log.print_with_flush(self, ERROR, *args, **kwargs) Log.print_with_flush(self, ERROR, *args, **kwargs)
Log.print_with_flush(self, self.exception_traceback_level, traceback.format_exc(limit)) Log.print_with_flush(self, self.exception_traceback_level,
traceback.format_exc(limit))
class ThreadSafeWrapper(Log): class ThreadSafeWrapper(Log):
@@ -242,10 +239,9 @@ class ThreadSafeWrapper(Log):
class GUILog(ThreadSafeLog): class GUILog(ThreadSafeLog):
"""
'''
Logs in HTML and plain text as unicode. Ideal for display in a GUI context. Logs in HTML and plain text as unicode. Ideal for display in a GUI context.
''' """
def __init__(self): def __init__(self):
ThreadSafeLog.__init__(self, level=self.DEBUG) ThreadSafeLog.__init__(self, level=self.DEBUG)