mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-23 10:35:49 +01:00
Removed as_unicode function
This commit is contained in:
@@ -1,43 +1,37 @@
|
||||
import functools, re, json
|
||||
from math import ceil
|
||||
import functools
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
|
||||
from ebook_converter import entity_to_unicode, as_unicode
|
||||
from ebook_converter import entity_to_unicode
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions={
|
||||
'<' : '<',
|
||||
'>' : '>',
|
||||
"'" : ''',
|
||||
'"' : '"',
|
||||
'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
result_exceptions={'<': '<',
|
||||
'>': '>',
|
||||
"'": ''',
|
||||
'"': '"',
|
||||
'&': '&'})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st',
|
||||
}
|
||||
LIGATURES = {'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st'}
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
|
||||
_ligpat = re.compile('|'.join(LIGATURES))
|
||||
|
||||
@@ -83,17 +77,18 @@ def smarten_punctuation(html, log=None):
|
||||
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
format is the type of document analysis will be done against.
|
||||
"""
|
||||
Provides various text analysis functions to determine how the document is
|
||||
structured. format is the type of document analysis will be done against.
|
||||
raw is the raw text to determine the line length to use for wrapping.
|
||||
Blank lines are excluded from analysis
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)',
|
||||
re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
@@ -103,13 +98,13 @@ class DocAnalysis(object):
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
'''
|
||||
"""
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
"""
|
||||
lengths = []
|
||||
for line in self.lines:
|
||||
if len(line) > 0:
|
||||
@@ -121,7 +116,7 @@ class DocAnalysis(object):
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = ceil(avg * 2)
|
||||
max_line = math.ceil(avg * 2)
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
@@ -138,31 +133,32 @@ class DocAnalysis(object):
|
||||
return lengths[index]
|
||||
|
||||
def line_histogram(self, percent):
|
||||
'''
|
||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||
percent is the percentage of lines that should be in a single bucket to return true
|
||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||
'''
|
||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
"""
|
||||
Creates a broad histogram of the document to determine whether it
|
||||
incorporates hard line breaks. Lines are sorted into 20 'buckets'
|
||||
based on length. percent is the percentage of lines that should be in
|
||||
a single bucket to return true The majority of the lines will exist in
|
||||
1-2 buckets in typical docs with hard line breaks
|
||||
"""
|
||||
minLineLength = 20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength = 1900 # Discard larger than this to stay in range
|
||||
buckets = 20 # Each line is divided into a bucket based on length
|
||||
|
||||
# print("there are "+str(len(lines))+" lines")
|
||||
# max = 0
|
||||
# for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
# _l = len(line)
|
||||
# if _l > max:
|
||||
# max = _l
|
||||
# print("max line found is "+str(max))
|
||||
# Build the line length histogram
|
||||
hRaw = [0 for i in range(0,buckets)]
|
||||
hRaw = [0 for i in range(0, buckets)]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l // 100)
|
||||
# print("adding "+str(l))
|
||||
hRaw[l]+=1
|
||||
_l = len(line)
|
||||
if _l > minLineLength and _l < maxLineLength:
|
||||
_l = int(_l // 100)
|
||||
# print("adding "+str(_l))
|
||||
hRaw[_l] += 1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
totalLines = len(self.lines)
|
||||
@@ -175,7 +171,7 @@ class DocAnalysis(object):
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
for i in range(0,len(h)):
|
||||
for i in range(0, len(h)):
|
||||
if h[i] > maxValue:
|
||||
maxValue = h[i]
|
||||
|
||||
@@ -188,36 +184,42 @@ class DocAnalysis(object):
|
||||
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||
retain hyphens.
|
||||
'''
|
||||
"""
|
||||
Analyzes words to determine whether hyphens should be retained/removed.
|
||||
Uses the document itself is as a dictionary. This method handles all
|
||||
languages along with uncommon, made-up, and scientific words. The primary
|
||||
disadvantage is that words appearing only once in the document retain
|
||||
hyphens.
|
||||
"""
|
||||
|
||||
def __init__(self, verbose=0, log=None):
|
||||
self.log = log
|
||||
self.verbose = verbose
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = (
|
||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
||||
# Add common suffixes to the regex below to increase the likelihood of
|
||||
# a match - don't add suffixes which are also complete words, such as
|
||||
# 'able' or 'sex' only remove if it's not already the point of
|
||||
# hyphenation
|
||||
self.suffix_string = ("((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?"
|
||||
"|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|"
|
||||
"istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier"
|
||||
"|al|ex|ian)$")
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string,
|
||||
re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of
|
||||
# hyphenation
|
||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string,
|
||||
re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
try:
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
except Exception:
|
||||
wraptags = ''
|
||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||
@@ -231,65 +233,84 @@ class Dehyphenator(object):
|
||||
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
except Exception:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
|
||||
self.log(" Cleanup:returned dehyphenated word: " +
|
||||
dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
|
||||
self.log(" Cleanup:returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
|
||||
self.log(" Cleanup:returning original text " +
|
||||
firsthalf + " + linefeed " + secondhalf)
|
||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
if (self.format == 'individual_words' and
|
||||
len(firsthalf) + len(secondhalf) <= 6):
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
self.log("too short, returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
self.log("too short, returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned dehyphenated word: " + dehyphenated)
|
||||
self.log(" returned dehyphenated word: " +
|
||||
dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned hyphenated word: " + hyphenated)
|
||||
self.log(" returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
||||
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)'
|
||||
r'(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}'
|
||||
r'(?P<up2threeblanks><(p|div)[^>]*>\s*'
|
||||
r'(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+)'
|
||||
r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}'
|
||||
r'(<span[^>]*>)?)\s*(?P<secondpart>'
|
||||
r'[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||
r'\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*'
|
||||
r'<[iub]>)\s*(?P<secondpart>[\w\d]+)' %
|
||||
length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(
|
||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
||||
intextmatch = re.compile('(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)'
|
||||
'(\u0020|\u0009)*(?P<wraptags>'
|
||||
'(\n(\u0020|\u0009)*)+)(?P<secondpart>'
|
||||
'[\\w\\d]+)' % length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(
|
||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
intextmatch = re.compile(r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*'
|
||||
r'(?P<secondpart>\w+)(?![^<]*?>)',
|
||||
re.UNICODE)
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)'
|
||||
r'(?P<wraptags></span>\s*(</[iubp]>\s*'
|
||||
r'<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>)?\s*(?P<secondpart>'
|
||||
r'[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||
r'(?P<wraptags>\s+)(?P<secondpart>'
|
||||
r'[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
@@ -299,18 +320,18 @@ class CSSPreProcessor(object):
|
||||
|
||||
# Remove some of the broken CSS Microsoft products
|
||||
# create
|
||||
MS_PAT = re.compile(r'''
|
||||
MS_PAT = re.compile(r'''
|
||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||
(%s).+? # The invalid selectors
|
||||
(?P<end>$|;|\}) # The end of the declaration
|
||||
'''%'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||
''' % 'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE | re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
def ms_sub(self, match):
|
||||
end = match.group('end')
|
||||
try:
|
||||
start = match.group('start')
|
||||
except:
|
||||
except Exception:
|
||||
start = ''
|
||||
if end == ';':
|
||||
end = ''
|
||||
@@ -332,7 +353,7 @@ class CSSPreProcessor(object):
|
||||
for line in data.splitlines():
|
||||
ll = line.lstrip()
|
||||
if not (namespaced or ll.startswith('@import') or not ll or
|
||||
ll.startswith('@charset')):
|
||||
ll.startswith('@charset')):
|
||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
||||
namespaced = True
|
||||
ans.append(line)
|
||||
@@ -359,7 +380,8 @@ def accent_regex(accent_maps, letter_before=False):
|
||||
args = ''.join(accent_cat), ''.join(letters)
|
||||
accent_group, letter_group = 1, 2
|
||||
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args),
|
||||
re.UNICODE)
|
||||
|
||||
def sub(m):
|
||||
lmap = accent_maps[m.group(accent_group)]
|
||||
@@ -371,83 +393,96 @@ def accent_regex(accent_maps, letter_before=False):
|
||||
def html_preprocess_rules():
|
||||
ans = getattr(html_preprocess_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = html_preprocess_rules.ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
|
||||
]
|
||||
ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>',
|
||||
re.IGNORECASE | re.DOTALL), sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling,
|
||||
# MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||
'')]
|
||||
html_preprocess_rules.ans = ans
|
||||
return ans
|
||||
|
||||
|
||||
def pdftohtml_rules():
|
||||
ans = getattr(pdftohtml_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = pdftohtml_rules.ans = [
|
||||
accent_regex({
|
||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ',
|
||||
}),
|
||||
ans = [accent_regex({'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚ'
|
||||
'úÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ'}),
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'},
|
||||
letter_before=True),
|
||||
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||
# If pdf printed from a browser then the header/footer has a
|
||||
# reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?'
|
||||
r'[A-Z].*<br>(?=\s*<hr>))',
|
||||
re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'),
|
||||
lambda match: '<p>\n<p style="text-align:center">' +
|
||||
match.group('break') + '</p>'),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||
]
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> ')]
|
||||
pdftohtml_rules.ans = ans
|
||||
return ans
|
||||
|
||||
|
||||
def book_designer_rules():
|
||||
ans = getattr(book_designer_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = book_designer_rules.ans = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
ans = [(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match: '<span style="page-break-after:always"> '
|
||||
'</span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)'
|
||||
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match: '<h1 id="BookTitle" align="%s">%s</h1>' %
|
||||
(match.group(2) if match.group(2) else 'center',
|
||||
match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)'
|
||||
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match: '<h2 id="BookAuthor" align="%s">%s</h2>' %
|
||||
(match.group(2) if match.group(2) else 'center',
|
||||
match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL),
|
||||
lambda match: '<h2 class="title">%s</h2>' % (match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL),
|
||||
lambda match: '<h3 class="subtitle">%s</h3>' %
|
||||
(match.group(1),))]
|
||||
book_designer_rules.ans = ans
|
||||
return None
|
||||
|
||||
|
||||
@@ -470,7 +505,7 @@ class HTMLPreProcessor(object):
|
||||
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def __call__(self, html, remove_special_chars=None,
|
||||
get_preprocess_html=False):
|
||||
get_preprocess_html=False):
|
||||
if remove_special_chars is not None:
|
||||
html = remove_special_chars.sub('', html)
|
||||
html = html.replace('\0', '')
|
||||
@@ -487,13 +522,14 @@ class HTMLPreProcessor(object):
|
||||
start_rules = []
|
||||
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
html = _ligpat.sub(lambda m: LIGATURES[m.group()], html)
|
||||
|
||||
user_sr_rules = {}
|
||||
# Function for processing search and replace
|
||||
|
||||
def do_search_replace(search_pattern, replace_txt):
|
||||
from ebook_converter.ebooks.conversion.search_replace import compile_regular_expression
|
||||
from ebook_converter.ebooks.conversion.search_replace import \
|
||||
compile_regular_expression
|
||||
try:
|
||||
search_re = compile_regular_expression(search_pattern)
|
||||
if not replace_txt:
|
||||
@@ -502,11 +538,11 @@ class HTMLPreProcessor(object):
|
||||
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
(search, e))
|
||||
|
||||
# search / replace using the sr?_search / sr?_replace options
|
||||
for i in range(1, 4):
|
||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
||||
search, replace = 'sr%d_search' % i, 'sr%d_replace' % i
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if search_pattern:
|
||||
@@ -520,31 +556,35 @@ class HTMLPreProcessor(object):
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
end_rules = []
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
# delete soft hyphens - moved here so it's executed after
|
||||
# header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(
|
||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(r'[](</p>\s*<p>\s*)+\s*'
|
||||
r'(?=[\[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(
|
||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+'
|
||||
r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'),
|
||||
lambda match: ''))
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
length = docanalysis.line_length(getattr(self.extra_opts,
|
||||
'unwrap_factor'))
|
||||
if length:
|
||||
# print("The pdf line length returned is " + str(length))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*<p>\s*'
|
||||
r'(?=[\[a-z\d])' % length),
|
||||
lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile((
|
||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
|
||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
(re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą'
|
||||
r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4})'
|
||||
r';))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*'
|
||||
r'<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
|
||||
length, re.UNICODE), wrap_lines))
|
||||
|
||||
for rule in html_preprocess_rules() + start_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
name = '%04d.html' % i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
|
||||
html = rule[0].sub(rule[1], html)
|
||||
except Exception as e:
|
||||
if rule in user_sr_rules:
|
||||
self.log.error(
|
||||
'User supplied search & replace rule: %s -> %s '
|
||||
'failed with error: %s, ignoring.'%(
|
||||
user_sr_rules[rule], rule[1], e))
|
||||
self.log.error('User supplied search & replace rule: %s '
|
||||
'-> %s failed with error: %s, ignoring.' %
|
||||
(user_sr_rules[rule], rule[1], e))
|
||||
else:
|
||||
raise
|
||||
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
html = dehyphenator(html, 'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
from ebook_converter.ebooks.conversion.utils import \
|
||||
HeuristicProcessor
|
||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
if pdf_markup.get_word_count(html) > 7000:
|
||||
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
|
||||
from ebook_converter.utils.localization import get_udc
|
||||
from ebook_converter.utils.mreplace import MReplace
|
||||
unihandecoder = get_udc()
|
||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
||||
mr = MReplace(data={'«': '<' * 3, '»': '>' * 3})
|
||||
html = mr.mreplace(html)
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
from ebook_converter.ebooks.conversion.utils import \
|
||||
HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if is_pdftohtml:
|
||||
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
|
||||
html = html.replace('<!-- created by ebook-converter\'s '
|
||||
'pdftohtml -->', '')
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
unsupported_unicode_chars = (self.extra_opts.output_profile
|
||||
.unsupported_unicode_chars)
|
||||
except AttributeError:
|
||||
unsupported_unicode_chars = ''
|
||||
if unsupported_unicode_chars:
|
||||
|
||||
@@ -10,19 +10,13 @@ import urllib.parse
|
||||
from ebook_converter.ebooks.oeb.base import urlunquote
|
||||
from ebook_converter.ebooks.chardet import detect_xml_encoding
|
||||
from ebook_converter.constants_old import iswindows
|
||||
from ebook_converter import unicode_path, as_unicode, replace_entities
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
from ebook_converter import unicode_path, replace_entities
|
||||
|
||||
|
||||
class Link(object):
|
||||
|
||||
'''
|
||||
"""
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
@@ -31,7 +25,8 @@ class Link(object):
|
||||
if iswindows and path.startswith('/'):
|
||||
path = path[1:]
|
||||
isabs = True
|
||||
path = urllib.parse.urlunparse(('', '', path, url.params, url.query, ''))
|
||||
path = urllib.parse.urlunparse(('', '', path, url.params, url.query,
|
||||
''))
|
||||
path = urlunquote(path)
|
||||
if isabs or os.path.isabs(path):
|
||||
return path
|
||||
@@ -39,17 +34,18 @@ class Link(object):
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
:param url: The url this link points to. Must be an unquoted unicode
|
||||
string.
|
||||
:param base: The base directory that relative URLs are with respect
|
||||
to. Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, str) and isinstance(base, str)
|
||||
self.url = url
|
||||
self.parsed_url = urllib.parse.urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.url = url
|
||||
self.parsed_url = urllib.parse.urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||
self.path = None
|
||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
@@ -62,7 +58,7 @@ class Link(object):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return 'Link: %s --> %s'%(self.url, self.path)
|
||||
return 'Link: %s --> %s' % (self.url, self.path)
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
@@ -84,24 +80,25 @@ class HTMLFile(object):
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
LINK_PAT = re.compile(r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|'
|
||||
r'(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL | re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose,
|
||||
referrer=None):
|
||||
"""
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
"""
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
@@ -112,18 +109,21 @@ class HTMLFile(object):
|
||||
header = header.decode(encoding)
|
||||
except ValueError:
|
||||
pass
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
||||
self.is_binary = level > 0 and not bool(self
|
||||
.HTML_PAT
|
||||
.search(header))
|
||||
if not self.is_binary:
|
||||
src += f.read()
|
||||
except IOError as err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||
msg = ('Could not read from file: %s with error: %s' %
|
||||
(self.path, str(err)))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
if not src:
|
||||
if level == 0:
|
||||
raise ValueError('The file %s is empty'%self.path)
|
||||
raise ValueError('The file %s is empty' % self.path)
|
||||
self.is_binary = True
|
||||
|
||||
if not self.is_binary:
|
||||
@@ -145,7 +145,9 @@ class HTMLFile(object):
|
||||
return hash(self.path)
|
||||
|
||||
def __str__(self):
|
||||
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
return 'HTMLFile:%d:%s:%s' % (self.level,
|
||||
'b' if self.is_binary else 'a',
|
||||
self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
@@ -191,20 +193,22 @@ def depth_first(root, flat, visited=None):
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
|
||||
'''
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0,
|
||||
encoding=None):
|
||||
"""
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
implies that no links in the root HTML file are
|
||||
followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it
|
||||
is auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list
|
||||
contains :class:`HTMLFile` objects.
|
||||
"""
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
@@ -215,9 +219,10 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
nf = HTMLFile(link.path, level, encoding, verbose,
|
||||
referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
raise IgnoreFile('%s is a binary file' % nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile as err:
|
||||
@@ -244,7 +249,8 @@ def get_filelist(htmlfile, dir, opts, log):
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||
encoding=opts
|
||||
.input_encoding)[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
|
||||
@@ -21,7 +21,6 @@ from ebook_converter import force_unicode
|
||||
from ebook_converter.constants_old import filesystem_encoding, __version__
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor
|
||||
from ebook_converter import as_unicode
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
from ebook_converter.utils.short_uuid import uuid4
|
||||
@@ -419,7 +418,7 @@ def urlnormalize(href):
|
||||
parts = urllib.parse.urlparse(href)
|
||||
except ValueError as e:
|
||||
raise ValueError('Failed to parse the URL: %r with underlying error: '
|
||||
'%s' % (href, as_unicode(e)))
|
||||
'%s' % (href, e))
|
||||
if not parts.scheme or parts.scheme == 'file':
|
||||
path, frag = urllib.parse.urldefrag(href)
|
||||
parts = ('', '', path, '', '', frag)
|
||||
@@ -723,7 +722,7 @@ class Metadata(object):
|
||||
% (parse_utils.barename(self.term), self.value, self.attrib)
|
||||
|
||||
def __str__(self):
|
||||
return as_unicode(self.value)
|
||||
return str(self.value)
|
||||
|
||||
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
|
||||
attrib = {}
|
||||
|
||||
@@ -14,7 +14,7 @@ from lxml.etree import XPath as _XPath
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import as_unicode, force_unicode
|
||||
from ebook_converter import force_unicode
|
||||
from ebook_converter.ebooks.epub import rules
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb.polish.split import do_split
|
||||
@@ -126,7 +126,7 @@ class Split(object):
|
||||
except SelectorError as err:
|
||||
self.log.warn('Ignoring page breaks specified with invalid '
|
||||
'CSS selector: %r (%s)' %
|
||||
(selector, as_unicode(err)))
|
||||
(selector, err))
|
||||
|
||||
for i, elem in enumerate(item.data.iter('*')):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user