mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-16 22:25:54 +01:00
Removed as_unicode function
This commit is contained in:
@@ -1,43 +1,37 @@
|
||||
import functools, re, json
|
||||
from math import ceil
|
||||
import functools
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
|
||||
from ebook_converter import entity_to_unicode, as_unicode
|
||||
from ebook_converter import entity_to_unicode
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions={
|
||||
'<' : '<',
|
||||
'>' : '>',
|
||||
"'" : ''',
|
||||
'"' : '"',
|
||||
'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
result_exceptions={'<': '<',
|
||||
'>': '>',
|
||||
"'": ''',
|
||||
'"': '"',
|
||||
'&': '&'})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL | re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st',
|
||||
}
|
||||
LIGATURES = {'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st'}
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
|
||||
_ligpat = re.compile('|'.join(LIGATURES))
|
||||
|
||||
@@ -83,17 +77,18 @@ def smarten_punctuation(html, log=None):
|
||||
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
format is the type of document analysis will be done against.
|
||||
"""
|
||||
Provides various text analysis functions to determine how the document is
|
||||
structured. format is the type of document analysis will be done against.
|
||||
raw is the raw text to determine the line length to use for wrapping.
|
||||
Blank lines are excluded from analysis
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)',
|
||||
re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
@@ -103,13 +98,13 @@ class DocAnalysis(object):
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
'''
|
||||
"""
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
"""
|
||||
lengths = []
|
||||
for line in self.lines:
|
||||
if len(line) > 0:
|
||||
@@ -121,7 +116,7 @@ class DocAnalysis(object):
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = ceil(avg * 2)
|
||||
max_line = math.ceil(avg * 2)
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
@@ -138,31 +133,32 @@ class DocAnalysis(object):
|
||||
return lengths[index]
|
||||
|
||||
def line_histogram(self, percent):
|
||||
'''
|
||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||
percent is the percentage of lines that should be in a single bucket to return true
|
||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||
'''
|
||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
"""
|
||||
Creates a broad histogram of the document to determine whether it
|
||||
incorporates hard line breaks. Lines are sorted into 20 'buckets'
|
||||
based on length. percent is the percentage of lines that should be in
|
||||
a single bucket to return true The majority of the lines will exist in
|
||||
1-2 buckets in typical docs with hard line breaks
|
||||
"""
|
||||
minLineLength = 20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength = 1900 # Discard larger than this to stay in range
|
||||
buckets = 20 # Each line is divided into a bucket based on length
|
||||
|
||||
# print("there are "+str(len(lines))+" lines")
|
||||
# max = 0
|
||||
# for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
# _l = len(line)
|
||||
# if _l > max:
|
||||
# max = _l
|
||||
# print("max line found is "+str(max))
|
||||
# Build the line length histogram
|
||||
hRaw = [0 for i in range(0,buckets)]
|
||||
hRaw = [0 for i in range(0, buckets)]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l // 100)
|
||||
# print("adding "+str(l))
|
||||
hRaw[l]+=1
|
||||
_l = len(line)
|
||||
if _l > minLineLength and _l < maxLineLength:
|
||||
_l = int(_l // 100)
|
||||
# print("adding "+str(_l))
|
||||
hRaw[_l] += 1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
totalLines = len(self.lines)
|
||||
@@ -175,7 +171,7 @@ class DocAnalysis(object):
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
for i in range(0,len(h)):
|
||||
for i in range(0, len(h)):
|
||||
if h[i] > maxValue:
|
||||
maxValue = h[i]
|
||||
|
||||
@@ -188,36 +184,42 @@ class DocAnalysis(object):
|
||||
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||
retain hyphens.
|
||||
'''
|
||||
"""
|
||||
Analyzes words to determine whether hyphens should be retained/removed.
|
||||
Uses the document itself is as a dictionary. This method handles all
|
||||
languages along with uncommon, made-up, and scientific words. The primary
|
||||
disadvantage is that words appearing only once in the document retain
|
||||
hyphens.
|
||||
"""
|
||||
|
||||
def __init__(self, verbose=0, log=None):
|
||||
self.log = log
|
||||
self.verbose = verbose
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = (
|
||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
||||
# Add common suffixes to the regex below to increase the likelihood of
|
||||
# a match - don't add suffixes which are also complete words, such as
|
||||
# 'able' or 'sex' only remove if it's not already the point of
|
||||
# hyphenation
|
||||
self.suffix_string = ("((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?"
|
||||
"|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|"
|
||||
"istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier"
|
||||
"|al|ex|ian)$")
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string,
|
||||
re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of
|
||||
# hyphenation
|
||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string,
|
||||
re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
try:
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
except Exception:
|
||||
wraptags = ''
|
||||
hyphenated = str(firsthalf) + "-" + str(secondhalf)
|
||||
dehyphenated = str(firsthalf) + str(secondhalf)
|
||||
@@ -231,65 +233,84 @@ class Dehyphenator(object):
|
||||
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
except Exception:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
|
||||
self.log(" Cleanup:returned dehyphenated word: " +
|
||||
dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
|
||||
self.log(" Cleanup:returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
|
||||
self.log(" Cleanup:returning original text " +
|
||||
firsthalf + " + linefeed " + secondhalf)
|
||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
if (self.format == 'individual_words' and
|
||||
len(firsthalf) + len(secondhalf) <= 6):
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
self.log("too short, returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
self.log("too short, returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned dehyphenated word: " + dehyphenated)
|
||||
self.log(" returned dehyphenated word: " +
|
||||
dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned hyphenated word: " + hyphenated)
|
||||
self.log(" returned hyphenated word: " +
|
||||
hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
||||
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)'
|
||||
r'(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}'
|
||||
r'(?P<up2threeblanks><(p|div)[^>]*>\s*'
|
||||
r'(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+)'
|
||||
r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}'
|
||||
r'(<span[^>]*>)?)\s*(?P<secondpart>'
|
||||
r'[\w\d]+)' % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||
intextmatch = re.compile(r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||
r'\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*'
|
||||
r'<[iub]>)\s*(?P<secondpart>[\w\d]+)' %
|
||||
length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(
|
||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
||||
intextmatch = re.compile('(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)'
|
||||
'(\u0020|\u0009)*(?P<wraptags>'
|
||||
'(\n(\u0020|\u0009)*)+)(?P<secondpart>'
|
||||
'[\\w\\d]+)' % length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(
|
||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
intextmatch = re.compile(r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*'
|
||||
r'(?P<secondpart>\w+)(?![^<]*?>)',
|
||||
re.UNICODE)
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)'
|
||||
r'(?P<wraptags></span>\s*(</[iubp]>\s*'
|
||||
r'<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>)?\s*(?P<secondpart>'
|
||||
r'[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
intextmatch = re.compile(r'(?P<firstpart>[^\W\-]+)(-|‐)'
|
||||
r'(?P<wraptags>\s+)(?P<secondpart>'
|
||||
r'[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
@@ -299,18 +320,18 @@ class CSSPreProcessor(object):
|
||||
|
||||
# Remove some of the broken CSS Microsoft products
|
||||
# create
|
||||
MS_PAT = re.compile(r'''
|
||||
MS_PAT = re.compile(r'''
|
||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||
(%s).+? # The invalid selectors
|
||||
(?P<end>$|;|\}) # The end of the declaration
|
||||
'''%'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||
''' % 'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE | re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
def ms_sub(self, match):
|
||||
end = match.group('end')
|
||||
try:
|
||||
start = match.group('start')
|
||||
except:
|
||||
except Exception:
|
||||
start = ''
|
||||
if end == ';':
|
||||
end = ''
|
||||
@@ -332,7 +353,7 @@ class CSSPreProcessor(object):
|
||||
for line in data.splitlines():
|
||||
ll = line.lstrip()
|
||||
if not (namespaced or ll.startswith('@import') or not ll or
|
||||
ll.startswith('@charset')):
|
||||
ll.startswith('@charset')):
|
||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
||||
namespaced = True
|
||||
ans.append(line)
|
||||
@@ -359,7 +380,8 @@ def accent_regex(accent_maps, letter_before=False):
|
||||
args = ''.join(accent_cat), ''.join(letters)
|
||||
accent_group, letter_group = 1, 2
|
||||
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args),
|
||||
re.UNICODE)
|
||||
|
||||
def sub(m):
|
||||
lmap = accent_maps[m.group(accent_group)]
|
||||
@@ -371,83 +393,96 @@ def accent_regex(accent_maps, letter_before=False):
|
||||
def html_preprocess_rules():
|
||||
ans = getattr(html_preprocess_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = html_preprocess_rules.ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
|
||||
]
|
||||
ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>',
|
||||
re.IGNORECASE | re.DOTALL), sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling,
|
||||
# MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
|
||||
'')]
|
||||
html_preprocess_rules.ans = ans
|
||||
return ans
|
||||
|
||||
|
||||
def pdftohtml_rules():
|
||||
ans = getattr(pdftohtml_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = pdftohtml_rules.ans = [
|
||||
accent_regex({
|
||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ',
|
||||
}),
|
||||
ans = [accent_regex({'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚ'
|
||||
'úÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ'}),
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'},
|
||||
letter_before=True),
|
||||
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||
# If pdf printed from a browser then the header/footer has a
|
||||
# reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?'
|
||||
r'[A-Z].*<br>(?=\s*<hr>))',
|
||||
re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'),
|
||||
lambda match: '<p>\n<p style="text-align:center">' +
|
||||
match.group('break') + '</p>'),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||
]
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> ')]
|
||||
pdftohtml_rules.ans = ans
|
||||
return ans
|
||||
|
||||
|
||||
def book_designer_rules():
|
||||
ans = getattr(book_designer_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = book_designer_rules.ans = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
ans = [(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match: '<span style="page-break-after:always"> '
|
||||
'</span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)'
|
||||
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match: '<h1 id="BookTitle" align="%s">%s</h1>' %
|
||||
(match.group(2) if match.group(2) else 'center',
|
||||
match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)'
|
||||
r'(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match: '<h2 id="BookAuthor" align="%s">%s</h2>' %
|
||||
(match.group(2) if match.group(2) else 'center',
|
||||
match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL),
|
||||
lambda match: '<h2 class="title">%s</h2>' % (match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>',
|
||||
re.IGNORECASE | re.DOTALL),
|
||||
lambda match: '<h3 class="subtitle">%s</h3>' %
|
||||
(match.group(1),))]
|
||||
book_designer_rules.ans = ans
|
||||
return None
|
||||
|
||||
|
||||
@@ -470,7 +505,7 @@ class HTMLPreProcessor(object):
|
||||
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def __call__(self, html, remove_special_chars=None,
|
||||
get_preprocess_html=False):
|
||||
get_preprocess_html=False):
|
||||
if remove_special_chars is not None:
|
||||
html = remove_special_chars.sub('', html)
|
||||
html = html.replace('\0', '')
|
||||
@@ -487,13 +522,14 @@ class HTMLPreProcessor(object):
|
||||
start_rules = []
|
||||
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
html = _ligpat.sub(lambda m: LIGATURES[m.group()], html)
|
||||
|
||||
user_sr_rules = {}
|
||||
# Function for processing search and replace
|
||||
|
||||
def do_search_replace(search_pattern, replace_txt):
|
||||
from ebook_converter.ebooks.conversion.search_replace import compile_regular_expression
|
||||
from ebook_converter.ebooks.conversion.search_replace import \
|
||||
compile_regular_expression
|
||||
try:
|
||||
search_re = compile_regular_expression(search_pattern)
|
||||
if not replace_txt:
|
||||
@@ -502,11 +538,11 @@ class HTMLPreProcessor(object):
|
||||
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
(search, e))
|
||||
|
||||
# search / replace using the sr?_search / sr?_replace options
|
||||
for i in range(1, 4):
|
||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
||||
search, replace = 'sr%d_search' % i, 'sr%d_replace' % i
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if search_pattern:
|
||||
@@ -520,31 +556,35 @@ class HTMLPreProcessor(object):
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
end_rules = []
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
# delete soft hyphens - moved here so it's executed after
|
||||
# header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(
|
||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(r'[](</p>\s*<p>\s*)+\s*'
|
||||
r'(?=[\[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(
|
||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
end_rules.append((re.compile(r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+'
|
||||
r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'),
|
||||
lambda match: ''))
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
length = docanalysis.line_length(getattr(self.extra_opts,
|
||||
'unwrap_factor'))
|
||||
if length:
|
||||
# print("The pdf line length returned is " + str(length))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*<p>\s*'
|
||||
r'(?=[\[a-z\d])' % length),
|
||||
lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile((
|
||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
|
||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
(re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą'
|
||||
r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4})'
|
||||
r';))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*'
|
||||
r'<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
|
||||
length, re.UNICODE), wrap_lines))
|
||||
|
||||
for rule in html_preprocess_rules() + start_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
name = '%04d.html' % i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
|
||||
html = rule[0].sub(rule[1], html)
|
||||
except Exception as e:
|
||||
if rule in user_sr_rules:
|
||||
self.log.error(
|
||||
'User supplied search & replace rule: %s -> %s '
|
||||
'failed with error: %s, ignoring.'%(
|
||||
user_sr_rules[rule], rule[1], e))
|
||||
self.log.error('User supplied search & replace rule: %s '
|
||||
'-> %s failed with error: %s, ignoring.' %
|
||||
(user_sr_rules[rule], rule[1], e))
|
||||
else:
|
||||
raise
|
||||
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
html = dehyphenator(html, 'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
from ebook_converter.ebooks.conversion.utils import \
|
||||
HeuristicProcessor
|
||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
if pdf_markup.get_word_count(html) > 7000:
|
||||
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
|
||||
from ebook_converter.utils.localization import get_udc
|
||||
from ebook_converter.utils.mreplace import MReplace
|
||||
unihandecoder = get_udc()
|
||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
||||
mr = MReplace(data={'«': '<' * 3, '»': '>' * 3})
|
||||
html = mr.mreplace(html)
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
||||
from ebook_converter.ebooks.conversion.utils import \
|
||||
HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if is_pdftohtml:
|
||||
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
|
||||
html = html.replace('<!-- created by ebook-converter\'s '
|
||||
'pdftohtml -->', '')
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
unsupported_unicode_chars = (self.extra_opts.output_profile
|
||||
.unsupported_unicode_chars)
|
||||
except AttributeError:
|
||||
unsupported_unicode_chars = ''
|
||||
if unsupported_unicode_chars:
|
||||
|
||||
Reference in New Issue
Block a user