import functools
import json
import math
import re
from ebook_converter import entity_to_unicode
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode,
result_exceptions={'<': '<',
'>': '>',
"'": ''',
'"': '"',
'&': '&'})
_span_pat = re.compile(' ]*>\s*'+chap+'
\n'
else:
return ''+chap+'
\n'+title+'
\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
return ' '
else:
return ital+' '
def smarten_punctuation(html, log=None):
from ebook_converter.utils.smartypants import smartyPants
from ebook_converter.ebooks.chardet import substitute_entites
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+str(uuid4())
stop = 'calibre-smartypants-'+str(uuid4())
html = html.replace('', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '')
return substitute_entites(html)
class DocAnalysis(object):
"""
Provides various text analysis functions to determine how the document is
structured. format is the type of document analysis will be done against.
raw is the raw text to determine the line length to use for wrapping.
Blank lines are excluded from analysis
"""
def __init__(self, format='html', raw=''):
raw = raw.replace(' ', ' ')
if format == 'html':
linere = re.compile(r'(?<=
]*>\s*
\s*)?(p|div)>\s+)' r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}' r'(]*>)?)\s*(?P|[iub]>\s*
\s*'
r'<[iub]>)\s*(?P
\n
' + match.group('break') + '
'), # Remove'), (re.compile(r'
]*>\s*'), '\n'), (re.compile(r'\s*'), '
\n'), # Clean up spaces (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '), # Add space before and after italics (re.compile(r'(?'), ' '), (re.compile(r'(?=\w)'), ' ')] pdftohtml_rules.ans = ans return ans def book_designer_rules(): ans = getattr(book_designer_rules, 'ans', None) if ans is None: ans = [(re.compile('\s*)+\s*' r'(?=[\[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(r'[]\s*((i|u|b)>)+(
\s*\s*)+' r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: '')) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: docanalysis = DocAnalysis('pdf', html) length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: # print("The pdf line length returned is " + str(length)) # unwrap em/en dashes end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*
\s*' r'(?=[\[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą' r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?(i|b|u)>)?\s*(
\s*' r'\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') % length, re.UNICODE), wrap_lines)) for rule in html_preprocess_rules() + start_rules: html = rule[0].sub(rule[1], html) if self.regex_wizard_callback is not None: self.regex_wizard_callback(self.current_href, html) if get_preprocess_html: return html def dump(raw, where): import os dp = getattr(self.extra_opts, 'debug_pipeline', None) if dp and os.path.exists(dp): odir = os.path.join(dp, 'input') if os.path.exists(odir): odir = os.path.join(odir, where) if not os.path.exists(odir): os.makedirs(odir) name, i = None, 0 while not name or os.path.exists(os.path.join(odir, name)): i += 1 name = '%04d.html' % i with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) # dump(html, 'pre-preprocess') for rule in rules + end_rules: try: html = rule[0].sub(rule[1], html) except Exception as e: if rule in user_sr_rules: self.log.error('User supplied search & replace rule: %s ' '-> %s failed with error: %s, ignoring.' % (user_sr_rules[rule], rule[1], e)) else: raise if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html, 'html', length) if is_pdftohtml: from ebook_converter.ebooks.conversion.utils import \ HeuristicProcessor pdf_markup = HeuristicProcessor(self.extra_opts, None) totalwords = 0 if pdf_markup.get_word_count(html) > 7000: html = pdf_markup.markup_chapters(html, totalwords, True) # dump(html, 'post-preprocess') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in html and SVG_NS not in html: html = html.replace( '', '') if getattr(self.extra_opts, 'smarten_punctuation', False): html = smarten_punctuation(html, self.log) try: unsupported_unicode_chars = (self.extra_opts.output_profile .unsupported_unicode_chars) except AttributeError: unsupported_unicode_chars = '' if unsupported_unicode_chars: from ebook_converter.utils.localization import get_udc unihandecoder = get_udc() for char in unsupported_unicode_chars: asciichar = unihandecoder.decode(char) html = html.replace(char, asciichar) return html