diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py
index 6bd24e5..b6f80e7 100644
--- a/ebook_converter/__init__.py
+++ b/ebook_converter/__init__.py
@@ -13,10 +13,10 @@ try:
except EnvironmentError:
os.chdir(os.path.expanduser('~'))
-from ebook_converter.constants_old import (iswindows, isosx, islinux, isfrozen,
- isbsd, preferred_encoding, __appname__, __version__, __author__,
- win32event, win32api, winerror, fcntl,
- filesystem_encoding, plugins, config_dir)
+from ebook_converter.constants_old import iswindows, islinux, isfrozen, \
+ isbsd, preferred_encoding, __appname__, __version__, __author__, \
+ win32event, win32api, winerror, fcntl, \
+ filesystem_encoding, plugins, config_dir
from ebook_converter.startup import winutil, winutilerror
from ebook_converter.utils.icu import safe_chr
@@ -51,23 +51,28 @@ def confirm_config_name(name):
return name + '_again'
-_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<', # no2to3
- '"', ':', '>', '+', '/') + tuple(map(chr, range(32)))) # no2to3
+_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<',
+ '"', ':', '>', '+', '/') +
+ tuple(map(chr, range(32))))
def sanitize_file_name(name, substitute='_'):
- '''
- Sanitize the filename `name`. All invalid characters are replaced by `substitute`.
- The set of invalid characters is the union of the invalid characters in Windows,
- macOS and Linux. Also removes leading and trailing whitespace.
- **WARNING:** This function also replaces path separators, so only pass file names
- and not full paths to it.
- '''
+ """
+ Sanitize the filename `name`. All invalid characters are replaced by
+ `substitute`. The set of invalid characters is the union of the invalid
+ characters in Windows, macOS and Linux. Also removes leading and trailing
+ whitespace.
+
+ **WARNING:** This function also replaces path separators, so only pass
+ file names and not full paths to it.
+ """
+
if isinstance(name, bytes):
name = name.decode(filesystem_encoding, 'replace')
if isinstance(substitute, bytes):
substitute = substitute.decode(filesystem_encoding, 'replace')
- chars = (substitute if c in _filename_sanitize_unicode else c for c in name)
+ chars = (substitute
+ if c in _filename_sanitize_unicode else c for c in name)
one = ''.join(chars)
one = re.sub(r'\s', ' ', one).strip()
bname, ext = os.path.splitext(one)
@@ -87,8 +92,8 @@ def prints(*args, **kwargs):
"""
Print unicode arguments safely by encoding them to preferred_encoding
Has the same signature as the print function from Python 3, except for the
- additional keyword argument safe_encode, which if set to True will cause the
- function to use repr when encoding fails.
+ additional keyword argument safe_encode, which if set to True will cause
+ the function to use repr when encoding fails.
Returns the number of bytes written.
"""
@@ -120,7 +125,7 @@ def prints(*args, **kwargs):
except UnicodeEncodeError:
try:
arg = arg.encode('utf-8')
- except:
+ except Exception:
if not safe_encode:
raise
arg = repr(arg)
@@ -131,7 +136,7 @@ def prints(*args, **kwargs):
except UnicodeEncodeError:
try:
arg = arg.encode('utf-8')
- except:
+ except Exception:
if not safe_encode:
raise
arg = repr(arg)
@@ -139,7 +144,7 @@ def prints(*args, **kwargs):
try:
file.write(arg)
count += len(arg)
- except:
+ except Exception:
from polyglot import reprlib
arg = reprlib.repr(arg)
file.write(arg)
@@ -168,22 +173,12 @@ def setup_cli_handlers(logger, level):
elif level == logging.DEBUG:
handler = logging.StreamHandler(sys.stderr)
handler.setLevel(logging.DEBUG)
- handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s'))
+ handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:'
+ '%(lineno)s: %(message)s'))
logger.addHandler(handler)
-def load_library(name, cdll):
- if iswindows:
- return cdll.LoadLibrary(name)
- if isosx:
- name += '.dylib'
- if hasattr(sys, 'frameworks_dir'):
- return cdll.LoadLibrary(os.path.join(getattr(sys, 'frameworks_dir'), name))
- return cdll.LoadLibrary(name)
- return cdll.LoadLibrary(name+'.so')
-
-
def extract(path, dir):
extractor = None
# First use the file header to identify its type
@@ -216,7 +211,8 @@ def fit_image(width, height, pwidth, pheight):
@param height: Height of image
@param pwidth: Width of box
@param pheight: Height of box
- @return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height.
+ @return: scaled, new_width, new_height. scaled is True iff new_width
+ and/or new_height is different from width or height.
'''
scaled = height > pheight or width > pwidth
if height > pheight:
@@ -262,8 +258,10 @@ def walk(dir):
def strftime(fmt, t=None):
- ''' A version of strftime that returns unicode strings and tries to handle dates
- before 1900 '''
+ """
+ A version of strftime that returns unicode strings and tries to handle
+ dates before 1900
+ """
if not fmt:
return ''
if t is None:
@@ -272,7 +270,7 @@ def strftime(fmt, t=None):
t = t.timetuple()
early_year = t[0] < 1900
if early_year:
- replacement = 1900 if t[0]%4 == 0 else 1901
+ replacement = 1900 if t[0] % 4 == 0 else 1901
fmt = fmt.replace('%Y', '_early year hack##')
t = list(t)
orig_year = t[0]
@@ -301,27 +299,33 @@ def my_unichr(num):
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
- result_exceptions={}):
- '''
+ result_exceptions={}):
+ """
:param match: A match object such that '&'+match.group(1)';' is the entity.
- :param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
+ :param exceptions: A list of entities to not convert (Each entry is the
+ name of the entity, for e.g. 'apos' or '#1234'
- :param encoding: The encoding to use to decode numeric entities between 128 and 256.
- If None, the Unicode UCS encoding is used. A common encoding is cp1252.
+ :param encoding: The encoding to use to decode numeric entities between
+ 128 and 256. If None, the Unicode UCS encoding is used.
+ A common encoding is cp1252.
+
+ :param result_exceptions: A mapping of characters to entities. If the
+ result is in result_exceptions,
+ result_exception[result] is returned instead.
+ Convenient way to specify exception for things
+ like < or > that can be specified by various
+ actual entities.
+ """
- :param result_exceptions: A mapping of characters to entities. If the result
- is in result_exceptions, result_exception[result] is returned instead.
- Convenient way to specify exception for things like < or > that can be
- specified by various actual entities.
- '''
def check(ch):
return result_exceptions.get(ch, ch)
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
- if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
+ # squot is generated by some broken CMS software
+ if ent in {'apos', 'squot'}:
return check("'")
if ent == 'hellips':
ent = 'hellip'
@@ -331,7 +335,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
num = int(ent[2:], 16)
else:
num = int(ent[1:])
- except:
+ except Exception:
return '&'+ent+';'
if encoding is None or num > 255:
return check(my_unichr(num))
@@ -394,15 +398,6 @@ def force_unicode(obj, enc=preferred_encoding):
return obj
-def as_unicode(obj, enc=preferred_encoding):
- if not isinstance(obj, bytes):
- try:
- obj = str(obj)
- except Exception:
- obj = repr(obj)
- return force_unicode(obj, enc=enc)
-
-
def url_slash_cleaner(url):
'''
Removes redundant /'s from url's.
diff --git a/ebook_converter/ebooks/conversion/preprocess.py b/ebook_converter/ebooks/conversion/preprocess.py
index 1cc42b4..9f8bfcf 100644
--- a/ebook_converter/ebooks/conversion/preprocess.py
+++ b/ebook_converter/ebooks/conversion/preprocess.py
@@ -1,43 +1,37 @@
-import functools, re, json
-from math import ceil
+import functools
+import json
+import math
+import re
-from ebook_converter import entity_to_unicode, as_unicode
+from ebook_converter import entity_to_unicode
-__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal ]*>\s*
]*>\s*
).*?(?=)', + re.DOTALL) elif format == 'pdf': linere = re.compile(r'(?<=]*>\s*
\s*)' - r'?(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(]*>)?)\s*(?P]*>\s*
\s*)?(p|div)>\s+)' + r'{0,3}\s*(<[iubp][^>]*>\s*){1,2}' + r'(]*>)?)\s*(?P|' - r'[iub]>\s*
\s*<[iub]>)\s*(?P |[iub]>\s* \s*'
+ r'<[iub]>)\s*(?P
\n
' + + match.group('break') + '
'), - # Center separator lines - (re.compile(r'\n
' + match.group('break') + '
'), + # Remove'), + (re.compile(r'
]*>\s*'), '\n'), + (re.compile(r'\s*'), '
\n'), - # Convert line breaks to paragraphs - (re.compile(r''), - (re.compile(r'
]*>\s*'), '\n'), - (re.compile(r'\s*'), '
\n'), - - # Clean up spaces - (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '), - # Add space before and after italics - (re.compile(r'(?'), ' '), - (re.compile(r'(?=\w)'), ' '), - ] + # Clean up spaces + (re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '), + # Add space before and after italics + (re.compile(r'(?'), ' '), + (re.compile(r'(?=\w)'), ' ')] + pdftohtml_rules.ans = ans return ans def book_designer_rules(): ans = getattr(book_designer_rules, 'ans', None) if ans is None: - ans = book_designer_rules.ans = [ - # HR - (re.compile('\s*)+\s*(?=[\[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(r'[](
\s*\s*)+\s*' + r'(?=[\[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting - end_rules.append((re.compile( - r'[]\s*((i|u|b)>)+(
\s*\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(r'[]\s*((i|u|b)>)+(
\s*\s*)+' + r'\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), + lambda match: '')) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: docanalysis = DocAnalysis('pdf', html) - length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) + length = docanalysis.line_length(getattr(self.extra_opts, + 'unwrap_factor')) if length: # print("The pdf line length returned is " + str(length)) # unwrap em/en dashes - end_rules.append((re.compile( - r'(?<=.{%i}[–—])\s*
\s*(?=[\[a-z\d])' % length), lambda match: '')) + end_rules.append((re.compile(r'(?<=.{%i}[–—])\s*
\s*' + r'(?=[\[a-z\d])' % length), + lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(( - r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]' - r'|(?(i|b|u)>)?\s*(
\s*\s*)+\s*(?=(<(i|b|u)>)?' - r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines), - ) + (re.compile((r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçą' + r'ężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?(i|b|u)>)?\s*(
\s*' + r'\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])') %
+ length, re.UNICODE), wrap_lines))
for rule in html_preprocess_rules() + start_rules:
html = rule[0].sub(rule[1], html)
@@ -567,7 +607,7 @@ class HTMLPreProcessor(object):
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
- name = '%04d.html'%i
+ name = '%04d.html' % i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
@@ -578,20 +618,20 @@ class HTMLPreProcessor(object):
html = rule[0].sub(rule[1], html)
except Exception as e:
if rule in user_sr_rules:
- self.log.error(
- 'User supplied search & replace rule: %s -> %s '
- 'failed with error: %s, ignoring.'%(
- user_sr_rules[rule], rule[1], e))
+ self.log.error('User supplied search & replace rule: %s '
+ '-> %s failed with error: %s, ignoring.' %
+ (user_sr_rules[rule], rule[1], e))
else:
raise
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
- html = dehyphenator(html,'html', length)
+ html = dehyphenator(html, 'html', length)
if is_pdftohtml:
- from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
+ from ebook_converter.ebooks.conversion.utils import \
+ HeuristicProcessor
pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0
if pdf_markup.get_word_count(html) > 7000:
@@ -613,23 +653,26 @@ class HTMLPreProcessor(object):
from ebook_converter.utils.localization import get_udc
from ebook_converter.utils.mreplace import MReplace
unihandecoder = get_udc()
- mr = MReplace(data={'«':'<'*3, '»':'>'*3})
+ mr = MReplace(data={'«': '<' * 3, '»': '>' * 3})
html = mr.mreplace(html)
html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False):
- from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
+ from ebook_converter.ebooks.conversion.utils import \
+ HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if is_pdftohtml:
- html = html.replace('', '')
+ html = html.replace('', '')
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log)
try:
- unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
+ unsupported_unicode_chars = (self.extra_opts.output_profile
+ .unsupported_unicode_chars)
except AttributeError:
unsupported_unicode_chars = ''
if unsupported_unicode_chars:
diff --git a/ebook_converter/ebooks/html/input.py b/ebook_converter/ebooks/html/input.py
index 3a61e1d..58b69bc 100644
--- a/ebook_converter/ebooks/html/input.py
+++ b/ebook_converter/ebooks/html/input.py
@@ -10,19 +10,13 @@ import urllib.parse
from ebook_converter.ebooks.oeb.base import urlunquote
from ebook_converter.ebooks.chardet import detect_xml_encoding
from ebook_converter.constants_old import iswindows
-from ebook_converter import unicode_path, as_unicode, replace_entities
-
-
-__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal