mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-17 01:24:12 +01:00
260 lines
8.4 KiB
Python
260 lines
8.4 KiB
Python
import html
|
|
import logging
|
|
import math
|
|
import mimetypes
|
|
import os
|
|
import pkg_resources
|
|
import re
|
|
import sys
|
|
|
|
from functools import partial
|
|
|
|
from ebook_converter import constants_old
|
|
from ebook_converter.constants_old import \
|
|
__appname__, __version__, __author__, \
|
|
config_dir
|
|
from ebook_converter.ebooks.html_entities import html5_entities
|
|
|
|
|
|
if False:
|
|
# Prevent pyflakes from complaining
|
|
__appname__, __version__, __author__, config_dir
|
|
|
|
|
|
def init_mimetypes():
|
|
mimetypes.init([pkg_resources.resource_filename('ebook_converter',
|
|
'data/mime.types')])
|
|
|
|
|
|
def guess_extension(*args, **kwargs):
|
|
ext = mimetypes.guess_extension(*args, **kwargs)
|
|
if not ext and args and args[0] == 'application/x-palmreader':
|
|
ext = '.pdb'
|
|
return ext
|
|
|
|
|
|
def sanitize_file_name(name, substitute='_'):
|
|
"""
|
|
Sanitize the filename `name`. All invalid characters are replaced by
|
|
`substitute`. The set of invalid characters is the union of the invalid
|
|
characters in Windows, macOS and Linux. Also removes leading and trailing
|
|
whitespace.
|
|
|
|
**WARNING:** This function also replaces path separators, so only pass
|
|
file names and not full paths to it.
|
|
"""
|
|
|
|
if isinstance(name, bytes):
|
|
name = name.decode(constants_old.filesystem_encoding, 'replace')
|
|
if isinstance(substitute, bytes):
|
|
substitute = substitute.decode(constants_old.filesystem_encoding,
|
|
'replace')
|
|
chars = (substitute
|
|
if c in set(('\\', '|', '?', '*', '<', '"', ':', '>', '+', '/') +
|
|
tuple(map(chr, range(32)))) else c for c in name)
|
|
one = ''.join(chars)
|
|
one = re.sub(r'\s', ' ', one).strip()
|
|
bname, ext = os.path.splitext(one)
|
|
one = re.sub(r'^\.+$', '_', bname)
|
|
one = one.replace('..', substitute)
|
|
one += ext
|
|
# Windows doesn't like path components that end with a period or space
|
|
if one and one[-1] in ('.', ' '):
|
|
one = one[:-1]+'_'
|
|
# Names starting with a period are hidden on Unix
|
|
if one.startswith('.'):
|
|
one = '_' + one[1:]
|
|
return one
|
|
|
|
|
|
def setup_cli_handlers(logger, level):
|
|
if os.getenv('CALIBRE_WORKER') and logger.handlers:
|
|
return
|
|
logger.setLevel(level)
|
|
if level == logging.WARNING:
|
|
handler = logging.StreamHandler(sys.stdout)
|
|
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
|
handler.setLevel(logging.WARNING)
|
|
elif level == logging.INFO:
|
|
handler = logging.StreamHandler(sys.stdout)
|
|
handler.setFormatter(logging.Formatter())
|
|
handler.setLevel(logging.INFO)
|
|
elif level == logging.DEBUG:
|
|
handler = logging.StreamHandler(sys.stderr)
|
|
handler.setLevel(logging.DEBUG)
|
|
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:'
|
|
'%(lineno)s: %(message)s'))
|
|
|
|
logger.addHandler(handler)
|
|
|
|
|
|
def fit_image(width, height, pwidth, pheight):
|
|
"""
|
|
Fit image in box of width pwidth and height pheight.
|
|
@param width: Width of image
|
|
@param height: Height of image
|
|
@param pwidth: Width of box
|
|
@param pheight: Height of box
|
|
@return: scaled, new_width, new_height. scaled is True iff new_width
|
|
and/or new_height is different from width or height.
|
|
"""
|
|
scaled = height > pheight or width > pwidth
|
|
if height > pheight:
|
|
corrf = pheight / float(height)
|
|
width, height = math.floor(corrf*width), pheight
|
|
if width > pwidth:
|
|
corrf = pwidth / float(width)
|
|
width, height = pwidth, math.floor(corrf*height)
|
|
if height > pheight:
|
|
corrf = pheight / float(height)
|
|
width, height = math.floor(corrf*width), pheight
|
|
|
|
return scaled, int(width), int(height)
|
|
|
|
|
|
class CurrentDir(object):
|
|
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.cwd = None
|
|
|
|
def __enter__(self, *args):
|
|
self.cwd = os.getcwd()
|
|
os.chdir(self.path)
|
|
return self.cwd
|
|
|
|
def __exit__(self, *args):
|
|
try:
|
|
os.chdir(self.cwd)
|
|
except EnvironmentError:
|
|
# The previous CWD no longer exists
|
|
pass
|
|
|
|
|
|
def walk(dir):
|
|
"""A nice interface to os.walk"""
|
|
for record in os.walk(dir):
|
|
for f in record[-1]:
|
|
yield os.path.join(record[0], f)
|
|
|
|
|
|
def my_unichr(num):
|
|
try:
|
|
return chr(num)
|
|
except (ValueError, OverflowError):
|
|
return '?'
|
|
|
|
|
|
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
|
result_exceptions={}):
|
|
"""
|
|
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
|
|
|
:param exceptions: A list of entities to not convert (Each entry is the
|
|
name of the entity, for e.g. 'apos' or '#1234'
|
|
|
|
:param encoding: The encoding to use to decode numeric entities between
|
|
128 and 256. If None, the Unicode UCS encoding is used.
|
|
A common encoding is cp1252.
|
|
|
|
:param result_exceptions: A mapping of characters to entities. If the
|
|
result is in result_exceptions,
|
|
result_exception[result] is returned instead.
|
|
Convenient way to specify exception for things
|
|
like < or > that can be specified by various
|
|
actual entities.
|
|
"""
|
|
|
|
def check(ch):
|
|
return result_exceptions.get(ch, ch)
|
|
|
|
ent = match.group(1)
|
|
if ent in exceptions:
|
|
return '&'+ent+';'
|
|
# squot is generated by some broken CMS software
|
|
if ent in {'apos', 'squot'}:
|
|
return check("'")
|
|
if ent == 'hellips':
|
|
ent = 'hellip'
|
|
if ent.startswith('#'):
|
|
try:
|
|
if ent[1] in ('x', 'X'):
|
|
num = int(ent[2:], 16)
|
|
else:
|
|
num = int(ent[1:])
|
|
except Exception:
|
|
return '&'+ent+';'
|
|
if encoding is None or num > 255:
|
|
return check(my_unichr(num))
|
|
try:
|
|
return check(bytes(bytearray((num,))).decode(encoding))
|
|
except UnicodeDecodeError:
|
|
return check(my_unichr(num))
|
|
try:
|
|
return check(html5_entities[ent])
|
|
except KeyError:
|
|
pass
|
|
try:
|
|
return check(my_unichr(html.entities.name2codepoint[ent]))
|
|
except KeyError:
|
|
return '&'+ent+';'
|
|
|
|
|
|
_ent_pat = re.compile(r'&(\S+?);')
|
|
xml_entity_to_unicode = partial(entity_to_unicode,
|
|
result_exceptions={'"': '"',
|
|
"'": ''',
|
|
'<': '<',
|
|
'>': '>',
|
|
'&': '&'})
|
|
|
|
|
|
def replace_entities(raw, encoding='cp1252'):
|
|
return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
|
|
|
|
|
|
def xml_replace_entities(raw, encoding='cp1252'):
|
|
return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
|
|
|
|
|
|
def prepare_string_for_xml(raw, attribute=False):
|
|
raw = _ent_pat.sub(entity_to_unicode, raw)
|
|
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
|
if attribute:
|
|
raw = raw.replace('"', '"').replace("'", ''')
|
|
return raw
|
|
|
|
|
|
def force_unicode(obj, enc=constants_old.preferred_encoding):
|
|
if isinstance(obj, bytes):
|
|
try:
|
|
obj = obj.decode(enc)
|
|
except Exception:
|
|
try:
|
|
obj = obj.decode(constants_old.filesystem_encoding
|
|
if enc == constants_old.preferred_encoding
|
|
else constants_old.preferred_encoding)
|
|
except Exception:
|
|
try:
|
|
obj = obj.decode('utf-8')
|
|
except Exception:
|
|
obj = repr(obj)
|
|
if isinstance(obj, bytes):
|
|
obj = obj.decode('utf-8')
|
|
return obj
|
|
|
|
|
|
def human_readable(size, sep=' '):
|
|
""" Convert a size in bytes into a human readable form """
|
|
divisor, suffix = 1, "B"
|
|
for i, candidate in enumerate(('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB')):
|
|
if size < (1 << ((i + 1) * 10)):
|
|
divisor, suffix = (1 << (i * 10)), candidate
|
|
break
|
|
size = str(float(size)/divisor)
|
|
if size.find(".") > -1:
|
|
size = size[:size.find(".")+2]
|
|
if size.endswith('.0'):
|
|
size = size[:-2]
|
|
return size + sep + suffix
|