1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-20 16:55:50 +01:00

Removed as_unicode function

This commit is contained in:
2020-06-14 19:02:23 +02:00
parent add7a8ca56
commit fdd531f6e0
7 changed files with 412 additions and 366 deletions

View File

@@ -13,10 +13,10 @@ try:
except EnvironmentError:
os.chdir(os.path.expanduser('~'))
from ebook_converter.constants_old import (iswindows, isosx, islinux, isfrozen,
isbsd, preferred_encoding, __appname__, __version__, __author__,
win32event, win32api, winerror, fcntl,
filesystem_encoding, plugins, config_dir)
from ebook_converter.constants_old import iswindows, islinux, isfrozen, \
isbsd, preferred_encoding, __appname__, __version__, __author__, \
win32event, win32api, winerror, fcntl, \
filesystem_encoding, plugins, config_dir
from ebook_converter.startup import winutil, winutilerror
from ebook_converter.utils.icu import safe_chr
@@ -51,23 +51,28 @@ def confirm_config_name(name):
return name + '_again'
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<', # no2to3
'"', ':', '>', '+', '/') + tuple(map(chr, range(32)))) # no2to3
_filename_sanitize_unicode = frozenset(('\\', '|', '?', '*', '<',
'"', ':', '>', '+', '/') +
tuple(map(chr, range(32))))
def sanitize_file_name(name, substitute='_'):
'''
Sanitize the filename `name`. All invalid characters are replaced by `substitute`.
The set of invalid characters is the union of the invalid characters in Windows,
macOS and Linux. Also removes leading and trailing whitespace.
**WARNING:** This function also replaces path separators, so only pass file names
and not full paths to it.
'''
"""
Sanitize the filename `name`. All invalid characters are replaced by
`substitute`. The set of invalid characters is the union of the invalid
characters in Windows, macOS and Linux. Also removes leading and trailing
whitespace.
**WARNING:** This function also replaces path separators, so only pass
file names and not full paths to it.
"""
if isinstance(name, bytes):
name = name.decode(filesystem_encoding, 'replace')
if isinstance(substitute, bytes):
substitute = substitute.decode(filesystem_encoding, 'replace')
chars = (substitute if c in _filename_sanitize_unicode else c for c in name)
chars = (substitute
if c in _filename_sanitize_unicode else c for c in name)
one = ''.join(chars)
one = re.sub(r'\s', ' ', one).strip()
bname, ext = os.path.splitext(one)
@@ -87,8 +92,8 @@ def prints(*args, **kwargs):
"""
Print unicode arguments safely by encoding them to preferred_encoding
Has the same signature as the print function from Python 3, except for the
additional keyword argument safe_encode, which if set to True will cause the
function to use repr when encoding fails.
additional keyword argument safe_encode, which if set to True will cause
the function to use repr when encoding fails.
Returns the number of bytes written.
"""
@@ -120,7 +125,7 @@ def prints(*args, **kwargs):
except UnicodeEncodeError:
try:
arg = arg.encode('utf-8')
except:
except Exception:
if not safe_encode:
raise
arg = repr(arg)
@@ -131,7 +136,7 @@ def prints(*args, **kwargs):
except UnicodeEncodeError:
try:
arg = arg.encode('utf-8')
except:
except Exception:
if not safe_encode:
raise
arg = repr(arg)
@@ -139,7 +144,7 @@ def prints(*args, **kwargs):
try:
file.write(arg)
count += len(arg)
except:
except Exception:
from polyglot import reprlib
arg = reprlib.repr(arg)
file.write(arg)
@@ -168,22 +173,12 @@ def setup_cli_handlers(logger, level):
elif level == logging.DEBUG:
handler = logging.StreamHandler(sys.stderr)
handler.setLevel(logging.DEBUG)
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:%(lineno)s: %(message)s'))
handler.setFormatter(logging.Formatter('[%(levelname)s] %(filename)s:'
'%(lineno)s: %(message)s'))
logger.addHandler(handler)
def load_library(name, cdll):
if iswindows:
return cdll.LoadLibrary(name)
if isosx:
name += '.dylib'
if hasattr(sys, 'frameworks_dir'):
return cdll.LoadLibrary(os.path.join(getattr(sys, 'frameworks_dir'), name))
return cdll.LoadLibrary(name)
return cdll.LoadLibrary(name+'.so')
def extract(path, dir):
extractor = None
# First use the file header to identify its type
@@ -216,7 +211,8 @@ def fit_image(width, height, pwidth, pheight):
@param height: Height of image
@param pwidth: Width of box
@param pheight: Height of box
@return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height.
@return: scaled, new_width, new_height. scaled is True iff new_width
and/or new_height is different from width or height.
'''
scaled = height > pheight or width > pwidth
if height > pheight:
@@ -262,8 +258,10 @@ def walk(dir):
def strftime(fmt, t=None):
''' A version of strftime that returns unicode strings and tries to handle dates
before 1900 '''
"""
A version of strftime that returns unicode strings and tries to handle
dates before 1900
"""
if not fmt:
return ''
if t is None:
@@ -272,7 +270,7 @@ def strftime(fmt, t=None):
t = t.timetuple()
early_year = t[0] < 1900
if early_year:
replacement = 1900 if t[0]%4 == 0 else 1901
replacement = 1900 if t[0] % 4 == 0 else 1901
fmt = fmt.replace('%Y', '_early year hack##')
t = list(t)
orig_year = t[0]
@@ -301,27 +299,33 @@ def my_unichr(num):
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
result_exceptions={}):
'''
result_exceptions={}):
"""
:param match: A match object such that '&'+match.group(1)';' is the entity.
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, for e.g. 'apos' or '#1234'
:param exceptions: A list of entities to not convert (Each entry is the
name of the entity, for e.g. 'apos' or '#1234'
:param encoding: The encoding to use to decode numeric entities between 128 and 256.
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
:param encoding: The encoding to use to decode numeric entities between
128 and 256. If None, the Unicode UCS encoding is used.
A common encoding is cp1252.
:param result_exceptions: A mapping of characters to entities. If the
result is in result_exceptions,
result_exception[result] is returned instead.
Convenient way to specify exception for things
like < or > that can be specified by various
actual entities.
"""
:param result_exceptions: A mapping of characters to entities. If the result
is in result_exceptions, result_exception[result] is returned instead.
Convenient way to specify exception for things like < or > that can be
specified by various actual entities.
'''
def check(ch):
return result_exceptions.get(ch, ch)
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
# squot is generated by some broken CMS software
if ent in {'apos', 'squot'}:
return check("'")
if ent == 'hellips':
ent = 'hellip'
@@ -331,7 +335,7 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
num = int(ent[2:], 16)
else:
num = int(ent[1:])
except:
except Exception:
return '&'+ent+';'
if encoding is None or num > 255:
return check(my_unichr(num))
@@ -394,15 +398,6 @@ def force_unicode(obj, enc=preferred_encoding):
return obj
def as_unicode(obj, enc=preferred_encoding):
if not isinstance(obj, bytes):
try:
obj = str(obj)
except Exception:
obj = repr(obj)
return force_unicode(obj, enc=enc)
def url_slash_cleaner(url):
'''
Removes redundant /'s from url's.