1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-03 11:03:32 +02:00

Move force_uniceode to utils package

This commit is contained in:
2020-11-11 19:30:50 +01:00
parent 35c34c3b45
commit 3152c52839
16 changed files with 72 additions and 60 deletions

View File

@@ -111,22 +111,3 @@ def prepare_string_for_xml(raw, attribute=False):
if attribute: if attribute:
raw = raw.replace('"', '"').replace("'", ''') raw = raw.replace('"', '"').replace("'", ''')
return raw return raw
def force_unicode(obj, enc=constants_old.preferred_encoding):
if isinstance(obj, bytes):
try:
obj = obj.decode(enc)
except Exception:
try:
obj = obj.decode(constants_old.filesystem_encoding
if enc == constants_old.preferred_encoding
else constants_old.preferred_encoding)
except Exception:
try:
obj = obj.decode('utf-8')
except Exception:
obj = repr(obj)
if isinstance(obj, bytes):
obj = obj.decode('utf-8')
return obj

View File

@@ -21,7 +21,7 @@ import math
import bs4 import bs4
from PIL import Image as PILImage from PIL import Image as PILImage
from ebook_converter import entity_to_unicode, force_unicode from ebook_converter import entity_to_unicode
from ebook_converter.constants_old import __appname__, filesystem_encoding, \ from ebook_converter.constants_old import __appname__, filesystem_encoding, \
preferred_encoding preferred_encoding
from ebook_converter.devices.interface import DevicePlugin as Device from ebook_converter.devices.interface import DevicePlugin as Device
@@ -37,6 +37,7 @@ from ebook_converter.ebooks.lrf.pylrs.pylrs import (
RuledLine, Span, Sub, Sup, TextBlock RuledLine, Span, Sub, Sup, TextBlock
) )
from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.ptempfile import PersistentTemporaryFile
from ebook_converter.utils import encoding as uenc
from ebook_converter.utils import img as uimg from ebook_converter.utils import img as uimg
@@ -1935,8 +1936,8 @@ class HTMLConverter(object):
def process_file(path, options, logger): def process_file(path, options, logger):
path = os.path.abspath(path) path = os.path.abspath(path)
default_title = force_unicode(os.path.splitext(os.path.basename(path))[0], default_title = os.path.splitext(os.path .basename(path))[0]
filesystem_encoding) default_title = uenc.force_unicode(default_title, filesystem_encoding)
dirpath = os.path.dirname(path) dirpath = os.path.dirname(path)
tpath = '' tpath = ''

View File

@@ -8,9 +8,9 @@ import re
import sys import sys
import urllib.parse import urllib.parse
from ebook_converter import force_unicode
from ebook_converter.utils.config_base import tweaks from ebook_converter.utils.config_base import tweaks
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
from ebook_converter.utils import encoding as uenc
try: try:
@@ -43,7 +43,7 @@ def remove_bracketed_text(src, brackets=None):
from collections import Counter from collections import Counter
counts = Counter() counts = Counter()
buf = [] buf = []
src = force_unicode(src) src = uenc.force_unicode(src)
rmap = {v: k for k, v in brackets.items()} rmap = {v: k for k, v in brackets.items()}
for char in src: for char in src:
if char in brackets: if char in brackets:
@@ -75,7 +75,7 @@ def author_to_author_sort(author, method=None):
if method == 'copy': if method == 'copy':
return author return author
prefixes = {force_unicode(y).lower() prefixes = {uenc.force_unicode(y).lower()
for y in tweaks['author_name_prefixes']} for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes} prefixes |= {y+'.' for y in prefixes}
while True: while True:
@@ -87,7 +87,7 @@ def author_to_author_sort(author, method=None):
else: else:
break break
suffixes = {force_unicode(y).lower() suffixes = {uenc.force_unicode(y).lower()
for y in tweaks['author_name_suffixes']} for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes} suffixes |= {y+'.' for y in suffixes}

View File

@@ -12,10 +12,10 @@ from lxml import etree
from ebook_converter.utils.date import parse_only_date from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.imghdr import identify from ebook_converter.utils.imghdr import identify
from ebook_converter import force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode from ebook_converter.polyglot.binary import as_base64_unicode
from ebook_converter.utils import encoding as uenc
NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0', NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0',
@@ -110,7 +110,7 @@ def get_metadata(stream):
if book_title: if book_title:
book_title = str(book_title) book_title = str(book_title)
else: else:
book_title = force_unicode(os.path.splitext( book_title = uenc.force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name', 'Unknown')))[0]) os.path.basename(getattr(stream, 'name', 'Unknown')))[0])
mi = MetaInformation(book_title, authors) mi = MetaInformation(book_title, authors)

View File

@@ -4,8 +4,8 @@ Edit metadata in RTF files.
import codecs import codecs
import re import re
from ebook_converter import force_unicode
from ebook_converter.ebooks.metadata import MetaInformation from ebook_converter.ebooks.metadata import MetaInformation
from ebook_converter.utils import encoding as uenc
title_pat = re.compile(br'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL) title_pat = re.compile(br'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
author_pat = re.compile(br'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL) author_pat = re.compile(br'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
@@ -74,7 +74,7 @@ def detect_codepage(stream):
def encode(unistr): def encode(unistr):
if not isinstance(unistr, str): if not isinstance(unistr, str):
unistr = force_unicode(unistr) unistr = uenc.force_unicode(unistr)
return ''.join(c if ord(c) < 128 else return ''.join(c if ord(c) < 128 else
'\\u{}?'.format(ord(c)) for c in unistr) '\\u{}?'.format(ord(c)) for c in unistr)

View File

@@ -17,12 +17,12 @@ from lxml import etree
from lxml import html from lxml import html
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.constants_old import filesystem_encoding, __version__ from ebook_converter.constants_old import filesystem_encoding, __version__
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor
from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils import encoding as uenc
from ebook_converter.utils.short_uuid import uuid4 from ebook_converter.utils.short_uuid import uuid4
@@ -1074,7 +1074,7 @@ class Manifest(object):
def sort_key(self): def sort_key(self):
href = self.href href = self.href
if isinstance(href, bytes): if isinstance(href, bytes):
href = force_unicode(href) href = uenc.force_unicode(href)
if isinstance(self.spine_position, numbers.Number): if isinstance(self.spine_position, numbers.Number):
sp = self.spine_position sp = self.spine_position

View File

@@ -5,9 +5,10 @@ from lxml import etree
from lxml import html from lxml import html
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import xml_replace_entities, force_unicode from ebook_converter import xml_replace_entities
from ebook_converter.constants_old import filesystem_encoding from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from ebook_converter.utils import encoding as uenc
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
@@ -159,7 +160,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
if log is None: if log is None:
log = LOG log = LOG
filename = force_unicode(filename, enc=filesystem_encoding) filename = uenc.force_unicode(filename, enc=filesystem_encoding)
if not isinstance(data, str): if not isinstance(data, str):
if decoder is not None: if decoder is not None:

View File

@@ -3,12 +3,12 @@ import functools
from css_parser.css import CSSRule, CSSStyleDeclaration from css_parser.css import CSSRule, CSSStyleDeclaration
from ebook_converter import force_unicode
from ebook_converter.css_selectors import parse, SelectorSyntaxError from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish import pretty from ebook_converter.ebooks.oeb.polish import pretty
from ebook_converter.utils.icu import numeric_sort_key from ebook_converter.utils.icu import numeric_sort_key
from ebook_converter.css_selectors import Select, SelectorError from ebook_converter.css_selectors import Select, SelectorError
from ebook_converter.utils import encoding as uenc
def filter_used_rules(rules, log, select): def filter_used_rules(rules, log, select):
@@ -137,7 +137,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False,
if unused_rules: if unused_rules:
num_of_removed_rules += len(unused_rules) num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules] [sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8') style.text = uenc.force_unicode(sheet.cssText, 'utf-8')
pretty.pretty_script_or_style(container, style) pretty.pretty_script_or_style(container, style)
container.dirty(name) container.dirty(name)
@@ -241,7 +241,7 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
sheet = container.parse_css(style.text) sheet = container.parse_css(style.text)
if transform_sheet(sheet): if transform_sheet(sheet):
changed = True changed = True
style.text = force_unicode(sheet.cssText, 'utf-8') style.text = uenc.force_unicode(sheet.cssText, 'utf-8')
pretty.pretty_script_or_style(container, style) pretty.pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'): for elem in root.xpath('//*[@style]'):
text = elem.get('style', None) text = elem.get('style', None)
@@ -253,8 +253,9 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
del elem.attrib['style'] del elem.attrib['style']
else: else:
elem.set('style', elem.set('style',
force_unicode(style.getCssText(separator=' '), uenc.force_unicode(style
'utf-8')) .getCssText(separator=' '),
'utf-8'))
return changed return changed

View File

@@ -1,10 +1,10 @@
import textwrap import textwrap
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils import encoding as uenc
from ebook_converter.utils.icu import sort_key from ebook_converter.utils.icu import sort_key
@@ -157,8 +157,9 @@ def pretty_script_or_style(container, child):
if child.text: if child.text:
indent = indent_for_tag(child) indent = indent_for_tag(child)
if child.tag.endswith('style'): if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), child.text = uenc.force_unicode(pretty_css(container, '',
'utf-8') child.text),
'utf-8')
child.text = textwrap.dedent(child.text) child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' child.text = '\n' + '\n'.join([(indent + x) if x else ''
for x in child.text.splitlines()]) for x in child.text.splitlines()])

View File

@@ -12,12 +12,12 @@ from css_parser import (profile as cssprofiles, parseString, parseStyle, log as
css_parser_log, CSSParser, profiles, replaceUrls) css_parser_log, CSSParser, profiles, replaceUrls)
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks import unit_convert from ebook_converter.ebooks import unit_convert
from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.normalize_css import DEFAULTS, normalizers from ebook_converter.ebooks.oeb.normalize_css import DEFAULTS, normalizers
from ebook_converter.css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES from ebook_converter.css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
from ebook_converter.tinycss.media3 import CSSMedia3Parser from ebook_converter.tinycss.media3 import CSSMedia3Parser
from ebook_converter.utils import encoding as uenc
css_parser_log.setLevel(logging.WARN) css_parser_log.setLevel(logging.WARN)
@@ -223,10 +223,10 @@ class Stylizer(object):
for x in elem: for x in elem:
t = getattr(x, 'text', None) t = getattr(x, 'text', None)
if t: if t:
text += '\n\n' + force_unicode(t, 'utf-8') text += '\n\n' + uenc.force_unicode(t, 'utf-8')
t = getattr(x, 'tail', None) t = getattr(x, 'tail', None)
if t: if t:
text += '\n\n' + force_unicode(t, 'utf-8') text += '\n\n' + uenc.force_unicode(t, 'utf-8')
if text: if text:
text = oeb.css_preprocessor(text) text = oeb.css_preprocessor(text)
# We handle @import rules separately # We handle @import rules separately

View File

@@ -14,12 +14,12 @@ from lxml.etree import XPath as _XPath
from lxml import etree from lxml import etree
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks.epub import rules from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.split import do_split from ebook_converter.ebooks.oeb.polish.split import do_split
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
from ebook_converter.css_selectors import Select, SelectorError from ebook_converter.css_selectors import Select, SelectorError
from ebook_converter.utils import encoding as uenc
__license__ = 'GPL v3' __license__ = 'GPL v3'
@@ -86,10 +86,12 @@ class Split(object):
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
base.OEB_STYLES] base.OEB_STYLES]
for rule in rules(stylesheets): for rule in rules(stylesheets):
before = force_unicode(getattr(rule.style.getPropertyCSSValue( before = uenc.force_unicode(
'page-break-before'), 'cssText', '').strip().lower()) getattr(rule.style.getPropertyCSSValue(
after = force_unicode(getattr(rule.style.getPropertyCSSValue( 'page-break-before'), 'cssText', '').strip().lower())
'page-break-after'), 'cssText', '').strip().lower()) after = uenc.force_unicode(
getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower())
try: try:
if before and before not in {'avoid', 'auto', 'inherit'}: if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, self.page_break_selectors.add((rule.selectorText,

View File

@@ -119,21 +119,24 @@ def reset_base_dir():
base_dir() base_dir()
def force_unicode(x): def _force_unicode(x):
# Cannot use the implementation in calibre.__init__ as it causes a circular # Cannot use the implementation in calibre.__init__ as it causes a circular
# dependency # dependency
# NOTE(gryf): Congratulations! that's a 3rd function in this codebase
# called force_unicode! I guess that forcing unicode on text objects is
# some kind of hobby.
if isinstance(x, bytes): if isinstance(x, bytes):
x = x.decode(filesystem_encoding) x = x.decode(filesystem_encoding)
return x return x
def _make_file(suffix, prefix, base): def _make_file(suffix, prefix, base):
suffix, prefix = map(force_unicode, (suffix, prefix)) # no2to3 suffix, prefix = map(_force_unicode, (suffix, prefix)) # no2to3
return tempfile.mkstemp(suffix, prefix, dir=base) return tempfile.mkstemp(suffix, prefix, dir=base)
def _make_dir(suffix, prefix, base): def _make_dir(suffix, prefix, base):
suffix, prefix = map(force_unicode, (suffix, prefix)) # no2to3 suffix, prefix = map(_force_unicode, (suffix, prefix)) # no2to3
return tempfile.mkdtemp(suffix, prefix, base) return tempfile.mkdtemp(suffix, prefix, base)

View File

@@ -7,10 +7,11 @@ import json
import numbers import numbers
import os import os
import pickle import pickle
import pkg_resources
import re import re
import traceback import traceback
import pkg_resources
from ebook_converter.constants_old import config_dir from ebook_converter.constants_old import config_dir
from ebook_converter.constants_old import filesystem_encoding from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.constants_old import preferred_encoding from ebook_converter.constants_old import preferred_encoding
@@ -75,7 +76,8 @@ def from_json(obj):
return obj return obj
def force_unicode(x): def _force_unicode(x):
# TODO(gryf): eliminate cases, when this kind of functions are needed.
try: try:
return x.decode(preferred_encoding) return x.decode(preferred_encoding)
except UnicodeDecodeError: except UnicodeDecodeError:
@@ -87,7 +89,7 @@ def force_unicode(x):
def force_unicode_recursive(obj): def force_unicode_recursive(obj):
if isinstance(obj, bytes): if isinstance(obj, bytes):
return force_unicode(obj) return _force_unicode(obj)
if isinstance(obj, (list, tuple)): if isinstance(obj, (list, tuple)):
return type(obj)(map(force_unicode_recursive, obj)) return type(obj)(map(force_unicode_recursive, obj))
if isinstance(obj, dict): if isinstance(obj, dict):

View File

@@ -0,0 +1,20 @@
from ebook_converter import constants_old
def force_unicode(obj, enc=constants_old.preferred_encoding):
if isinstance(obj, bytes):
try:
obj = obj.decode(enc)
except Exception:
try:
obj = obj.decode(constants_old.filesystem_encoding
if enc == constants_old.preferred_encoding
else constants_old.preferred_encoding)
except Exception:
try:
obj = obj.decode('utf-8')
except Exception:
obj = repr(obj)
if isinstance(obj, bytes):
obj = obj.decode('utf-8')
return obj

View File

@@ -9,9 +9,9 @@ import shutil
from math import ceil from math import ceil
from ebook_converter import constants_old from ebook_converter import constants_old
from ebook_converter import force_unicode
from ebook_converter.constants_old import (filesystem_encoding, from ebook_converter.constants_old import (filesystem_encoding,
preferred_encoding) preferred_encoding)
from ebook_converter.utils import encoding as uenc
from ebook_converter.utils.localization import get_udc from ebook_converter.utils.localization import get_udc
@@ -183,7 +183,7 @@ def case_preserving_open_file(path, mode='wb', mkdir_mode=0o777):
path = os.path.abspath(path) path = os.path.abspath(path)
sep = force_unicode(os.sep, 'ascii') sep = uenc.force_unicode(os.sep, 'ascii')
if path.endswith(sep): if path.endswith(sep):
path = path[:-1] path = path[:-1]

View File

@@ -12,11 +12,11 @@ from threading import Thread
#from PyQt5.QtCore import QBuffer, QByteArray, Qt #from PyQt5.QtCore import QBuffer, QByteArray, Qt
#from PyQt5.QtGui import QColor, QImage, QImageReader, QImageWriter, QPixmap, QTransform #from PyQt5.QtGui import QColor, QImage, QImageReader, QImageWriter, QPixmap, QTransform
from ebook_converter import force_unicode
from ebook_converter.constants_old import plugins from ebook_converter.constants_old import plugins
from ebook_converter.ptempfile import TemporaryDirectory from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.utils.config_base import tweaks from ebook_converter.utils.config_base import tweaks
from ebook_converter.utils.filenames import atomic_rename from ebook_converter.utils.filenames import atomic_rename
from ebook_converter.utils import encoding as uenc
from ebook_converter.utils.imghdr import what from ebook_converter.utils.imghdr import what
# Utilities {{{ # Utilities {{{
@@ -586,7 +586,7 @@ def run_optimizer(file_path, cmd, as_filter=False, input_data=None):
outw = Thread(name='CopyOutput', target=copy, args=(p.stdout, outf)) outw = Thread(name='CopyOutput', target=copy, args=(p.stdout, outf))
outw.daemon = True outw.daemon = True
outw.start() outw.start()
raw = force_unicode(stderr.read()) raw = uenc.force_unicode(stderr.read())
if p.wait() != 0: if p.wait() != 0:
return raw return raw
else: else: