1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-01 07:22:26 +01:00

Get rid of icu string functions in favor of native ones.

This commit is contained in:
2020-05-03 20:19:11 +02:00
parent 212cb56d42
commit da010d7841
10 changed files with 138 additions and 185 deletions

View File

@@ -6,6 +6,7 @@ from collections import defaultdict
from itertools import count
from operator import attrgetter
import urllib.parse
import string
from lxml import etree, html
from ebook_converter import force_unicode
@@ -763,7 +764,7 @@ class Metadata(object):
key = barename(key)
attrib[key] = prefixname(value, nsrmap)
if namespace(self.term) == DC11_NS:
name = DC(icu_title(barename(self.term)))
name = DC(string.capwords(barename(self.term)))
elem = element(dcmeta, name, attrib=attrib)
elem.text = self.value
else:

View File

@@ -93,10 +93,14 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
if num:
container.dirty(name)
num_merged += num
import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
import_map = {name: get_imported_sheets(name, container, sheets)
for name in sheets}
if remove_unused_classes:
class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in sheets.items()}
style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in sheets.items()}
class_map = {name: {x.lower() for x in
classes_in_rule_list(sheet.cssRules)}
for name, sheet in sheets.items()}
style_rules = {name: tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
for name, sheet in sheets.items()}
num_of_removed_rules = num_of_removed_classes = 0
@@ -115,7 +119,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_merged += num
container.dirty(name)
if remove_unused_classes:
used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
@@ -147,7 +151,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
for elem in root.xpath('//*[@class]'):
original_classes, classes = elem.get('class', '').split(), []
for x in original_classes:
if icu_lower(x) in used_classes:
if x.lower() in used_classes:
classes.append(x)
if len(classes) != len(original_classes):
if classes:

View File

@@ -1,5 +1,6 @@
import re, os
from bisect import bisect
import bisect
import os
import re
from ebook_converter import guess_type as _guess_type, replace_entities
@@ -8,6 +9,10 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def _upper(string):
return string.upper()
def guess_type(x):
return _guess_type(x)[0] or 'application/octet-stream'
@@ -23,7 +28,8 @@ def setup_css_parser_serialization(tab_width=2):
def actual_case_for_name(container, name):
from ebook_converter.utils.filenames import samefile
if not container.exists(name):
raise ValueError('Cannot get actual case for %s as it does not exist' % name)
raise ValueError('Cannot get actual case for %s as it does not '
'exist' % name)
parts = name.split('/')
base = ''
ans = []
@@ -55,9 +61,12 @@ def corrected_case_for_name(container, name):
correctx = x
else:
try:
candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
dirname = os.path.dirname(container.name_to_abspath(base))
candidates = {q for q in os.listdir(dirname)}
except EnvironmentError:
return None # one of the non-terminal components of name is a file instead of a directory
# one of the non-terminal components of name is a file instead
# of a directory
return None
for q in candidates:
if q.lower() == x.lower():
correctx = q
@@ -75,7 +84,7 @@ class PositionFinder(object):
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
def __call__(self, pos):
lnum = bisect(self.new_lines, pos)
lnum = bisect.bisect(self.new_lines, pos)
try:
offset = abs(pos - self.new_lines[lnum - 1])
except IndexError:
@@ -94,7 +103,7 @@ class CommentFinder(object):
def __call__(self, offset):
if not self.starts:
return False
q = bisect(self.starts, offset) - 1
q = bisect.bisect(self.starts, offset) - 1
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
@@ -182,13 +191,16 @@ def handle_entities(text, func):
return func(replace_entities(text))
def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
'''Apply the specified function to individual groups in the match object (the result of re.search() or
the whole match if no groups were defined. Returns the replaced string.'''
def apply_func_to_match_groups(match, func=_upper,
handle_entities=handle_entities):
"""
Apply the specified function to individual groups in the match object (the
result of re.search() or
the whole match if no groups were defined. Returns the replaced string.
"""
found_groups = False
i = 0
parts, pos = [], match.start()
f = lambda text:handle_entities(text, func)
while True:
i += 1
try:
@@ -198,19 +210,22 @@ def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_ent
found_groups = True
if start > -1:
parts.append(match.string[pos:start])
parts.append(f(match.string[start:end]))
parts.append(handle_entities(match.string[start:end], func))
pos = end
if not found_groups:
return f(match.group())
return handle_entities(match.group(), func)
parts.append(match.string[pos:match.end()])
return ''.join(parts)
def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
''' Apply the specified function only to text between HTML tag definitions. '''
f = lambda text:handle_entities(text, func)
def apply_func_to_html_text(match, func=_upper,
handle_entities=handle_entities):
"""
Apply the specified function only to text between HTML tag definitions.
"""
parts = re.split(r'(<[^>]+>)', match.group())
parts = (x if x.startswith('<') else f(x) for x in parts)
parts = (x if x.startswith('<') else handle_entities(x, func)
for x in parts)
return ''.join(parts)

View File

@@ -1,7 +1,10 @@
"""
CSS case-mangling transform.
"""
import string
from lxml import etree
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS
from ebook_converter.ebooks.oeb.base import CSS_MIME
from ebook_converter.ebooks.oeb.base import namespace
@@ -46,16 +49,17 @@ class CaseMangler(object):
relhref = item.relhref(href)
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
rel='stylesheet', href=relhref, type=CSS_MIME)
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
stylizer = Stylizer(html, item.href, self.oeb, self.opts,
self.profile)
self.mangle_elem(html.find(XHTML('body')), stylizer)
def text_transform(self, transform, text):
if transform == 'capitalize':
return icu_title(text)
return string.capwords(text)
elif transform == 'uppercase':
return icu_upper(text)
return text.upper()
elif transform == 'lowercase':
return icu_lower(text)
return text.lower()
return text
def split_text(self, text):

View File

@@ -620,8 +620,8 @@ class FieldMetadata(object):
if label in self._tb_cats:
raise ValueError('Duplicate user field [%s]'%(label))
st = [label]
if icu_lower(label) != label:
st.append(icu_lower(label))
if label.lower() != label:
st.append(label.lower())
self._tb_cats[label] = {'table':None, 'column':None,
'datatype':None, 'is_multiple':{},
'kind':'user', 'name':name,

View File

@@ -104,11 +104,6 @@ if not _run_once:
except:
pass
from ebook_converter.utils.icu import lower as icu_lower, upper as icu_upper
builtins.__dict__['icu_lower'] = icu_lower
builtins.__dict__['icu_upper'] = icu_upper
builtins.__dict__['icu_title'] = lambda s: ' '.join([x.capitalize() for x in s.split(' ')])
def connect_lambda(bound_signal, self, func, **kw):
import weakref
r = weakref.ref(self)

View File

@@ -1220,10 +1220,10 @@ class BuiltinListUnion(BuiltinFormatterFunction):
def evaluate(self, formatter, kwargs, mi, locals, list1, list2, separator):
res = [l.strip() for l in list1.split(separator) if l.strip()]
l2 = [l.strip() for l in list2.split(separator) if l.strip()]
lcl1 = {icu_lower(l) for l in res}
lcl1 = {l.lower() for l in res}
for i in l2:
if icu_lower(i) not in lcl1 and i not in res:
if i.lower() not in lcl1 and i not in res:
res.append(i)
if separator == ',':
return ', '.join(res)
@@ -1241,11 +1241,11 @@ class BuiltinListDifference(BuiltinFormatterFunction):
def evaluate(self, formatter, kwargs, mi, locals, list1, list2, separator):
l1 = [l.strip() for l in list1.split(separator) if l.strip()]
l2 = {icu_lower(l.strip()) for l in list2.split(separator) if l.strip()}
l2 = {l.strip().lower() for l in list2.split(separator) if l.strip()}
res = []
for i in l1:
if icu_lower(i) not in l2 and i not in res:
if i.lower() not in l2 and i not in res:
res.append(i)
if separator == ',':
return ', '.join(res)
@@ -1263,11 +1263,11 @@ class BuiltinListIntersection(BuiltinFormatterFunction):
def evaluate(self, formatter, kwargs, mi, locals, list1, list2, separator):
l1 = [l.strip() for l in list1.split(separator) if l.strip()]
l2 = {icu_lower(l.strip()) for l in list2.split(separator) if l.strip()}
l2 = {l.strip().lower() for l in list2.split(separator) if l.strip()}
res = []
for i in l1:
if icu_lower(i) in l2 and i not in res:
if i.lower() in l2 and i not in res:
res.append(i)
if separator == ',':
return ', '.join(res)
@@ -1302,8 +1302,8 @@ class BuiltinListEquals(BuiltinFormatterFunction):
'The comparison is case insensitive.')
def evaluate(self, formatter, kwargs, mi, locals, list1, sep1, list2, sep2, yes_val, no_val):
s1 = {icu_lower(l.strip()) for l in list1.split(sep1) if l.strip()}
s2 = {icu_lower(l.strip()) for l in list2.split(sep2) if l.strip()}
s1 = {l.strip().lower() for l in list1.split(sep1) if l.strip()}
s2 = {l.strip().lower() for l in list2.split(sep2) if l.strip()}
if s1 == s2:
return yes_val
return no_val
@@ -1426,7 +1426,7 @@ class BuiltinLanguageStrings(BuiltinFormatterFunction):
retval = []
for c in [c.strip() for c in lang_codes.split(',') if c.strip()]:
try:
n = calibre_langcode_to_name(c, localize != '0')
n = calibre_langcode_to_name(c)
if n:
retval.append(n)
except:

View File

@@ -292,7 +292,7 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):
ans = OrderedDict()
last_c, last_ordnum = ' ', 0
for item in items:
c = icu_upper(key(item) or ' ')
c = (key(item) or ' ').upper()
ordnum, ordlen = collation_order(c)
if last_ordnum != ordnum:
if not is_narrow_build:

View File

@@ -1,27 +1,6 @@
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
import io
import sys
import json
import pkg_resources
_available_translations = None
def sanitize_lang(lang):
if lang:
match = re.match('[a-z]{2,3}(_[A-Z]{2}){0,1}', lang)
if match:
lang = match.group()
if lang == 'zh':
lang = 'zh_CN'
if not lang:
lang = 'en'
return lang
def get_lang():
return 'en_US'
@@ -34,121 +13,78 @@ def is_rtl():
_lang_trans = None
lcdata = {
'abday': ('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'),
'abmon': ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'),
'd_fmt': '%m/%d/%Y',
'd_t_fmt': '%a %d %b %Y %r %Z',
'day': ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'),
'mon': ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'),
'noexpr': '^[nN].*',
'radixchar': '.',
't_fmt': '%r',
't_fmt_ampm': '%I:%M:%S %p',
'thousep': ',',
'yesexpr': '^[yY].*'
}
def load_po(path):
from ebook_converter.translations.msgfmt import make
buf = io.BytesIO()
try:
make(path, buf)
except Exception:
print(('Failed to compile translations file: %s, ignoring') % path)
buf = None
else:
buf = io.BytesIO(buf.getvalue())
return buf
lcdata = {'abday': ('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'),
'abmon': ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
'Sep', 'Oct', 'Nov', 'Dec'),
'd_fmt': '%m/%d/%Y',
'd_t_fmt': '%a %d %b %Y %r %Z',
'day': ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
'Friday', 'Saturday'),
'mon': ('January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November',
'December'),
'noexpr': '^[nN].*',
'radixchar': '.',
't_fmt': '%r',
't_fmt_ampm': '%I:%M:%S %p',
'thousep': ',',
'yesexpr': '^[yY].*'}
_iso639 = None
_extra_lang_codes = {
'pt_BR' : 'Brazilian Portuguese',
'en_GB' : 'English (UK)',
'zh_CN' : 'Simplified Chinese',
'zh_TW' : 'Traditional Chinese',
'en' : 'English',
'en_US' : 'English (United States)',
'en_AR' : 'English (Argentina)',
'en_AU' : 'English (Australia)',
'en_JP' : 'English (Japan)',
'en_DE' : 'English (Germany)',
'en_BG' : 'English (Bulgaria)',
'en_EG' : 'English (Egypt)',
'en_NZ' : 'English (New Zealand)',
'en_CA' : 'English (Canada)',
'en_GR' : 'English (Greece)',
'en_IN' : 'English (India)',
'en_NP' : 'English (Nepal)',
'en_TH' : 'English (Thailand)',
'en_TR' : 'English (Turkey)',
'en_CY' : 'English (Cyprus)',
'en_CZ' : 'English (Czech Republic)',
'en_PH' : 'English (Philippines)',
'en_PK' : 'English (Pakistan)',
'en_PL' : 'English (Poland)',
'en_HR' : 'English (Croatia)',
'en_HU' : 'English (Hungary)',
'en_ID' : 'English (Indonesia)',
'en_IL' : 'English (Israel)',
'en_RU' : 'English (Russia)',
'en_SG' : 'English (Singapore)',
'en_YE' : 'English (Yemen)',
'en_IE' : 'English (Ireland)',
'en_CN' : 'English (China)',
'en_TW' : 'English (Taiwan)',
'en_ZA' : 'English (South Africa)',
'es_PY' : 'Spanish (Paraguay)',
'es_UY' : 'Spanish (Uruguay)',
'es_AR' : 'Spanish (Argentina)',
'es_CR' : 'Spanish (Costa Rica)',
'es_MX' : 'Spanish (Mexico)',
'es_CU' : 'Spanish (Cuba)',
'es_CL' : 'Spanish (Chile)',
'es_EC' : 'Spanish (Ecuador)',
'es_HN' : 'Spanish (Honduras)',
'es_VE' : 'Spanish (Venezuela)',
'es_BO' : 'Spanish (Bolivia)',
'es_NI' : 'Spanish (Nicaragua)',
'es_CO' : 'Spanish (Colombia)',
'de_AT' : 'German (AT)',
'fr_BE' : 'French (BE)',
'nl' : 'Dutch (NL)',
'nl_BE' : 'Dutch (BE)',
'und' : 'Unknown'
}
if False:
# Extra strings needed for Qt
# NOTE: Ante Meridian (i.e. like 10:00 AM)
'AM'
# NOTE: Post Meridian (i.e. like 10:00 PM)
'PM'
# NOTE: Ante Meridian (i.e. like 10:00 am)
'am'
# NOTE: Post Meridian (i.e. like 10:00 pm)
'pm'
'&Copy'
'Select All'
'Copy Link'
'&Select All'
'Copy &Link Location'
'&Undo'
'&Redo'
'Cu&t'
'&Paste'
'Paste and Match Style'
'Directions'
'Left to Right'
'Right to Left'
'Fonts'
'&Step up'
'Step &down'
'Close without Saving'
'Close Tab'
_extra_lang_codes = {'pt_BR': 'Brazilian Portuguese',
'en_GB': 'English (UK)',
'zh_CN': 'Simplified Chinese',
'zh_TW': 'Traditional Chinese',
'en': 'English',
'en_US': 'English (United States)',
'en_AR': 'English (Argentina)',
'en_AU': 'English (Australia)',
'en_JP': 'English (Japan)',
'en_DE': 'English (Germany)',
'en_BG': 'English (Bulgaria)',
'en_EG': 'English (Egypt)',
'en_NZ': 'English (New Zealand)',
'en_CA': 'English (Canada)',
'en_GR': 'English (Greece)',
'en_IN': 'English (India)',
'en_NP': 'English (Nepal)',
'en_TH': 'English (Thailand)',
'en_TR': 'English (Turkey)',
'en_CY': 'English (Cyprus)',
'en_CZ': 'English (Czech Republic)',
'en_PH': 'English (Philippines)',
'en_PK': 'English (Pakistan)',
'en_PL': 'English (Poland)',
'en_HR': 'English (Croatia)',
'en_HU': 'English (Hungary)',
'en_ID': 'English (Indonesia)',
'en_IL': 'English (Israel)',
'en_RU': 'English (Russia)',
'en_SG': 'English (Singapore)',
'en_YE': 'English (Yemen)',
'en_IE': 'English (Ireland)',
'en_CN': 'English (China)',
'en_TW': 'English (Taiwan)',
'en_ZA': 'English (South Africa)',
'es_PY': 'Spanish (Paraguay)',
'es_UY': 'Spanish (Uruguay)',
'es_AR': 'Spanish (Argentina)',
'es_CR': 'Spanish (Costa Rica)',
'es_MX': 'Spanish (Mexico)',
'es_CU': 'Spanish (Cuba)',
'es_CL': 'Spanish (Chile)',
'es_EC': 'Spanish (Ecuador)',
'es_HN': 'Spanish (Honduras)',
'es_VE': 'Spanish (Venezuela)',
'es_BO': 'Spanish (Bolivia)',
'es_NI': 'Spanish (Nicaragua)',
'es_CO': 'Spanish (Colombia)',
'de_AT': 'German (AT)',
'fr_BE': 'French (BE)',
'nl': 'Dutch (NL)',
'nl_BE': 'Dutch (BE)',
'und': 'Unknown'}
_lcase_map = {}
for k in _extra_lang_codes:
@@ -219,11 +155,9 @@ def get_iso_language(lang_trans, lang):
return lang_trans(ans)
def calibre_langcode_to_name(lc, localize=True):
iso639 = _load_iso639()
translate = _ if localize else lambda x: x
translate = lambda x: x
try:
return translate(iso639['by_3'][lc])
except:

View File

@@ -64,23 +64,23 @@ def titlecase(text):
line.append(word)
continue
else:
word = icu_lower(word)
word = word.lower()
if APOS_SECOND.match(word):
word = word.replace(word[0], icu_upper(word[0]), 1)
word = word[:2] + icu_upper(word[2]) + word[3:]
word = word.replace(word[0], word[0].upprt(), 1)
word = word[:2] + word[2].upper() + word[3:]
line.append(word)
continue
if INLINE_PERIOD.search(word) or UC_ELSEWHERE.match(word):
line.append(word)
continue
if SMALL_WORDS.match(word):
line.append(icu_lower(word))
line.append(word.lower())
continue
hyphenated = []
for item in word.split('-'):
hyphenated.append(CAPFIRST.sub(lambda m: icu_upper(m.group(0)), item))
hyphenated.append(CAPFIRST.sub(lambda m: m.group(0).upper(), item))
line.append("-".join(hyphenated))
result = "".join(line)