import codecs import sys import unicodedata # Setup code {{{ from ebook_converter.constants import plugins from ebook_converter.polyglot.builtins import cmp from ebook_converter.utils.config_base import tweaks __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' is_narrow_build = sys.maxunicode < 0x10ffff _locale = _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None cmp _none = u'' _none2 = b'' _cmap = {} _icu, err = 1, None # plugins['icu'] if _icu is None: raise RuntimeError('Failed to load icu with error: %s' % err) del err #icu_unicode_version = getattr(_icu, 'unicode_version', None) # _nmodes = {m:getattr(_icu, m) for m in ('NFC', 'NFD', 'NFKC', 'NFKD')} # Ensure that the python internal filesystem and default encodings are not ASCII #def is_ascii(name): # try: # return codecs.lookup(name).name == b'ascii' # except (TypeError, LookupError): # return True # # #try: # if is_ascii(sys.getdefaultencoding()): # _icu.set_default_encoding(b'utf-8') #except: # import traceback # traceback.print_exc() # #try: # if is_ascii(sys.getfilesystemencoding()): # _icu.set_filesystem_encoding(b'utf-8') #except: # import traceback # traceback.print_exc() #del is_ascii def collator(): global _collator, _locale if _collator is None: if _locale is None: from ebook_converter.utils.localization import get_lang if tweaks['locale_for_sorting']: _locale = tweaks['locale_for_sorting'] else: _locale = get_lang() try: _collator = _icu.Collator(_locale) except Exception as e: print('Failed to load collator for locale: %r with error %r, using English' % (_locale, e)) _collator = _icu.Collator('en') return _collator def change_locale(locale=None): global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = None _locale = locale def primary_collator(): 'Ignores case differences and accented characters' global _primary_collator if _primary_collator is None: _primary_collator = collator().clone() _primary_collator.strength = _icu.UCOL_PRIMARY return _primary_collator def sort_collator(): 'Ignores case differences and recognizes numbers in strings (if the tweak is set)' global _sort_collator if _sort_collator is None: _sort_collator = collator().clone() _sort_collator.strength = _icu.UCOL_SECONDARY _sort_collator.numeric = tweaks['numeric_collation'] return _sort_collator def numeric_collator(): 'Uses natural sorting for numbers inside strings so something2 will sort before something10' global _numeric_collator if _numeric_collator is None: _numeric_collator = collator().clone() _numeric_collator.strength = _icu.UCOL_SECONDARY _numeric_collator.numeric = True return _numeric_collator def case_sensitive_collator(): 'Always sorts upper case letter before lower case' global _case_sensitive_collator if _case_sensitive_collator is None: _case_sensitive_collator = collator().clone() _case_sensitive_collator.numeric = sort_collator().numeric _case_sensitive_collator.upper_first = True return _case_sensitive_collator # Templates that will be used to generate various concrete # function implementations based on different collators, to allow lazy loading # of collators, with maximum runtime performance _sort_key_template = ''' def {name}(obj): try: try: return {collator}.{func}(obj) except AttributeError: pass return {collator_func}().{func}(obj) except TypeError: if isinstance(obj, bytes): try: obj = obj.decode(sys.getdefaultencoding()) except ValueError: return obj return {collator}.{func}(obj) return b'' ''' _strcmp_template = ''' def {name}(a, b): try: try: return {collator}.{func}(a, b) except AttributeError: pass return {collator_func}().{func}(a, b) except TypeError: if isinstance(a, bytes): try: a = a.decode(sys.getdefaultencoding()) except ValueError: return cmp(a, b) elif a is None: a = u'' if isinstance(b, bytes): try: b = b.decode(sys.getdefaultencoding()) except ValueError: return cmp(a, b) elif b is None: b = u'' return {collator}.{func}(a, b) ''' _change_case_template = ''' def {name}(x): try: try: return _icu.change_case(x, _icu.{which}, _locale) except NotImplementedError: pass collator() # sets _locale return _icu.change_case(x, _icu.{which}, _locale) except TypeError: if isinstance(x, bytes): try: x = x.decode(sys.getdefaultencoding()) except ValueError: return x return _icu.change_case(x, _icu.{which}, _locale) raise ''' def _make_func(template, name, **kwargs): l = globals() kwargs['name'] = name kwargs['func'] = kwargs.get('func', 'sort_key') exec(template.format(**kwargs), l) return l[name] # }}} # ################ The string functions ######################################## sort_key = _make_func(_sort_key_template, 'sort_key', collator='_sort_collator', collator_func='sort_collator') numeric_sort_key = _make_func(_sort_key_template, 'numeric_sort_key', collator='_numeric_collator', collator_func='numeric_collator') primary_sort_key = _make_func(_sort_key_template, 'primary_sort_key', collator='_primary_collator', collator_func='primary_collator') case_sensitive_sort_key = _make_func(_sort_key_template, 'case_sensitive_sort_key', collator='_case_sensitive_collator', collator_func='case_sensitive_collator') collation_order = _make_func(_sort_key_template, 'collation_order', collator='_sort_collator', collator_func='sort_collator', func='collation_order') strcmp = _make_func(_strcmp_template, 'strcmp', collator='_sort_collator', collator_func='sort_collator', func='strcmp') case_sensitive_strcmp = _make_func( _strcmp_template, 'case_sensitive_strcmp', collator='_case_sensitive_collator', collator_func='case_sensitive_collator', func='strcmp') primary_strcmp = _make_func(_strcmp_template, 'primary_strcmp', collator='_primary_collator', collator_func='primary_collator', func='strcmp') upper = _make_func(_change_case_template, 'upper', which='UPPER_CASE') lower = _make_func(_change_case_template, 'lower', which='LOWER_CASE') title_case = _make_func(_change_case_template, 'title_case', which='TITLE_CASE') def capitalize(x): try: return upper(x[0]) + lower(x[1:]) except (IndexError, TypeError, AttributeError): return x try: swapcase = _icu.swap_case except AttributeError: # For people running from source swapcase = lambda x:x.swapcase() find = _make_func(_strcmp_template, 'find', collator='_collator', collator_func='collator', func='find') primary_find = _make_func(_strcmp_template, 'primary_find', collator='_primary_collator', collator_func='primary_collator', func='find') contains = _make_func(_strcmp_template, 'contains', collator='_collator', collator_func='collator', func='contains') primary_contains = _make_func(_strcmp_template, 'primary_contains', collator='_primary_collator', collator_func='primary_collator', func='contains') startswith = _make_func(_strcmp_template, 'startswith', collator='_collator', collator_func='collator', func='startswith') primary_startswith = _make_func(_strcmp_template, 'primary_startswith', collator='_primary_collator', collator_func='primary_collator', func='startswith') safe_chr = chr # _icu.chr ord_string = str # _icu.ord_string def character_name(string): try: return _icu.character_name(str(string)) or None except (TypeError, ValueError, KeyError): pass def character_name_from_code(code): try: return _icu.character_name_from_code(code) or '' except (TypeError, ValueError, KeyError): return '' def normalize(text, mode='NFC'): # This is very slightly slower than using unicodedata.normalize, so stick with # that unless you have very good reasons not too. Also, it's speed # decreases on wide python builds, where conversion to/from ICU's string # representation is slower. # return _icu.normalize(_nmodes[mode], str(text)) return unicode.normalize(mode, str(text)) def contractions(col=None): global _cmap col = col or _collator if col is None: col = collator() ans = _cmap.get(collator, None) if ans is None: ans = col.contractions() ans = frozenset(filter(None, ans)) _cmap[col] = ans return ans def partition_by_first_letter(items, reverse=False, key=lambda x:x): # Build a list of 'equal' first letters by noticing changes # in ICU's 'ordinal' for the first letter. from collections import OrderedDict items = sorted(items, key=lambda x:sort_key(key(x)), reverse=reverse) ans = OrderedDict() last_c, last_ordnum = ' ', 0 for item in items: c = icu_upper(key(item) or ' ') ordnum, ordlen = collation_order(c) if last_ordnum != ordnum: if not is_narrow_build: ordlen = 1 last_c = c[0:ordlen] last_ordnum = ordnum try: ans[last_c].append(item) except KeyError: ans[last_c] = [item] return ans # Return the number of unicode codepoints in a string string_length = len #_icu.string_length if is_narrow_build else len # Return the number of UTF-16 codepoints in a string utf16_length = len # if is_narrow_build else _icu.utf16_length ################################################################################ # if __name__ == '__main__': # from ebook_converter.utils.icu_test import run # run(verbosity=4)