Initial import

2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+__all__ = ["Unihandecoder"]
+
+'''
+Decode unicode text to an ASCII representation of the text.
+Translate unicode characters to ASCII.
+
+Inspired from John Schember's unidecode library which was created as part
+of calibre.
+
+Copyright(c) 2009, John Schember
+
+Tranliterate the string from unicode characters to ASCII in Chinese and others.
+
+'''
+import unicodedata
+
+
+class Unihandecoder(object):
+    preferred_encoding = None
+    decoder = None
+
+    def __init__(self, lang="zh", encoding='utf-8'):
+        self.preferred_encoding = encoding
+        lang = lang.lower()
+        if lang[:2] == 'ja':
+            from calibre.ebooks.unihandecode.jadecoder import Jadecoder
+            self.decoder = Jadecoder()
+        elif lang[:2] == 'kr' or lang == 'korean':
+            from calibre.ebooks.unihandecode.krdecoder import Krdecoder
+            self.decoder = Krdecoder()
+        elif lang[:2] == 'vn' or lang == 'vietnum':
+            from calibre.ebooks.unihandecode.vndecoder import Vndecoder
+            self.decoder = Vndecoder()
+        else:  # zh and others
+            from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+            self.decoder = Unidecoder()
+
+    def decode(self, text):
+        if isinstance(text, bytes):
+            try:
+                text = text.decode(self.preferred_encoding)
+            except Exception:
+                text = text.decode('utf-8', 'replace')
+        # at first unicode normalize it. (see Unicode standards)
+        ntext = unicodedata.normalize('NFKC', text)
+        return self.decoder.decode(ntext)
@@ -0,0 +1,43 @@
+# coding:utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text for Japanese.
+ Translate unicode string to ASCII roman string.
+
+API is based on the python unidecode,
+which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
+and  perl module Text::Unidecode
+(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
+
+This functionality is owned by Kakasi Japanese processing engine.
+
+Copyright (c) 2010 Hiroshi Miura
+'''
+
+import re
+from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
+from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
+
+
+class Jadecoder(Unidecoder):
+    kakasi = None
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(JACODES)
+        self.kakasi = kakasi()
+
+    def decode(self, text):
+        try:
+            result=self.kakasi.do(text)
+            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
+        except:
+            return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text in Korean.
+Based on unidecoder.
+
+'''
+
+from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+from calibre.ebooks.unihandecode.krcodepoints import CODEPOINTS as HANCODES
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+
+
+class Krdecoder(Unidecoder):
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text in Chinese.
+Transliterate unicode characters to ASCII based on chinese pronounce.
+
+Derived from John Schember's unidecode library. Which was created
+as part of calibre.
+
+Copyright(c) 2009, John Schember <john@nachtimwald.com>
+
+Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
+is based on the perl module Text::Unidecode
+(http://search.cpan.org/~sburke/Text-Unidecode-0.04/). More information about
+unidecode can be found at
+http://interglacial.com/~sburke/tpj/as_html/tpj22.html.
+
+The major differences between this implementation and others is it's written in
+python and it uses a single dictionary instead of loading the code group files
+as needed.
+
+
+Copyright (c) 2007 Russell Norris
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+
+Copyright 2001, Sean M. Burke <sburke@cpan.org>, all rights reserved.
+
+The programs and documentation in this dist are distributed in the
+hope that they will be useful, but without any warranty; without even
+the implied warranty of merchantability or fitness for a particular
+purpose.
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.
+'''
+
+import re
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+from calibre.ebooks.unihandecode.zhcodepoints import CODEPOINTS as HANCODES
+from polyglot.builtins import unicode_type
+
+
+class Unidecoder(object):
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)
+
+    def decode(self, text):
+        # Replace characters larger than 127 with their ASCII equivelent.
+        return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
+
+    def replace_point(self, codepoint):
+        '''
+        Returns the replacement character or ? if none can be found.
+        '''
+        try:
+            # Split the unicode character xABCD into parts 0xAB and 0xCD.
+            # 0xAB represents the group within CODEPOINTS to query and 0xCD
+            # represents the position in the list of characters for the group.
+            return self.codepoints[self.code_group(codepoint)][self.grouped_point(
+                codepoint)]
+        except:
+            return '?'
+
+    def code_group(self, character):
+        '''
+        Find what group character is a part of.
+        '''
+        # Code groups withing CODEPOINTS take the form 'xAB'
+        if not isinstance(character, unicode_type):
+            character = unicode_type(character, "utf-8")
+        return 'x%02x' % (ord(character) >> 8)
+
+    def grouped_point(self, character):
+        '''
+        Return the location the replacement character is in the list for a
+        the group character is a part of.
+        '''
+        if not isinstance(character, unicode_type):
+            character = unicode_type(character, "utf-8")
+        return ord(character) & 255
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Decode unicode text to an ASCII representation of the text in Vietnamese.
+
+'''
+
+from calibre.ebooks.unihandecode.unidecoder import Unidecoder
+from calibre.ebooks.unihandecode.vncodepoints import CODEPOINTS as HANCODES
+from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
+
+
+class Vndecoder(Unidecoder):
+
+    codepoints = {}
+
+    def __init__(self):
+        self.codepoints = CODEPOINTS
+        self.codepoints.update(HANCODES)