mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-23 22:51:30 +02:00
Initial import
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__all__ = ["Unihandecoder"]
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text.
|
||||
Translate unicode characters to ASCII.
|
||||
|
||||
Inspired from John Schember's unidecode library which was created as part
|
||||
of calibre.
|
||||
|
||||
Copyright(c) 2009, John Schember
|
||||
|
||||
Tranliterate the string from unicode characters to ASCII in Chinese and others.
|
||||
|
||||
'''
|
||||
import unicodedata
|
||||
|
||||
|
||||
class Unihandecoder(object):
|
||||
preferred_encoding = None
|
||||
decoder = None
|
||||
|
||||
def __init__(self, lang="zh", encoding='utf-8'):
|
||||
self.preferred_encoding = encoding
|
||||
lang = lang.lower()
|
||||
if lang[:2] == 'ja':
|
||||
from calibre.ebooks.unihandecode.jadecoder import Jadecoder
|
||||
self.decoder = Jadecoder()
|
||||
elif lang[:2] == 'kr' or lang == 'korean':
|
||||
from calibre.ebooks.unihandecode.krdecoder import Krdecoder
|
||||
self.decoder = Krdecoder()
|
||||
elif lang[:2] == 'vn' or lang == 'vietnum':
|
||||
from calibre.ebooks.unihandecode.vndecoder import Vndecoder
|
||||
self.decoder = Vndecoder()
|
||||
else: # zh and others
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
self.decoder = Unidecoder()
|
||||
|
||||
def decode(self, text):
|
||||
if isinstance(text, bytes):
|
||||
try:
|
||||
text = text.decode(self.preferred_encoding)
|
||||
except Exception:
|
||||
text = text.decode('utf-8', 'replace')
|
||||
# at first unicode normalize it. (see Unicode standards)
|
||||
ntext = unicodedata.normalize('NFKC', text)
|
||||
return self.decoder.decode(ntext)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,43 @@
|
||||
# coding:utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text for Japanese.
|
||||
Translate unicode string to ASCII roman string.
|
||||
|
||||
API is based on the python unidecode,
|
||||
which is based on Ruby gem (http://rubyforge.org/projects/unidecode/)
|
||||
and perl module Text::Unidecode
|
||||
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/).
|
||||
|
||||
This functionality is owned by Kakasi Japanese processing engine.
|
||||
|
||||
Copyright (c) 2010 Hiroshi Miura
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
|
||||
from calibre.ebooks.unihandecode.pykakasi.kakasi import kakasi
|
||||
|
||||
|
||||
class Jadecoder(Unidecoder):
|
||||
kakasi = None
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(JACODES)
|
||||
self.kakasi = kakasi()
|
||||
|
||||
def decode(self, text):
|
||||
try:
|
||||
result=self.kakasi.do(text)
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),result)
|
||||
except:
|
||||
return re.sub('[^\x00-\x7f]', lambda x: self.replace_point(x.group()),text)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text in Korean.
|
||||
Based on unidecoder.
|
||||
|
||||
'''
|
||||
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
from calibre.ebooks.unihandecode.krcodepoints import CODEPOINTS as HANCODES
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
|
||||
|
||||
class Krdecoder(Unidecoder):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,110 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text in Chinese.
|
||||
Transliterate unicode characters to ASCII based on chinese pronounce.
|
||||
|
||||
Derived from John Schember's unidecode library. Which was created
|
||||
as part of calibre.
|
||||
|
||||
Copyright(c) 2009, John Schember <john@nachtimwald.com>
|
||||
|
||||
Based on the ruby unidecode gem (http://rubyforge.org/projects/unidecode/) which
|
||||
is based on the perl module Text::Unidecode
|
||||
(http://search.cpan.org/~sburke/Text-Unidecode-0.04/). More information about
|
||||
unidecode can be found at
|
||||
http://interglacial.com/~sburke/tpj/as_html/tpj22.html.
|
||||
|
||||
The major differences between this implementation and others is it's written in
|
||||
python and it uses a single dictionary instead of loading the code group files
|
||||
as needed.
|
||||
|
||||
|
||||
Copyright (c) 2007 Russell Norris
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
Copyright 2001, Sean M. Burke <sburke@cpan.org>, all rights reserved.
|
||||
|
||||
The programs and documentation in this dist are distributed in the
|
||||
hope that they will be useful, but without any warranty; without even
|
||||
the implied warranty of merchantability or fitness for a particular
|
||||
purpose.
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself.
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
from calibre.ebooks.unihandecode.zhcodepoints import CODEPOINTS as HANCODES
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class Unidecoder(object):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
|
||||
def decode(self, text):
|
||||
# Replace characters larger than 127 with their ASCII equivelent.
|
||||
return re.sub('[^\x00-\x7f]',lambda x: self.replace_point(x.group()), text)
|
||||
|
||||
def replace_point(self, codepoint):
|
||||
'''
|
||||
Returns the replacement character or ? if none can be found.
|
||||
'''
|
||||
try:
|
||||
# Split the unicode character xABCD into parts 0xAB and 0xCD.
|
||||
# 0xAB represents the group within CODEPOINTS to query and 0xCD
|
||||
# represents the position in the list of characters for the group.
|
||||
return self.codepoints[self.code_group(codepoint)][self.grouped_point(
|
||||
codepoint)]
|
||||
except:
|
||||
return '?'
|
||||
|
||||
def code_group(self, character):
|
||||
'''
|
||||
Find what group character is a part of.
|
||||
'''
|
||||
# Code groups withing CODEPOINTS take the form 'xAB'
|
||||
if not isinstance(character, unicode_type):
|
||||
character = unicode_type(character, "utf-8")
|
||||
return 'x%02x' % (ord(character) >> 8)
|
||||
|
||||
def grouped_point(self, character):
|
||||
'''
|
||||
Return the location the replacement character is in the list for a
|
||||
the group character is a part of.
|
||||
'''
|
||||
if not isinstance(character, unicode_type):
|
||||
character = unicode_type(character, "utf-8")
|
||||
return ord(character) & 255
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,24 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decode unicode text to an ASCII representation of the text in Vietnamese.
|
||||
|
||||
'''
|
||||
|
||||
from calibre.ebooks.unihandecode.unidecoder import Unidecoder
|
||||
from calibre.ebooks.unihandecode.vncodepoints import CODEPOINTS as HANCODES
|
||||
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
|
||||
|
||||
|
||||
class Vndecoder(Unidecoder):
|
||||
|
||||
codepoints = {}
|
||||
|
||||
def __init__(self):
|
||||
self.codepoints = CODEPOINTS
|
||||
self.codepoints.update(HANCODES)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user