mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-06 17:35:48 +01:00
Moved entity_to_unicode function to utils.entities module.
This commit is contained in:
@@ -1,11 +1,10 @@
|
|||||||
import html
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from ebook_converter import constants_old
|
from ebook_converter import constants_old
|
||||||
from ebook_converter.ebooks.html_entities import html5_entities
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
|
|
||||||
class CurrentDir(object):
|
class CurrentDir(object):
|
||||||
@@ -27,69 +26,8 @@ class CurrentDir(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
|
||||||
result_exceptions={}):
|
|
||||||
"""
|
|
||||||
:param match: A match object such that '&'+match.group(1)';' is the entity.
|
|
||||||
|
|
||||||
:param exceptions: A list of entities to not convert (Each entry is the
|
|
||||||
name of the entity, for e.g. 'apos' or '#1234'
|
|
||||||
|
|
||||||
:param encoding: The encoding to use to decode numeric entities between
|
|
||||||
128 and 256. If None, the Unicode UCS encoding is used.
|
|
||||||
A common encoding is cp1252.
|
|
||||||
|
|
||||||
:param result_exceptions: A mapping of characters to entities. If the
|
|
||||||
result is in result_exceptions,
|
|
||||||
result_exception[result] is returned instead.
|
|
||||||
Convenient way to specify exception for things
|
|
||||||
like < or > that can be specified by various
|
|
||||||
actual entities.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def my_unichr(num):
|
|
||||||
try:
|
|
||||||
return chr(num)
|
|
||||||
except (ValueError, OverflowError):
|
|
||||||
return '?'
|
|
||||||
|
|
||||||
def check(ch):
|
|
||||||
return result_exceptions.get(ch, ch)
|
|
||||||
|
|
||||||
ent = match.group(1)
|
|
||||||
if ent in exceptions:
|
|
||||||
return '&'+ent+';'
|
|
||||||
# squot is generated by some broken CMS software
|
|
||||||
if ent in {'apos', 'squot'}:
|
|
||||||
return check("'")
|
|
||||||
if ent == 'hellips':
|
|
||||||
ent = 'hellip'
|
|
||||||
if ent.startswith('#'):
|
|
||||||
try:
|
|
||||||
if ent[1] in ('x', 'X'):
|
|
||||||
num = int(ent[2:], 16)
|
|
||||||
else:
|
|
||||||
num = int(ent[1:])
|
|
||||||
except Exception:
|
|
||||||
return '&'+ent+';'
|
|
||||||
if encoding is None or num > 255:
|
|
||||||
return check(my_unichr(num))
|
|
||||||
try:
|
|
||||||
return check(bytes(bytearray((num,))).decode(encoding))
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
return check(my_unichr(num))
|
|
||||||
try:
|
|
||||||
return check(html5_entities[ent])
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
return check(my_unichr(html.entities.name2codepoint[ent]))
|
|
||||||
except KeyError:
|
|
||||||
return '&'+ent+';'
|
|
||||||
|
|
||||||
|
|
||||||
_ent_pat = re.compile(r'&(\S+?);')
|
_ent_pat = re.compile(r'&(\S+?);')
|
||||||
xml_entity_to_unicode = partial(entity_to_unicode,
|
xml_entity_to_unicode = partial(entities.entity_to_unicode,
|
||||||
result_exceptions={'"': '"',
|
result_exceptions={'"': '"',
|
||||||
"'": ''',
|
"'": ''',
|
||||||
'<': '<',
|
'<': '<',
|
||||||
@@ -98,7 +36,8 @@ xml_entity_to_unicode = partial(entity_to_unicode,
|
|||||||
|
|
||||||
|
|
||||||
def replace_entities(raw, encoding='cp1252'):
|
def replace_entities(raw, encoding='cp1252'):
|
||||||
return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
|
return _ent_pat.sub(partial(entities.entity_to_unicode, encoding=encoding),
|
||||||
|
raw)
|
||||||
|
|
||||||
|
|
||||||
def xml_replace_entities(raw, encoding='cp1252'):
|
def xml_replace_entities(raw, encoding='cp1252'):
|
||||||
@@ -106,7 +45,7 @@ def xml_replace_entities(raw, encoding='cp1252'):
|
|||||||
|
|
||||||
|
|
||||||
def prepare_string_for_xml(raw, attribute=False):
|
def prepare_string_for_xml(raw, attribute=False):
|
||||||
raw = _ent_pat.sub(entity_to_unicode, raw)
|
raw = _ent_pat.sub(entities.entity_to_unicode, raw)
|
||||||
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
raw = raw.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
if attribute:
|
if attribute:
|
||||||
raw = raw.replace('"', '"').replace("'", ''')
|
raw = raw.replace('"', '"').replace("'", ''')
|
||||||
|
|||||||
@@ -3,14 +3,14 @@ import json
|
|||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ebook_converter import entity_to_unicode
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
|
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||||
|
|
||||||
convert_entities = functools.partial(entity_to_unicode,
|
convert_entities = functools.partial(entities.entity_to_unicode,
|
||||||
result_exceptions={'<': '<',
|
result_exceptions={'<': '<',
|
||||||
'>': '>',
|
'>': '>',
|
||||||
"'": ''',
|
"'": ''',
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ import math
|
|||||||
import bs4
|
import bs4
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
from ebook_converter import entity_to_unicode
|
|
||||||
from ebook_converter.constants_old import __appname__, filesystem_encoding, \
|
from ebook_converter.constants_old import __appname__, filesystem_encoding, \
|
||||||
preferred_encoding
|
preferred_encoding
|
||||||
from ebook_converter.devices.interface import DevicePlugin as Device
|
from ebook_converter.devices.interface import DevicePlugin as Device
|
||||||
@@ -39,6 +38,7 @@ from ebook_converter.ebooks.lrf.pylrs.pylrs import (
|
|||||||
from ebook_converter.ptempfile import PersistentTemporaryFile
|
from ebook_converter.ptempfile import PersistentTemporaryFile
|
||||||
from ebook_converter.utils import encoding as uenc
|
from ebook_converter.utils import encoding as uenc
|
||||||
from ebook_converter.utils import img as uimg
|
from ebook_converter.utils import img as uimg
|
||||||
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
|
|
||||||
def strip_style_comments(match):
|
def strip_style_comments(match):
|
||||||
@@ -90,7 +90,7 @@ MARKUP_MASSAGE = [ # Close <a /> tags
|
|||||||
|
|
||||||
# Replace entities
|
# Replace entities
|
||||||
(re.compile(r'&(\S+?);'),
|
(re.compile(r'&(\S+?);'),
|
||||||
functools.partial(entity_to_unicode,
|
functools.partial(entities.entity_to_unicode,
|
||||||
exceptions=['lt', 'gt', 'amp', 'quot'])),
|
exceptions=['lt', 'gt', 'amp', 'quot'])),
|
||||||
|
|
||||||
# Remove comments from within style tags as they can mess up
|
# Remove comments from within style tags as they can mess up
|
||||||
|
|||||||
@@ -6,8 +6,9 @@ import struct
|
|||||||
import zlib
|
import zlib
|
||||||
|
|
||||||
from ebook_converter.ebooks.lrf import LRFParseError, PRS500_PROFILE
|
from ebook_converter.ebooks.lrf import LRFParseError, PRS500_PROFILE
|
||||||
from ebook_converter import entity_to_unicode, prepare_string_for_xml
|
from ebook_converter import prepare_string_for_xml
|
||||||
from ebook_converter.ebooks.lrf.tags import Tag
|
from ebook_converter.ebooks.lrf.tags import Tag
|
||||||
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
ruby_tags = {0xF575: ['rubyAlignAndAdjust', 'W'],
|
ruby_tags = {0xF575: ['rubyAlignAndAdjust', 'W'],
|
||||||
0xF576: ['rubyoverhang', 'W', {0: 'none', 1: 'auto'}],
|
0xF576: ['rubyoverhang', 'W', {0: 'none', 1: 'auto'}],
|
||||||
@@ -713,7 +714,8 @@ class Text(LRFStream):
|
|||||||
s = str(text, "utf-16-le")
|
s = str(text, "utf-16-le")
|
||||||
if s:
|
if s:
|
||||||
s = s.translate(self.text_map)
|
s = s.translate(self.text_map)
|
||||||
self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
|
self.content.append(self.entity_pattern
|
||||||
|
.sub(entities.entity_to_unicode, s))
|
||||||
|
|
||||||
def end_container(self, tag, stream):
|
def end_container(self, tag, stream):
|
||||||
self.content.append(None)
|
self.content.append(None)
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ DEFAULT_SOURCE_ENCODING = "cp1252" # default is us-windows character set
|
|||||||
DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs
|
DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs
|
||||||
|
|
||||||
from ebook_converter.constants_old import __appname__, __version__
|
from ebook_converter.constants_old import __appname__, __version__
|
||||||
from ebook_converter import entity_to_unicode
|
from ebook_converter.utils import entities
|
||||||
|
|
||||||
|
|
||||||
class LrsError(Exception):
|
class LrsError(Exception):
|
||||||
@@ -737,7 +737,8 @@ class TableOfContents(object):
|
|||||||
class TocLabel(object):
|
class TocLabel(object):
|
||||||
|
|
||||||
def __init__(self, label, textBlock):
|
def __init__(self, label, textBlock):
|
||||||
self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label))
|
self.label = escape(re.sub(r'&(\S+?);', entities.entity_to_unicode,
|
||||||
|
label))
|
||||||
self.textBlock = textBlock
|
self.textBlock = textBlock
|
||||||
|
|
||||||
def toElement(self, se):
|
def toElement(self, se):
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import textwrap
|
|||||||
|
|
||||||
from lxml import html, etree
|
from lxml import html, etree
|
||||||
|
|
||||||
from ebook_converter import xml_entity_to_unicode, entity_to_unicode
|
from ebook_converter import xml_entity_to_unicode
|
||||||
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||||
from ebook_converter.ebooks import DRMError, unit_convert
|
from ebook_converter.ebooks import DRMError, unit_convert
|
||||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||||
@@ -20,6 +20,7 @@ from ebook_converter.ebooks.metadata import MetaInformation
|
|||||||
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||||
from ebook_converter.ebooks.metadata.toc import TOC
|
from ebook_converter.ebooks.metadata.toc import TOC
|
||||||
from ebook_converter.ebooks.mobi.reader.headers import BookHeader
|
from ebook_converter.ebooks.mobi.reader.headers import BookHeader
|
||||||
|
from ebook_converter.utils import entities
|
||||||
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
|
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
|
||||||
from ebook_converter.utils.img import AnimatedGIF
|
from ebook_converter.utils.img import AnimatedGIF
|
||||||
from ebook_converter.utils.imghdr import what
|
from ebook_converter.utils.imghdr import what
|
||||||
@@ -759,7 +760,8 @@ class MobiReader(object):
|
|||||||
':text()')])
|
':text()')])
|
||||||
except Exception:
|
except Exception:
|
||||||
text = ''
|
text = ''
|
||||||
text = ent_pat.sub(entity_to_unicode, text)
|
text = ent_pat.sub(entities.entity_to_unicode,
|
||||||
|
text)
|
||||||
item = tocobj.add_item(toc.partition('#')[0],
|
item = tocobj.add_item(toc.partition('#')[0],
|
||||||
href[1:], text)
|
href[1:], text)
|
||||||
item.left_space = int(self.get_left_whitespace(x))
|
item.left_space = int(self.get_left_whitespace(x))
|
||||||
|
|||||||
Reference in New Issue
Block a user