From 018676c02697573a9064f8e28fa9e2024a21f67e Mon Sep 17 00:00:00 2001 From: gryf Date: Sun, 3 Jan 2021 19:47:49 +0100 Subject: [PATCH] Moved prepare_string_for_xml to utils.entities. --- ebook_converter/__init__.py | 11 ------ .../ebooks/conversion/plugins/txt_input.py | 3 +- ebook_converter/ebooks/fb2/fb2ml.py | 39 +++++++++++-------- ebook_converter/ebooks/htmlz/oeb2html.py | 18 +++++---- ebook_converter/ebooks/lrf/objects.py | 4 +- ebook_converter/ebooks/pdb/haodoo/reader.py | 10 ++--- ebook_converter/ebooks/txt/processor.py | 5 ++- ebook_converter/library/comments.py | 4 +- 8 files changed, 45 insertions(+), 49 deletions(-) diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py index 8fec1d1..7a1d0f4 100644 --- a/ebook_converter/__init__.py +++ b/ebook_converter/__init__.py @@ -24,14 +24,3 @@ class CurrentDir(object): except EnvironmentError: # The previous CWD no longer exists pass - - -_ent_pat = re.compile(r'&(\S+?);') - - -def prepare_string_for_xml(raw, attribute=False): - raw = _ent_pat.sub(entities.entity_to_unicode, raw) - raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') - if attribute: - raw = raw.replace('"', '"').replace("'", ''') - return raw diff --git a/ebook_converter/ebooks/conversion/plugins/txt_input.py b/ebook_converter/ebooks/conversion/plugins/txt_input.py index fafdfb2..363e937 100644 --- a/ebook_converter/ebooks/conversion/plugins/txt_input.py +++ b/ebook_converter/ebooks/conversion/plugins/txt_input.py @@ -1,6 +1,5 @@ import os -from ebook_converter import _ent_pat from ebook_converter.customize.conversion import InputFormatPlugin from ebook_converter.customize.conversion import OptionRecommendation from ebook_converter.utils import entities @@ -198,7 +197,7 @@ class TXTInput(InputFormatPlugin): txt = txt.decode(ienc, 'replace') # Replace entities - txt = _ent_pat.sub(entities.xml_entity_to_unicode, txt) + txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) diff --git a/ebook_converter/ebooks/fb2/fb2ml.py b/ebook_converter/ebooks/fb2/fb2ml.py index 6e3c086..2c84af9 100644 --- a/ebook_converter/ebooks/fb2/fb2ml.py +++ b/ebook_converter/ebooks/fb2/fb2ml.py @@ -10,11 +10,11 @@ import uuid from lxml import etree from ebook_converter import constants as const -from ebook_converter import prepare_string_for_xml from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.polyglot.binary import as_base64_unicode +from ebook_converter.utils import entities from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.localization import lang_as_iso639_1 @@ -149,13 +149,16 @@ class FB2MLizer(object): author_middle = ' '.join(author_parts[1:-1]) author_last = author_parts[-1] metadata['author'] += '' - metadata['author'] += ('%s' % - prepare_string_for_xml(author_first)) + metadata['author'] += ( + '%s' % + entities.prepare_string_for_xml(author_first)) if author_middle: - metadata['author'] += ('%s' % - prepare_string_for_xml(author_middle)) - metadata['author'] += ('%s' % - prepare_string_for_xml(author_last)) + metadata['author'] += ( + '%s' % + entities.prepare_string_for_xml(author_middle)) + metadata['author'] += ( + '%s' % + entities.prepare_string_for_xml(author_last)) metadata['author'] += '' if not metadata['author']: metadata['author'] = ('' @@ -164,7 +167,7 @@ class FB2MLizer(object): metadata['keywords'] = '' tags = list(map(str, self.oeb_book.metadata.subject)) if tags: - tags = ', '.join(prepare_string_for_xml(x) for x in tags) + tags = ', '.join(entities.prepare_string_for_xml(x) for x in tags) metadata['keywords'] = '%s' % tags metadata['sequence'] = '' @@ -172,7 +175,8 @@ class FB2MLizer(object): index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] - seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0])) + seq = entities.prepare_string_for_xml(str(self.oeb_book.metadata + .series[0])) metadata['sequence'] = ('' % (seq, index)) @@ -193,7 +197,8 @@ class FB2MLizer(object): pass else: year = ('%s' % - prepare_string_for_xml(date.value.partition('-')[0])) + entities.prepare_string_for_xml(date.value + .partition('-')[0])) try: publisher = self.oeb_book.metadata['publisher'][0] @@ -201,11 +206,12 @@ class FB2MLizer(object): pass else: publisher = ('%s' % - prepare_string_for_xml(publisher.value)) + entities.prepare_string_for_xml(publisher.value)) for x in identifiers: if x.get(base.tag('opf', 'scheme'), None).lower() == 'isbn': - isbn = '%s' % prepare_string_for_xml(x.value) + isbn = ('%s' % + entities.prepare_string_for_xml(x.value)) metadata['year'] = year metadata['isbn'] = isbn @@ -213,7 +219,7 @@ class FB2MLizer(object): for key, value in metadata.items(): if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): - metadata[key] = prepare_string_for_xml(value) + metadata[key] = entities.prepare_string_for_xml(value) try: comments = self.oeb_book.metadata['description'][0] @@ -221,7 +227,8 @@ class FB2MLizer(object): metadata['comments'] = '' else: from ebook_converter.utils.html2text import html2text - annot = prepare_string_for_xml(html2text(comments.value).strip()) + annot = entities.prepare_string_for_xml(html2text(comments + .value).strip()) metadata['comments'] = f'

{annot}

' # Keep the indentation level of the description the same as the body. @@ -583,7 +590,7 @@ class FB2MLizer(object): if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('

') - fb2_out.append(prepare_string_for_xml(elem_tree.text)) + fb2_out.append(entities.prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('

') @@ -600,7 +607,7 @@ class FB2MLizer(object): if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('

') - fb2_out.append(prepare_string_for_xml(elem_tree.tail)) + fb2_out.append(entities.prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('

') diff --git a/ebook_converter/ebooks/htmlz/oeb2html.py b/ebook_converter/ebooks/htmlz/oeb2html.py index 42b0742..c27e103 100644 --- a/ebook_converter/ebooks/htmlz/oeb2html.py +++ b/ebook_converter/ebooks/htmlz/oeb2html.py @@ -8,11 +8,11 @@ import urllib.parse from functools import partial from lxml import html -from ebook_converter import prepare_string_for_xml from ebook_converter import constants as const from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.stylizer import Stylizer +from ebook_converter.utils import entities from ebook_converter.utils.logging import default_log from ebook_converter.polyglot.builtins import as_bytes @@ -57,7 +57,7 @@ class OEB2HTML(object): def mlize_spine(self, oeb_book): output = [ u'%s' % ( - prepare_string_for_xml(self.book_title)) + entities.prepare_string_for_xml(self.book_title)) ] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) @@ -136,7 +136,7 @@ class OEB2HTML(object): return css def prepare_string_for_html(self, raw): - raw = prepare_string_for_xml(raw) + raw = entities.prepare_string_for_xml(raw) raw = raw.replace(u'\u00ad', '­') raw = raw.replace(u'\u2014', '—') raw = raw.replace(u'\u2013', '–') @@ -189,7 +189,8 @@ class OEB2HTMLNoCSSizer(OEB2HTML): # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): - at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + at += ' %s="%s"' % (k, entities + .prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) @@ -281,7 +282,8 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): - at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + at += ' %s="%s"' % (k, entities + .prepare_string_for_xml(v, attribute=True)) # Turn style into strings for putting in the tag. style_t = '' @@ -336,7 +338,8 @@ class OEB2HTMLClassCSSizer(OEB2HTML): css = u'' else: css = u'' - title = u'%s' % prepare_string_for_xml(self.book_title) + title = (u'%s' % + entities.prepare_string_for_xml(self.book_title)) output = [u''] + \ [css] + [title, u''] + output + [u''] return ''.join(output) @@ -373,7 +376,8 @@ class OEB2HTMLClassCSSizer(OEB2HTML): # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): - at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + at += ' %s="%s"' % (k, + entities.prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) diff --git a/ebook_converter/ebooks/lrf/objects.py b/ebook_converter/ebooks/lrf/objects.py index 81c93ba..b3773c9 100644 --- a/ebook_converter/ebooks/lrf/objects.py +++ b/ebook_converter/ebooks/lrf/objects.py @@ -6,7 +6,7 @@ import struct import zlib from ebook_converter.ebooks.lrf import LRFParseError, PRS500_PROFILE -from ebook_converter import prepare_string_for_xml +from ebook_converter.utils import entities from ebook_converter.ebooks.lrf.tags import Tag from ebook_converter.utils import entities @@ -877,7 +877,7 @@ class Text(LRFStream): open_containers = collections.deque() for c in self.content: if isinstance(c, str): - s += prepare_string_for_xml(c).replace('\0', '') + s += entities.prepare_string_for_xml(c).replace('\0', '') elif c is None: if open_containers: p = open_containers.pop() diff --git a/ebook_converter/ebooks/pdb/haodoo/reader.py b/ebook_converter/ebooks/pdb/haodoo/reader.py index 74e6a18..ec611f5 100644 --- a/ebook_converter/ebooks/pdb/haodoo/reader.py +++ b/ebook_converter/ebooks/pdb/haodoo/reader.py @@ -1,19 +1,15 @@ """ Read content from Haodoo.net pdb file. """ -import struct import os +import struct -from ebook_converter import prepare_string_for_xml from ebook_converter.ebooks.pdb.formatreader import FormatReader from ebook_converter.ebooks.metadata import MetaInformation from ebook_converter.ebooks.txt.processor import opf_writer, HTML_TEMPLATE +from ebook_converter.utils import entities -__license__ = 'GPL v3' -__copyright__ = '2012, Kan-Ru Chen ' -__docformat__ = 'restructuredtext en' - BPDB_IDENT = b'BOOKMTIT' UPDB_IDENT = b'BOOKMTIU' @@ -133,7 +129,7 @@ class Reader(FormatReader): line = '

' + line + '

\n' title_added = True else: - line = prepare_string_for_xml(line) + line = entities.prepare_string_for_xml(line) lines.append('

%s

' % line) if not title_added: lines.insert(0, '

' + title + '

\n') diff --git a/ebook_converter/ebooks/txt/processor.py b/ebook_converter/ebooks/txt/processor.py index 579159b..ecdfc93 100644 --- a/ebook_converter/ebooks/txt/processor.py +++ b/ebook_converter/ebooks/txt/processor.py @@ -3,11 +3,11 @@ Read content from txt file. """ import os, re -from ebook_converter import prepare_string_for_xml from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.conversion.preprocess import DocAnalysis from ebook_converter.utils.cleantext import clean_ascii_chars +from ebook_converter.utils import entities HTML_TEMPLATE = '%s \n%s\n' @@ -87,7 +87,8 @@ def convert_basic(txt, title='', epub_split_size_kb=0): for line in txt.split('\n'): if line.strip(): blank_count = 0 - lines.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) + lines.append(u'

%s

' % entities + .prepare_string_for_xml(line.replace('\n', ' '))) else: blank_count += 1 if blank_count == 2: diff --git a/ebook_converter/library/comments.py b/ebook_converter/library/comments.py index 05a8a64..6e550d3 100644 --- a/ebook_converter/library/comments.py +++ b/ebook_converter/library/comments.py @@ -2,10 +2,10 @@ import re import bs4 -from ebook_converter import prepare_string_for_xml from ebook_converter.constants_old import preferred_encoding from ebook_converter.ebooks.BeautifulSoup import html5_parser from ebook_converter.utils.html2text import html2text +from ebook_converter.utils import entities # Hackish - ignoring sentences ending or beginning in numbers to avoid @@ -51,7 +51,7 @@ def comments_to_html(comments): return comments if '<' not in comments: - comments = prepare_string_for_xml(comments) + comments = entities.prepare_string_for_xml(comments) parts = [u'

%s

'%x.replace(u'\n', u'
') for x in comments.split('\n\n')] return '\n'.join(parts)