1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-24 07:01:30 +02:00

Use the real constants module.

This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
This commit is contained in:
2020-05-29 17:04:53 +02:00
parent ee4801228f
commit ce89f5c9d1
54 changed files with 2383 additions and 2081 deletions
+38 -27
View File
@@ -1,12 +1,14 @@
__license__ = 'GPL v3' """
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Code for the conversion of ebook formats and the reading of metadata Code for the conversion of ebook formats and the reading of metadata
from various formats. from various formats.
''' """
import numbers
import os
import re
import sys
from lxml import etree
import os, re, numbers, sys
from ebook_converter import prints from ebook_converter import prints
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
@@ -30,12 +32,15 @@ class ParserError(ValueError):
pass pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm', BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text',
'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'htm', 'xhtm', 'html', 'htmlz', 'xhtml', 'pdf', 'pdb',
'epub', 'fb2', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'epub', 'fb2',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'docm', 'md', 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz',
'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', 'kpf'] 'mbp', 'tan', 'snb', 'xps', 'oxps', 'azw4', 'book', 'zbf',
'pobi', 'docx', 'docm', 'md', 'textile', 'markdown',
'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx',
'kpf']
def return_raster_image(path): def return_raster_image(path):
@@ -49,8 +54,7 @@ def return_raster_image(path):
def extract_cover_from_embedded_svg(html, base, log): def extract_cover_from_embedded_svg(html, base, log):
from ebook_converter.ebooks.oeb.base import XPath, SVG, XLINK from ebook_converter.ebooks.oeb.base import XPath, SVG, XLINK
from ebook_converter.utils.xml_parse import safe_xml_fromstring root = etree.fromstring(html)
root = safe_xml_fromstring(html)
svg = XPath('//svg:svg')(root) svg = XPath('//svg:svg')(root)
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
@@ -65,10 +69,10 @@ def extract_calibre_cover(raw, base, log):
from ebook_converter.ebooks.BeautifulSoup import BeautifulSoup from ebook_converter.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw) soup = BeautifulSoup(raw)
matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
'font', 'br']) 'font', 'br'])
images = soup.findAll('img', src=True) images = soup.findAll('img', src=True)
if matches is None and len(images) == 1 and \ if (matches is None and len(images) == 1 and
images[0].get('alt', '').lower()=='cover': images[0].get('alt', '').lower() == 'cover'):
img = images[0] img = images[0]
img = os.path.join(base, *img['src'].split('/')) img = os.path.join(base, *img['src'].split('/'))
q = return_raster_image(img) q = return_raster_image(img)
@@ -97,13 +101,14 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750):
data = None data = None
if SVG_NS in raw: if SVG_NS in raw:
try: try:
data = extract_cover_from_embedded_svg(raw, data = extract_cover_from_embedded_svg(
os.path.dirname(path_to_html), log) raw, os.path.dirname(path_to_html), log)
except Exception: except Exception:
pass pass
if data is None: if data is None:
try: try:
data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log) data = extract_calibre_cover(raw, os.path.dirname(path_to_html),
log)
except Exception: except Exception:
pass pass
@@ -118,7 +123,8 @@ def render_html_data(path_to_html, width, height):
result = {} result = {}
def report_error(text=''): def report_error(text=''):
prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr) prints('Failed to render', path_to_html, 'with errors:',
file=sys.stderr)
if text: if text:
prints(text, file=sys.stderr) prints(text, file=sys.stderr)
if result and result['stdout_stderr']: if result and result['stdout_stderr']:
@@ -127,7 +133,8 @@ def render_html_data(path_to_html, width, height):
with TemporaryDirectory('-render-html') as tdir: with TemporaryDirectory('-render-html') as tdir:
try: try:
result = fork_job('ebook_converter.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg')) result = fork_job('ebook_converter.ebooks.render_html', 'main',
args=(path_to_html, tdir, 'jpeg'))
except WorkerError as e: except WorkerError as e:
report_error(e.orig_tb) report_error(e.orig_tb)
else: else:
@@ -156,17 +163,20 @@ def normalize(x):
def calibre_cover(title, author_string, series_string=None, def calibre_cover(title, author_string, series_string=None,
output_format='jpg', title_size=46, author_size=36, logo_path=None): output_format='jpg', title_size=46, author_size=36,
logo_path=None):
title = normalize(title) title = normalize(title)
author_string = normalize(author_string) author_string = normalize(author_string)
series_string = normalize(series_string) series_string = normalize(series_string)
from ebook_converter.ebooks.covers import calibre_cover2 from ebook_converter.ebooks.covers import calibre_cover2
from ebook_converter.utils.img import image_to_data from ebook_converter.utils.img import image_to_data
ans = calibre_cover2(title, author_string or '', series_string or '', logo_path=logo_path, as_qimage=True) ans = calibre_cover2(title, author_string or '', series_string or '',
logo_path=logo_path, as_qimage=True)
return image_to_data(ans, fmt=output_format) return image_to_data(ans, fmt=output_format)
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc|rem|q)$') UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc'
r'|rem|q)$')
def unit_convert(value, base, font, dpi, body_font_size=12): def unit_convert(value, base, font, dpi, body_font_size=12):
@@ -175,7 +185,7 @@ def unit_convert(value, base, font, dpi, body_font_size=12):
return value return value
try: try:
return float(value) * 72.0 / dpi return float(value) * 72.0 / dpi
except: except Exception:
pass pass
result = value result = value
m = UNIT_RE.match(value) m = UNIT_RE.match(value)
@@ -227,7 +237,8 @@ def generate_masthead(title, output_path=None, width=600, height=60):
recs = load_defaults('mobi_output') recs = load_defaults('mobi_output')
masthead_font_family = recs.get('masthead_font', None) masthead_font_family = recs.get('masthead_font', None)
from ebook_converter.ebooks.covers import generate_masthead from ebook_converter.ebooks.covers import generate_masthead
return generate_masthead(title, output_path=output_path, width=width, height=height, font_family=masthead_font_family) return generate_masthead(title, output_path=output_path, width=width,
height=height, font_family=masthead_font_family)
def escape_xpath_attr(value): def escape_xpath_attr(value):
@@ -3,6 +3,7 @@ Based on ideas from comiclrf created by FangornUK.
""" """
import shutil, textwrap, codecs, os import shutil, textwrap, codecs, os
from ebook_converter import constants as const
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
from ebook_converter import CurrentDir from ebook_converter import CurrentDir
from ebook_converter.ptempfile import PersistentTemporaryDirectory from ebook_converter.ptempfile import PersistentTemporaryDirectory
@@ -245,7 +246,6 @@ class ComicInput(InputFormatPlugin):
return os.path.abspath('metadata.opf') return os.path.abspath('metadata.opf')
def create_wrappers(self, pages): def create_wrappers(self, pages):
from ebook_converter.ebooks.oeb.base import XHTML_NS
wrappers = [] wrappers = []
WRAPPER = textwrap.dedent('''\ WRAPPER = textwrap.dedent('''\
<html xmlns="%s"> <html xmlns="%s">
@@ -267,7 +267,8 @@ class ComicInput(InputFormatPlugin):
''') ''')
dir = os.path.dirname(pages[0]) dir = os.path.dirname(pages[0])
for i, page in enumerate(pages): for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1) wrapper = WRAPPER%(const.XHTML_NS, i+1, os.path.basename(page),
i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1)) page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
with open(page, 'wb') as f: with open(page, 'wb') as f:
f.write(wrapper.encode('utf-8')) f.write(wrapper.encode('utf-8'))
@@ -275,8 +276,6 @@ class ComicInput(InputFormatPlugin):
return wrappers return wrappers
def create_viewer_wrapper(self, pages): def create_viewer_wrapper(self, pages):
from ebook_converter.ebooks.oeb.base import XHTML_NS
def page(src): def page(src):
return '<img src="{}"></img>'.format(os.path.basename(src)) return '<img src="{}"></img>'.format(os.path.basename(src))
@@ -303,7 +302,7 @@ class ComicInput(InputFormatPlugin):
%s %s
</body> </body>
</html> </html>
''' % (XHTML_NS, pages) ''' % (const.XHTML_NS, pages)
path = os.path.join(base, 'wrapper.xhtml') path = os.path.join(base, 'wrapper.xhtml')
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(wrapper.encode('utf-8')) f.write(wrapper.encode('utf-8'))
@@ -1,14 +1,22 @@
from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation import io
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.customize import conversion
from ebook_converter.ebooks.docx.dump import do_dump
from ebook_converter.ebooks.docx.writer.container import DOCX
from ebook_converter.ebooks.docx.writer.from_html import Convert
from ebook_converter.ebooks.metadata import opf2 as opf_meta
from ebook_converter.ebooks.oeb import base
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1', PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter'] 'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
_OPT = conversion.OptionRecommendation
class DOCXOutput(OutputFormatPlugin): class DOCXOutput(conversion.OutputFormatPlugin):
name = 'DOCX Output' name = 'DOCX Output'
author = 'Kovid Goyal' author = 'Kovid Goyal'
@@ -16,75 +24,63 @@ class DOCXOutput(OutputFormatPlugin):
commit_name = 'docx_output' commit_name = 'docx_output'
ui_data = {'page_sizes': PAGE_SIZES} ui_data = {'page_sizes': PAGE_SIZES}
options = { options = {_OPT(name='docx_page_size', recommended_value='letter',
OptionRecommendation(name='docx_page_size', recommended_value='letter', level=_OPT.LOW, choices=PAGE_SIZES,
level=OptionRecommendation.LOW, choices=PAGE_SIZES, help='The size of the page. Default is letter. Choices '
help='The size of the page. Default is letter. Choices ' 'are %s' % PAGE_SIZES),
'are %s' % PAGE_SIZES), _OPT(name='docx_custom_page_size', recommended_value=None,
help='Custom size of the document. Use the form '
OptionRecommendation(name='docx_custom_page_size', recommended_value=None, 'widthxheight EG. `123x321` to specify the width and '
help='Custom size of the document. Use the form widthxheight ' 'height (in pts). This overrides any specified '
'EG. `123x321` to specify the width and height (in pts). ' 'page-size.'),
'This overrides any specified page-size.'), _OPT(name='docx_no_cover', recommended_value=False,
help='Do not insert the book cover as an image at the '
OptionRecommendation(name='docx_no_cover', recommended_value=False, 'start of the document. If you use this option, the book '
help='Do not insert the book cover as an image at the start of the document.' 'cover will be discarded.'),
' If you use this option, the book cover will be discarded.'), _OPT(name='preserve_cover_aspect_ratio',
recommended_value=False, help='Preserve the aspect ratio '
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False, 'of the cover image instead of stretching it out to cover '
help='Preserve the aspect ratio of the cover image instead of stretching' 'the entire page.'),
' it out to cover the entire page.'), _OPT(name='docx_no_toc', recommended_value=False,
help='Do not insert the table of contents as a page at '
OptionRecommendation(name='docx_no_toc', recommended_value=False, 'the start of the document.'),
help='Do not insert the table of contents as a page at the start of the document.'), _OPT(name='extract_to', help='Extract the contents of the '
'generated DOCX file to the specified directory. The '
OptionRecommendation(name='extract_to', 'contents of the directory are first deleted, so be '
help='Extract the contents of the generated %s file to the ' 'careful.'),
'specified directory. The contents of the directory are first ' _OPT(name='docx_page_margin_left', recommended_value=72.0,
'deleted, so be careful.' % 'DOCX'), level=_OPT.LOW, help='The size of the left page margin, '
'in pts. Default is 72pt. Overrides the common left page '
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0, 'margin setting.'),
level=OptionRecommendation.LOW, _OPT(name='docx_page_margin_top', recommended_value=72.0,
help='The size of the left page margin, in pts. Default is 72pt.' level=_OPT.LOW, help='The size of the top page margin, '
' Overrides the common left page margin setting.' 'in pts. Default is 72pt. Overrides the common top page '
), 'margin setting, unless set to zero.'),
_OPT(name='docx_page_margin_right', recommended_value=72.0,
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0, level=_OPT.LOW, help='The size of the right page margin, '
level=OptionRecommendation.LOW, 'in pts. Default is 72pt. Overrides the common right page '
help='The size of the top page margin, in pts. Default is 72pt.' 'margin setting, unless set to zero.'),
' Overrides the common top page margin setting, unless set to zero.' _OPT(name='docx_page_margin_bottom', recommended_value=72.0,
), level=_OPT.LOW, help='The size of the bottom page margin, '
'in pts. Default is 72pt. Overrides the common bottom '
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0, 'page margin setting, unless set to zero.')}
level=OptionRecommendation.LOW,
help='The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.'
),
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help='The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.'
),
}
def convert_metadata(self, oeb): def convert_metadata(self, oeb):
from lxml import etree
from ebook_converter.ebooks.oeb.base import OPF, OPF2_NS package = etree.Element(base.tag('opf', 'package'),
from ebook_converter.ebooks.metadata.opf2 import OPF as ReadOPF attrib={'version': '2.0'},
from io import BytesIO nsmap={None: const.OPF2_NS})
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
oeb.metadata.to_opf2(package) oeb.metadata.to_opf2(package)
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata() self.mi = opf_meta.OPF(io.BytesIO(etree.tostring(package,
encoding='utf-8')),
populate_spine=False,
try_to_guess_cover=False).to_book_metadata()
def convert(self, oeb, output_path, input_plugin, opts, log): def convert(self, oeb, output_path, input_plugin, opts, log):
from ebook_converter.ebooks.docx.writer.container import DOCX
from ebook_converter.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log) docx = DOCX(opts, log)
self.convert_metadata(oeb) self.convert_metadata(oeb)
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)() Convert(oeb, docx, self.mi, not opts.docx_no_cover,
not opts.docx_no_toc)()
docx.write(output_path, self.mi) docx.write(output_path, self.mi)
if opts.extract_to: if opts.extract_to:
from ebook_converter.ebooks.docx.dump import do_dump
do_dump(output_path, opts.extract_to) do_dump(output_path, opts.extract_to)
@@ -1,14 +1,19 @@
import os, re, posixpath import hashlib
from itertools import cycle import itertools
import os
import re
import traceback
import uuid
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation from lxml import etree
from ebook_converter.ebooks.metadata import opf2 as opf_meta
from ebook_converter.ebooks.oeb import base
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
__license__ = 'GPL 3' ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding' IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
@@ -16,8 +21,8 @@ def decrypt_font_data(key, data, algorithm):
is_adobe = algorithm == ADOBE_OBFUSCATION is_adobe = algorithm == ADOBE_OBFUSCATION
crypt_len = 1024 if is_adobe else 1040 crypt_len = 1024 if is_adobe else 1040
crypt = bytearray(data[:crypt_len]) crypt = bytearray(data[:crypt_len])
key = cycle(iter(bytearray(key))) key = itertools.cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x^next(key) for x in crypt)) decrypt = bytes(bytearray(x ^ next(key) for x in crypt))
return decrypt + data[crypt_len:] return decrypt + data[crypt_len:]
@@ -29,18 +34,16 @@ def decrypt_font(key, path, algorithm):
class EPUBInput(InputFormatPlugin): class EPUBInput(InputFormatPlugin):
name = 'EPUB Input' name = 'EPUB Input'
author = 'Kovid Goyal' author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML' description = 'Convert EPUB files (.epub) to HTML'
file_types = {'epub'} file_types = {'epub'}
output_encoding = None output_encoding = None
commit_name = 'epub_input' commit_name = 'epub_input'
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)} recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def process_encryption(self, encfile, opf, log): def process_encryption(self, encfile, opf, log):
from lxml import etree
import uuid, hashlib
idpf_key = opf.raw_unique_identifier idpf_key = opf.raw_unique_identifier
if idpf_key: if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
@@ -56,27 +59,28 @@ class EPUBInput(InputFormatPlugin):
try: try:
key = item.text.rpartition(':')[-1] key = item.text.rpartition(':')[-1]
key = uuid.UUID(key).bytes key = uuid.UUID(key).bytes
except: except Exception:
import traceback
traceback.print_exc() traceback.print_exc()
key = None key = None
try: try:
root = etree.parse(encfile) root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): for em in root.xpath('descendant::*[contains(name(), '
'"EncryptionMethod")]'):
algorithm = em.get('Algorithm', '') algorithm = em.get('Algorithm', '')
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
return False return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] cr = em.getparent().xpath('descendant::*[contains(name(), '
'"CipherReference")]')[0]
uri = cr.get('URI') uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) path = os.path.abspath(os.path.join(os.path.dirname(encfile),
'..', *uri.split('/')))
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key) tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
if (tkey and os.path.exists(path)): if (tkey and os.path.exists(path)):
self._encrypted_font_uris.append(uri) self._encrypted_font_uris.append(uri)
decrypt_font(tkey, path, algorithm) decrypt_font(tkey, path, algorithm)
return True return True
except: except Exception:
import traceback
traceback.print_exc() traceback.print_exc()
return False return False
@@ -97,8 +101,11 @@ class EPUBInput(InputFormatPlugin):
return t return t
def rationalize_cover3(self, opf, log): def rationalize_cover3(self, opf, log):
''' If there is a reference to the cover/titlepage via manifest properties, convert to """
entries in the <guide> so that the rest of the pipeline picks it up. ''' If there is a reference to the cover/titlepage via manifest
properties, convert to entries in the <guide> so that the rest of the
pipeline picks it up.
"""
from ebook_converter.ebooks.metadata.opf3 import items_with_property from ebook_converter.ebooks.metadata.opf3 import items_with_property
removed = guide_titlepage_href = guide_titlepage_id = None removed = guide_titlepage_href = guide_titlepage_id = None
@@ -128,7 +135,8 @@ class EPUBInput(InputFormatPlugin):
titlepage_id, titlepage_href = tid, href.partition('#')[0] titlepage_id, titlepage_href = tid, href.partition('#')[0]
break break
if titlepage_href is None: if titlepage_href is None:
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id titlepage_href = guide_titlepage_href
titlepage_id = guide_titlepage_id
if titlepage_href is not None: if titlepage_href is not None:
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page') self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
spine = list(opf.iterspine()) spine = list(opf.iterspine())
@@ -148,7 +156,6 @@ class EPUBInput(InputFormatPlugin):
means, at most one entry with type="cover" that points to a raster means, at most one entry with type="cover" that points to a raster
cover and at most one entry with type="titlepage" that points to an cover and at most one entry with type="titlepage" that points to an
HTML titlepage. ''' HTML titlepage. '''
from ebook_converter.ebooks.oeb.base import OPF
removed = None removed = None
from lxml import etree from lxml import etree
guide_cover, guide_elem = None, None guide_cover, guide_elem = None, None
@@ -160,12 +167,14 @@ class EPUBInput(InputFormatPlugin):
raster_cover = opf.raster_cover raster_cover = opf.raster_cover
if raster_cover: if raster_cover:
if guide_elem is None: if guide_elem is None:
g = opf.root.makeelement(OPF('guide')) g = opf.root.makeelement(base.tag('opf', 'guide'))
opf.root.append(g) opf.root.append(g)
else: else:
g = guide_elem.getparent() g = guide_elem.getparent()
guide_cover = raster_cover guide_cover = raster_cover
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'}) guide_elem = g.makeelement(base.tag('opf', 'reference'),
attrib={'href': raster_cover,
'type': 'cover'})
g.append(guide_elem) g.append(guide_elem)
return return
spine = list(opf.iterspine()) spine = list(opf.iterspine())
@@ -186,7 +195,8 @@ class EPUBInput(InputFormatPlugin):
# specially # specially
if not self.for_viewer: if not self.for_viewer:
if len(spine) == 1: if len(spine) == 1:
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.') log.warn('There is only a single spine item and it is marked '
'as the cover. Removing cover marking.')
for guide_elem in tuple(opf.iterguide()): for guide_elem in tuple(opf.iterguide()):
if guide_elem.get('type', '').lower() == 'cover': if guide_elem.get('type', '').lower() == 'cover':
guide_elem.getparent().remove(guide_elem) guide_elem.getparent().remove(guide_elem)
@@ -215,8 +225,9 @@ class EPUBInput(InputFormatPlugin):
# Render the titlepage to create a raster cover # Render the titlepage to create a raster cover
from ebook_converter.ebooks import render_html_svg_workaround from ebook_converter.ebooks import render_html_svg_workaround
guide_elem.set('href', 'calibre_raster_cover.jpg') guide_elem.set('href', 'calibre_raster_cover.jpg')
t = etree.SubElement( t = etree.SubElement(elem[0].getparent(), base.tag('opf', 'item'),
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover') href=guide_elem.get('href'),
id='calibre_raster_cover')
t.set('media-type', 'image/jpeg') t.set('media-type', 'image/jpeg')
if os.path.exists(guide_cover): if os.path.exists(guide_cover):
renderer = render_html_svg_workaround(guide_cover, log) renderer = render_html_svg_workaround(guide_cover, log)
@@ -229,17 +240,16 @@ class EPUBInput(InputFormatPlugin):
return removed return removed
def find_opf(self): def find_opf(self):
from ebook_converter.utils.xml_parse import safe_xml_fromstring
def attr(n, attr): def attr(n, attr):
for k, v in n.attrib.items(): for k, v in n.attrib.items():
if k.endswith(attr): if k.endswith(attr):
return v return v
try: try:
with open('META-INF/container.xml', 'rb') as f: with open('META-INF/container.xml', 'rb') as f:
root = safe_xml_fromstring(f.read()) root = etree.fromstring(f.read())
for r in root.xpath('//*[local-name()="rootfile"]'): for r in root.xpath('//*[local-name()="rootfile"]'):
if attr(r, 'media-type') != "application/oebps-package+xml": if (attr(r, 'media-type') !=
"application/oebps-package+xml"):
continue continue
path = attr(r, 'full-path') path = attr(r, 'full-path')
if not path: if not path:
@@ -248,20 +258,18 @@ class EPUBInput(InputFormatPlugin):
if os.path.exists(path): if os.path.exists(path):
return path return path
except Exception: except Exception:
import traceback
traceback.print_exc() traceback.print_exc()
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.zipfile import ZipFile
from ebook_converter import walk from ebook_converter import walk
from ebook_converter.ebooks import DRMError from ebook_converter.ebooks import DRMError
from ebook_converter.ebooks.metadata.opf2 import OPF
try: try:
zf = ZipFile(stream) zf = ZipFile(stream)
zf.extractall(os.getcwd()) zf.extractall(os.getcwd())
except: except Exception:
log.exception('EPUB appears to be invalid ZIP file, trying a' log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser') ' more forgiving ZIP parser')
from ebook_converter.utils.localunzip import extractall from ebook_converter.utils.localunzip import extractall
stream.seek(0) stream.seek(0)
extractall(stream) extractall(stream)
@@ -276,11 +284,12 @@ class EPUBInput(InputFormatPlugin):
path = getattr(stream, 'name', 'stream') path = getattr(stream, 'name', 'stream')
if opf is None: if opf is None:
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) raise ValueError('%s is not a valid EPUB file (could not find '
'opf)' % path)
opf = os.path.relpath(opf, os.getcwd()) opf = os.path.relpath(opf, os.getcwd())
parts = os.path.split(opf) # parts = os.path.split(opf)
opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf)))
self._encrypted_font_uris = [] self._encrypted_font_uris = []
if os.path.exists(encfile): if os.path.exists(encfile):
@@ -288,18 +297,23 @@ class EPUBInput(InputFormatPlugin):
raise DRMError(os.path.basename(path)) raise DRMError(os.path.basename(path))
self.encrypted_fonts = self._encrypted_font_uris self.encrypted_fonts = self._encrypted_font_uris
if len(parts) > 1 and parts[0]: # XXX(gryf): this code would fail pretty ugly, thus, this part was
delta = '/'.join(parts[:-1])+'/' # never used.
# if len(parts) > 1 and parts[0]:
# delta = '/'.join(parts[:-1])+'/'
def normpath(x): # def normpath(x):
return posixpath.normpath(delta + elem.get('href')) # return posixpath.normpath(delta + elem.get('href'))
for elem in opf.itermanifest(): # for elem in opf.itermanifest():
elem.set('href', normpath(elem.get('href'))) # elem.set('href', normpath(elem.get('href')))
for elem in opf.iterguide(): # for elem in opf.iterguide():
elem.set('href', normpath(elem.get('href'))) # elem.set('href', normpath(elem.get('href')))
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 if opf.package_version >= 3.0:
f = self.rationalize_cover3
else:
f = self.rationalize_cover2
self.removed_cover = f(opf, log) self.removed_cover = f(opf, log)
if self.removed_cover: if self.removed_cover:
self.removed_items_to_ignore = (self.removed_cover,) self.removed_items_to_ignore = (self.removed_cover,)
@@ -352,15 +366,18 @@ class EPUBInput(InputFormatPlugin):
from lxml import etree from lxml import etree
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.oeb.polish.parsing import parse from ebook_converter.ebooks.oeb.polish.parsing import parse
from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, \
NCX, urlnormalize, urlunquote, serialize
from ebook_converter.ebooks.oeb.polish.toc import first_child from ebook_converter.ebooks.oeb.polish.toc import first_child
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
with open(nav_path, 'rb') as f: with open(nav_path, 'rb') as f:
raw = f.read() raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True)[0]
root = parse(raw, log=log) root = parse(raw, log=log)
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/'
'ncx/" version="2005-1" xml:lang="eng">'
'<navMap/></ncx>')
navmap = ncx[0] navmap = ncx[0]
et = '{%s}type' % EPUB_NS et = '{%s}type' % EPUB_NS
bn = os.path.basename(nav_path) bn = os.path.basename(nav_path)
@@ -368,8 +385,8 @@ class EPUBInput(InputFormatPlugin):
def add_from_li(li, parent): def add_from_li(li, parent):
href = text = None href = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')): for x in li.iterchildren(XHTML('a'), XHTML('span')):
text = etree.tostring( text = etree.tostring(x, method='text', encoding='unicode',
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( with_tail=False).strip() or ' '.join(
x.xpath('descendant-or-self::*/@title')).strip() x.xpath('descendant-or-self::*/@title')).strip()
href = x.get('href') href = x.get('href')
if href: if href:
@@ -382,7 +399,7 @@ class EPUBInput(InputFormatPlugin):
np[0].append(np.makeelement(NCX('text'))) np[0].append(np.makeelement(NCX('text')))
np[0][0].text = text np[0][0].text = text
if href: if href:
np.append(np.makeelement(NCX('content'), attrib={'src':href})) np.append(np.makeelement(NCX('content'), attrib={'src': href}))
return np return np
def process_nav_node(node, toc_parent): def process_nav_node(node, toc_parent):
@@ -401,20 +418,25 @@ class EPUBInput(InputFormatPlugin):
else: else:
return return
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path),
delete=False) as f:
f.write(etree.tostring(ncx, encoding='utf-8')) f.write(etree.tostring(ncx, encoding='utf-8'))
ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/') ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME,
append=True).get('id')
for spine in opf.root.xpath('//*[local-name()="spine"]'): for spine in opf.root.xpath('//*[local-name()="spine"]'):
spine.set('toc', ncx_id) spine.set('toc', ncx_id)
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/')) url = os.path.relpath(nav_path).replace(os.sep, '/')
opts.epub3_nav_href = urlnormalize(url)
opts.epub3_nav_parsed = root opts.epub3_nav_parsed = root
if getattr(self, 'removed_cover', None): if getattr(self, 'removed_cover', None):
changed = False changed = False
base_path = os.path.dirname(nav_path) base_path = os.path.dirname(nav_path)
for elem in root.xpath('//*[@href]'): for elem in root.xpath('//*[@href]'):
href, frag = elem.get('href').partition('#')[::2] href, frag = elem.get('href').partition('#')[::2]
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path) link_path = os.path.relpath(os.path.join(base_path,
urlunquote(href)),
base_path)
abs_href = urlnormalize(link_path) abs_href = urlnormalize(link_path)
if abs_href == self.removed_cover: if abs_href == self.removed_cover:
changed = True changed = True
@@ -2,7 +2,11 @@ import os
import re import re
import shutil import shutil
import urllib.parse import urllib.parse
import uuid
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.customize.conversion import OutputFormatPlugin from ebook_converter.customize.conversion import OutputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation from ebook_converter.customize.conversion import OptionRecommendation
@@ -132,39 +136,37 @@ class EPUBOutput(OutputFormatPlugin):
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)} recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def workaround_webkit_quirks(self): # {{{ def workaround_webkit_quirks(self): # {{{
from ebook_converter.ebooks.oeb.base import XPath
for x in self.oeb.spine: for x in self.oeb.spine:
root = x.data root = x.data
body = XPath('//h:body')(root) body = base.XPath('//h:body')(root)
if body: if body:
body = body[0] body = body[0]
if not hasattr(body, 'xpath'): if not hasattr(body, 'xpath'):
continue continue
for pre in XPath('//h:pre')(body): for pre in base.XPath('//h:pre')(body):
if not pre.text and len(pre) == 0: if not pre.text and len(pre) == 0:
pre.tag = 'div' pre.tag = 'div'
# }}} # }}}
def upshift_markup(self): # {{{ def upshift_markup(self): # {{{
'Upgrade markup to comply with XHTML 1.1 where possible' 'Upgrade markup to comply with XHTML 1.1 where possible'
from ebook_converter.ebooks.oeb.base import XPath, XML
for x in self.oeb.spine: for x in self.oeb.spine:
root = x.data root = x.data
if (not root.get(XML('lang'))) and (root.get('lang')): if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')):
root.set(XML('lang'), root.get('lang')) root.set(base.tag('xml', 'lang'), root.get('lang'))
body = XPath('//h:body')(root) body = base.XPath('//h:body')(root)
if body: if body:
body = body[0] body = body[0]
if not hasattr(body, 'xpath'): if not hasattr(body, 'xpath'):
continue continue
for u in XPath('//h:u')(root): for u in base.XPath('//h:u')(root):
u.tag = 'span' u.tag = 'span'
seen_ids, seen_names = set(), set() seen_ids, seen_names = set(), set()
for x in XPath('//*[@id or @name]')(root): for x in base.XPath('//*[@id or @name]')(root):
eid, name = x.get('id', None), x.get('name', None) eid, name = x.get('id', None), x.get('name', None)
if eid: if eid:
if eid in seen_ids: if eid in seen_ids:
@@ -223,28 +225,27 @@ class EPUBOutput(OutputFormatPlugin):
first = next(iter(self.oeb.spine)) first = next(iter(self.oeb.spine))
self.oeb.toc.add('Start', first.href) self.oeb.toc.add('Start', first.href)
from ebook_converter.ebooks.oeb.base import OPF
identifiers = oeb.metadata['identifier'] identifiers = oeb.metadata['identifier']
uuid = None _uuid = None
for x in identifiers: for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'): if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or
uuid = str(x).split(':')[-1] str(x).startswith('urn:uuid:')):
_uuid = str(x).split(':')[-1]
break break
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
if uuid is None: if _uuid is None:
self.log.warn('No UUID identifier found') self.log.warn('No UUID identifier found')
from uuid import uuid4 _uuid = str(uuid.uuid4())
uuid = str(uuid4()) oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
if encrypted_fonts and not uuid.startswith('urn:uuid:'): if encrypted_fonts and not _uuid.startswith('urn:uuid:'):
# Apparently ADE requires this value to start with urn:uuid: # Apparently ADE requires this value to start with urn:uuid:
# for some absurd reason, or it will throw a hissy fit and refuse # for some absurd reason, or it will throw a hissy fit and refuse
# to use the obfuscated fonts. # to use the obfuscated fonts.
for x in identifiers: for x in identifiers:
if str(x) == uuid: if str(x) == _uuid:
x.content = 'urn:uuid:'+uuid x.content = 'urn:uuid:' + _uuid
with TemporaryDirectory('_epub_output') as tdir: with TemporaryDirectory('_epub_output') as tdir:
from ebook_converter.customize.ui import plugin_for_output_format from ebook_converter.customize.ui import plugin_for_output_format
@@ -264,7 +265,7 @@ class EPUBOutput(OutputFormatPlugin):
self.upgrade_to_epub3(tdir, opf) self.upgrade_to_epub3(tdir, opf)
encryption = None encryption = None
if encrypted_fonts: if encrypted_fonts:
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid)
from ebook_converter.ebooks.epub import initialize_container from ebook_converter.ebooks.epub import initialize_container
with initialize_container(output_path, os.path.basename(opf), with initialize_container(output_path, os.path.basename(opf),
@@ -312,12 +313,12 @@ class EPUBOutput(OutputFormatPlugin):
except EnvironmentError: except EnvironmentError:
pass pass
def encrypt_fonts(self, uris, tdir, uuid): # {{{ def encrypt_fonts(self, uris, tdir, _uuid): # {{{
from ebook_converter.polyglot.binary import from_hex_bytes from ebook_converter.polyglot.binary import from_hex_bytes
key = re.sub(r'[^a-fA-F0-9]', '', uuid) key = re.sub(r'[^a-fA-F0-9]', '', _uuid)
if len(key) < 16: if len(key) < 16:
raise ValueError('UUID identifier %r is invalid'%uuid) raise ValueError('UUID identifier %r is invalid'% _uuid)
key = bytearray(from_hex_bytes((key + key)[:32])) key = bytearray(from_hex_bytes((key + key)[:32]))
paths = [] paths = []
with CurrentDir(tdir): with CurrentDir(tdir):
@@ -335,7 +336,8 @@ class EPUBOutput(OutputFormatPlugin):
if len(data) >= 1024: if len(data) >= 1024:
data = bytearray(data) data = bytearray(data)
f.seek(0) f.seek(0)
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024)))) f.write(bytes(bytearray(data[i] ^ key[i%16]
for i in range(1024))))
else: else:
self.log.warn('Font', path, 'is invalid, ignoring') self.log.warn('Font', path, 'is invalid, ignoring')
if not isinstance(uri, str): if not isinstance(uri, str):
@@ -374,11 +376,10 @@ class EPUBOutput(OutputFormatPlugin):
# }}} # }}}
def workaround_ade_quirks(self): # {{{ def workaround_ade_quirks(self): # {{{
''' """
Perform various markup transforms to get the output to render correctly Perform various markup transforms to get the output to render correctly
in the quirky ADE. in the quirky ADE.
''' """
from ebook_converter.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
stylesheet = self.oeb.manifest.main_stylesheet stylesheet = self.oeb.manifest.main_stylesheet
@@ -388,23 +389,23 @@ class EPUBOutput(OutputFormatPlugin):
for node in self.oeb.toc.iter(): for node in self.oeb.toc.iter():
href = getattr(node, 'href', None) href = getattr(node, 'href', None)
if hasattr(href, 'partition'): if hasattr(href, 'partition'):
base, _, frag = href.partition('#') _base, _, frag = href.partition('#')
frag = urlunquote(frag) frag = base.urlunquote(frag)
if frag and frag_pat.match(frag) is None: if frag and frag_pat.match(frag) is None:
self.log.warn( self.log.warn(
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag) 'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
node.href = base node.href = _base
for x in self.oeb.spine: for x in self.oeb.spine:
root = x.data root = x.data
body = XPath('//h:body')(root) body = base.XPath('//h:body')(root)
if body: if body:
body = body[0] body = body[0]
if hasattr(body, 'xpath'): if hasattr(body, 'xpath'):
# remove <img> tags with empty src elements # remove <img> tags with empty src elements
bad = [] bad = []
for x in XPath('//h:img')(body): for x in base.XPath('//h:img')(body):
src = x.get('src', '').strip() src = x.get('src', '').strip()
if src in ('', '#') or src.startswith('http:'): if src in ('', '#') or src.startswith('http:'):
bad.append(x) bad.append(x)
@@ -412,7 +413,7 @@ class EPUBOutput(OutputFormatPlugin):
img.getparent().remove(img) img.getparent().remove(img)
# Add id attribute to <a> tags that have name # Add id attribute to <a> tags that have name
for x in XPath('//h:a[@name]')(body): for x in base.XPath('//h:a[@name]')(body):
if not x.get('id', False): if not x.get('id', False):
x.set('id', x.get('name')) x.set('id', x.get('name'))
# The delightful epubcheck has started complaining about <a> tags that # The delightful epubcheck has started complaining about <a> tags that
@@ -420,19 +421,19 @@ class EPUBOutput(OutputFormatPlugin):
x.attrib.pop('name') x.attrib.pop('name')
# Replace <br> that are children of <body> as ADE doesn't handle them # Replace <br> that are children of <body> as ADE doesn't handle them
for br in XPath('./h:br')(body): for br in base.XPath('./h:br')(body):
if br.getparent() is None: if br.getparent() is None:
continue continue
try: try:
prior = next(br.itersiblings(preceding=True)) prior = next(br.itersiblings(preceding=True))
priortag = barename(prior.tag) priortag = parse_utils.barename(prior.tag)
priortext = prior.tail priortext = prior.tail
except: except:
priortag = 'body' priortag = 'body'
priortext = body.text priortext = body.text
if priortext: if priortext:
priortext = priortext.strip() priortext = priortext.strip()
br.tag = XHTML('p') br.tag = base.tag('xhtml', 'p')
br.text = '\u00a0' br.text = '\u00a0'
style = br.get('style', '').split(';') style = br.get('style', '').split(';')
style = list(filter(None, map(lambda x: x.strip(), style))) style = list(filter(None, map(lambda x: x.strip(), style)))
@@ -446,44 +447,44 @@ class EPUBOutput(OutputFormatPlugin):
style.append('height:0pt') style.append('height:0pt')
br.set('style', '; '.join(style)) br.set('style', '; '.join(style))
for tag in XPath('//h:embed')(root): for tag in base.XPath('//h:embed')(root):
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in XPath('//h:object')(root): for tag in base.XPath('//h:object')(root):
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}: if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
continue continue
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in XPath('//h:title|//h:style')(root): for tag in base.XPath('//h:title|//h:style')(root):
if not tag.text: if not tag.text:
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in XPath('//h:script')(root): for tag in base.XPath('//h:script')(root):
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
tag.getparent().remove(tag) tag.getparent().remove(tag)
for tag in XPath('//h:body/descendant::h:script')(root): for tag in base.XPath('//h:body/descendant::h:script')(root):
tag.getparent().remove(tag) tag.getparent().remove(tag)
formchildren = XPath('./h:input|./h:button|./h:textarea|' formchildren = base.XPath('./h:input|./h:button|./h:textarea|'
'./h:label|./h:fieldset|./h:legend') './h:label|./h:fieldset|./h:legend')
for tag in XPath('//h:form')(root): for tag in base.XPath('//h:form')(root):
if formchildren(tag): if formchildren(tag):
tag.getparent().remove(tag) tag.getparent().remove(tag)
else: else:
# Not a real form # Not a real form
tag.tag = XHTML('div') tag.tag = base.tag('xhtml', 'div')
for tag in XPath('//h:center')(root): for tag in base.XPath('//h:center')(root):
tag.tag = XHTML('div') tag.tag = base.tag('xhtml', 'div')
tag.set('style', 'text-align:center') tag.set('style', 'text-align:center')
# ADE can't handle &amp; in an img url # ADE can't handle &amp; in an img url
for tag in XPath('//h:img[@src]')(root): for tag in base.XPath('//h:img[@src]')(root):
tag.set('src', tag.get('src', '').replace('&', '')) tag.set('src', tag.get('src', '').replace('&', ''))
# ADE whimpers in fright when it encounters a <td> outside a # ADE whimpers in fright when it encounters a <td> outside a
# <table> # <table>
in_table = XPath('ancestor::h:table') in_table = base.XPath('ancestor::h:table')
for tag in XPath('//h:td|//h:tr|//h:th')(root): for tag in base.XPath('//h:td|//h:tr|//h:th')(root):
if not in_table(tag): if not in_table(tag):
tag.tag = XHTML('div') tag.tag = base.tag('xhtml', 'div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]') special_chars = re.compile('[\u200b\u00ad]')
@@ -498,7 +499,7 @@ class EPUBOutput(OutputFormatPlugin):
if stylesheet is not None: if stylesheet is not None:
# ADE doesn't render lists correctly if they have left margins # ADE doesn't render lists correctly if they have left margins
from css_parser.css import CSSRule from css_parser.css import CSSRule
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root): for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root):
sel = '.'+lb.get('class') sel = '.'+lb.get('class')
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if sel == rule.selectorList.selectorText: if sel == rule.selectorList.selectorText:
@@ -519,11 +520,10 @@ class EPUBOutput(OutputFormatPlugin):
''' '''
Perform toc link transforms to alleviate slow loading. Perform toc link transforms to alleviate slow loading.
''' '''
from ebook_converter.ebooks.oeb.base import XPath
from ebook_converter.ebooks.oeb.polish.toc import item_at_top from ebook_converter.ebooks.oeb.polish.toc import item_at_top
def frag_is_at_top(root, frag): def frag_is_at_top(root, frag):
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root) elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
if elem: if elem:
elem = elem[0] elem = elem[0]
else: else:
@@ -1,59 +1,57 @@
""" """
Convert .fb2 files to .lrf Convert .fb2 files to .lrf
""" """
import os, re import os
import pkg_resources import pkg_resources
import re
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation from lxml import etree
from ebook_converter import constants as const
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter import guess_type from ebook_converter import guess_type
__license__ = 'GPL v3' FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1' FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
class FB2Input(InputFormatPlugin): class FB2Input(InputFormatPlugin):
name = 'FB2 Input' name = 'FB2 Input'
author = 'Anatoly Shipitsin' author = 'Anatoly Shipitsin'
description = 'Convert FB2 and FBZ files to HTML' description = 'Convert FB2 and FBZ files to HTML'
file_types = {'fb2', 'fbz'} file_types = {'fb2', 'fbz'}
commit_name = 'fb2_input' commit_name = 'fb2_input'
recommendations = { recommendations = {('level1_toc', '//h:h1', OptionRecommendation.MED),
('level1_toc', '//h:h1', OptionRecommendation.MED), ('level2_toc', '//h:h2', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED), ('level3_toc', '//h:h3', OptionRecommendation.MED)}
('level3_toc', '//h:h3', OptionRecommendation.MED),
}
options = { options = {OptionRecommendation(name='no_inline_fb2_toc',
OptionRecommendation(name='no_inline_fb2_toc', recommended_value=False,
recommended_value=False, level=OptionRecommendation.LOW, level=OptionRecommendation.LOW,
help='Do not insert a Table of Contents at the beginning of the book.' help='Do not insert a Table of Contents '
)} 'at the beginning of the book.')}
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from lxml import etree from ebook_converter.ebooks.metadata.fb2 import ensure_namespace
from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.ebooks.metadata.fb2 import get_fb2_data
from ebook_converter.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.metadata.meta import get_metadata
from ebook_converter.ebooks.oeb.base import XLINK_NS, XHTML_NS
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
self.log = log self.log = log
log.debug('Parsing XML...') log.debug('Parsing XML...')
raw = get_fb2_data(stream)[0] raw = get_fb2_data(stream)[0]
raw = raw.replace(b'\0', b'') raw = raw.replace(b'\0', b'')
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0] assume_utf8=True, resolve_entities=True)[0]
try: try:
doc = safe_xml_fromstring(raw) doc = etree.fromstring(raw)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
doc = safe_xml_fromstring(raw.replace('& ', '&amp;')) doc = etree.fromstring(raw.replace('& ', '&amp;'))
if doc is None: if doc is None:
raise ValueError('The FB2 file is not valid XML') raise ValueError('The FB2 file is not valid XML')
doc = ensure_namespace(doc) doc = ensure_namespace(doc)
@@ -62,22 +60,24 @@ class FB2Input(InputFormatPlugin):
except Exception: except Exception:
fb_ns = FB2NS fb_ns = FB2NS
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS} NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS}
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') stylesheets = doc.xpath('//*[local-name() = "stylesheet" and '
'@type="text/css"]')
css = '' css = ''
for s in stylesheets: for s in stylesheets:
css += etree.tostring(s, encoding='unicode', method='text', css += etree.tostring(s, encoding='unicode', method='text',
with_tail=False) + '\n\n' with_tail=False) + '\n\n'
if css: if css:
import css_parser, logging import css_parser
import logging
parser = css_parser.CSSParser(fetcher=None, parser = css_parser.CSSParser(fetcher=None,
log=logging.getLogger('calibre.css')) log=logging.getLogger('calibre.css'))
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS
text = XHTML_CSS_NAMESPACE + css text = XHTML_CSS_NAMESPACE + css
log.debug('Parsing stylesheet...') log.debug('Parsing stylesheet...')
stylesheet = parser.parseString(text) stylesheet = parser.parseString(text)
stylesheet.namespaces['h'] = XHTML_NS stylesheet.namespaces['h'] = const.XHTML_NS
css = stylesheet.cssText css = stylesheet.cssText
if isinstance(css, bytes): if isinstance(css, bytes):
css = css.decode('utf-8', 'replace') css = css.decode('utf-8', 'replace')
@@ -92,16 +92,20 @@ class FB2Input(InputFormatPlugin):
if options.no_inline_fb2_toc: if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC') log('Disabling generation of inline FB2 TOC')
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
re.DOTALL).sub('', ss) re.DOTALL).sub('', ss)
styledoc = safe_xml_fromstring(ss) styledoc = etree.fromstring(ss)
transform = etree.XSLT(styledoc) transform = etree.XSLT(styledoc)
result = transform(doc) result = transform(doc)
# Handle links of type note and cite # Handle links of type note and cite
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')} notes = {a.get('href')[1:]: a
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')} for a in result.xpath('//a[@link_note and @href]')
if a.get('href').startswith('#')}
cites = {a.get('link_cite'): a
for a in result.xpath('//a[@link_cite]')
if not a.get('href', '')}
all_ids = {x for x in result.xpath('//*/@id')} all_ids = {x for x in result.xpath('//*/@id')}
for cite, a in cites.items(): for cite, a in cites.items():
note = notes.get(cite, None) note = notes.get(cite, None)
@@ -137,8 +141,10 @@ class FB2Input(InputFormatPlugin):
f.write(mi.cover_data[1]) f.write(mi.cover_data[1])
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
else: else:
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): for img in doc.xpath('//f:coverpage/f:image',
href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) namespaces=NAMESPACES):
href = img.get('{%s}href' % const.XLINK_NS,
img.get('href', None))
if href is not None: if href is not None:
if href.startswith('#'): if href.startswith('#'):
href = href[1:] href = href[1:]
@@ -165,15 +171,15 @@ class FB2Input(InputFormatPlugin):
ext = ct.rpartition('/')[-1].lower() ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'): if ext in ('png', 'jpeg', 'jpg'):
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg', if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}: 'png'}:
fname += '.' + ext fname += '.' + ext
self.binary_map[elem.get('id')] = fname self.binary_map[elem.get('id')] = fname
raw = elem.text.strip() raw = elem.text.strip()
try: try:
data = base64_decode(raw) data = base64_decode(raw)
except TypeError: except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%( self.log.exception('Binary data with id=%s is corrupted, '
elem.get('id'))) 'ignoring' % elem.get('id'))
else: else:
with open(fname, 'wb') as f: with open(fname, 'wb') as f:
f.write(data) f.write(data)
@@ -1,17 +1,17 @@
import copy
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.customize.conversion import InputFormatPlugin from ebook_converter.customize.conversion import InputFormatPlugin
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class LITInput(InputFormatPlugin): class LITInput(InputFormatPlugin):
name = 'LIT Input' name = 'LIT Input'
author = 'Marshall T. Vandegrift' author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML' description = 'Convert LIT files to HTML'
file_types = {'lit'} file_types = {'lit'}
commit_name = 'lit_input' commit_name = 'lit_input'
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
@@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
return create_oebbook(log, stream, options, reader=LitReader) return create_oebbook(log, stream, options, reader=LitReader)
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
from ebook_converter.ebooks.oeb.base import XHTML_NS, XPath, XHTML from ebook_converter.ebooks.oeb.base import XPath, XHTML
for item in oeb.spine: for item in oeb.spine:
root = item.data root = item.data
if not hasattr(root, 'xpath'): if not hasattr(root, 'xpath'):
@@ -37,22 +37,23 @@ class LITInput(InputFormatPlugin):
body = body[0] body = body[0]
if len(body) == 1 and body[0].tag == XHTML('pre'): if len(body) == 1 and body[0].tag == XHTML('pre'):
pre = body[0] pre = body[0]
from ebook_converter.ebooks.txt.processor import convert_basic, \ from ebook_converter.ebooks.txt.processor import \
separate_paragraphs_single_line convert_basic, separate_paragraphs_single_line
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.xml_parse import safe_xml_fromstring self.log('LIT file with all text in singe <pre> tag '
import copy 'detected')
self.log('LIT file with all text in singe <pre> tag detected')
html = separate_paragraphs_single_line(pre.text) html = separate_paragraphs_single_line(pre.text)
html = convert_basic(html).replace('<html>', html = convert_basic(html).replace('<html>',
'<html xmlns="%s">'%XHTML_NS) '<html xmlns="%s">' %
const.XHTML_NS)
html = xml_to_unicode(html, strip_encoding_pats=True, html = xml_to_unicode(html, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
if opts.smarten_punctuation: if opts.smarten_punctuation:
# SmartyPants skips text inside <pre> tags # SmartyPants skips text inside <pre> tags
from ebook_converter.ebooks.conversion.preprocess import smarten_punctuation from ebook_converter.ebooks.conversion import \
html = smarten_punctuation(html, self.log) preprocess
root = safe_xml_fromstring(html) html = preprocess.smarten_punctuation(html, self.log)
root = etree.fromstring(html)
body = XPath('//h:body')(root) body = XPath('//h:body')(root)
pre.tag = XHTML('div') pre.tag = XHTML('div')
pre.text = '' pre.text = ''
@@ -1,54 +1,52 @@
import os, sys import os
import sys
import pkg_resources import pkg_resources
from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin from ebook_converter.customize.conversion import InputFormatPlugin
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class LRFInput(InputFormatPlugin): class LRFInput(InputFormatPlugin):
name = 'LRF Input' name = 'LRF Input'
author = 'Kovid Goyal' author = 'Kovid Goyal'
description = 'Convert LRF files to HTML' description = 'Convert LRF files to HTML'
file_types = {'lrf'} file_types = {'lrf'}
commit_name = 'lrf_input' commit_name = 'lrf_input'
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from ebook_converter.ebooks.lrf.input import (MediaType, Styles, TextBlock, from ebook_converter.ebooks.lrf.input import MediaType, Styles, \
Canvas, ImageBlock, RuledLine) TextBlock, Canvas, ImageBlock, RuledLine
self.log = log self.log = log
self.log('Generating XML') self.log('Generating XML')
from ebook_converter.ebooks.lrf.lrfparser import LRFDocument from ebook_converter.ebooks.lrf.lrfparser import LRFDocument
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from lxml import etree
d = LRFDocument(stream) d = LRFDocument(stream)
d.parse() d.parse()
xml = d.to_xml(write_files=True) xml = d.to_xml(write_files=True)
if options.verbose > 2: if options.verbose > 2:
open(u'lrs.xml', 'wb').write(xml.encode('utf-8')) open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
doc = safe_xml_fromstring(xml) doc = etree.fromstring(xml)
char_button_map = {} char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'): for x in doc.xpath('//CharButton[@refobj]'):
ro = x.get('refobj') ro = x.get('refobj')
jump_button = doc.xpath('//*[@objid="%s"]'%ro) jump_button = doc.xpath('//*[@objid="%s"]' % ro)
if jump_button: if jump_button:
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]') jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage '
'and @refobj]')
if jump_to: if jump_to:
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'), char_button_map[ro] = ('%s.xhtml#%s' %
jump_to[0].get('refobj')) (jump_to[0].get('refpage'),
jump_to[0].get('refobj')))
plot_map = {} plot_map = {}
for x in doc.xpath('//Plot[@refobj]'): for x in doc.xpath('//Plot[@refobj]'):
ro = x.get('refobj') ro = x.get('refobj')
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro) image = doc.xpath('//Image[@objid="%s" and @refstream]' % ro)
if image: if image:
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'% imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]' %
image[0].get('refstream')) image[0].get('refstream'))
if imgstr: if imgstr:
plot_map[ro] = imgstr[0].get('file') plot_map[ro] = imgstr[0].get('file')
@@ -58,21 +56,19 @@ class LRFInput(InputFormatPlugin):
resource_filename('ebook_converter', resource_filename('ebook_converter',
'data/lrf.xsl')) as fobj: 'data/lrf.xsl')) as fobj:
# TODO(gryf): change this nonsense to etree.parse() instead. # TODO(gryf): change this nonsense to etree.parse() instead.
styledoc = safe_xml_fromstring(fobj.read()) styledoc = etree.fromstring(fobj.read())
media_type = MediaType() media_type = MediaType()
styles = Styles() styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log) text_block = TextBlock(styles, char_button_map, plot_map, log)
canvas = Canvas(doc, styles, text_block, log) canvas = Canvas(doc, styles, text_block, log)
image_block = ImageBlock(canvas) image_block = ImageBlock(canvas)
ruled_line = RuledLine() ruled_line = RuledLine()
extensions = { extensions = {('calibre', 'media-type'): media_type,
('calibre', 'media-type') : media_type, ('calibre', 'text-block'): text_block,
('calibre', 'text-block') : text_block, ('calibre', 'ruled-line'): ruled_line,
('calibre', 'ruled-line') : ruled_line, ('calibre', 'styles'): styles,
('calibre', 'styles') : styles, ('calibre', 'canvas'): canvas,
('calibre', 'canvas') : canvas, ('calibre', 'image-block'): image_block}
('calibre', 'image-block'): image_block,
}
transform = etree.XSLT(styledoc, extensions=extensions) transform = etree.XSLT(styledoc, extensions=extensions)
try: try:
result = transform(doc) result = transform(doc)
@@ -1,57 +1,58 @@
import os, glob, re, textwrap import glob
import os
import pkg_resources import pkg_resources
import re
import textwrap
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter.polyglot.builtins import as_bytes from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
border_style_map = { border_style_map = {'single': 'solid',
'single' : 'solid', 'double-thickness-border': 'double',
'double-thickness-border' : 'double', 'shadowed-border': 'outset',
'shadowed-border': 'outset', 'double-border': 'double',
'double-border': 'double', 'dotted-border': 'dotted',
'dotted-border': 'dotted', 'dashed': 'dashed',
'dashed': 'dashed', 'hairline': 'solid',
'hairline': 'solid', 'inset': 'inset',
'inset': 'inset', 'dash-small': 'dashed',
'dash-small': 'dashed', 'dot-dash': 'dotted',
'dot-dash': 'dotted', 'dot-dot-dash': 'dotted',
'dot-dot-dash': 'dotted', 'outset': 'outset',
'outset': 'outset', 'tripple': 'double',
'tripple': 'double', 'triple': 'double',
'triple': 'double', 'thick-thin-small': 'solid',
'thick-thin-small': 'solid', 'thin-thick-small': 'solid',
'thin-thick-small': 'solid', 'thin-thick-thin-small': 'solid',
'thin-thick-thin-small': 'solid', 'thick-thin-medium': 'solid',
'thick-thin-medium': 'solid', 'thin-thick-medium': 'solid',
'thin-thick-medium': 'solid', 'thin-thick-thin-medium': 'solid',
'thin-thick-thin-medium': 'solid', 'thick-thin-large': 'solid',
'thick-thin-large': 'solid', 'thin-thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid', 'wavy': 'ridge',
'wavy': 'ridge', 'double-wavy': 'ridge',
'double-wavy': 'ridge', 'striped': 'ridge',
'striped': 'ridge', 'emboss': 'inset',
'emboss': 'inset', 'engrave': 'inset',
'engrave': 'inset', 'frame': 'ridge'}
'frame': 'ridge',
}
class RTFInput(InputFormatPlugin): class RTFInput(InputFormatPlugin):
name = 'RTF Input' name = 'RTF Input'
author = 'Kovid Goyal' author = 'Kovid Goyal'
description = 'Convert RTF files to HTML' description = 'Convert RTF files to HTML'
file_types = {'rtf'} file_types = {'rtf'}
commit_name = 'rtf_input' commit_name = 'rtf_input'
options = { options = {OptionRecommendation(name='ignore_wmf', recommended_value=False,
OptionRecommendation(name='ignore_wmf', recommended_value=False, help='Ignore WMF images instead of '
help='Ignore WMF images instead of replacing them with a ' 'replacing them with a placeholder '
'placeholder image.'), 'image.')}
}
def generate_xml(self, stream): def generate_xml(self, stream):
from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf
@@ -64,7 +65,7 @@ class RTFInput(InputFormatPlugin):
run_lev = 4 run_lev = 4
indent_out = 1 indent_out = 1
self.log('Running RTFParser in debug mode') self.log('Running RTFParser in debug mode')
except: except Exception:
self.log.warn('Impossible to run RTFParser in debug mode') self.log.warn('Impossible to run RTFParser in debug mode')
parser = ParseRtf( parser = ParseRtf(
in_file=stream, in_file=stream,
@@ -108,7 +109,8 @@ class RTFInput(InputFormatPlugin):
deb_dir=debug_dir, deb_dir=debug_dir,
# Default encoding # Default encoding
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252', default_encoding=getattr(self.opts, 'input_encoding',
'cp1252') or 'cp1252',
# Run level # Run level
run_level=run_lev, run_level=run_lev,
@@ -151,7 +153,7 @@ class RTFInput(InputFormatPlugin):
for count, val in imap.items(): for count, val in imap.items():
try: try:
imap[count] = self.convert_image(val) imap[count] = self.convert_image(val)
except: except Exception:
self.log.exception('Failed to convert', val) self.log.exception('Failed to convert', val)
return imap return imap
@@ -161,7 +163,7 @@ class RTFInput(InputFormatPlugin):
try: try:
return self.rasterize_wmf(name) return self.rasterize_wmf(name)
except Exception: except Exception:
self.log.exception('Failed to convert WMF image %r'%name) self.log.exception('Failed to convert WMF image %r' % name)
return self.replace_wmf(name) return self.replace_wmf(name)
def replace_wmf(self, name): def replace_wmf(self, name):
@@ -170,9 +172,11 @@ class RTFInput(InputFormatPlugin):
return '__REMOVE_ME__' return '__REMOVE_ME__'
from ebook_converter.ebooks.covers import message_image from ebook_converter.ebooks.covers import message_image
if self.default_img is None: if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not supported.' self.default_img = message_image('Conversion of WMF images is not '
' Use Microsoft Word or OpenOffice to save this RTF file' 'supported. Use Microsoft Word '
' as HTML and convert that in calibre.') 'or OpenOffice to save this RTF '
'file as HTML and convert that '
'in calibre.')
name = name.replace('.wmf', '.jpg') name = name.replace('.wmf', '.jpg')
with open(name, 'wb') as f: with open(name, 'wb') as f:
f.write(self.default_img) f.write(self.default_img)
@@ -189,10 +193,10 @@ class RTFInput(InputFormatPlugin):
return name return name
def write_inline_css(self, ic, border_styles): def write_inline_css(self, ic, border_styles):
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in font_size_classes = ['span.fs%d { font-size: %spt }' % (i, x)
enumerate(ic.font_sizes)] for i, x in enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in color_classes = ['span.col%d { color: %s }' % (i, x)
enumerate(ic.colors) if x != 'false'] for i, x in enumerate(ic.colors) if x != 'false']
css = textwrap.dedent(''' css = textwrap.dedent('''
span.none { span.none {
text-decoration: none; font-weight: normal; text-decoration: none; font-weight: normal;
@@ -210,11 +214,11 @@ class RTFInput(InputFormatPlugin):
span.strike-through { text-decoration: line-through } span.strike-through { text-decoration: line-through }
''') ''')
css += '\n'+'\n'.join(font_size_classes) css += '\n' + '\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes) css += '\n' + '\n'.join(color_classes)
for cls, val in border_styles.items(): for cls, val in border_styles.items():
css += '\n\n.%s {\n%s\n}'%(cls, val) css += '\n\n.%s {\n%s\n}' % (cls, val)
with open(u'styles.css', 'ab') as f: with open(u'styles.css', 'ab') as f:
f.write(css.encode('utf-8')) f.write(css.encode('utf-8'))
@@ -224,35 +228,34 @@ class RTFInput(InputFormatPlugin):
style_map = {} style_map = {}
for elem in doc.xpath(r'//*[local-name()="cell"]'): for elem in doc.xpath(r'//*[local-name()="cell"]'):
style = ['border-style: hidden', 'border-width: 1px', style = ['border-style: hidden', 'border-width: 1px',
'border-color: black'] 'border-color: black']
for x in ('bottom', 'top', 'left', 'right'): for x in ('bottom', 'top', 'left', 'right'):
bs = elem.get('border-cell-%s-style'%x, None) bs = elem.get('border-cell-%s-style' % x, None)
if bs: if bs:
cbs = border_style_map.get(bs, 'solid') cbs = border_style_map.get(bs, 'solid')
style.append('border-%s-style: %s'%(x, cbs)) style.append('border-%s-style: %s' % (x, cbs))
bw = elem.get('border-cell-%s-line-width'%x, None) bw = elem.get('border-cell-%s-line-width' % x, None)
if bw: if bw:
style.append('border-%s-width: %spt'%(x, bw)) style.append('border-%s-width: %spt' % (x, bw))
bc = elem.get('border-cell-%s-color'%x, None) bc = elem.get('border-cell-%s-color' % x, None)
if bc: if bc:
style.append('border-%s-color: %s'%(x, bc)) style.append('border-%s-color: %s' % (x, bc))
style = ';\n'.join(style) style = ';\n'.join(style)
if style not in border_styles: if style not in border_styles:
border_styles.append(style) border_styles.append(style)
idx = border_styles.index(style) idx = border_styles.index(style)
cls = 'border_style%d'%idx cls = 'border_style%d' % idx
style_map[cls] = style style_map[cls] = style
elem.set('class', cls) elem.set('class', cls)
return style_map return style_map
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from lxml import etree
from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.metadata.meta import get_metadata
from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from ebook_converter.ebooks.rtf2xml.ParseRtf import \
RtfInvalidCodeException
from ebook_converter.ebooks.rtf.input import InlineClass from ebook_converter.ebooks.rtf.input import InlineClass
from ebook_converter.utils.xml_parse import safe_xml_fromstring
self.opts = options self.opts = options
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
@@ -269,14 +272,15 @@ class RTFInput(InputFormatPlugin):
imap = {} imap = {}
try: try:
imap = self.extract_images(d[0]) imap = self.extract_images(d[0])
except: except Exception:
self.log.exception('Failed to extract images...') self.log.exception('Failed to extract images...')
self.log('Parsing XML...') self.log('Parsing XML...')
doc = safe_xml_fromstring(xml) doc = etree.fromstring(xml)
border_styles = self.convert_borders(doc) border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]', for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): namespaces={'rtf':
'http://rtf2xml.sourceforge.net/'}):
num = int(pict.get('num')) num = int(pict.get('num'))
name = imap.get(num, None) name = imap.get(num, None)
if name is not None: if name is not None:
@@ -286,8 +290,8 @@ class RTFInput(InputFormatPlugin):
inline_class = InlineClass(self.log) inline_class = InlineClass(self.log)
with open(pkg_resources.resource_filename('ebook_converter', with open(pkg_resources.resource_filename('ebook_converter',
'data/rtf.xsl')) as fobj: 'data/rtf.xsl')) as fobj:
styledoc = safe_xml_fromstring(fobj.read()) styledoc = etree.fromstring(fobj.read())
extensions = {('calibre', 'inline-class') : inline_class} extensions = {('calibre', 'inline-class'): inline_class}
transform = etree.XSLT(styledoc, extensions=extensions) transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc) result = transform(doc)
html = u'index.xhtml' html = u'index.xhtml'
@@ -296,7 +300,8 @@ class RTFInput(InputFormatPlugin):
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n # clean multiple \n
res = re.sub(b'\n+', b'\n', res) res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines # Replace newlines inserted by the 'empty_paragraphs' option in
# rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res) # res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}', # res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res) # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
@@ -316,7 +321,8 @@ class RTFInput(InputFormatPlugin):
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
for item in oeb.spine: for item in oeb.spine:
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'): for img in item.data.xpath('//*[local-name()="img" and '
'@src="__REMOVE_ME__"]'):
p = img.getparent() p = img.getparent()
idx = p.index(img) idx = p.index(img)
p.remove(img) p.remove(img)
@@ -1,27 +1,33 @@
import os import os
from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.ptempfile import TemporaryDirectory from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.utils.filenames import ascii_filename from ebook_converter.utils.filenames import ascii_filename
__license__ = 'GPL 3' HTML_TEMPLATE = ('<html><head><meta http-equiv="Content-Type" '
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>' 'content="text/html; charset=utf-8"/><title>%s</title>'
__docformat__ = 'restructuredtext en' '</head><body>\n%s\n</body></html>')
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def html_encode(s): def html_encode(s):
return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;') # noqa return (s.replace('&', '&amp;')
.replace('<', '&lt;')
.replace('>', '&gt;')
.replace('"', '&quot;')
.replace("'", '&apos;')
.replace('\n', '<br/>')
.replace(' ', '&nbsp;'))
class SNBInput(InputFormatPlugin): class SNBInput(InputFormatPlugin):
name = 'SNB Input' name = 'SNB Input'
author = 'Li Fanxi' author = 'Li Fanxi'
description = 'Convert SNB files to OEB' description = 'Convert SNB files to OEB'
file_types = {'snb'} file_types = {'snb'}
commit_name = 'snb_input' commit_name = 'snb_input'
options = set() options = set()
@@ -32,13 +38,12 @@ class SNBInput(InputFormatPlugin):
from ebook_converter.ebooks.oeb.base import DirContainer from ebook_converter.ebooks.oeb.base import DirContainer
from ebook_converter.ebooks.snb.snbfile import SNBFile from ebook_converter.ebooks.snb.snbfile import SNBFile
from ebook_converter.utils.xml_parse import safe_xml_fromstring
log.debug("Parsing SNB file...") log.debug("Parsing SNB file...")
snbFile = SNBFile() snbFile = SNBFile()
try: try:
snbFile.Parse(stream) snbFile.Parse(stream)
except: except Exception:
raise ValueError("Invalid SNB file") raise ValueError("Invalid SNB file")
if not snbFile.IsValid(): if not snbFile.IsValid():
log.debug("Invalid SNB file") log.debug("Invalid SNB file")
@@ -46,27 +51,28 @@ class SNBInput(InputFormatPlugin):
log.debug("Handle meta data ...") log.debug("Handle meta data ...")
from ebook_converter.ebooks.conversion.plumber import create_oebbook from ebook_converter.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, None, options, oeb = create_oebbook(log, None, options,
encoding=options.input_encoding, populate=False) encoding=options.input_encoding, populate=False)
meta = snbFile.GetFileStream('snbf/book.snbf') meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None: if meta is not None:
meta = safe_xml_fromstring(meta) meta = etree.fromstring(meta)
l = {'title' : './/head/name', item_map = {'title': './/head/name',
'creator' : './/head/author', 'creator': './/head/author',
'language' : './/head/language', 'language': './/head/language',
'generator': './/head/generator', 'generator': './/head/generator',
'publisher': './/head/publisher', 'publisher': './/head/publisher',
'cover' : './/head/cover', } 'cover': './/head/cover'}
d = {} d = {}
for item in l: for key, item in item_map.items():
node = meta.find(l[item]) node = meta.find(item)
if node is not None: if node is not None:
d[item] = node.text if node.text is not None else '' d[key] = node.text if node.text is not None else ''
else: else:
d[item] = '' d[key] = ''
oeb.metadata.add('title', d['title']) oeb.metadata.add('title', d['title'])
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'}) oeb.metadata.add('creator', d['creator'], attrib={'role': 'aut'})
oeb.metadata.add('language', d['language'].lower().replace('_', '-')) oeb.metadata.add('language',
d['language'].lower().replace('_', '-'))
oeb.metadata.add('generator', d['generator']) oeb.metadata.add('generator', d['generator'])
oeb.metadata.add('publisher', d['publisher']) oeb.metadata.add('publisher', d['publisher'])
if d['cover'] != '': if d['cover'] != '':
@@ -84,7 +90,7 @@ class SNBInput(InputFormatPlugin):
toc = snbFile.GetFileStream('snbf/toc.snbf') toc = snbFile.GetFileStream('snbf/toc.snbf')
oeb.container = DirContainer(tdir, log) oeb.container = DirContainer(tdir, log)
if toc is not None: if toc is not None:
toc = safe_xml_fromstring(toc) toc = etree.fromstring(toc)
i = 1 i = 1
for ch in toc.find('.//body'): for ch in toc.find('.//body'):
chapterName = ch.text chapterName = ch.text
@@ -93,18 +99,22 @@ class SNBInput(InputFormatPlugin):
data = snbFile.GetFileStream('snbc/' + chapterSrc) data = snbFile.GetFileStream('snbc/' + chapterSrc)
if data is None: if data is None:
continue continue
snbc = safe_xml_fromstring(data) snbc = etree.fromstring(data)
lines = [] lines = []
for line in snbc.find('.//body'): for line in snbc.find('.//body'):
if line.tag == 'text': if line.tag == 'text':
lines.append('<p>%s</p>' % html_encode(line.text)) lines.append('<p>%s</p>' % html_encode(line.text))
elif line.tag == 'img': elif line.tag == 'img':
lines.append('<p><img src="%s" /></p>' % html_encode(line.text)) lines.append('<p><img src="%s" /></p>' %
html_encode(line.text))
with open(os.path.join(tdir, fname), 'wb') as f: with open(os.path.join(tdir, fname), 'wb') as f:
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace')) f.write((HTML_TEMPLATE %
(chapterName,
'\n'.join(lines))).encode('utf-8',
'replace'))
oeb.toc.add(ch.text, fname) oeb.toc.add(ch.text, fname)
id, href = oeb.manifest.generate(id='html', id, href = oeb.manifest.generate(
href=ascii_filename(fname)) id='html', href=ascii_filename(fname))
item = oeb.manifest.add(id, href, 'text/html') item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = fname item.html_input_href = fname
oeb.spine.add(item, True) oeb.spine.add(item, True)
@@ -112,7 +122,7 @@ class SNBInput(InputFormatPlugin):
imageFiles = snbFile.OutputImageFiles(tdir) imageFiles = snbFile.OutputImageFiles(tdir)
for f, m in imageFiles: for f, m in imageFiles:
id, href = oeb.manifest.generate(id='image', id, href = oeb.manifest.generate(id='image',
href=ascii_filename(f)) href=ascii_filename(f))
item = oeb.manifest.add(id, href, m) item = oeb.manifest.add(id, href, m)
item.html_input_href = f item.html_input_href = f
+54 -41
View File
@@ -1,9 +1,12 @@
import os, sys, shutil import os
import shutil
import sys
from lxml import etree from lxml import etree
from ebook_converter import walk, guess_type from ebook_converter import walk, guess_type
from ebook_converter.ebooks.metadata import string_to_authors, authors_to_sort_string from ebook_converter.ebooks.metadata import authors_to_sort_string
from ebook_converter.ebooks.metadata import string_to_authors
from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.ebooks.docx import InvalidDOCX from ebook_converter.ebooks.docx import InvalidDOCX
from ebook_converter.ebooks.docx.names import DOCXNamespace from ebook_converter.ebooks.docx.names import DOCXNamespace
@@ -11,21 +14,11 @@ from ebook_converter.ptempfile import PersistentTemporaryDirectory
from ebook_converter.utils.localization import canonicalize_lang from ebook_converter.utils.localization import canonicalize_lang
from ebook_converter.utils.logging import default_log from ebook_converter.utils.logging import default_log
from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.zipfile import ZipFile
from ebook_converter.utils.xml_parse import safe_xml_fromstring
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def fromstring(raw, parser=None):
return safe_xml_fromstring(raw)
# Read metadata {{{ # Read metadata {{{
def read_doc_props(raw, mi, XPath): def read_doc_props(raw, mi, XPath):
root = fromstring(raw) root = etree.fromstring(raw)
titles = XPath('//dc:title')(root) titles = XPath('//dc:title')(root)
if titles: if titles:
title = titles[0].text title = titles[0].text
@@ -53,29 +46,31 @@ def read_doc_props(raw, mi, XPath):
desc = XPath('//dc:description')(root) desc = XPath('//dc:description')(root)
if desc: if desc:
raw = etree.tostring(desc[0], method='text', encoding='unicode') raw = etree.tostring(desc[0], method='text', encoding='unicode')
raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary # Word 2007 mangles newlines in the summary
raw = raw.replace('_x000d_', '')
mi.comments = raw.strip() mi.comments = raw.strip()
langs = [] langs = []
for lang in XPath('//dc:language')(root): for lang in XPath('//dc:language')(root):
if lang.text and lang.text.strip(): if lang.text and lang.text.strip():
l = canonicalize_lang(lang.text) canonic_lang = canonicalize_lang(lang.text)
if l: if canonic_lang:
langs.append(l) langs.append(canonic_lang)
if langs: if langs:
mi.languages = langs mi.languages = langs
def read_app_props(raw, mi): def read_app_props(raw, mi):
root = fromstring(raw) root = etree.fromstring(raw)
company = root.xpath('//*[local-name()="Company"]') company = root.xpath('//*[local-name()="Company"]')
if company and company[0].text and company[0].text.strip(): if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip() mi.publisher = company[0].text.strip()
def read_default_style_language(raw, mi, XPath): def read_default_style_language(raw, mi, XPath):
root = fromstring(raw) root = etree.fromstring(raw)
for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root): for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/'
'@w:val')(root):
lang = canonicalize_lang(lang) lang = canonicalize_lang(lang)
if lang: if lang:
mi.languages = [lang] mi.languages = [lang]
@@ -87,7 +82,9 @@ class DOCX(object):
def __init__(self, path_or_stream, log=None, extract=True): def __init__(self, path_or_stream, log=None, extract=True):
self.docx_is_transitional = True self.docx_is_transitional = True
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') stream = path_or_stream
if not hasattr(path_or_stream, 'read'):
stream = open(path_or_stream, 'rb')
self.name = getattr(stream, 'name', None) or '<stream>' self.name = getattr(stream, 'name', None) or '<stream>'
self.log = log or default_log self.log = log or default_log
if extract: if extract:
@@ -107,9 +104,9 @@ class DOCX(object):
try: try:
zf = ZipFile(stream) zf = ZipFile(stream)
zf.extractall(self.tdir) zf.extractall(self.tdir)
except: except Exception:
self.log.exception('DOCX appears to be invalid ZIP file, trying a' self.log.exception('DOCX appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser') ' more forgiving ZIP parser')
from ebook_converter.utils.localunzip import extractall from ebook_converter.utils.localunzip import extractall
stream.seek(0) stream.seek(0)
extractall(stream, self.tdir) extractall(stream, self.tdir)
@@ -133,13 +130,17 @@ class DOCX(object):
try: try:
raw = self.read('[Content_Types].xml') raw = self.read('[Content_Types].xml')
except KeyError: except KeyError:
raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) raise InvalidDOCX('The file %s docx file has no '
root = fromstring(raw) '[Content_Types].xml' % self.name)
root = etree.fromstring(raw)
self.content_types = {} self.content_types = {}
self.default_content_types = {} self.default_content_types = {}
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): for item in root.xpath('//*[local-name()="Types"]/*[local-name()='
self.default_content_types[item.get('Extension').lower()] = item.get('ContentType') '"Default" and @Extension and @ContentType]'):
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'): self.default_content_types[item.get('Extension').lower()] = \
item.get('ContentType')
for item in root.xpath('//*[local-name()="Types"]/*[local-name()='
'"Override" and @PartName and @ContentType]'):
name = item.get('PartName').lstrip('/') name = item.get('PartName').lstrip('/')
self.content_types[name] = item.get('ContentType') self.content_types[name] = item.get('ContentType')
@@ -155,15 +156,19 @@ class DOCX(object):
try: try:
raw = self.read('_rels/.rels') raw = self.read('_rels/.rels')
except KeyError: except KeyError:
raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) raise InvalidDOCX('The file %s docx file has no _rels/.rels' %
root = fromstring(raw) self.name)
root = etree.fromstring(raw)
self.relationships = {} self.relationships = {}
self.relationships_rmap = {} self.relationships_rmap = {}
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): for item in root.xpath('//*[local-name()="Relationships"]/*[local-name'
'()="Relationship" and @Type and @Target]'):
target = item.get('Target').lstrip('/') target = item.get('Target').lstrip('/')
typ = item.get('Type') typ = item.get('Type')
if target == 'word/document.xml': if target == 'word/document.xml':
self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument' self.docx_is_transitional = (typ != 'http://purl.oclc.org/'
'ooxml/officeDocument/'
'relationships/officeDocument')
self.relationships[typ] = target self.relationships[typ] = target
self.relationships_rmap[target] = typ self.relationships_rmap[target] = typ
@@ -171,15 +176,17 @@ class DOCX(object):
def document_name(self): def document_name(self):
name = self.relationships.get(self.namespace.names['DOCUMENT'], None) name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
if name is None: if name is None:
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) names = tuple(n for n in self.names if n == 'document.xml' or
n.endswith('/document.xml'))
if not names: if not names:
raise InvalidDOCX('The file %s docx file has no main document' % self.name) raise InvalidDOCX('The file %s docx file has no main '
'document' % self.name)
name = names[0] name = names[0]
return name return name
@property @property
def document(self): def document(self):
return fromstring(self.read(self.document_name)) return etree.fromstring(self.read(self.document_name))
@property @property
def document_relationships(self): def document_relationships(self):
@@ -195,10 +202,13 @@ class DOCX(object):
except KeyError: except KeyError:
pass pass
else: else:
root = fromstring(raw) root = etree.fromstring(raw)
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): for item in root.xpath('//*[local-name()="Relationships"]/*'
'[local-name()="Relationship" and @Type '
'and @Target]'):
target = item.get('Target') target = item.get('Target')
if item.get('TargetMode', None) != 'External' and not target.startswith('#'): if (item.get('TargetMode', None) != 'External' and not
target.startswith('#')):
target = '/'.join((base, target.lstrip('/'))) target = '/'.join((base, target.lstrip('/')))
typ = item.get('Type') typ = item.get('Type')
Id = item.get('Id') Id = item.get('Id')
@@ -209,13 +219,15 @@ class DOCX(object):
def get_document_properties_names(self): def get_document_properties_names(self):
name = self.relationships.get(self.namespace.names['DOCPROPS'], None) name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
if name is None: if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') names = tuple(n for n in self.names
if n.lower() == 'docprops/core.xml')
if names: if names:
name = names[0] name = names[0]
yield name yield name
name = self.relationships.get(self.namespace.names['APPPROPS'], None) name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if name is None: if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') names = tuple(n for n in self.names
if n.lower() == 'docprops/app.xml')
if names: if names:
name = names[0] name = names[0]
yield name yield name
@@ -239,7 +251,8 @@ class DOCX(object):
else: else:
read_default_style_language(raw, mi, self.namespace.XPath) read_default_style_language(raw, mi, self.namespace.XPath)
ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None) ap_name = self.relationships.get(self.namespace.names['APPPROPS'],
None)
if ap_name: if ap_name:
try: try:
raw = self.read(ap_name) raw = self.read(ap_name)
+8 -7
View File
@@ -1,12 +1,13 @@
import sys, os, re, math, errno, uuid, numbers import sys, os, re, math, errno, uuid, numbers
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from lxml import etree
from lxml import html from lxml import html
from lxml.html.builder import ( from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1) HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
from ebook_converter import guess_type from ebook_converter import guess_type
from ebook_converter.ebooks.docx.container import DOCX, fromstring from ebook_converter.ebooks.docx.container import DOCX
from ebook_converter.ebooks.docx.names import XML, generate_anchor from ebook_converter.ebooks.docx.names import XML, generate_anchor
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
from ebook_converter.ebooks.docx.numbering import Numbering from ebook_converter.ebooks.docx.numbering import Numbering
@@ -311,7 +312,7 @@ class Convert(object):
raise raise
self.log.warn('Settings %s file missing' % sename) self.log.warn('Settings %s file missing' % sename)
else: else:
self.settings(fromstring(seraw)) self.settings(etree.fromstring(seraw))
if foname is not None: if foname is not None:
try: try:
@@ -327,7 +328,7 @@ class Convert(object):
self.log.warn('Endnotes %s do not exist' % enname) self.log.warn('Endnotes %s do not exist' % enname)
else: else:
enrel = self.docx.get_relationships(enname) enrel = self.docx.get_relationships(enname)
footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel) footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel)
if fname is not None: if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0] embed_relationships = self.docx.get_relationships(fname)[0]
@@ -336,7 +337,7 @@ class Convert(object):
except KeyError: except KeyError:
self.log.warn('Fonts table %s does not exist' % fname) self.log.warn('Fonts table %s does not exist' % fname)
else: else:
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir) fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir)
if tname is not None: if tname is not None:
try: try:
@@ -344,7 +345,7 @@ class Convert(object):
except KeyError: except KeyError:
self.log.warn('Styles %s do not exist' % sname) self.log.warn('Styles %s do not exist' % sname)
else: else:
self.theme(fromstring(raw)) self.theme(etree.fromstring(raw))
styles_loaded = False styles_loaded = False
if sname is not None: if sname is not None:
@@ -353,7 +354,7 @@ class Convert(object):
except KeyError: except KeyError:
self.log.warn('Styles %s do not exist' % sname) self.log.warn('Styles %s do not exist' % sname)
else: else:
self.styles(fromstring(raw), fonts, self.theme) self.styles(etree.fromstring(raw), fonts, self.theme)
styles_loaded = True styles_loaded = True
if not styles_loaded: if not styles_loaded:
self.styles(None, fonts, self.theme) self.styles(None, fonts, self.theme)
@@ -364,7 +365,7 @@ class Convert(object):
except KeyError: except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname) self.log.warn('Numbering styles %s do not exist' % nname)
else: else:
numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0]) numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering) self.styles.resolve_numbering(numbering)
+12 -15
View File
@@ -1,22 +1,19 @@
import collections
import re import re
from collections import Counter
from ebook_converter.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area from ebook_converter.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area
from ebook_converter.ebooks.docx.writer.styles import StylesManager, FloatSpec
from ebook_converter.ebooks.docx.writer.links import LinksManager
from ebook_converter.ebooks.docx.writer.images import ImagesManager
from ebook_converter.ebooks.docx.writer.fonts import FontsManager from ebook_converter.ebooks.docx.writer.fonts import FontsManager
from ebook_converter.ebooks.docx.writer.tables import Table from ebook_converter.ebooks.docx.writer.images import ImagesManager
from ebook_converter.ebooks.docx.writer.links import LinksManager
from ebook_converter.ebooks.docx.writer.lists import ListsManager from ebook_converter.ebooks.docx.writer.lists import ListsManager
from ebook_converter.ebooks.docx.writer.styles import StylesManager, FloatSpec
from ebook_converter.ebooks.docx.writer.tables import Table
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from ebook_converter.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from ebook_converter.ebooks.oeb.base import XPath, barename
from ebook_converter.utils.localization import lang_as_iso639_1 from ebook_converter.utils.localization import lang_as_iso639_1
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def lang_for_tag(tag): def lang_for_tag(tag):
for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'): for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'):
val = lang_as_iso639_1(tag.get(attr)) val = lang_as_iso639_1(tag.get(attr))
@@ -140,7 +137,7 @@ class Block(object):
self.numbering_id = None self.numbering_id = None
self.parent_items = None self.parent_items = None
self.html_block = html_block self.html_block = html_block
self.html_tag = barename(html_block.tag) self.html_tag = parse_utils.barename(html_block.tag)
self.float_spec = float_spec self.float_spec = float_spec
if float_spec is not None: if float_spec is not None:
float_spec.blocks.append(self) float_spec.blocks.append(self)
@@ -387,7 +384,7 @@ class Blocks(object):
def resolve_language(self): def resolve_language(self):
default_lang = self.styles_manager.document_lang default_lang = self.styles_manager.document_lang
for block in self.all_blocks: for block in self.all_blocks:
count = Counter() count = collections.Counter()
for run in block.runs: for run in block.runs:
count[run.lang] += 1 count[run.lang] += 1
if count: if count:
@@ -473,13 +470,13 @@ class Convert(object):
self.abshref = self.images_manager.abshref = item.abshref self.abshref = self.images_manager.abshref = item.abshref
self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang
for i, body in enumerate(XPath('//h:body')(item.data)): for i, body in enumerate(base.XPath('//h:body')(item.data)):
with self.blocks: with self.blocks:
self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body) self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body)
self.process_tag(body, stylizer, is_first_tag=i == 0) self.process_tag(body, stylizer, is_first_tag=i == 0)
def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
tagname = barename(html_tag.tag) tagname = parse_utils.barename(html_tag.tag)
tag_style = stylizer.style(html_tag) tag_style = stylizer.style(html_tag)
ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
display = tag_style._get('display') display = tag_style._get('display')
@@ -573,7 +570,7 @@ class Convert(object):
text = html_tag.text text = html_tag.text
if text: if text:
block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang) block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang)
elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]): elif tagname == 'li' and len(html_tag) and parse_utils.barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]):
block.force_not_empty = True block.force_not_empty = True
def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): def add_inline_tag(self, tagname, html_tag, tag_style, stylizer):
+113 -63
View File
@@ -9,10 +9,10 @@ import uuid
from lxml import etree from lxml import etree
from ebook_converter import constants as const
from ebook_converter import prepare_string_for_xml from ebook_converter import prepare_string_for_xml
from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.utils.localization import lang_as_iso639_1 from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.ebooks.oeb.base import urlnormalize from ebook_converter.ebooks.oeb.base import urlnormalize
from ebook_converter.polyglot.binary import as_base64_unicode from ebook_converter.polyglot.binary import as_base64_unicode
@@ -36,9 +36,10 @@ class FB2MLizer(object):
def reset_state(self): def reset_state(self):
# Used to ensure text and tags are always within <p> and </p> # Used to ensure text and tags are always within <p> and </p>
self.in_p = False self.in_p = False
# Mapping of image names. OEB allows for images to have the same name but be stored # Mapping of image names. OEB allows for images to have the same name
# in different directories. FB2 images are all in a flat layout so we rename all images # but be stored in different directories. FB2 images are all in a flat
# into a sequential numbering system to ensure there are no collisions between image names. # layout so we rename all images into a sequential numbering system to
# ensure there are no collisions between image names.
self.image_hrefs = {} self.image_hrefs = {}
# Mapping of toc items and their # Mapping of toc items and their
self.toc = {} self.toc = {}
@@ -68,13 +69,15 @@ class FB2MLizer(object):
output = self.clean_text('\n'.join(output)) output = self.clean_text('\n'.join(output))
if self.opts.pretty_print: if self.opts.pretty_print:
output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True) output = etree.tostring(etree.fromstring(output),
encoding='unicode', pretty_print=True)
return '<?xml version="1.0" encoding="UTF-8"?>\n' + output return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
def clean_text(self, text): def clean_text(self, text):
# Remove pointless tags, but keep their contents. # Remove pointless tags, but keep their contents.
text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)</\1>', r'\2', text) text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>'
r'(\s*)</\1>', r'\2', text)
# Clean up paragraphs endings. # Clean up paragraphs endings.
text = re.sub(r'(?mu)\s+</p>', '</p>', text) text = re.sub(r'(?mu)\s+</p>', '</p>', text)
@@ -96,7 +99,8 @@ class FB2MLizer(object):
text = re.sub(r'(?mu)</title>\s*<p>', '</title>\n<p>', text) text = re.sub(r'(?mu)</title>\s*<p>', '</title>\n<p>', text)
# Put line breaks between paragraphs on a separate line. # Put line breaks between paragraphs on a separate line.
text = re.sub(r'(?mu)</(p|title)>\s*<empty-line/>', r'</\1>\n<empty-line/>', text) text = re.sub(r'(?mu)</(p|title)>\s*<empty-line/>',
r'</\1>\n<empty-line/>', text)
text = re.sub(r'(?mu)<empty-line/>\s*<p>', '<empty-line/>\n<p>', text) text = re.sub(r'(?mu)<empty-line/>\s*<p>', '<empty-line/>\n<p>', text)
# Remove empty sections. # Remove empty sections.
@@ -115,7 +119,9 @@ class FB2MLizer(object):
metadata['title'] = self.oeb_book.metadata.title[0].value metadata['title'] = self.oeb_book.metadata.title[0].value
metadata['appname'] = __appname__ metadata['appname'] = __appname__
metadata['version'] = __version__ metadata['version'] = __version__
metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) metadata['date'] = '%i.%i.%i' % (datetime.now().day,
datetime.now().month,
datetime.now().year)
if self.oeb_book.metadata.language: if self.oeb_book.metadata.language:
lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
if not lc: if not lc:
@@ -143,31 +149,38 @@ class FB2MLizer(object):
author_middle = ' '.join(author_parts[1:-1]) author_middle = ' '.join(author_parts[1:-1])
author_last = author_parts[-1] author_last = author_parts[-1]
metadata['author'] += '<author>' metadata['author'] += '<author>'
metadata['author'] += '<first-name>%s</first-name>' % prepare_string_for_xml(author_first) metadata['author'] += ('<first-name>%s</first-name>' %
prepare_string_for_xml(author_first))
if author_middle: if author_middle:
metadata['author'] += '<middle-name>%s</middle-name>' % prepare_string_for_xml(author_middle) metadata['author'] += ('<middle-name>%s</middle-name>' %
metadata['author'] += '<last-name>%s</last-name>' % prepare_string_for_xml(author_last) prepare_string_for_xml(author_middle))
metadata['author'] += ('<last-name>%s</last-name>' %
prepare_string_for_xml(author_last))
metadata['author'] += '</author>' metadata['author'] += '</author>'
if not metadata['author']: if not metadata['author']:
metadata['author'] = '<author><first-name></first-name><last-name></last-name></author>' metadata['author'] = ('<author><first-name></first-name>'
'<last-name></last-name></author>')
metadata['keywords'] = '' metadata['keywords'] = ''
tags = list(map(str, self.oeb_book.metadata.subject)) tags = list(map(str, self.oeb_book.metadata.subject))
if tags: if tags:
tags = ', '.join(prepare_string_for_xml(x) for x in tags) tags = ', '.join(prepare_string_for_xml(x) for x in tags)
metadata['keywords'] = '<keywords>%s</keywords>'%tags metadata['keywords'] = '<keywords>%s</keywords>' % tags
metadata['sequence'] = '' metadata['sequence'] = ''
if self.oeb_book.metadata.series: if self.oeb_book.metadata.series:
index = '1' index = '1'
if self.oeb_book.metadata.series_index: if self.oeb_book.metadata.series_index:
index = self.oeb_book.metadata.series_index[0] index = self.oeb_book.metadata.series_index[0]
metadata['sequence'] = '<sequence name="%s" number="%s"/>' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index) seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0]))
metadata['sequence'] = ('<sequence name="%s" number="%s"/>' %
(seq, index))
year = publisher = isbn = '' year = publisher = isbn = ''
identifiers = self.oeb_book.metadata['identifier'] identifiers = self.oeb_book.metadata['identifier']
for x in identifiers: for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'): if (x.get(OPF('scheme'), None).lower() == 'uuid' or
str(x).startswith('urn:uuid:')):
metadata['id'] = str(x).split(':')[-1] metadata['id'] = str(x).split(':')[-1]
break break
if metadata['id'] is None: if metadata['id'] is None:
@@ -179,22 +192,27 @@ class FB2MLizer(object):
except IndexError: except IndexError:
pass pass
else: else:
year = '<year>%s</year>' % prepare_string_for_xml(date.value.partition('-')[0]) year = ('<year>%s</year>' %
prepare_string_for_xml(date.value.partition('-')[0]))
try: try:
publisher = self.oeb_book.metadata['publisher'][0] publisher = self.oeb_book.metadata['publisher'][0]
except IndexError: except IndexError:
pass pass
else: else:
publisher = '<publisher>%s</publisher>' % prepare_string_for_xml(publisher.value) publisher = ('<publisher>%s</publisher>' %
prepare_string_for_xml(publisher.value))
for x in identifiers: for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'isbn': if x.get(OPF('scheme'), None).lower() == 'isbn':
isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value) isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value)
metadata['year'], metadata['isbn'], metadata['publisher'] = year, isbn, publisher metadata['year'] = year
metadata['isbn'] = isbn
metadata['publisher'] = publisher
for key, value in metadata.items(): for key, value in metadata.items():
if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): if key not in ('author', 'cover', 'sequence', 'keywords', 'year',
'publisher', 'isbn'):
metadata[key] = prepare_string_for_xml(value) metadata[key] = prepare_string_for_xml(value)
try: try:
@@ -203,7 +221,8 @@ class FB2MLizer(object):
metadata['comments'] = '' metadata['comments'] = ''
else: else:
from ebook_converter.utils.html2text import html2text from ebook_converter.utils.html2text import html2text
metadata['comments'] = '<annotation><p>{}</p></annotation>'.format(prepare_string_for_xml(html2text(comments.value).strip())) annot = prepare_string_for_xml(html2text(comments.value).strip())
metadata['comments'] = f'<annotation><p>{annot}</p></annotation>'
# Keep the indentation level of the description the same as the body. # Keep the indentation level of the description the same as the body.
header = textwrap.dedent('''\ header = textwrap.dedent('''\
@@ -245,7 +264,9 @@ class FB2MLizer(object):
cover_href = None cover_href = None
# Get the raster cover if it's available. # Get the raster cover if it's available.
if self.oeb_book.metadata.cover and str(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: if (self.oeb_book.metadata.cover and
str(self.oeb_book.metadata.cover[0]) in
self.oeb_book.manifest.ids):
id = str(self.oeb_book.metadata.cover[0]) id = str(self.oeb_book.metadata.cover[0])
cover_item = self.oeb_book.manifest.ids[id] cover_item = self.oeb_book.manifest.ids[id]
if cover_item.media_type in OEB_RASTER_IMAGES: if cover_item.media_type in OEB_RASTER_IMAGES:
@@ -259,7 +280,8 @@ class FB2MLizer(object):
page_name = 'cover' page_name = 'cover'
if page_name: if page_name:
cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href] key = self.oeb_book.guide[page_name].href
cover_item = self.oeb_book.manifest.hrefs[key]
# Get the first image in the page # Get the first image in the page
for img in cover_item.xpath('//img'): for img in cover_item.xpath('//img'):
cover_href = cover_item.abshref(img.get('src')) cover_href = cover_item.abshref(img.get('src'))
@@ -267,10 +289,11 @@ class FB2MLizer(object):
if cover_href: if cover_href:
# Only write the image tag if it is in the manifest. # Only write the image tag if it is in the manifest.
if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs: if (cover_href in self.oeb_book.manifest.hrefs and
cover_href not in self.image_hrefs):
self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs)
return '<coverpage><image l:href="#%s"/></coverpage>' % self.image_hrefs[cover_href] return ('<coverpage><image l:href="#%s"/></coverpage>' %
self.image_hrefs[cover_href])
return '' return ''
def get_text(self): def get_text(self):
@@ -285,16 +308,20 @@ class FB2MLizer(object):
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts,
self.opts.output_profile)
# Start a <section> if we must sectionize each file or if the TOC references this page # Start a <section> if we must sectionize each file or if the TOC
# references this page
page_section_open = False page_section_open = False
if self.opts.sectionize == 'files' or None in self.toc.get(item.href, ()): if (self.opts.sectionize == 'files' or
None in self.toc.get(item.href, ())):
text.append('<section>') text.append('<section>')
page_section_open = True page_section_open = True
self.section_level += 1 self.section_level += 1
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) text += self.dump_text(item.data.find(XHTML('body')), stylizer,
item)
if page_section_open: if page_section_open:
text.append('</section>') text.append('</section>')
@@ -309,20 +336,23 @@ class FB2MLizer(object):
return ''.join(text) return ''.join(text)
def fb2mlize_images(self): def fb2mlize_images(self):
''' """
This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. This function uses the self.image_hrefs dictionary mapping. It is
''' populated by the dump_text function.
"""
from ebook_converter.ebooks.oeb.base import OEB_RASTER_IMAGES from ebook_converter.ebooks.oeb.base import OEB_RASTER_IMAGES
images = [] images = []
for item in self.oeb_book.manifest: for item in self.oeb_book.manifest:
# Don't write the image if it's not referenced in the document's text. # Don't write the image if it's not referenced in the document's
# text.
if item.href not in self.image_hrefs: if item.href not in self.image_hrefs:
continue continue
if item.media_type in OEB_RASTER_IMAGES: if item.media_type in OEB_RASTER_IMAGES:
try: try:
if item.media_type not in ('image/jpeg', 'image/png'): if item.media_type not in ('image/jpeg', 'image/png'):
imdata = save_cover_data_to(item.data, compression_quality=70) imdata = save_cover_data_to(item.data,
compression_quality=70)
raw_data = as_base64_unicode(imdata) raw_data = as_base64_unicode(imdata)
content_type = 'image/jpeg' content_type = 'image/jpeg'
else: else:
@@ -330,11 +360,14 @@ class FB2MLizer(object):
content_type = item.media_type content_type = item.media_type
# Don't put the encoded image on a single line. # Don't put the encoded image on a single line.
step = 72 step = 72
data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step)) data = '\n'.join(raw_data[i:i+step]
images.append('<binary id="%s" content-type="%s">%s</binary>' % (self.image_hrefs[item.href], content_type, data)) for i in range(0, len(raw_data), step))
images.append('<binary id="%s" content-type="%s">%s'
'</binary>' % (self.image_hrefs[item.href],
content_type, data))
except Exception as e: except Exception as e:
self.log.error('Error: Could not include file %s because ' self.log.error('Error: Could not include file %s because '
'%s.' % (item.href, e)) '%s.' % (item.href, e))
return '\n'.join(images) return '\n'.join(images)
def create_flat_toc(self, nodes, level): def create_flat_toc(self, nodes, level):
@@ -391,26 +424,31 @@ class FB2MLizer(object):
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
''' '''
This function is intended to be used in a recursive manner. dump_text will This function is intended to be used in a recursive manner. dump_text
run though all elements in the elem_tree and call itself on each element. will run though all elements in the elem_tree and call itself on each
element.
self.image_hrefs will be populated by calling this function. self.image_hrefs will be populated by calling this function.
@param elem_tree: etree representation of XHTML content to be transformed. @param elem_tree: etree representation of XHTML content to be
transformed.
@param stylizer: Used to track the style of elements within the tree. @param stylizer: Used to track the style of elements within the tree.
@param page: OEB page used to determine absolute urls. @param page: OEB page used to determine absolute urls.
@param tag_stack: List of open FB2 tags to take into account. @param tag_stack: List of open FB2 tags to take into account.
@return: List of string representing the XHTML converted to FB2 markup. @return: List of string representing the XHTML converted to FB2 markup.
''' '''
from ebook_converter.ebooks.oeb.base import XHTML_NS, barename, namespace from ebook_converter.ebooks.oeb.base import barename
from ebook_converter.ebooks.oeb.base import namespace
elem = elem_tree elem = elem_tree
# Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace. # Ensure what we are converting is not a string and that the fist tag
if not isinstance(elem_tree.tag, (str, bytes)) or namespace(elem_tree.tag) != XHTML_NS: # is part of the XHTML namespace.
if (not isinstance(elem_tree.tag, (str, bytes)) or
namespace(elem_tree.tag) != const.XHTML_NS):
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ if (p is not None and isinstance(p.tag, (str, bytes)) and
and elem.tail: namespace(p.tag) == const.XHTML_NS and elem.tail):
return [elem.tail] return [elem.tail]
return [] return []
@@ -423,7 +461,8 @@ class FB2MLizer(object):
# FB2 generated output. # FB2 generated output.
fb2_out = [] fb2_out = []
# FB2 tags in the order they are opened. This will be used to close the tags. # FB2 tags in the order they are opened. This will be used to close
# the tags.
tags = [] tags = []
# First tag in tree # First tag in tree
tag = barename(elem_tree.tag) tag = barename(elem_tree.tag)
@@ -432,26 +471,31 @@ class FB2MLizer(object):
ems = int(round((float(style.marginTop) / style.fontSize) - 1)) ems = int(round((float(style.marginTop) / style.fontSize) - 1))
if ems < 0: if ems < 0:
ems = 0 ems = 0
except: except Exception:
ems = 0 ems = 0
# Convert TOC entries to <title>s and add <section>s # Convert TOC entries to <title>s and add <section>s
if self.opts.sectionize == 'toc': if self.opts.sectionize == 'toc':
# A section cannot be a child of any other element than another section, # A section cannot be a child of any other element than another
# so leave the tag alone if there are parents # section, so leave the tag alone if there are parents
if not tag_stack: if not tag_stack:
# There are two reasons to start a new section here: the TOC pointed to # There are two reasons to start a new section here: the TOC
# this page (then we use the first non-<body> on the page as a <title>), or # pointed to this page (then we use the first non-<body> on
# the TOC pointed to a specific element # the page as a <title>), or the TOC pointed to a specific
# element
newlevel = 0 newlevel = 0
toc_entry = self.toc.get(page.href, None) toc_entry = self.toc.get(page.href, None)
if toc_entry is not None: if toc_entry is not None:
if None in toc_entry: if None in toc_entry:
if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text: if (tag != 'body' and hasattr(elem_tree, 'text') and
elem_tree.text):
newlevel = 1 newlevel = 1
self.toc[page.href] = None self.toc[page.href] = None
if not newlevel and elem_tree.attrib.get('id', None) is not None: if (not newlevel and
newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None) elem_tree.attrib.get('id', None) is not None):
newlevel = toc_entry.get(elem_tree.attrib.get('id',
None),
None)
# Start a new section if necessary # Start a new section if necessary
if newlevel: if newlevel:
@@ -463,13 +507,14 @@ class FB2MLizer(object):
fb2_out.append('<title>') fb2_out.append('<title>')
tags.append('title') tags.append('title')
if self.section_level == 0: if self.section_level == 0:
# If none of the prior processing made a section, make one now to be FB2 spec compliant # If none of the prior processing made a section, make one now
# to be FB2 spec compliant
fb2_out.append('<section>') fb2_out.append('<section>')
self.section_level += 1 self.section_level += 1
# Process the XHTML tag and styles. Converted to an FB2 tag. # Process the XHTML tag and styles. Converted to an FB2 tag.
# Use individual if statement not if else. There can be # Use individual if statement not if else. There can be only one XHTML
# only one XHTML tag but it can have multiple styles. # tag but it can have multiple styles.
if tag == 'img' and elem_tree.attrib.get('src', None): if tag == 'img' and elem_tree.attrib.get('src', None):
# Only write the image tag if it is in the manifest. # Only write the image tag if it is in the manifest.
ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
@@ -479,7 +524,8 @@ class FB2MLizer(object):
p_txt, p_tag = self.ensure_p() p_txt, p_tag = self.ensure_p()
fb2_out += p_txt fb2_out += p_txt
tags += p_tag tags += p_tag
fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) fb2_out.append('<image l:href="#%s"/>' %
self.image_hrefs[ihref])
else: else:
self.log.warn(u'Ignoring image not in manifest: %s' % ihref) self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
if tag in ('br', 'hr') or ems >= 1: if tag in ('br', 'hr') or ems >= 1:
@@ -513,7 +559,8 @@ class FB2MLizer(object):
p_txt, p_tag = self.ensure_p() p_txt, p_tag = self.ensure_p()
fb2_out += p_txt fb2_out += p_txt
tags += p_tag tags += p_tag
fb2_out.append('<a l:href="%s">' % urlnormalize(elem_tree.attrib['href'])) fb2_out.append('<a l:href="%s">' %
urlnormalize(elem_tree.attrib['href']))
tags.append('a') tags.append('a')
if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
@@ -523,8 +570,10 @@ class FB2MLizer(object):
s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
if tag in ('del', 'strike') or style['text-decoration'] == 'line-through': if (tag in ('del', 'strike') or
s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags) style['text-decoration'] == 'line-through'):
s_out, s_tags = self.handle_simple_tag('strikethrough',
tag_stack+tags)
fb2_out += s_out fb2_out += s_out
tags += s_tags tags += s_tags
if tag == 'sub': if tag == 'sub':
@@ -552,7 +601,8 @@ class FB2MLizer(object):
tags.reverse() tags.reverse()
fb2_out += self.close_tags(tags) fb2_out += self.close_tags(tags)
# Process element text that comes after the close of the XHTML tag but before the next XHTML tag. # Process element text that comes after the close of the XHTML tag but
# before the next XHTML tag.
if hasattr(elem_tree, 'tail') and elem_tree.tail: if hasattr(elem_tree, 'tail') and elem_tree.tail:
if not self.in_p: if not self.in_p:
fb2_out.append('<p>') fb2_out.append('<p>')
+21 -20
View File
@@ -9,8 +9,9 @@ from functools import partial
from lxml import html from lxml import html
from ebook_converter import prepare_string_for_xml from ebook_converter import prepare_string_for_xml
from ebook_converter.ebooks.oeb.base import ( from ebook_converter import constants as const
XHTML, XHTML_NS, SVG_NS, barename, namespace, OEB_IMAGES, XLINK, rewrite_links, urlnormalize) from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.utils.logging import default_log from ebook_converter.utils.logging import default_log
from ebook_converter.polyglot.builtins import as_bytes from ebook_converter.polyglot.builtins import as_bytes
@@ -61,9 +62,9 @@ class OEB2HTML(object):
for item in oeb_book.spine: for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href) self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item) self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item)) base.rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item)
output.append('\n\n') output.append('\n\n')
output.append('</body></html>') output.append('</body></html>')
return ''.join(output) return ''.join(output)
@@ -80,7 +81,7 @@ class OEB2HTML(object):
def map_resources(self, oeb_book): def map_resources(self, oeb_book):
for item in oeb_book.manifest: for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES: if item.media_type in base.OEB_IMAGES:
if item.href not in self.images: if item.href not in self.images:
ext = os.path.splitext(item.href)[1] ext = os.path.splitext(item.href)[1]
fname = '%s%s' % (len(self.images), ext) fname = '%s%s' % (len(self.images), ext)
@@ -88,9 +89,9 @@ class OEB2HTML(object):
self.images[item.href] = fname self.images[item.href] = fname
if item in oeb_book.spine: if item in oeb_book.spine:
self.get_link_id(item.href) self.get_link_id(item.href)
root = item.data.find(XHTML('body')) root = item.data.find(base.tag('xhtml', 'body'))
link_attrs = set(html.defs.link_attrs) link_attrs = set(html.defs.link_attrs)
link_attrs.add(XLINK('href')) link_attrs.add(base.tag('xlink', 'href'))
for el in root.iter(): for el in root.iter():
attribs = el.attrib attribs = el.attrib
try: try:
@@ -108,7 +109,7 @@ class OEB2HTML(object):
def rewrite_link(self, url, page=None): def rewrite_link(self, url, page=None):
if not page: if not page:
return url return url
abs_url = page.abshref(urlnormalize(url)) abs_url = page.abshref(base.urlnormalize(url))
if abs_url in self.images: if abs_url in self.images:
return 'images/%s' % self.images[abs_url] return 'images/%s' % self.images[abs_url]
if abs_url in self.links: if abs_url in self.links:
@@ -121,7 +122,7 @@ class OEB2HTML(object):
tag = el.tag tag = el.tag
except UnicodeDecodeError: except UnicodeDecodeError:
continue continue
if tag == XHTML('body'): if tag == base.tag('xhtml', 'body'):
el.attrib['id'] = self.get_link_id(page.href)[1:] el.attrib['id'] = self.get_link_id(page.href)[1:]
continue continue
if 'id' in el.attrib: if 'id' in el.attrib:
@@ -156,9 +157,9 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
# We can only processes tags. If there isn't a tag return any text. # We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) not in (XHTML_NS, SVG_NS): or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
and elem.tail: and elem.tail:
return [elem.tail] return [elem.tail]
return [''] return ['']
@@ -167,7 +168,7 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
text = [''] text = ['']
style = stylizer.style(elem) style = stylizer.style(elem)
tags = [] tags = []
tag = barename(elem.tag) tag = parse_utils.barename(elem.tag)
attribs = elem.attrib attribs = elem.attrib
if tag == 'body': if tag == 'body':
@@ -245,9 +246,9 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
# We can only processes tags. If there isn't a tag return any text. # We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) not in (XHTML_NS, SVG_NS): or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
and elem.tail: and elem.tail:
return [elem.tail] return [elem.tail]
return [''] return ['']
@@ -256,7 +257,7 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
text = [''] text = ['']
style = stylizer.style(elem) style = stylizer.style(elem)
tags = [] tags = []
tag = barename(elem.tag) tag = parse_utils.barename(elem.tag)
attribs = elem.attrib attribs = elem.attrib
style_a = '%s' % style style_a = '%s' % style
@@ -327,9 +328,9 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
for item in oeb_book.spine: for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href) self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item) self.rewrite_ids(item.data, item)
rewrite_links(item.data, partial(self.rewrite_link, page=item)) base.rewrite_links(item.data, partial(self.rewrite_link, page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item)
output.append('\n\n') output.append('\n\n')
if self.opts.htmlz_class_style == 'external': if self.opts.htmlz_class_style == 'external':
css = u'<link href="style.css" rel="stylesheet" type="text/css" />' css = u'<link href="style.css" rel="stylesheet" type="text/css" />'
@@ -348,9 +349,9 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
# We can only processes tags. If there isn't a tag return any text. # We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) not in (XHTML_NS, SVG_NS): or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
and elem.tail: and elem.tail:
return [elem.tail] return [elem.tail]
return [''] return ['']
@@ -358,7 +359,7 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
# Setup our variables. # Setup our variables.
text = [''] text = ['']
tags = [] tags = []
tag = barename(elem.tag) tag = parse_utils.barename(elem.tag)
attribs = elem.attrib attribs = elem.attrib
if tag == 'body': if tag == 'body':
+64 -55
View File
@@ -1,38 +1,32 @@
""" """
Read meta information from fb2 files Read meta information from fb2 files
""" """
import os, random import functools
from functools import partial import os
from string import ascii_letters, digits import random
import string
from lxml import etree from lxml import etree
from ebook_converter.utils.date import parse_only_date from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.imghdr import identify from ebook_converter.utils.imghdr import identify
from ebook_converter import guess_type, guess_all_extensions, prints, force_unicode from ebook_converter import guess_type, guess_all_extensions, prints, \
force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode from ebook_converter.polyglot.binary import as_base64_unicode
__license__ = 'GPL v3' NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0',
__copyright__ = ('2011, Roman Mukhin <ramses_ru at hotmail.com>, ' 'fb21': 'http://www.gribuser.ru/xml/fictionbook/2.1',
'2008, Anatoly Shipitsin <norguhtar at gmail.com>') 'xlink': 'http://www.w3.org/1999/xlink'}
tostring = functools.partial(etree.tostring, method='text', encoding='unicode')
NAMESPACES = {
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
'fb21' : 'http://www.gribuser.ru/xml/fictionbook/2.1',
'xlink' : 'http://www.w3.org/1999/xlink'
}
tostring = partial(etree.tostring, method='text', encoding='unicode')
def XLINK(tag): def XLINK(tag):
return '{%s}%s'%(NAMESPACES['xlink'], tag) return '{%s}%s' % (NAMESPACES['xlink'], tag)
class Context(object): class Context(object):
@@ -52,7 +46,7 @@ class Context(object):
return etree.XPath(*args, namespaces=self.namespaces) return etree.XPath(*args, namespaces=self.namespaces)
def get_or_create(self, parent, tag, attribs={}, at_start=True): def get_or_create(self, parent, tag, attribs={}, at_start=True):
xpathstr='./fb:'+tag xpathstr = './fb:'+tag
for n, v in attribs.items(): for n, v in attribs.items():
xpathstr += '[@%s="%s"]' % (n, v) xpathstr += '[@%s="%s"]' % (n, v)
ans = self.XPath(xpathstr)(parent) ans = self.XPath(xpathstr)(parent)
@@ -73,7 +67,7 @@ class Context(object):
def clear_meta_tags(self, doc, tag): def clear_meta_tags(self, doc, tag):
for parent in ('title-info', 'src-title-info', 'publish-info'): for parent in ('title-info', 'src-title-info', 'publish-info'):
for x in self.XPath('//fb:%s/fb:%s'%(parent, tag))(doc): for x in self.XPath('//fb:%s/fb:%s' % (parent, tag))(doc):
x.getparent().remove(x) x.getparent().remove(x)
def text2fb2(self, parent, text): def text2fb2(self, parent, text):
@@ -117,42 +111,41 @@ def get_metadata(stream):
book_title = str(book_title) book_title = str(book_title)
else: else:
book_title = force_unicode(os.path.splitext( book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name', os.path.basename(getattr(stream, 'name', 'Unknown')))[0])
'Unknown')))[0])
mi = MetaInformation(book_title, authors) mi = MetaInformation(book_title, authors)
try: try:
_parse_cover(root, mi, ctx) _parse_cover(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_comments(root, mi, ctx) _parse_comments(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_tags(root, mi, ctx) _parse_tags(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_series(root, mi, ctx) _parse_series(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_isbn(root, mi, ctx) _parse_isbn(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_publisher(root, mi, ctx) _parse_publisher(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_pubdate(root, mi, ctx) _parse_pubdate(root, mi, ctx)
except: except Exception:
pass pass
try: try:
_parse_language(root, mi, ctx) _parse_language(root, mi, ctx)
except: except Exception:
pass pass
return mi return mi
@@ -160,11 +153,11 @@ def get_metadata(stream):
def _parse_authors(root, ctx): def _parse_authors(root, ctx):
authors = [] authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent! # pick up authors but only from 1 secrion <title-info>; otherwise it is
# Those are fallbacks: <src-title-info>, <document-info> # not consistent! Those are fallbacks: <src-title-info>, <document-info>
author = None author = None
for author_sec in ['title-info', 'src-title-info', 'document-info']: for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in ctx.XPath('//fb:%s/fb:author'%author_sec)(root): for au in ctx.XPath('//fb:%s/fb:author' % author_sec)(root):
author = _parse_author(au, ctx) author = _parse_author(au, ctx)
if author: if author:
authors.append(author) authors.append(author)
@@ -207,24 +200,26 @@ def _parse_book_title(root, ctx):
xp_ti = '//fb:title-info/fb:book-title/text()' xp_ti = '//fb:title-info/fb:book-title/text()'
xp_pi = '//fb:publish-info/fb:book-title/text()' xp_pi = '//fb:publish-info/fb:book-title/text()'
xp_si = '//fb:src-title-info/fb:book-title/text()' xp_si = '//fb:src-title-info/fb:book-title/text()'
book_title = ctx.XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root) book_title = ctx.XPath('normalize-space(%s|%s|%s)' %
(xp_ti, xp_pi, xp_si))(root)
return book_title return book_title
def _parse_cover(root, mi, ctx): def _parse_cover(root, mi, ctx):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info> # pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/@xlink:href), "#")')(root) imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/'
'@xlink:href), "#")')(root)
if imgid: if imgid:
try: try:
_parse_cover_data(root, imgid, mi, ctx) _parse_cover_data(root, imgid, mi, ctx)
except: except Exception:
pass pass
def _parse_cover_data(root, imgid, mi, ctx): def _parse_cover_data(root, imgid, mi, ctx):
from ebook_converter.ebooks.fb2 import base64_decode from ebook_converter.ebooks.fb2 import base64_decode
elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root) elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root)
if elm_binary: if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg') mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype) mime_extensions = guess_all_extensions(mimetype)
@@ -241,12 +236,13 @@ def _parse_cover_data(root, imgid, mi, ctx):
fmt = identify(cdata)[0] fmt = identify(cdata)[0]
mi.cover_data = (fmt, cdata) mi.cover_data = (fmt, cdata)
else: else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid)) prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" %
(mimetype, imgid))
def _parse_tags(root, mi, ctx): def _parse_tags(root, mi, ctx):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent! # pick up genre but only from 1 secrion <title-info>; otherwise it is not
# Those are fallbacks: <src-title-info> # consistent! Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']: for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ? # -- i18n Translations-- ?
tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root) tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root)
@@ -267,16 +263,20 @@ def _parse_series(root, mi, ctx):
mi.series = elms_sequence[0].get('name', None) mi.series = elms_sequence[0].get('name', None)
if mi.series: if mi.series:
try: try:
mi.series_index = float('.'.join(elms_sequence[0].get('number', None).split()[:2])) i = float('.'.join(elms_sequence[0].get('number',
None).split()[:2]))
mi.series_index = i
except Exception: except Exception:
pass pass
def _parse_isbn(root, mi, ctx): def _parse_isbn(root, mi, ctx):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case # some people try to put several isbn in this field, but it is not
# allowed. try to stick to the 1-st one in this case
isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root) isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
if isbn: if isbn:
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case # some people try to put several isbn in this field, but it is not
# allowed. try to stick to the 1-st one in this case
if ',' in isbn: if ',' in isbn:
isbn = isbn[:isbn.index(',')] isbn = isbn[:isbn.index(',')]
if check_isbn(isbn): if check_isbn(isbn):
@@ -284,9 +284,11 @@ def _parse_isbn(root, mi, ctx):
def _parse_comments(root, mi, ctx): def _parse_comments(root, mi, ctx):
# pick up annotation but only from 1 section <title-info>; fallback: <src-title-info> # pick up annotation but only from 1 section <title-info>;
# fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']: for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % annotation_sec)(root) elms_annotation = ctx.XPath('//fb:%s/fb:annotation' %
annotation_sec)(root)
if elms_annotation: if elms_annotation:
mi.comments = tostring(elms_annotation[0]) mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt? # TODO: tags i18n, xslt?
@@ -294,7 +296,8 @@ def _parse_comments(root, mi, ctx):
def _parse_publisher(root, mi, ctx): def _parse_publisher(root, mi, ctx):
publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/text())')(root) publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/'
'text())')(root)
if publisher: if publisher:
mi.publisher = publisher mi.publisher = publisher
@@ -315,7 +318,7 @@ def _parse_language(root, mi, ctx):
def _get_fbroot(raw): def _get_fbroot(raw):
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = safe_xml_fromstring(raw) root = etree.fromstring(raw)
return ensure_namespace(root) return ensure_namespace(root)
@@ -348,10 +351,12 @@ def _set_authors(title_info, mi, ctx):
ctx.create_tag(atag, 'first-name').text = author_parts[0] ctx.create_tag(atag, 'first-name').text = author_parts[0]
author_parts = author_parts[1:] author_parts = author_parts[1:]
if len(author_parts) > 1: if len(author_parts) > 1:
ctx.create_tag(atag, 'middle-name', at_start=False).text = author_parts[0] ctx.create_tag(atag, 'middle-name',
at_start=False).text = author_parts[0]
author_parts = author_parts[1:] author_parts = author_parts[1:]
if author_parts: if author_parts:
ctx.create_tag(atag, 'last-name', at_start=False).text = ' '.join(author_parts) a = ' '.join(author_parts)
ctx.create_tag(atag, 'last-name', at_start=False).text = a
def _set_tags(title_info, mi, ctx): def _set_tags(title_info, mi, ctx):
@@ -368,12 +373,12 @@ def _set_series(title_info, mi, ctx):
seq = ctx.get_or_create(title_info, 'sequence') seq = ctx.get_or_create(title_info, 'sequence')
seq.set('name', mi.series) seq.set('name', mi.series)
try: try:
seq.set('number', '%g'%mi.series_index) seq.set('number', '%g' % mi.series_index)
except: except Exception:
seq.set('number', '1') seq.set('number', '1')
def _rnd_name(size=8, chars=ascii_letters + digits): def _rnd_name(size=8, chars=string.ascii_letters + string.digits):
return ''.join(random.choice(chars) for x in range(size)) return ''.join(random.choice(chars) for x in range(size))
@@ -396,7 +401,9 @@ def _set_cover(title_info, mi, ctx):
cim_filename = _rnd_pic_file_name('cover') cim_filename = _rnd_pic_file_name('cover')
cim_tag.attrib[XLINK('href')] = '#' + cim_filename cim_tag.attrib[XLINK('href')] = '#' + cim_filename
fb2_root = cim_tag.getroottree().getroot() fb2_root = cim_tag.getroottree().getroot()
cim_binary = ctx.get_or_create(fb2_root, 'binary', attribs={'id': cim_filename}, at_start=False) cim_binary = ctx.get_or_create(fb2_root, 'binary',
attribs={'id': cim_filename},
at_start=False)
cim_binary.attrib['content-type'] = 'image/jpeg' cim_binary.attrib['content-type'] = 'image/jpeg'
cim_binary.text = _encode_into_jpeg(mi.cover_data[1]) cim_binary.text = _encode_into_jpeg(mi.cover_data[1])
@@ -425,7 +432,8 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False):
# single quotes in xml declaration. Sigh. See # single quotes in xml declaration. Sigh. See
# https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184 # https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184
raw = b'<?xml version="1.0" encoding="UTF-8"?>\n' raw = b'<?xml version="1.0" encoding="UTF-8"?>\n'
raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False) raw += etree.tostring(root, method='xml', encoding='utf-8',
xml_declaration=False)
stream.seek(0) stream.seek(0)
stream.truncate() stream.truncate()
@@ -449,6 +457,7 @@ def ensure_namespace(doc):
if bare_tags: if bare_tags:
import re import re
raw = etree.tostring(doc, encoding='unicode') raw = etree.tostring(doc, encoding='unicode')
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>',
doc = safe_xml_fromstring(raw) raw)
doc = etree.fromstring(raw)
return doc return doc
File diff suppressed because it is too large Load Diff
+28 -28
View File
@@ -5,6 +5,7 @@ from functools import wraps
from lxml import etree from lxml import etree
from ebook_converter import constants as const
from ebook_converter import prints from ebook_converter import prints
from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors
from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata.book.base import Metadata
@@ -15,7 +16,6 @@ from ebook_converter.ebooks.metadata.utils import (
create_manifest_item, ensure_unique, normalize_languages, parse_opf, create_manifest_item, ensure_unique, normalize_languages, parse_opf,
pretty_print_opf pretty_print_opf
) )
from ebook_converter.ebooks.oeb.base import DC, OPF, OPF2_NSMAP
from ebook_converter.utils.config import from_json, to_json from ebook_converter.utils.config import from_json, to_json
from ebook_converter.utils.date import ( from ebook_converter.utils.date import (
fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow, fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow,
@@ -46,7 +46,7 @@ def XPath(x):
try: try:
return _xpath_cache[x] return _xpath_cache[x]
except KeyError: except KeyError:
_xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP) _xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP)
return ans return ans
@@ -213,7 +213,7 @@ def set_refines(elem, existing_refines, *new_refines):
remove_refines(elem, existing_refines) remove_refines(elem, existing_refines)
for ref in reversed(new_refines): for ref in reversed(new_refines):
prop, val, scheme = ref prop, val, scheme = ref
r = elem.makeelement(OPF('meta')) r = elem.makeelement(const.OPF_META)
r.set('refines', '#' + eid), r.set('property', prop) r.set('refines', '#' + eid), r.set('property', prop)
r.text = val.strip() r.text = val.strip()
if scheme: if scheme:
@@ -249,7 +249,7 @@ def parse_identifier(ident, val, refines):
# Try the OPF 2 style opf:scheme attribute, which will be present, for # Try the OPF 2 style opf:scheme attribute, which will be present, for
# example, in EPUB 3 files that have had their metadata set by an # example, in EPUB 3 files that have had their metadata set by an
# application that only understands EPUB 2. # application that only understands EPUB 2.
scheme = ident.get(OPF('scheme')) scheme = ident.get(const.OPF_SCHEME)
if scheme and not lval.startswith('urn:'): if scheme and not lval.startswith('urn:'):
return finalize(scheme, val) return finalize(scheme, val)
@@ -294,7 +294,7 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=
continue continue
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for scheme, val in new_identifiers.items(): for scheme, val in new_identifiers.items():
ident = metadata.makeelement(DC('identifier')) ident = metadata.makeelement(const.DC_IDENT)
ident.text = '%s:%s' % (scheme, val) ident.text = '%s:%s' % (scheme, val)
if package_identifier is None: if package_identifier is None:
metadata.append(ident) metadata.append(ident)
@@ -312,11 +312,11 @@ def identifier_writer(name):
if is_package_id: if is_package_id:
package_identifier = ident package_identifier = ident
val = (ident.text or '').strip() val = (ident.text or '').strip()
if (val.startswith(name + ':') or ident.get(OPF('scheme')) == name) and not is_package_id: if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id:
remove_element(ident, refines) remove_element(ident, refines)
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
if ival: if ival:
ident = metadata.makeelement(DC('identifier')) ident = metadata.makeelement(const.DC_IDENT)
ident.text = '%s:%s' % (name, ival) ident.text = '%s:%s' % (name, ival)
if package_identifier is None: if package_identifier is None:
metadata.append(ident) metadata.append(ident)
@@ -376,7 +376,7 @@ def set_title(root, prefixes, refines, title, title_sort=None):
main_title = find_main_title(root, refines, remove_blanks=True) main_title = find_main_title(root, refines, remove_blanks=True)
if main_title is None: if main_title is None:
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
main_title = m.makeelement(DC('title')) main_title = m.makeelement(const.DC_TITLE)
m.insert(0, main_title) m.insert(0, main_title)
main_title.text = title or None main_title.text = title or None
ts = [refdef('file-as', title_sort)] if title_sort else () ts = [refdef('file-as', title_sort)] if title_sort else ()
@@ -411,7 +411,7 @@ def set_languages(root, prefixes, refines, languages):
languages = ['und'] languages = ['und']
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for lang in uniq(languages): for lang in uniq(languages):
l = metadata.makeelement(DC('language')) l = metadata.makeelement(const.DC_LANG)
l.text = lang l.text = lang
metadata.append(l) metadata.append(l)
# }}} # }}}
@@ -440,7 +440,7 @@ def read_authors(root, prefixes, refines):
if file_as: if file_as:
aus = file_as[0][-1] aus = file_as[0][-1]
else: else:
aus = item.get(OPF('file-as')) or None aus = item.get(const.OPF_FILE_AS) or None
return Author(normalize_whitespace(val), normalize_whitespace(aus)) return Author(normalize_whitespace(val), normalize_whitespace(aus))
for item in XPath('./opf:metadata/dc:creator')(root): for item in XPath('./opf:metadata/dc:creator')(root):
@@ -448,7 +448,7 @@ def read_authors(root, prefixes, refines):
if val: if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
role = props.get('role') role = props.get('role')
opf_role = item.get(OPF('role')) opf_role = item.get(const.OPF_ROLE)
if role: if role:
if is_relators_role(props, 'aut'): if is_relators_role(props, 'aut'):
roled_authors.append(author(item, props, val)) roled_authors.append(author(item, props, val))
@@ -465,22 +465,22 @@ def set_authors(root, prefixes, refines, authors):
ensure_prefix(root, prefixes, 'marc') ensure_prefix(root, prefixes, 'marc')
for item in XPath('./opf:metadata/dc:creator')(root): for item in XPath('./opf:metadata/dc:creator')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
opf_role = item.get(OPF('role')) opf_role = item.get(const.OPF_ROLE)
if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')): if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')):
continue continue
remove_element(item, refines) remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for author in authors: for author in authors:
if author.name: if author.name:
a = metadata.makeelement(DC('creator')) a = metadata.makeelement(const.DC_CREATOR)
aid = ensure_id(a) aid = ensure_id(a)
a.text = author.name a.text = author.name
metadata.append(a) metadata.append(a)
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m.text = 'aut' m.text = 'aut'
metadata.append(m) metadata.append(m)
if author.sort: if author.sort:
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'file-as'}) m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'})
m.text = author.sort m.text = author.sort
metadata.append(m) metadata.append(m)
@@ -492,7 +492,7 @@ def read_book_producers(root, prefixes, refines):
if val: if val:
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
role = props.get('role') role = props.get('role')
opf_role = item.get(OPF('role')) opf_role = item.get(const.OPF_ROLE)
if role: if role:
if is_relators_role(props, 'bkp'): if is_relators_role(props, 'bkp'):
ans.append(normalize_whitespace(val)) ans.append(normalize_whitespace(val))
@@ -504,18 +504,18 @@ def read_book_producers(root, prefixes, refines):
def set_book_producers(root, prefixes, refines, producers): def set_book_producers(root, prefixes, refines, producers):
for item in XPath('./opf:metadata/dc:contributor')(root): for item in XPath('./opf:metadata/dc:contributor')(root):
props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) props = properties_for_id_with_scheme(item.get('id'), prefixes, refines)
opf_role = item.get(OPF('role')) opf_role = item.get(const.OPF_ROLE)
if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')): if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')):
continue continue
remove_element(item, refines) remove_element(item, refines)
metadata = XPath('./opf:metadata')(root)[0] metadata = XPath('./opf:metadata')(root)[0]
for bkp in producers: for bkp in producers:
if bkp: if bkp:
a = metadata.makeelement(DC('contributor')) a = metadata.makeelement(const.DC_CONTRIBUTOR)
aid = ensure_id(a) aid = ensure_id(a)
a.text = bkp a.text = bkp
metadata.append(a) metadata.append(a)
m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'})
m.text = 'bkp' m.text = 'bkp'
metadata.append(m) metadata.append(m)
# }}} # }}}
@@ -552,7 +552,7 @@ def set_pubdate(root, prefixes, refines, val):
if not is_date_undefined(val): if not is_date_undefined(val):
val = isoformat(val) val = isoformat(val)
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(DC('date')) d = m.makeelement(const.DC_DATE)
d.text = val d.text = val
m.append(d) m.append(d)
@@ -584,7 +584,7 @@ def create_timestamp(root, prefixes, m, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
ensure_prefix(root, prefixes, 'dcterms') ensure_prefix(root, prefixes, 'dcterms')
val = w3cdtf(val) val = w3cdtf(val)
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'}) d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'})
d.text = val d.text = val
m.append(d) m.append(d)
@@ -625,7 +625,7 @@ def set_last_modified(root, prefixes, refines, val=None):
else: else:
ensure_prefix(root, prefixes, 'dcterms') ensure_prefix(root, prefixes, 'dcterms')
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
meta = m.makeelement(OPF('meta'), attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'}) meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'})
m.append(meta) m.append(meta)
meta.text = val meta.text = val
# }}} # }}}
@@ -648,7 +648,7 @@ def set_comments(root, prefixes, refines, val):
if val: if val:
val = val.strip() val = val.strip()
if val: if val:
c = m.makeelement(DC('description')) c = m.makeelement(const.DC_DESC)
c.text = val c.text = val
m.append(c) m.append(c)
# }}} # }}}
@@ -670,7 +670,7 @@ def set_publisher(root, prefixes, refines, val):
if val: if val:
val = val.strip() val = val.strip()
if val: if val:
c = m.makeelement(DC('publisher')) c = m.makeelement(const.DC_PUBLISHER('publisher'))
c.text = normalize_whitespace(val) c.text = normalize_whitespace(val)
m.append(c) m.append(c)
# }}} # }}}
@@ -693,7 +693,7 @@ def set_tags(root, prefixes, refines, val):
if val: if val:
val = uniq(list(filter(None, val))) val = uniq(list(filter(None, val)))
for x in val: for x in val:
c = m.makeelement(DC('subject')) c = m.makeelement(const.DC_SUBJ)
c.text = normalize_whitespace(x) c.text = normalize_whitespace(x)
if c.text: if c.text:
m.append(c) m.append(c)
@@ -725,7 +725,7 @@ def read_rating(root, prefixes, refines):
def create_rating(root, prefixes, val): def create_rating(root, prefixes, val):
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:rating'}) d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'})
d.text = val d.text = val
m.append(d) m.append(d)
@@ -772,7 +772,7 @@ def read_series(root, prefixes, refines):
def create_series(root, refines, series, series_index): def create_series(root, refines, series, series_index):
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(OPF('meta'), attrib={'property':'belongs-to-collection'}) d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'})
d.text = series d.text = series
m.append(d) m.append(d)
set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index)) set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
@@ -836,7 +836,7 @@ def dict_writer(name, serialize=dump_dict, remove2=True):
if val: if val:
ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX)
m = XPath('./opf:metadata')(root)[0] m = XPath('./opf:metadata')(root)[0]
d = m.makeelement(OPF('meta'), attrib={'property':'calibre:%s' % name}) d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name})
d.text = serialize(val) d.text = serialize(val)
m.append(d) m.append(d)
return writer return writer
+44 -40
View File
@@ -10,17 +10,13 @@ from lxml.builder import ElementMaker
from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/" NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata" CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS} NSMAP = {None: NCX_NS, 'calibre': CALIBRE_NS}
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP) E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP) C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
@@ -30,8 +26,10 @@ def parse_html_toc(data):
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
from lxml import etree from lxml import etree
if isinstance(data, bytes): if isinstance(data, bytes):
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] data = xml_to_unicode(data, strip_encoding_pats=True,
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) resolve_entities=True)[0]
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False,
sanitize_names=True)
for a in root.xpath('//*[@href and local-name()="a"]'): for a in root.xpath('//*[@href and local-name()="a"]'):
purl = urllib.parse.urlparse(unquote(a.get('href'))) purl = urllib.parse.urlparse(unquote(a.get('href')))
href, fragment = purl[2], purl[5] href, fragment = purl[2], purl[5]
@@ -48,8 +46,8 @@ def parse_html_toc(data):
class TOC(list): class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None, def __init__(self, href=None, fragment=None, text=None, parent=None,
play_order=0, base_path=os.getcwd(), type='unknown', author=None, play_order=0, base_path=os.getcwd(), type='unknown',
description=None, toc_thumbnail=None): author=None, description=None, toc_thumbnail=None):
self.href = href self.href = href
self.fragment = fragment self.fragment = fragment
if not self.fragment: if not self.fragment:
@@ -64,7 +62,7 @@ class TOC(list):
self.toc_thumbnail = toc_thumbnail self.toc_thumbnail = toc_thumbnail
def __str__(self): def __str__(self):
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)] lines = ['TOC: %s#%s %s' % (self.href, self.fragment, self.text)]
for child in self: for child in self:
c = str(child).splitlines() c = str(child).splitlines()
for l in c: for l in c:
@@ -91,12 +89,14 @@ class TOC(list):
entry.parent = None entry.parent = None
def add_item(self, href, fragment, text, play_order=None, type='unknown', def add_item(self, href, fragment, text, play_order=None, type='unknown',
author=None, description=None, toc_thumbnail=None): author=None, description=None, toc_thumbnail=None):
if play_order is None: if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1 play_order = (self[-1].play_order
if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order, base_path=self.base_path, play_order=play_order,
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail)) type=type, author=author, description=description,
toc_thumbnail=toc_thumbnail))
return self[-1] return self[-1]
def top_level_items(self): def top_level_items(self):
@@ -121,7 +121,10 @@ class TOC(list):
@property @property
def abspath(self): def abspath(self):
'Return the file this toc entry points to as a absolute path to a file on the system.' """
Return the file this toc entry points to as a absolute path to a file
on the system.
"""
if self.href is None: if self.href is None:
return None return None
@@ -136,8 +139,9 @@ class TOC(list):
toc = toc['toc'] toc = toc['toc']
if toc is None: if toc is None:
try: try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href'] toc = (opfreader.soup.find('guide')
except: .find('reference', attrs={'type': 'toc'})['href'])
except Exception:
for item in opfreader.manifest: for item in opfreader.manifest:
if 'toc' in item.href().lower(): if 'toc' in item.href().lower():
toc = item.href() toc = item.href()
@@ -151,13 +155,15 @@ class TOC(list):
toc = os.path.join(self.base_path, toc) toc = os.path.join(self.base_path, toc)
try: try:
if not os.path.exists(toc): if not os.path.exists(toc):
bn = os.path.basename(toc) bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files # Bug in BAEN OPF files
bn = bn.replace('_top.htm', '_toc.htm')
toc = os.path.join(os.path.dirname(toc), bn) toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc) self.read_html_toc(toc)
except: except Exception:
print('WARNING: Could not read Table of Contents. Continuing anyway.') print('WARNING: Could not read Table of Contents. '
'Continuing anyway.')
else: else:
path = opfreader.manifest.item(toc.lower()) path = opfreader.manifest.item(toc.lower())
path = getattr(path, 'path', path) path = getattr(path, 'path', path)
@@ -177,9 +183,9 @@ class TOC(list):
self.base_path = os.path.dirname(toc) self.base_path = os.path.dirname(toc)
if root is None: if root is None:
with open(toc, 'rb') as f: with open(toc, 'rb') as f:
raw = xml_to_unicode(f.read(), assume_utf8=True, raw = xml_to_unicode(f.read(), assume_utf8=True,
strip_encoding_pats=True)[0] strip_encoding_pats=True)[0]
root = safe_xml_fromstring(raw) root = etree.fromstring(raw)
xpn = {'re': 'http://exslt.org/regular-expressions'} xpn = {'re': 'http://exslt.org/regular-expressions'}
XPath = functools.partial(etree.XPath, namespaces=xpn) XPath = functools.partial(etree.XPath, namespaces=xpn)
@@ -197,7 +203,7 @@ class TOC(list):
def process_navpoint(np, dest): def process_navpoint(np, dest):
try: try:
play_order = int(get_attr(np, 1)) play_order = int(get_attr(np, 1))
except: except Exception:
play_order = 1 play_order = 1
href = fragment = text = None href = fragment = text = None
nd = dest nd = dest
@@ -207,7 +213,7 @@ class TOC(list):
text = '' text = ''
for txt in txt_path(nl): for txt in txt_path(nl):
text += etree.tostring(txt, method='text', text += etree.tostring(txt, method='text',
encoding='unicode', with_tail=False) encoding='unicode', with_tail=False)
content = content_path(np) content = content_path(np)
if content and text: if content and text:
content = content[0] content = content[0]
@@ -242,17 +248,14 @@ class TOC(list):
self.add_item(href, fragment, txt) self.add_item(href, fragment, txt)
def render(self, stream, uid): def render(self, stream, uid):
root = E.ncx( root = E.ncx(E.head(E.meta(name='dtb:uid', content=str(uid)),
E.head( E.meta(name='dtb:depth',
E.meta(name='dtb:uid', content=str(uid)), content=str(self.depth())),
E.meta(name='dtb:depth', content=str(self.depth())), E.meta(name='dtb:generator', content='%s (%s)' %
E.meta(name='dtb:generator', content='%s (%s)'%(__appname__, (__appname__, __version__)),
__version__)), E.meta(name='dtb:totalPageCount', content='0'),
E.meta(name='dtb:totalPageCount', content='0'), E.meta(name='dtb:maxPageNumber', content='0')),
E.meta(name='dtb:maxPageNumber', content='0'), E.docTitle(E.text('Table of Contents')))
),
E.docTitle(E.text('Table of Contents')),
)
navmap = E.navMap() navmap = E.navMap()
root.append(navmap) root.append(navmap)
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en') root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
@@ -263,12 +266,12 @@ class TOC(list):
if not text: if not text:
text = '' text = ''
c[1] += 1 c[1] += 1
item_id = 'num_%d'%c[1] item_id = 'num_%d' % c[1]
text = clean_xml_chars(text) text = clean_xml_chars(text)
elem = E.navPoint( elem = E.navPoint(
E.navLabel(E.text(re.sub(r'\s+', ' ', text))), E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
E.content(src=str(np.href)+(('#' + str(np.fragment)) E.content(src=str(np.href)+(('#' + str(np.fragment))
if np.fragment else '')), if np.fragment else '')),
id=item_id, id=item_id,
playOrder=str(np.play_order) playOrder=str(np.play_order)
) )
@@ -282,7 +285,8 @@ class TOC(list):
try: try:
elem.append(C.meta(desc, name='description')) elem.append(C.meta(desc, name='description'))
except ValueError: except ValueError:
elem.append(C.meta(clean_xml_chars(desc), name='description')) elem.append(C.meta(clean_xml_chars(desc),
name='description'))
idx = getattr(np, 'toc_thumbnail', None) idx = getattr(np, 'toc_thumbnail', None)
if idx: if idx:
elem.append(C.meta(idx, name='toc_thumbnail')) elem.append(C.meta(idx, name='toc_thumbnail'))
@@ -293,5 +297,5 @@ class TOC(list):
for np in self: for np in self:
navpoint(navmap, np) navpoint(navmap, np)
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True, raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=True) pretty_print=True)
stream.write(raw) stream.write(raw)
+14 -9
View File
@@ -1,12 +1,13 @@
from collections import namedtuple from collections import namedtuple
from lxml import etree
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.oeb.base import OPF from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.spell import parse_lang_code from ebook_converter.spell import parse_lang_code
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils.localization import lang_as_iso639_1 from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.utils.xml_parse import safe_xml_fromstring
OPFVersion = namedtuple('OPFVersion', 'major minor patch') OPFVersion = namedtuple('OPFVersion', 'major minor patch')
@@ -35,23 +36,26 @@ def parse_opf(stream_or_path):
raw = stream.read() raw = stream.read()
if not raw: if not raw:
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True, assume_utf8=True)
raw = raw[raw.find('<'):] raw = raw[raw.find('<'):]
root = safe_xml_fromstring(clean_xml_chars(raw)) root = etree.fromstring(clean_xml_chars(raw))
if root is None: if root is None:
raise ValueError('Not an OPF file') raise ValueError('Not an OPF file')
return root return root
def normalize_languages(opf_languages, mi_languages): def normalize_languages(opf_languages, mi_languages):
' Preserve original country codes and use 2-letter lang codes where possible ' """
Preserve original country codes and use 2-letter lang codes where possible
"""
def parse(x): def parse(x):
try: try:
return parse_lang_code(x) return parse_lang_code(x)
except ValueError: except ValueError:
return None return None
opf_languages = filter(None, map(parse, opf_languages)) opf_languages = filter(None, map(parse, opf_languages))
cc_map = {c.langcode:c.countrycode for c in opf_languages} cc_map = {c.langcode: c.countrycode for c in opf_languages}
mi_languages = filter(None, map(parse, mi_languages)) mi_languages = filter(None, map(parse, mi_languages))
def norm(x): def norm(x):
@@ -83,9 +87,9 @@ def create_manifest_item(root, href_template, id_template, media_type=None):
all_hrefs = frozenset(root.xpath('//*/@href')) all_hrefs = frozenset(root.xpath('//*/@href'))
href = ensure_unique(href_template, all_hrefs) href = ensure_unique(href_template, all_hrefs)
item_id = ensure_unique(id_template, all_ids) item_id = ensure_unique(id_template, all_ids)
manifest = root.find(OPF('manifest')) manifest = root.find(base.tag('opf', 'manifest'))
if manifest is not None: if manifest is not None:
i = manifest.makeelement(OPF('item')) i = manifest.makeelement(base.tag('opf', 'item'))
i.set('href', href), i.set('id', item_id) i.set('href', href), i.set('id', item_id)
i.set('media-type', media_type or guess_type(href_template)) i.set('media-type', media_type or guess_type(href_template))
manifest.append(i) manifest.append(i)
@@ -93,6 +97,7 @@ def create_manifest_item(root, href_template, id_template, media_type=None):
def pretty_print_opf(root): def pretty_print_opf(root):
from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, \
pretty_xml_tree
pretty_opf(root) pretty_opf(root)
pretty_xml_tree(root) pretty_xml_tree(root)
+125 -79
View File
@@ -1,44 +1,43 @@
import re, sys, copy, json import collections
from itertools import repeat import copy
from collections import defaultdict import itertools
import json
import re
import sys
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from ebook_converter import prints from ebook_converter import prints
from ebook_converter.ebooks.metadata import check_isbn, check_doi from ebook_converter.ebooks.metadata import check_isbn, check_doi
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.ebooks.metadata.opf2 import dump_dict from ebook_converter.ebooks.metadata.opf2 import dump_dict
from ebook_converter.utils.date import parse_date, isoformat, now from ebook_converter.utils.date import parse_date, isoformat, now
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1 from ebook_converter.utils.localization import canonicalize_lang, \
lang_as_iso639_1
__license__ = 'GPL v3' _xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' r'[\'"][^<>]*>', re.IGNORECASE)
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE) NS_MAP = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'dc': 'http://purl.org/dc/elements/1.1/',
NS_MAP = { 'pdf': 'http://ns.adobe.com/pdf/1.3/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'pdfx': 'http://ns.adobe.com/pdfx/1.3/',
'dc': 'http://purl.org/dc/elements/1.1/', 'xmp': 'http://ns.adobe.com/xap/1.0/',
'pdf': 'http://ns.adobe.com/pdf/1.3/', 'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/',
'pdfx': 'http://ns.adobe.com/pdfx/1.3/', 'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/',
'xmp': 'http://ns.adobe.com/xap/1.0/', 'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/',
'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/', 'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/',
'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/', 'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/',
'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/', 'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/',
'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/', 'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/', 'crossmark': 'http://crossref.org/crossmark/1.0/',
'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/', 'xml': 'http://www.w3.org/XML/1998/namespace',
'prism': 'http://prismstandard.org/namespaces/basic/2.0/', 'x': 'adobe:ns:meta/',
'crossmark': 'http://crossref.org/crossmark/1.0/', 'calibre': 'http://calibre-ebook.com/xmp-namespace',
'xml': 'http://www.w3.org/XML/1998/namespace', 'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index',
'x': 'adobe:ns:meta/', 'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns'}
'calibre': 'http://calibre-ebook.com/xmp-namespace',
'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index',
'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns',
}
KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'} KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'}
@@ -63,7 +62,7 @@ def parse_xmp_packet(raw_bytes):
pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]''' pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]'''
encodings = ('8', '16-le', '16-be', '32-le', '32-be') encodings = ('8', '16-le', '16-be', '32-le', '32-be')
header = raw_bytes[:1024] header = raw_bytes[:1024]
emap = {'\ufeff'.encode('utf-'+x):'utf-'+x for x in encodings} emap = {'\ufeff'.encode('utf-'+x): 'utf-'+x for x in encodings}
emap[b''] = 'utf-8' emap[b''] = 'utf-8'
for q in encodings: for q in encodings:
m = re.search(pat.encode('utf-'+q), header) m = re.search(pat.encode('utf-'+q), header)
@@ -71,15 +70,19 @@ def parse_xmp_packet(raw_bytes):
enc = emap.get(m.group(1), enc) enc = emap.get(m.group(1), enc)
break break
if enc is None: if enc is None:
return safe_xml_fromstring(raw_bytes) return etree.fromstring(raw_bytes)
raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string # lxml barfs if encoding declaration present in unicode string
return safe_xml_fromstring(raw) raw = _xml_declaration.sub('', raw_bytes.decode(enc))
return etree.fromstring(raw)
def serialize_xmp_packet(root, encoding='utf-8'): def serialize_xmp_packet(root, encoding='utf-8'):
root.tail = '\n' + '\n'.join(repeat(' '*100, 30)) # Adobe spec recommends inserting padding at the end of the packet # Adobe spec recommends inserting padding at the end of the packet
raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True, with_tail=True, method='xml') root.tail = '\n' + '\n'.join(itertools.repeat(' '*100, 30))
return b'<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes) raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True,
with_tail=True, method='xml')
return ('<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n'
'<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes))
def read_simple_property(elem): def read_simple_property(elem):
@@ -106,14 +109,15 @@ def read_sequence(parent):
yield read_simple_property(item) yield read_simple_property(item)
def uniq(vals, kmap=lambda x:x): def uniq(vals, kmap=lambda x: x):
''' Remove all duplicates from vals, while preserving order. kmap must be a ''' Remove all duplicates from vals, while preserving order. kmap must be a
callable that returns a hashable value for every item in vals ''' callable that returns a hashable value for every item in vals '''
vals = vals or () vals = vals or ()
lvals = (kmap(x) for x in vals) lvals = (kmap(x) for x in vals)
seen = set() seen = set()
seen_add = seen.add seen_add = seen.add
return tuple(x for x, k in zip(vals, lvals) if k not in seen and not seen_add(k)) return tuple(x for x, k in zip(vals, lvals) if k not in seen
and not seen_add(k))
def multiple_sequences(expr, root): def multiple_sequences(expr, root):
@@ -170,7 +174,8 @@ def read_series(root):
def read_user_metadata(mi, root): def read_user_metadata(mi, root):
from ebook_converter.utils.config import from_json from ebook_converter.utils.config import from_json
from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple from ebook_converter.ebooks.metadata.book.json_codec import \
decode_is_multiple
fields = set() fields = set()
for item in XPath('//calibre:custom_metadata')(root): for item in XPath('//calibre:custom_metadata')(root):
for li in XPath('./rdf:Bag/rdf:li')(item): for li in XPath('./rdf:Bag/rdf:li')(item):
@@ -186,7 +191,7 @@ def read_user_metadata(mi, root):
decode_is_multiple(fm) decode_is_multiple(fm)
mi.set_user_metadata(name, fm) mi.set_user_metadata(name, fm)
fields.add(name) fields.add(name)
except: except Exception:
prints('Failed to read user metadata:', name) prints('Failed to read user metadata:', name)
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@@ -194,13 +199,17 @@ def read_user_metadata(mi, root):
def read_xmp_identifers(parent): def read_xmp_identifers(parent):
''' For example: ''' For example:
<rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:li> <rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq>
<rdf:value>http://foo.com</rdf:value></rdf:li>
or the longer form: or the longer form:
<rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li> <rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq>
<rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li>
''' '''
for li in XPath('./rdf:Bag/rdf:li')(parent): for li in XPath('./rdf:Bag/rdf:li')(parent):
is_resource = li.attrib.get(expand('rdf:parseType'), None) == 'Resource' is_resource = li.attrib.get(expand('rdf:parseType'),
is_resource = is_resource or (len(li) == 1 and li[0].tag == expand('rdf:Description')) None) == 'Resource'
is_resource = is_resource or (len(li) == 1 and
li[0].tag == expand('rdf:Description'))
if not is_resource: if not is_resource:
yield None, li.text or '' yield None, li.text or ''
value = XPath('descendant::rdf:value')(li) value = XPath('descendant::rdf:value')(li)
@@ -241,12 +250,15 @@ def metadata_from_xmp_packet(raw_bytes):
if title.startswith(r'\376\377'): if title.startswith(r'\376\377'):
# corrupted XMP packet generated by Nitro PDF. See # corrupted XMP packet generated by Nitro PDF. See
# https://bugs.launchpad.net/calibre/+bug/1541981 # https://bugs.launchpad.net/calibre/+bug/1541981
raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF') raise ValueError('Corrupted XMP metadata packet detected, '
'probably generated by Nitro PDF')
mi.title = title mi.title = title
authors = multiple_sequences('//dc:creator', root) authors = multiple_sequences('//dc:creator', root)
if authors: if authors:
mi.authors = authors mi.authors = authors
tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) tags = multiple_sequences('//dc:subject',
root) or multiple_sequences('//pdf:Keywords',
root)
if tags: if tags:
mi.tags = tags mi.tags = tags
comments = first_alt('//dc:description', root) comments = first_alt('//dc:description', root)
@@ -256,8 +268,10 @@ def metadata_from_xmp_packet(raw_bytes):
if publishers: if publishers:
mi.publisher = publishers[0] mi.publisher = publishers[0]
try: try:
pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) pubdate = (parse_date(first_sequence('//dc:date', root) or
except: first_simple('//xmp:CreateDate', root),
assume_utc=False))
except Exception:
pass pass
else: else:
mi.pubdate = pubdate mi.pubdate = pubdate
@@ -291,7 +305,7 @@ def metadata_from_xmp_packet(raw_bytes):
if val: if val:
try: try:
setattr(mi, x, json.loads(val)) setattr(mi, x, json.loads(val))
except: except Exception:
pass pass
languages = multiple_sequences('//dc:language', root) languages = multiple_sequences('//dc:language', root)
@@ -319,7 +333,7 @@ def metadata_from_xmp_packet(raw_bytes):
identifiers[scheme] = val identifiers[scheme] = val
# Check Dublin Core for recognizable identifier types # Check Dublin Core for recognizable identifier types
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.items(): for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
if scheme not in identifiers: if scheme not in identifiers:
val = check_func(first_simple('//dc:identifier', root)) val = check_func(first_simple('//dc:identifier', root))
if val: if val:
@@ -359,17 +373,21 @@ def consolidate_metadata(info_mi, info):
else: else:
prefer_info = info_date > xmp_mi.metadata_date prefer_info = info_date > xmp_mi.metadata_date
if prefer_info: if prefer_info:
info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags info_mi.title = info_title
info_mi.authors = info_authors
info_mi.tags = info_tags
else: else:
# We'll use the xmp tags/authors but fallback to the info ones if the # We'll use the xmp tags/authors but fallback to the info ones if the
# xmp does not have tags/authors. smart_update() should have taken care of # xmp does not have tags/authors. smart_update() should have taken care
# the rest # of the rest
info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags info_mi.authors = (info_authors if xmp_mi.is_null('authors')
else xmp_mi.authors)
info_mi.tags = xmp_mi.tags or info_tags
return info_mi return info_mi
def nsmap(*args): def nsmap(*args):
return {x:NS_MAP[x] for x in args} return {x: NS_MAP[x] for x in args}
def create_simple_property(parent, tag, value): def create_simple_property(parent, tag, value):
@@ -435,7 +453,8 @@ def create_series(calibre, series, series_index):
def create_user_metadata(calibre, all_user_metadata): def create_user_metadata(calibre, all_user_metadata):
from ebook_converter.utils.config import to_json from ebook_converter.utils.config import to_json
from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple from ebook_converter.ebooks.metadata.book.json_codec import \
object_to_unicode, encode_is_multiple
s = calibre.makeelement(expand('calibre:custom_metadata')) s = calibre.makeelement(expand('calibre:custom_metadata'))
calibre.append(s) calibre.append(s)
@@ -447,7 +466,7 @@ def create_user_metadata(calibre, all_user_metadata):
encode_is_multiple(fm) encode_is_multiple(fm)
fm = object_to_unicode(fm) fm = object_to_unicode(fm)
fm = json.dumps(fm, default=to_json, ensure_ascii=False) fm = json.dumps(fm, default=to_json, ensure_ascii=False)
except: except Exception:
prints('Failed to write user metadata:', name) prints('Failed to write user metadata:', name)
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@@ -471,7 +490,8 @@ def metadata_to_xmp_packet(mi):
dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc')) dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc'))
dc.set(expand('rdf:about'), '') dc.set(expand('rdf:about'), '')
rdf.append(dc) rdf.append(dc)
for prop, tag in {'title':'dc:title', 'comments':'dc:description'}.items(): for prop, tag in {'title': 'dc:title',
'comments': 'dc:description'}.items():
val = mi.get(prop) or '' val = mi.get(prop) or ''
create_alt_property(dc, tag, val) create_alt_property(dc, tag, val)
for prop, (tag, ordered) in {'authors': ('dc:creator', True), for prop, (tag, ordered) in {'authors': ('dc:creator', True),
@@ -482,18 +502,23 @@ def metadata_to_xmp_packet(mi):
val = [val] val = [val]
create_sequence_property(dc, tag, val, ordered) create_sequence_property(dc, tag, val, ordered)
if not mi.is_null('pubdate'): if not mi.is_null('pubdate'):
create_sequence_property(dc, 'dc:date', [isoformat(mi.pubdate, as_utc=False)]) # Adobe spec recommends local time # Adobe spec recommends local time
create_sequence_property(dc, 'dc:date',
[isoformat(mi.pubdate, as_utc=False)])
if not mi.is_null('languages'): if not mi.is_null('languages'):
langs = list(filter(None, map(lambda x:lang_as_iso639_1(x) or canonicalize_lang(x), mi.languages))) langs = list(filter(None, map(lambda x: lang_as_iso639_1(x) or
canonicalize_lang(x), mi.languages)))
if langs: if langs:
create_sequence_property(dc, 'dc:language', langs, ordered=False) create_sequence_property(dc, 'dc:language', langs, ordered=False)
xmp = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('xmp', 'xmpidq')) xmp = rdf.makeelement(expand('rdf:Description'),
nsmap=nsmap('xmp', 'xmpidq'))
xmp.set(expand('rdf:about'), '') xmp.set(expand('rdf:about'), '')
rdf.append(xmp) rdf.append(xmp)
extra_ids = {} extra_ids = {}
for x in ('prism', 'pdfx'): for x in ('prism', 'pdfx'):
p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap(x)) p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'),
nsmap=nsmap(x))
p.set(expand('rdf:about'), '') p.set(expand('rdf:about'), '')
rdf.append(p) rdf.append(p)
@@ -503,7 +528,7 @@ def metadata_to_xmp_packet(mi):
for scheme, val in identifiers.items(): for scheme, val in identifiers.items():
if scheme in {'isbn', 'doi'}: if scheme in {'isbn', 'doi'}:
for prefix, parent in extra_ids.items(): for prefix, parent in extra_ids.items():
ie = parent.makeelement(expand('%s:%s'%(prefix, scheme))) ie = parent.makeelement(expand('%s:%s' % (prefix, scheme)))
ie.text = val ie.text = val
parent.append(ie) parent.append(ie)
@@ -511,7 +536,8 @@ def metadata_to_xmp_packet(mi):
d.text = isoformat(now(), as_utc=False) d.text = isoformat(now(), as_utc=False)
xmp.append(d) xmp.append(d)
calibre = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('calibre', 'calibreSI', 'calibreCC')) calibre = rdf.makeelement(expand('rdf:Description'),
nsmap=nsmap('calibre', 'calibreSI', 'calibreCC'))
calibre.set(expand('rdf:about'), '') calibre.set(expand('rdf:about'), '')
rdf.append(calibre) rdf.append(calibre)
if not mi.is_null('rating'): if not mi.is_null('rating'):
@@ -524,7 +550,8 @@ def metadata_to_xmp_packet(mi):
if not mi.is_null('series'): if not mi.is_null('series'):
create_series(calibre, mi.series, mi.series_index) create_series(calibre, mi.series, mi.series_index)
if not mi.is_null('timestamp'): if not mi.is_null('timestamp'):
create_simple_property(calibre, 'calibre:timestamp', isoformat(mi.timestamp, as_utc=False)) create_simple_property(calibre, 'calibre:timestamp',
isoformat(mi.timestamp, as_utc=False))
for x in ('author_link_map', 'user_categories'): for x in ('author_link_map', 'user_categories'):
val = getattr(mi, x, None) val = getattr(mi, x, None)
if val: if val:
@@ -550,10 +577,11 @@ def find_used_namespaces(elem):
def find_preferred_prefix(namespace, elems): def find_preferred_prefix(namespace, elems):
for elem in elems: for elem in elems:
ans = {v:k for k, v in elem.nsmap.items()}.get(namespace, None) ans = {v: k for k, v in elem.nsmap.items()}.get(namespace, None)
if ans is not None: if ans is not None:
return ans return ans
return find_preferred_prefix(namespace, elem.iterchildren(etree.Element)) return find_preferred_prefix(namespace,
elem.iterchildren(etree.Element))
def find_nsmap(elems): def find_nsmap(elems):
@@ -562,7 +590,7 @@ def find_nsmap(elems):
used_namespaces |= find_used_namespaces(elem) used_namespaces |= find_used_namespaces(elem)
ans = {} ans = {}
used_namespaces -= {NS_MAP['xml'], NS_MAP['x'], None, NS_MAP['rdf']} used_namespaces -= {NS_MAP['xml'], NS_MAP['x'], None, NS_MAP['rdf']}
rmap = {v:k for k, v in NS_MAP.items()} rmap = {v: k for k, v in NS_MAP.items()}
i = 0 i = 0
for ns in used_namespaces: for ns in used_namespaces:
if ns in rmap: if ns in rmap:
@@ -578,7 +606,10 @@ def find_nsmap(elems):
def clone_into(parent, elem): def clone_into(parent, elem):
' Clone the element, assuming that all namespace declarations are present in parent ' """
Clone the element, assuming that all namespace declarations are present
in parent
"""
clone = parent.makeelement(elem.tag) clone = parent.makeelement(elem.tag)
parent.append(clone) parent.append(clone)
if elem.text and not elem.text.isspace(): if elem.text and not elem.text.isspace():
@@ -591,28 +622,38 @@ def clone_into(parent, elem):
def merge_xmp_packet(old, new): def merge_xmp_packet(old, new):
''' Merge metadata present in the old packet that is not present in the new """
Merge metadata present in the old packet that is not present in the new
one into the new one. Assumes the new packet was generated by one into the new one. Assumes the new packet was generated by
metadata_to_xmp_packet() ''' metadata_to_xmp_packet()
"""
old, new = parse_xmp_packet(old), parse_xmp_packet(new) old, new = parse_xmp_packet(old), parse_xmp_packet(new)
# As per the adobe spec all metadata items have to be present inside top-level rdf:Description containers # As per the adobe spec all metadata items have to be present inside
# top-level rdf:Description containers
item_xpath = XPath('//rdf:RDF/rdf:Description/*') item_xpath = XPath('//rdf:RDF/rdf:Description/*')
# First remove all data fields that metadata_to_xmp_packet() knowns about, # First remove all data fields that metadata_to_xmp_packet() knowns about,
# since either they will have been set or if not present, imply they have # since either they will have been set or if not present, imply they have
# been cleared # been cleared
defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES} defined_tags = {expand(prefix + ':' + scheme)
defined_tags |= {expand('dc:' + x) for x in ('identifier', 'title', 'creator', 'date', 'description', 'language', 'publisher', 'subject')} for prefix in ('prism', 'pdfx')
defined_tags |= {expand('xmp:' + x) for x in ('MetadataDate', 'Identifier')} for scheme in KNOWN_ID_SCHEMES}
defined_tags |= {expand('dc:' + x)
for x in ('identifier', 'title', 'creator', 'date',
'description', 'language', 'publisher',
'subject')}
defined_tags |= {expand('xmp:' + x)
for x in ('MetadataDate', 'Identifier')}
# For redundancy also remove all fields explicitly set in the new packet # For redundancy also remove all fields explicitly set in the new packet
defined_tags |= {x.tag for x in item_xpath(new)} defined_tags |= {x.tag for x in item_xpath(new)}
calibrens = '{%s}' % NS_MAP['calibre'] calibrens = '{%s}' % NS_MAP['calibre']
for elem in item_xpath(old): for elem in item_xpath(old):
if elem.tag in defined_tags or (elem.tag and elem.tag.startswith(calibrens)): if elem.tag in defined_tags or (elem.tag and
elem.tag.startswith(calibrens)):
elem.getparent().remove(elem) elem.getparent().remove(elem)
# Group all items into groups based on their namespaces # Group all items into groups based on their namespaces
groups = defaultdict(list) groups = collections.defaultdict(list)
for item in item_xpath(new): for item in item_xpath(new):
ns = item.nsmap[item.prefix] ns = item.nsmap[item.prefix]
groups[ns].append(item) groups[ns].append(item)
@@ -626,9 +667,14 @@ def merge_xmp_packet(old, new):
root = A.xmpmeta(R.RDF) root = A.xmpmeta(R.RDF)
rdf = root[0] rdf = root[0]
for namespace in sorted(groups, key=lambda x:{NS_MAP['dc']:'a', NS_MAP['xmp']:'b', NS_MAP['calibre']:'c'}.get(x, 'z'+x)): for namespace in sorted(groups,
key=lambda x: {NS_MAP['dc']: 'a',
NS_MAP['xmp']: 'b',
NS_MAP['calibre']: 'c'}.get(x,
'z'+x)):
items = groups[namespace] items = groups[namespace]
desc = rdf.makeelement(expand('rdf:Description'), nsmap=find_nsmap(items)) desc = rdf.makeelement(expand('rdf:Description'),
nsmap=find_nsmap(items))
desc.set(expand('rdf:about'), '') desc.set(expand('rdf:about'), '')
rdf.append(desc) rdf.append(desc)
for item in items: for item in items:
+37 -35
View File
@@ -5,8 +5,10 @@ import copy
import re import re
import numbers import numbers
from lxml import etree from lxml import etree
from ebook_converter.ebooks.oeb.base import namespace, barename
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.ebooks.oeb.transforms.flatcss import KeyMapper from ebook_converter.ebooks.oeb.transforms.flatcss import KeyMapper
from ebook_converter.ebooks.mobi.utils import convert_color_for_font_tag from ebook_converter.ebooks.mobi.utils import convert_color_for_font_tag
@@ -23,7 +25,7 @@ def MBP(name):
return '{%s}%s' % (MBP_NS, name) return '{%s}%s' % (MBP_NS, name)
MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS} MOBI_NSMAP = {None: const.XHTML_NS, 'mbp': const.MBP_NS}
INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'} INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'}
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
# GR: Added 'caption' to both sets # GR: Added 'caption' to both sets
@@ -129,9 +131,9 @@ class MobiMLizer(object):
'Iterate over the spine and convert it to MOBIML' 'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine: for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
body = item.data.find(XHTML('body')) body = item.data.find(base.tag('xhtml', 'body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body')) nbody = etree.SubElement(nroot, base.tag('xhtml', 'body'))
self.current_spine_item = item self.current_spine_item = item
self.mobimlize_elem(body, stylizer, BlockState(nbody), self.mobimlize_elem(body, stylizer, BlockState(nbody),
[FormatState()]) [FormatState()])
@@ -162,7 +164,7 @@ class MobiMLizer(object):
lines = text.split('\n') lines = text.split('\n')
result = lines[:1] result = lines[:1]
for line in lines[1:]: for line in lines[1:]:
result.append(etree.Element(XHTML('br'))) result.append(etree.Element(base.tag('xhtml', 'br')))
if line: if line:
result.append(line) result.append(line)
return result return result
@@ -194,7 +196,7 @@ class MobiMLizer(object):
indent = (indent / abs(indent)) * self.profile.fbase indent = (indent / abs(indent)) * self.profile.fbase
if tag in NESTABLE_TAGS and not istate.rendered: if tag in NESTABLE_TAGS and not istate.rendered:
para = wrapper = etree.SubElement( para = wrapper = etree.SubElement(
parent, XHTML(tag), attrib=istate.attrib) parent, base.tag('xhtml', tag), attrib=istate.attrib)
bstate.nested.append(para) bstate.nested.append(para)
if tag == 'li' and len(istates) > 1: if tag == 'li' and len(istates) > 1:
istates[-2].list_num += 1 istates[-2].list_num += 1
@@ -203,21 +205,21 @@ class MobiMLizer(object):
para = wrapper = bstate.nested[-1] para = wrapper = bstate.nested[-1]
elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0: elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
ems = self.profile.mobi_ems_per_blockquote ems = self.profile.mobi_ems_per_blockquote
para = wrapper = etree.SubElement(parent, XHTML('blockquote')) para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'blockquote'))
para = wrapper para = wrapper
emleft = int(round(left / self.profile.fbase)) - ems emleft = int(round(left / self.profile.fbase)) - ems
emleft = min((emleft, 10)) emleft = min((emleft, 10))
while emleft > ems / 2: while emleft > ems / 2:
para = etree.SubElement(para, XHTML('blockquote')) para = etree.SubElement(para, base.tag('xhtml', 'blockquote'))
emleft -= ems emleft -= ems
else: else:
para = wrapper = etree.SubElement(parent, XHTML('p')) para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'p'))
bstate.inline = bstate.para = para bstate.inline = bstate.para = para
vspace = bstate.vpadding + bstate.vmargin vspace = bstate.vpadding + bstate.vmargin
bstate.vpadding = bstate.vmargin = 0 bstate.vpadding = bstate.vmargin = 0
if tag not in TABLE_TAGS: if tag not in TABLE_TAGS:
if tag in ('ul', 'ol') and vspace > 0: if tag in ('ul', 'ol') and vspace > 0:
wrapper.addprevious(etree.Element(XHTML('div'), wrapper.addprevious(etree.Element(base.tag('xhtml', 'div'),
height=self.mobimlize_measure(vspace))) height=self.mobimlize_measure(vspace)))
else: else:
wrapper.attrib['height'] = self.mobimlize_measure(vspace) wrapper.attrib['height'] = self.mobimlize_measure(vspace)
@@ -225,7 +227,7 @@ class MobiMLizer(object):
elif tag == 'table' and vspace > 0: elif tag == 'table' and vspace > 0:
vspace = int(round(vspace / self.profile.fbase)) vspace = int(round(vspace / self.profile.fbase))
while vspace > 0: while vspace > 0:
wrapper.addprevious(etree.Element(XHTML('br'))) wrapper.addprevious(etree.Element(base.tag('xhtml', 'br')))
vspace -= 1 vspace -= 1
if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)): if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)):
if isinstance(istate.halign, bytes): if isinstance(istate.halign, bytes):
@@ -237,7 +239,7 @@ class MobiMLizer(object):
bstate.inline = para bstate.inline = para
pstate = bstate.istate = None pstate = bstate.istate = None
try: try:
etree.SubElement(para, XHTML(tag), attrib=istate.attrib) etree.SubElement(para, base.tag('xhtml', tag), attrib=istate.attrib)
except: except:
print('Invalid subelement:', para, tag, istate.attrib) print('Invalid subelement:', para, tag, istate.attrib)
raise raise
@@ -245,7 +247,7 @@ class MobiMLizer(object):
para.attrib['valign'] = 'top' para.attrib['valign'] = 'top'
if istate.ids: if istate.ids:
for id_ in istate.ids: for id_ in istate.ids:
anchor = etree.Element(XHTML('a'), attrib={'id': id_}) anchor = etree.Element(base.tag('xhtml', 'a'), attrib={'id': id_})
if tag == 'li': if tag == 'li':
try: try:
last = bstate.body[-1][-1] last = bstate.body[-1][-1]
@@ -262,7 +264,7 @@ class MobiMLizer(object):
# This could potentially break if inserting an anchor at # This could potentially break if inserting an anchor at
# this point in the markup is illegal, but I cannot think # this point in the markup is illegal, but I cannot think
# of such a case offhand. # of such a case offhand.
if barename(last.tag) in LEAF_TAGS: if parse_utils.barename(last.tag) in LEAF_TAGS:
last.addprevious(anchor) last.addprevious(anchor)
else: else:
last.append(anchor) last.append(anchor)
@@ -279,28 +281,28 @@ class MobiMLizer(object):
elif pstate and pstate.href == href: elif pstate and pstate.href == href:
inline = bstate.anchor inline = bstate.anchor
else: else:
inline = etree.SubElement(inline, XHTML('a'), href=href) inline = etree.SubElement(inline, base.tag('xhtml', 'a'), href=href)
bstate.anchor = inline bstate.anchor = inline
if fsize != 3: if fsize != 3:
inline = etree.SubElement(inline, XHTML('font'), inline = etree.SubElement(inline, base.tag('xhtml', 'font'),
size=str(fsize)) size=str(fsize))
if istate.family == 'monospace': if istate.family == 'monospace':
inline = etree.SubElement(inline, XHTML('tt')) inline = etree.SubElement(inline, base.tag('xhtml', 'tt'))
if istate.italic: if istate.italic:
inline = etree.SubElement(inline, XHTML('i')) inline = etree.SubElement(inline, base.tag('xhtml', 'i'))
if istate.bold: if istate.bold:
inline = etree.SubElement(inline, XHTML('b')) inline = etree.SubElement(inline, base.tag('xhtml', 'b'))
if istate.bgcolor is not None and istate.bgcolor != 'transparent' : if istate.bgcolor is not None and istate.bgcolor != 'transparent' :
inline = etree.SubElement(inline, XHTML('span'), inline = etree.SubElement(inline, base.tag('xhtml', 'span'),
bgcolor=convert_color_for_font_tag(istate.bgcolor)) bgcolor=convert_color_for_font_tag(istate.bgcolor))
if istate.fgcolor != 'black': if istate.fgcolor != 'black':
inline = etree.SubElement(inline, XHTML('font'), inline = etree.SubElement(inline, base.tag('xhtml', 'font'),
color=convert_color_for_font_tag(istate.fgcolor)) color=convert_color_for_font_tag(istate.fgcolor))
if istate.strikethrough: if istate.strikethrough:
inline = etree.SubElement(inline, XHTML('s')) inline = etree.SubElement(inline, base.tag('xhtml', 's'))
if istate.underline: if istate.underline:
inline = etree.SubElement(inline, XHTML('u')) inline = etree.SubElement(inline, base.tag('xhtml', 'u'))
bstate.inline = inline bstate.inline = inline
bstate.istate = istate bstate.istate = istate
inline = bstate.inline inline = bstate.inline
@@ -318,7 +320,7 @@ class MobiMLizer(object):
def mobimlize_elem(self, elem, stylizer, bstate, istates, def mobimlize_elem(self, elem, stylizer, bstate, istates,
ignore_valign=False): ignore_valign=False):
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) != XHTML_NS: or parse_utils.namespace(elem.tag) != const.XHTML_NS:
return return
style = stylizer.style(elem) style = stylizer.style(elem)
# <mbp:frame-set/> does not exist lalalala # <mbp:frame-set/> does not exist lalalala
@@ -333,10 +335,10 @@ class MobiMLizer(object):
elem.text = None elem.text = None
elem.set('id', id_) elem.set('id', id_)
elem.tail = tail elem.tail = tail
elem.tag = XHTML('a') elem.tag = base.tag('xhtml', 'a')
else: else:
return return
tag = barename(elem.tag) tag = parse_utils.barename(elem.tag)
istate = copy.copy(istates[-1]) istate = copy.copy(istates[-1])
istate.rendered = False istate.rendered = False
istate.list_num = 0 istate.list_num = 0
@@ -451,7 +453,7 @@ class MobiMLizer(object):
if 'width' not in istate.attrib or 'height' not in istate.attrib: if 'width' not in istate.attrib or 'height' not in istate.attrib:
href = self.current_spine_item.abshref(elem.attrib['src']) href = self.current_spine_item.abshref(elem.attrib['src'])
try: try:
item = self.oeb.manifest.hrefs[urlnormalize(href)] item = self.oeb.manifest.hrefs[base.urlnormalize(href)]
except: except:
self.oeb.logger.warn('Failed to find image:', self.oeb.logger.warn('Failed to find image:',
href) href)
@@ -534,9 +536,9 @@ class MobiMLizer(object):
isinstance(valign, numbers.Number) and valign > 0) isinstance(valign, numbers.Number) and valign > 0)
vtag = 'sup' if issup else 'sub' vtag = 'sup' if issup else 'sub'
if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body')))
vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p'))
self.mobimlize_elem(elem, stylizer, vbstate, istates, self.mobimlize_elem(elem, stylizer, vbstate, istates,
ignore_valign=True) ignore_valign=True)
if len(istates) > 0: if len(istates) > 0:
@@ -548,8 +550,8 @@ class MobiMLizer(object):
self.mobimlize_content('span', '', bstate, istates) self.mobimlize_content('span', '', bstate, istates)
parent = bstate.para if bstate.inline is None else bstate.inline parent = bstate.para if bstate.inline is None else bstate.inline
if parent is not None: if parent is not None:
vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(parent, base.tag('xhtml', vtag))
vtag = etree.SubElement(vtag, XHTML('small')) vtag = etree.SubElement(vtag, base.tag('xhtml', 'small'))
# Add anchors # Add anchors
for child in vbstate.body: for child in vbstate.body:
if child is not vbstate.para: if child is not vbstate.para:
@@ -601,7 +603,7 @@ class MobiMLizer(object):
para = bstate.para para = bstate.para
if para is not None and para.text == '\xa0' and len(para) < 1: if para is not None and para.text == '\xa0' and len(para) < 1:
if style.height > 2: if style.height > 2:
para.getparent().replace(para, etree.Element(XHTML('br'))) para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br')))
else: else:
# This is too small to be rendered effectively, drop it # This is too small to be rendered effectively, drop it
para.getparent().remove(para) para.getparent().remove(para)
+4 -3
View File
@@ -8,6 +8,7 @@ import uuid
from lxml import etree from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX
from ebook_converter.ebooks.mobi.reader.index import read_index from ebook_converter.ebooks.mobi.reader.index import read_index
from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc
@@ -17,7 +18,7 @@ from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator
from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.metadata.toc import TOC
from ebook_converter.ebooks.mobi.utils import read_font_record from ebook_converter.ebooks.mobi.utils import read_font_record
from ebook_converter.ebooks.oeb.parse_utils import parse_html from ebook_converter.ebooks.oeb.parse_utils import parse_html
from ebook_converter.ebooks.oeb.base import XPath, XHTML, xml2text from ebook_converter.ebooks.oeb.base import XPath, xml2text
from ebook_converter.polyglot.builtins import as_unicode from ebook_converter.polyglot.builtins import as_unicode
@@ -553,8 +554,8 @@ class Mobi8Reader(object):
seen = set() seen = set()
links = [] links = []
for elem in root.iterdescendants(etree.Element): for elem in root.iterdescendants(etree.Element):
if reached and elem.tag == XHTML('a') and elem.get('href', if reached and elem.tag == const.XHTML_A and elem.get('href',
False): False):
href = elem.get('href') href = elem.get('href')
href, frag = urllib.parse.urldefrag(href) href, frag = urllib.parse.urldefrag(href)
href = base_href + '/' + href href = base_href + '/' + href
@@ -4,16 +4,11 @@ import re
import unicodedata import unicodedata
import urllib.parse import urllib.parse
from ebook_converter import constants as const
from ebook_converter.ebooks.mobi.mobiml import MBP_NS from ebook_converter.ebooks.mobi.mobiml import MBP_NS
from ebook_converter.ebooks.mobi.utils import is_guide_ref_start from ebook_converter.ebooks.mobi.utils import is_guide_ref_start
from ebook_converter.ebooks.oeb.base import ( from ebook_converter.ebooks.oeb import base
OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize from ebook_converter.ebooks.oeb import parse_utils
)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class Buf(io.BytesIO): class Buf(io.BytesIO):
@@ -25,9 +20,14 @@ class Buf(io.BytesIO):
class Serializer(object): class Serializer(object):
NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} NSRMAP = {'': None,
const.XML_NS: 'xml',
const.XHTML_NS: '',
MBP_NS: 'mbp'} # TODO(gryf): check why this is different than
# MBP_NS from const.
def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True): def __init__(self, oeb, images, is_periodical,
write_page_breaks_after_item=True):
''' '''
Write all the HTML markup in oeb into a single in memory buffer Write all the HTML markup in oeb into a single in memory buffer
containing a single html document with links replaced by offsets into containing a single html document with links replaced by offsets into
@@ -157,7 +157,8 @@ class Serializer(object):
buf.write(b'<guide>') buf.write(b'<guide>')
for ref in self.oeb.guide.values(): for ref in self.oeb.guide.values():
path = urllib.parse.urldefrag(ref.href)[0] path = urllib.parse.urldefrag(ref.href)[0]
if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: if (path not in hrefs or
hrefs[path].media_type not in base.OEB_DOCS):
continue continue
buf.write(b'<reference type="') buf.write(b'<reference type="')
@@ -178,28 +179,28 @@ class Serializer(object):
buf.write(b'</guide>') buf.write(b'</guide>')
def serialize_href(self, href, base=None): def serialize_href(self, href, _base=None):
''' """
Serialize the href attribute of an <a> or <reference> tag. It is Serialize the href attribute of an <a> or <reference> tag. It is
serialized as filepos="000000000" and a pointer to its location is serialized as filepos="000000000" and a pointer to its location is
stored in self.href_offsets so that the correct value can be filled in stored in self.href_offsets so that the correct value can be filled in
at the end. at the end.
''' """
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
try: try:
path, frag = urllib.parse.urldefrag(urlnormalize(href)) path, frag = urllib.parse.urldefrag(base.urlnormalize(href))
except ValueError: except ValueError:
# Unparseable URL # Unparseable URL
return False return False
if path and base: if path and _base:
path = base.abshref(path) path = _base.abshref(path)
if path and path not in hrefs: if path and path not in hrefs:
return False return False
buf = self.buf buf = self.buf
item = hrefs[path] if path else None item = hrefs[path] if path else None
if item and item.spine_position is None: if item and item.spine_position is None:
return False return False
path = item.href if item else base.href path = item.href if item else _base.href
href = '#'.join((path, frag)) if frag else path href = '#'.join((path, frag)) if frag else path
buf.write(b'filepos=') buf.write(b'filepos=')
self.href_offsets[href].append(buf.tell()) self.href_offsets[href].append(buf.tell())
@@ -219,7 +220,7 @@ class Serializer(object):
if href is not None: if href is not None:
# resolve the section url in id_offsets # resolve the section url in id_offsets
buf.write(b'<mbp:pagebreak />') buf.write(b'<mbp:pagebreak />')
self.id_offsets[urlnormalize(href)] = buf.tell() self.id_offsets[base.urlnormalize(href)] = buf.tell()
if tocref.klass == "periodical": if tocref.klass == "periodical":
buf.write(b'<div> <div height="1em"></div>') buf.write(b'<div> <div height="1em"></div>')
@@ -267,7 +268,7 @@ class Serializer(object):
if self.is_periodical and item.is_section_start: if self.is_periodical and item.is_section_start:
for section_toc in top_toc.nodes: for section_toc in top_toc.nodes:
if urlnormalize(item.href) == section_toc.href: if base.urlnormalize(item.href) == section_toc.href:
# create section url of the form r'feed_\d+/index.html' # create section url of the form r'feed_\d+/index.html'
section_url = re.sub(r'article_\d+/', '', section_toc.href) section_url = re.sub(r'article_\d+/', '', section_toc.href)
serialize_toc_level(section_toc, section_url) serialize_toc_level(section_toc, section_url)
@@ -287,12 +288,12 @@ class Serializer(object):
buf = self.buf buf = self.buf
if not item.linear: if not item.linear:
self.breaks.append(buf.tell() - 1) self.breaks.append(buf.tell() - 1)
self.id_offsets[urlnormalize(item.href)] = buf.tell() self.id_offsets[base.urlnormalize(item.href)] = buf.tell()
if item.is_section_start: if item.is_section_start:
buf.write(b'<a ></a> ') buf.write(b'<a ></a> ')
if item.is_article_start: if item.is_article_start:
buf.write(b'<a ></a> <a ></a>') buf.write(b'<a ></a> <a ></a>')
for elem in item.data.find(XHTML('body')): for elem in item.data.find(base.tag('xhtml', 'body')):
self.serialize_elem(elem, item) self.serialize_elem(elem, item)
if self.write_page_breaks_after_item: if self.write_page_breaks_after_item:
buf.write(b'<mbp:pagebreak/>') buf.write(b'<mbp:pagebreak/>')
@@ -306,15 +307,15 @@ class Serializer(object):
def serialize_elem(self, elem, item, nsrmap=NSRMAP): def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buf = self.buf buf = self.buf
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) not in nsrmap: or parse_utils.namespace(elem.tag) not in nsrmap:
return return
tag = prefixname(elem.tag, nsrmap) tag = base.prefixname(elem.tag, nsrmap)
# Previous layers take care of @name # Previous layers take care of @name
id_ = elem.attrib.pop('id', None) id_ = elem.attrib.pop('id', None)
if id_: if id_:
href = '#'.join((item.href, id_)) href = '#'.join((item.href, id_))
offset = self.anchor_offset or buf.tell() offset = self.anchor_offset or buf.tell()
key = urlnormalize(href) key = base.urlnormalize(href)
# Only set this id_offset if it wasn't previously seen # Only set this id_offset if it wasn't previously seen
self.id_offsets[key] = self.id_offsets.get(key, offset) self.id_offsets[key] = self.id_offsets.get(key, offset)
if self.anchor_offset is not None and \ if self.anchor_offset is not None and \
@@ -326,15 +327,15 @@ class Serializer(object):
buf.write(tag.encode('utf-8')) buf.write(tag.encode('utf-8'))
if elem.attrib: if elem.attrib:
for attr, val in elem.attrib.items(): for attr, val in elem.attrib.items():
if namespace(attr) not in nsrmap: if parse_utils.namespace(attr) not in nsrmap:
continue continue
attr = prefixname(attr, nsrmap) attr = base.prefixname(attr, nsrmap)
buf.write(b' ') buf.write(b' ')
if attr == 'href': if attr == 'href':
if self.serialize_href(val, item): if self.serialize_href(val, item):
continue continue
elif attr == 'src': elif attr == 'src':
href = urlnormalize(item.abshref(val)) href = base.urlnormalize(item.abshref(val))
if href in self.images: if href in self.images:
index = self.images[href] index = self.images[href]
self.used_images.add(href) self.used_images.add(href)
+2 -2
View File
@@ -2,6 +2,7 @@ import re
from struct import pack from struct import pack
from io import BytesIO from io import BytesIO
from ebook_converter.ebooks.oeb import base
from ebook_converter.constants_old import iswindows, isosx from ebook_converter.constants_old import iswindows, isosx
from ebook_converter.ebooks.mobi.utils import (utf8_text, to_base) from ebook_converter.ebooks.mobi.utils import (utf8_text, to_base)
from ebook_converter.utils.localization import lang_as_iso639_1 from ebook_converter.utils.localization import lang_as_iso639_1
@@ -95,9 +96,8 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
# Write UUID as ASIN # Write UUID as ASIN
uuid = None uuid = None
from ebook_converter.ebooks.oeb.base import OPF
for x in metadata['identifier']: for x in metadata['identifier']:
if (x.get(OPF('scheme'), None).lower() == 'uuid' or if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or
str(x).startswith('urn:uuid:')): str(x).startswith('urn:uuid:')):
uuid = str(x).split(':')[-1] uuid = str(x).split(':')[-1]
break break
+4 -9
View File
@@ -1,11 +1,12 @@
""" """
Convert an ODT file into a Open Ebook Convert an ODT file into a Open Ebook
""" """
import os, logging import logging
import os
from lxml import etree
from css_parser import CSSParser from css_parser import CSSParser
from css_parser.css import CSSRule from css_parser.css import CSSRule
from lxml import etree
from odf.odf2xhtml import ODF2XHTML from odf.odf2xhtml import ODF2XHTML
from odf.opendocument import load as odLoad from odf.opendocument import load as odLoad
@@ -14,15 +15,9 @@ from odf.namespaces import TEXTNS as odTEXTNS
from ebook_converter import CurrentDir, walk from ebook_converter import CurrentDir, walk
from ebook_converter.ebooks.oeb.base import _css_logger from ebook_converter.ebooks.oeb.base import _css_logger
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.polyglot.builtins import as_bytes from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
class Extract(ODF2XHTML): class Extract(ODF2XHTML):
def extract_pictures(self, zf): def extract_pictures(self, zf):
@@ -46,7 +41,7 @@ class Extract(ODF2XHTML):
ol.set('start', val) ol.set('start', val)
def fix_markup(self, html, log): def fix_markup(self, html, log):
root = safe_xml_fromstring(html) root = etree.fromstring(html)
self.filter_css(root, log) self.filter_css(root, log)
self.extract_css(root, log) self.extract_css(root, log)
self.epubify_markup(root, log) self.epubify_markup(root, log)
File diff suppressed because it is too large Load Diff
+21 -25
View File
@@ -1,20 +1,16 @@
import re import re
from lxml import etree, html from lxml import etree
from lxml import html
from ebook_converter import constants as const
from ebook_converter import xml_replace_entities, force_unicode from ebook_converter import xml_replace_entities, force_unicode
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.constants_old import filesystem_encoding from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
__license__ = 'GPL v3' RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' resolve_entities=False)
__docformat__ = 'restructuredtext en'
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
XHTML_NS = 'http://www.w3.org/1999/xhtml'
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
class NotHTML(Exception): class NotHTML(Exception):
@@ -33,15 +29,15 @@ def namespace(name):
def XHTML(name): def XHTML(name):
return '{%s}%s' % (XHTML_NS, name) return '{%s}%s' % (const.XHTML_NS, name)
def xpath(elem, expr): def xpath(elem, expr):
return elem.xpath(expr, namespaces={'h':XHTML_NS}) return elem.xpath(expr, namespaces={'h':const.XHTML_NS})
def XPath(expr): def XPath(expr):
return etree.XPath(expr, namespaces={'h':XHTML_NS}) return etree.XPath(expr, namespaces={'h':const.XHTML_NS})
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
@@ -111,7 +107,7 @@ def _html4_parse(data):
elem.text = elem.text.strip('-') elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding='unicode') data = etree.tostring(data, encoding='unicode')
data = safe_xml_fromstring(data) data = etree.fromstring(data)
return data return data
@@ -204,14 +200,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
try: try:
data = safe_xml_fromstring(data, recover=False) data = etree.fromstring(data)
check_for_html5(pre, data) check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError): except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more' log.debug('Initial parse failed, using more'
' forgiving parsers') ' forgiving parsers')
raw = data = xml_replace_entities(raw) raw = data = xml_replace_entities(raw)
try: try:
data = safe_xml_fromstring(data, recover=False) data = etree.fromstring(data)
check_for_html5(pre, data) check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError): except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Parsing %s as HTML' % filename) log.debug('Parsing %s as HTML' % filename)
@@ -240,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
if barename(data.tag) in non_html_file_tags: if barename(data.tag) in non_html_file_tags:
raise NotHTML(data.tag) raise NotHTML(data.tag)
log.warn('File %r does not appear to be (X)HTML'%filename) log.warn('File %r does not appear to be (X)HTML'%filename)
nroot = safe_xml_fromstring('<html></html>') nroot = etree.fromstring('<html></html>')
has_body = False has_body = False
for child in list(data): for child in list(data):
if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body': if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body':
@@ -249,7 +245,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
parent = nroot parent = nroot
if not has_body: if not has_body:
log.warn('File %r appears to be a HTML fragment'%filename) log.warn('File %r appears to be a HTML fragment'%filename)
nroot = safe_xml_fromstring('<html><body/></html>') nroot = etree.fromstring('<html><body/></html>')
parent = nroot[0] parent = nroot[0]
for child in list(data.iter()): for child in list(data.iter()):
oparent = child.getparent() oparent = child.getparent()
@@ -261,16 +257,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
# Force into the XHTML namespace # Force into the XHTML namespace
if not namespace(data.tag): if not namespace(data.tag):
log.warn('Forcing', filename, 'into XHTML namespace') log.warn('Forcing', filename, 'into XHTML namespace')
data.attrib['xmlns'] = XHTML_NS data.attrib['xmlns'] = const.XHTML_NS
data = etree.tostring(data, encoding='unicode') data = etree.tostring(data, encoding='unicode')
try: try:
data = safe_xml_fromstring(data, recover=False) data = etree.fromstring(data)
except: except:
data = data.replace(':=', '=').replace(':>', '>') data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '') data = data.replace('<http:/>', '')
try: try:
data = safe_xml_fromstring(data, recover=False) data = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
log.warn('Stripping comments from %s'% log.warn('Stripping comments from %s'%
filename) filename)
@@ -281,17 +277,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
'') '')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
try: try:
data = safe_xml_fromstring(data) data = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
log.warn('Stripping meta tags from %s'% filename) log.warn('Stripping meta tags from %s'% filename)
data = re.sub(r'<meta\s+[^>]+?>', '', data) data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = safe_xml_fromstring(data) data = etree.fromstring(data)
elif namespace(data.tag) != XHTML_NS: elif namespace(data.tag) != const.XHTML_NS:
# OEB_DOC_NS, but possibly others # OEB_DOC_NS, but possibly others
ns = namespace(data.tag) ns = namespace(data.tag)
attrib = dict(data.attrib) attrib = dict(data.attrib)
nroot = etree.Element(XHTML('html'), nroot = etree.Element(XHTML('html'),
nsmap={None: XHTML_NS}, attrib=attrib) nsmap={None: const.XHTML_NS}, attrib=attrib)
for elem in data.iterdescendants(): for elem in data.iterdescendants():
if isinstance(elem.tag, (str, bytes)) and \ if isinstance(elem.tag, (str, bytes)) and \
namespace(elem.tag) == ns: namespace(elem.tag) == ns:
@@ -301,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = nroot data = nroot
# Remove non default prefixes referring to the XHTML namespace # Remove non default prefixes referring to the XHTML namespace
data = ensure_namespace_prefixes(data, {None: XHTML_NS}) data = ensure_namespace_prefixes(data, {None: const.XHTML_NS})
data = merge_multiple_html_heads_and_bodies(data, log) data = merge_multiple_html_heads_and_bodies(data, log)
# Ensure has a <head/> # Ensure has a <head/>
+12 -11
View File
@@ -14,7 +14,9 @@ from itertools import count
import urllib.parse import urllib.parse
from css_parser import getUrls, replaceUrls from css_parser import getUrls, replaceUrls
from lxml import etree
from ebook_converter import constants as const
from ebook_converter import CurrentDir, walk from ebook_converter import CurrentDir, walk
from ebook_converter.constants_old import iswindows from ebook_converter.constants_old import iswindows
from ebook_converter.customize.ui import plugin_for_input_format, plugin_for_output_format from ebook_converter.customize.ui import plugin_for_input_format, plugin_for_output_format
@@ -34,7 +36,7 @@ from ebook_converter.ebooks.mobi import MobiError
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
from ebook_converter.ebooks.mobi.tweak import set_cover from ebook_converter.ebooks.mobi.tweak import set_cover
from ebook_converter.ebooks.oeb.base import ( from ebook_converter.ebooks.oeb.base import (
DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks, OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
rewrite_links, serialize, urlquote, urlunquote rewrite_links, serialize, urlquote, urlunquote
) )
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
@@ -47,13 +49,11 @@ from ebook_converter.ptempfile import PersistentTemporaryDirectory, PersistentTe
from ebook_converter.utils.filenames import hardlink_file, nlinks_file from ebook_converter.utils.filenames import hardlink_file, nlinks_file
from ebook_converter.utils.ipc.simple_worker import WorkerError, fork_job from ebook_converter.utils.ipc.simple_worker import WorkerError, fork_job
from ebook_converter.utils.logging import default_log from ebook_converter.utils.logging import default_log
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.zipfile import ZipFile
exists, join, relpath = os.path.exists, os.path.join, os.path.relpath exists, join, relpath = os.path.exists, os.path.join, os.path.relpath
OEB_FONTS = {guess_type('a.ttf'), guess_type('b.otf'), guess_type('a.woff'), 'application/x-font-ttf', 'application/x-font-otf', 'application/font-sfnt'} OEB_FONTS = {guess_type('a.ttf'), guess_type('b.otf'), guess_type('a.woff'), 'application/x-font-ttf', 'application/x-font-otf', 'application/font-sfnt'}
OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS}
null = object() null = object()
@@ -195,7 +195,7 @@ class ContainerBase(object): # {{{
data, self.used_encoding = xml_to_unicode( data, self.used_encoding = xml_to_unicode(
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True) data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
data = unicodedata.normalize('NFC', data) data = unicodedata.normalize('NFC', data)
return safe_xml_fromstring(data) return etree.fromstring(data)
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False): def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
if self.tweak_mode: if self.tweak_mode:
@@ -324,7 +324,7 @@ class Container(ContainerBase): # {{{
item_id = 'id' + '%d'%c item_id = 'id' + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
href = self.name_to_href(name, self.opf_name) href = self.name_to_href(name, self.opf_name)
item = manifest.makeelement(OPF('item'), item = manifest.makeelement(const.OPF_ITEM,
id=item_id, href=href) id=item_id, href=href)
item.set('media-type', self.mime_map[name]) item.set('media-type', self.mime_map[name])
self.insert_into_xml(manifest, item) self.insert_into_xml(manifest, item)
@@ -380,7 +380,7 @@ class Container(ContainerBase): # {{{
if mt in OEB_DOCS: if mt in OEB_DOCS:
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
spine = self.opf_xpath('//opf:spine')[0] spine = self.opf_xpath('//opf:spine')[0]
si = manifest.makeelement(OPF('itemref'), idref=item_id) si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id)
self.insert_into_xml(spine, si, index=spine_index) self.insert_into_xml(spine, si, index=spine_index)
return name return name
@@ -533,7 +533,7 @@ class Container(ContainerBase): # {{{
def opf_xpath(self, expr): def opf_xpath(self, expr):
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. ' ' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
return self.opf.xpath(expr, namespaces=OPF_NAMESPACES) return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES)
def has_name(self, name): def has_name(self, name):
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. ''' ''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
@@ -813,7 +813,8 @@ class Container(ContainerBase): # {{{
spine = self.opf_xpath('//opf:spine')[0] spine = self.opf_xpath('//opf:spine')[0]
spine.text = tail spine.text = tail
for name, linear in spine_items: for name, linear in spine_items:
i = spine.makeelement('{%s}itemref' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']}) i = spine.makeelement(const.OPF_ITEMREF,
nsmap={'opf': const.OPF2_NS})
i.tail = tail i.tail = tail
i.set('idref', imap[name]) i.set('idref', imap[name])
spine.append(i) spine.append(i)
@@ -944,7 +945,7 @@ class Container(ContainerBase): # {{{
item_id = id_prefix + '%d'%c item_id = id_prefix + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0] manifest = self.opf_xpath('//opf:manifest')[0]
item = manifest.makeelement(OPF('item'), item = manifest.makeelement(const.OPF_ITEM,
id=item_id, href=href) id=item_id, href=href)
item.set('media-type', media_type) item.set('media-type', media_type)
self.insert_into_xml(manifest, item) self.insert_into_xml(manifest, item)
@@ -993,7 +994,7 @@ class Container(ContainerBase): # {{{
self.format_opf() self.format_opf()
data = serialize(data, self.mime_map[name], pretty_print=name in data = serialize(data, self.mime_map[name], pretty_print=name in
self.pretty_print) self.pretty_print)
if name == self.opf_name and root.nsmap.get(None) == OPF2_NS: if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
# Needed as I can't get lxml to output opf:role and # Needed as I can't get lxml to output opf:role and
# not output <opf:metadata> as well # not output <opf:metadata> as well
data = re.sub(br'(<[/]{0,1})opf:', r'\1', data) data = re.sub(br'(<[/]{0,1})opf:', r'\1', data)
@@ -1172,7 +1173,7 @@ class EpubContainer(Container):
container_path = join(self.root, 'META-INF', 'container.xml') container_path = join(self.root, 'META-INF', 'container.xml')
if not exists(container_path): if not exists(container_path):
raise InvalidEpub('No META-INF/container.xml in epub') raise InvalidEpub('No META-INF/container.xml in epub')
container = safe_xml_fromstring(open(container_path, 'rb').read()) container = etree.fromstring(open(container_path, 'rb').read())
opf_files = container.xpath(( opf_files = container.xpath((
r'child::ocf:rootfiles/ocf:rootfile' r'child::ocf:rootfiles/ocf:rootfile'
'[@media-type="%s" and @full-path]'%guess_type('a.opf') '[@media-type="%s" and @full-path]'%guess_type('a.opf')
+4 -3
View File
@@ -2,10 +2,11 @@ from collections import defaultdict
from functools import partial from functools import partial
from css_parser.css import CSSRule, CSSStyleDeclaration from css_parser.css import CSSRule, CSSStyleDeclaration
from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter import constants as const
from ebook_converter import force_unicode from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from ebook_converter.utils.icu import numeric_sort_key from ebook_converter.utils.icu import numeric_sort_key
@@ -382,7 +383,7 @@ def add_stylesheet_links(container, name, text):
if not sheets: if not sheets:
return return
for sname in sheets: for sname in sheets:
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name)) link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
head.append(link) head.append(link)
pretty_xml_tree(head) pretty_xml_tree(head)
return serialize(root, 'text/html') return serialize(root, 'text/html')
+11 -11
View File
@@ -1,13 +1,9 @@
from lxml import etree from lxml import etree
from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES from ebook_converter import constants as const
from ebook_converter.utils.localization import canonicalize_lang from ebook_converter.utils.localization import canonicalize_lang
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
def get_book_language(container): def get_book_language(container):
for lang in container.opf_xpath('//dc:language'): for lang in container.opf_xpath('//dc:language'):
raw = lang.text raw = lang.text
@@ -18,7 +14,7 @@ def get_book_language(container):
def set_guide_item(container, item_type, title, name, frag=None): def set_guide_item(container, item_type, title, name, frag=None):
ref_tag = '{%s}reference' % OPF_NAMESPACES['opf'] ref_tag = const.OPF_REFERENCE
href = None href = None
if name: if name:
href = container.name_to_href(name, container.opf_name) href = container.name_to_href(name, container.opf_name)
@@ -27,23 +23,27 @@ def set_guide_item(container, item_type, title, name, frag=None):
guides = container.opf_xpath('//opf:guide') guides = container.opf_xpath('//opf:guide')
if not guides and href: if not guides and href:
g = container.opf.makeelement('{%s}guide' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']}) g = container.opf.makeelement(const.OPF_GUIDE,
nsmap={'opf': const.OPF2_NS})
container.insert_into_xml(container.opf, g) container.insert_into_xml(container.opf, g)
guides = [g] guides = [g]
for guide in guides: for guide in guides:
matches = [] matches = []
for child in guide.iterchildren(etree.Element): for child in guide.iterchildren(etree.Element):
if child.tag == ref_tag and child.get('type', '').lower() == item_type.lower(): if (child.tag == ref_tag and
child.get('type', '').lower() == item_type.lower()):
matches.append(child) matches.append(child)
if not matches and href: if not matches and href:
r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf':OPF_NAMESPACES['opf']}) r = guide.makeelement(ref_tag, type=item_type,
nsmap={'opf': const.OPF2_NS})
container.insert_into_xml(guide, r) container.insert_into_xml(guide, r)
matches.append(r) matches.append(r)
for m in matches: for m in matches:
if href: if href:
m.set('title', title), m.set('href', href), m.set('type', item_type) m.set('title', title)
m.set('href', href)
m.set('type', item_type)
else: else:
container.remove_from_xml(m) container.remove_from_xml(m)
container.dirty(container.opf_name) container.dirty(container.opf_name)
+34 -25
View File
@@ -1,21 +1,18 @@
import re import re
from lxml.etree import Element as LxmlElement from lxml import etree
import html5_parser import html5_parser
from ebook_converter import constants as const
from ebook_converter import xml_replace_entities from ebook_converter import xml_replace_entities
from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.ebooks.chardet import strip_encoding_declarations
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
__license__ = 'GPL v3' def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' line_numbers=True, linenumber_attribute=None,
replace_entities=True, fix_newlines=True):
XHTML_NS = 'http://www.w3.org/1999/xhtml'
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
if replace_entities: if replace_entities:
@@ -23,10 +20,14 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
if fix_newlines: if fix_newlines:
raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = raw.replace('\r\n', '\n').replace('\r', '\n')
raw = clean_xml_chars(raw) raw = clean_xml_chars(raw)
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces,
if (discard_namespaces and root.tag != 'html') or ( line_number_attr=linenumber_attribute,
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): keep_doctype=False, sanitize_names=True)
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) if ((discard_namespaces and root.tag != 'html') or
(not discard_namespaces and
(root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
raise ValueError('Failed to parse correctly, root has tag: %s and '
'prefix: %s' % (root.tag, root.prefix))
return root return root
@@ -48,12 +49,14 @@ def handle_private_entities(data):
user_entities[match.group(1)] = val user_entities[match.group(1)] = val
if user_entities: if user_entities:
data = ('\n' * num_of_nl_in_pre) + data[idx:] data = ('\n' * num_of_nl_in_pre) + data[idx:]
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data) data = pat.sub(lambda m: user_entities[m.group(1)], data)
return data return data
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): def parse(raw, decoder=None, log=None, line_numbers=True,
linenumber_attribute=None, replace_entities=True,
force_html5_parse=False):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = handle_private_entities(raw) raw = handle_private_entities(raw)
@@ -70,26 +73,32 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
raw = ('\n' * newlines) + raw[match.start():] raw = ('\n' * newlines) + raw[match.start():]
break break
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) raw = strip_encoding_declarations(raw, limit=10*1024,
preserve_newlines=True)
if force_html5_parse: if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) return parse_html5(raw, log=log, line_numbers=line_numbers,
linenumber_attribute=linenumber_attribute,
replace_entities=False, fix_newlines=False)
try: try:
ans = safe_xml_fromstring(raw, recover=False) ans = etree.fromstring(raw)
if ans.tag != '{%s}html' % XHTML_NS: if ans.tag != '{%s}html' % const.XHTML_NS:
raise ValueError('Root tag is not <html> in the XHTML namespace') raise ValueError('Root tag is not <html> in the XHTML namespace')
if linenumber_attribute: if linenumber_attribute:
for elem in ans.iter(LxmlElement): for elem in ans.iter(etree.element):
if elem.sourceline is not None: if elem.sourceline is not None:
elem.set(linenumber_attribute, str(elem.sourceline)) elem.set(linenumber_attribute, str(elem.sourceline))
return ans return ans
except Exception: except Exception:
if log is not None: if log is not None:
log.exception('Failed to parse as XML, parsing as tag soup') log.exception('Failed to parse as XML, parsing as tag soup')
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) return parse_html5(raw, log=log, line_numbers=line_numbers,
linenumber_attribute=linenumber_attribute,
replace_entities=False, fix_newlines=False)
if __name__ == '__main__': if __name__ == '__main__':
from lxml import etree root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0>&nbsp;'
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0>&nbsp;\n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False) '\n<b>b<svg ass="wipe" viewbox="0">',
discard_namespaces=False)
print(etree.tostring(root, encoding='utf-8')) print(etree.tostring(root, encoding='utf-8'))
print() print()
+13 -21
View File
@@ -2,10 +2,10 @@ import textwrap
# from lxml.etree import Element # from lxml.etree import Element
from ebook_converter import constants as const
from ebook_converter import force_unicode from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb.base import ( from ebook_converter.ebooks.oeb import parse_utils
serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG) from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES
from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils.icu import sort_key from ebook_converter.utils.icu import sort_key
@@ -38,15 +38,15 @@ def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for # Put all dc: tags first starting with title and author. Preserve order for
# the rest. # the rest.
def dckey(x): def dckey(x):
return {'title':0, 'creator':1}.get(barename(x.tag), 2) return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES): for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']) dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
dc_tags.sort(key=dckey) dc_tags.sort(key=dckey)
for x in reversed(dc_tags): for x in reversed(dc_tags):
metadata.insert(0, x) metadata.insert(0, x)
# Group items in the manifest # Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES) spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
spine_ids = {x:i for i, x in enumerate(spine_ids)} spine_ids = {x:i for i, x in enumerate(spine_ids)}
def manifest_key(x): def manifest_key(x):
@@ -75,7 +75,7 @@ def pretty_opf(root):
i = sort_key(href) i = sort_key(href)
return (cat, i) return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES): for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
try: try:
children = sorted(manifest, key=manifest_key) children = sorted(manifest, key=manifest_key)
except AttributeError: except AttributeError:
@@ -84,19 +84,11 @@ def pretty_opf(root):
manifest.insert(0, x) manifest.insert(0, x)
SVG_TAG = SVG('svg')
BLOCK_TAGS = frozenset(map(XHTML, (
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd',
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li',
'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td',
'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG}
def isblock(x): def isblock(x):
if callable(x.tag) or not x.tag: if callable(x.tag) or not x.tag:
return True return True
if x.tag in BLOCK_TAGS: if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
return True return True
return False return False
@@ -141,12 +133,12 @@ def pretty_block(parent, level=1, indent=' '):
that contain only other block tags ''' that contain only other block tags '''
if not parent.text or isspace(parent.text): if not parent.text or isspace(parent.text):
parent.text = '' parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n' nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
parent.text = parent.text + nn + (indent * level) parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent): for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child): if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent) pretty_block(child, level=level+1, indent=indent)
elif child.tag == SVG_TAG: elif child.tag == const.SVG_SVG:
pretty_xml_tree(child, level=level, indent=indent) pretty_xml_tree(child, level=level, indent=indent)
l = level l = level
if i == len(parent) - 1: if i == len(parent) - 1:
@@ -172,13 +164,13 @@ def pretty_html_tree(container, root):
child.tail = '\n\n' child.tail = '\n\n'
if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'): if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
pretty_xml_tree(child) pretty_xml_tree(child)
for body in root.findall('h:body', namespaces=XPNSMAP): for body in root.findall('h:body', namespaces=const.XPNSMAP):
pretty_block(body) pretty_block(body)
# Special case the handling of a body that contains a single block tag # Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag # with all content. In this case we prettify the containing block tag
# even if it has non block children. # even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks( if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
body[0]) and barename(body[0].tag) not in ( body[0]) and parse_utils.barename(body[0].tag) not in (
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0): 'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
pretty_block(body[0], level=2) pretty_block(body[0], level=2)
+23 -19
View File
@@ -1,7 +1,11 @@
import copy, os, re import copy
import os
import re
import urllib.parse import urllib.parse
from ebook_converter.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup
from ebook_converter.ebooks.oeb.polish.toc import node_from_loc from ebook_converter.ebooks.oeb.polish.toc import node_from_loc
from ebook_converter.ebooks.oeb.polish.replace import LinkRebaser from ebook_converter.ebooks.oeb.polish.replace import LinkRebaser
@@ -35,7 +39,7 @@ def adjust_split_point(split_point, log):
parent = sp.getparent() parent = sp.getparent()
if ( if (
parent is None or parent is None or
barename(parent.tag) in {'body', 'html'} or parse_utils.barename(parent.tag) in {'body', 'html'} or
(parent.text and parent.text.strip()) or (parent.text and parent.text.strip()) or
parent.index(sp) > 0 parent.index(sp) > 0
): ):
@@ -49,7 +53,7 @@ def adjust_split_point(split_point, log):
def get_body(root): def get_body(root):
return root.find('h:body', namespaces=XPNSMAP) return root.find('h:body', namespaces=const.XPNSMAP)
def do_split(split_point, log, before=True): def do_split(split_point, log, before=True):
@@ -113,7 +117,7 @@ def do_split(split_point, log, before=True):
nix_element(elem) nix_element(elem)
# Tree 2 # Tree 2
ancestors = frozenset(XPath('ancestor::*')(split_point2)) ancestors = frozenset(base.XPath('ancestor::*')(split_point2))
for elem in tuple(body2.iterdescendants()): for elem in tuple(body2.iterdescendants()):
if elem is split_point2: if elem is split_point2:
if not before: if not before:
@@ -251,7 +255,7 @@ def split(container, name, loc_or_xpath, before=True, totals=None):
break break
index = spine.index(spine_item) + 1 index = spine.index(spine_item) + 1
si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id')) si = spine.makeelement(base.tag('opf', 'itemref'), idref=manifest_item.get('id'))
if not linear: if not linear:
si.set('linear', 'no') si.set('linear', 'no')
container.insert_into_xml(spine, si, index=index) container.insert_into_xml(spine, si, index=index)
@@ -268,7 +272,7 @@ def multisplit(container, name, xpath, before=True):
:param before: If True the splits occur before the identified element otherwise after it. :param before: If True the splits occur before the identified element otherwise after it.
''' '''
root = container.parsed(name) root = container.parsed(name)
nodes = root.xpath(xpath, namespaces=XPNSMAP) nodes = root.xpath(xpath, namespaces=const.XPNSMAP)
if not nodes: if not nodes:
raise AbortError('The expression %s did not match any nodes' % xpath) raise AbortError('The expression %s did not match any nodes' % xpath)
for split_point in nodes: for split_point in nodes:
@@ -329,7 +333,7 @@ def all_anchors(root):
def all_stylesheets(container, name): def all_stylesheets(container, name):
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)): for link in base.XPath('//h:head/h:link[@href]')(container.parsed(name)):
name = container.href_to_name(link.get('href'), name) name = container.href_to_name(link.get('href'), name)
typ = link.get('type', 'text/css') typ = link.get('type', 'text/css')
if typ == 'text/css': if typ == 'text/css':
@@ -358,14 +362,14 @@ def merge_html(container, names, master, insert_page_breaks=False):
root = p(master) root = p(master)
# Ensure master has a <head> # Ensure master has a <head>
head = root.find('h:head', namespaces=XPNSMAP) head = root.find('h:head', namespaces=const.XPNSMAP)
if head is None: if head is None:
head = root.makeelement(XHTML('head')) head = root.makeelement(base.tag('xhtml', 'head'))
container.insert_into_xml(root, head, 0) container.insert_into_xml(root, head, 0)
seen_anchors = all_anchors(root) seen_anchors = all_anchors(root)
seen_stylesheets = set(all_stylesheets(container, master)) seen_stylesheets = set(all_stylesheets(container, master))
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1] master_body = p(master).findall('h:body', namespaces=const.XPNSMAP)[-1]
master_base = os.path.dirname(master) master_base = os.path.dirname(master)
anchor_map = {n:{} for n in names if n != master} anchor_map = {n:{} for n in names if n != master}
first_anchor_map = {} first_anchor_map = {}
@@ -377,7 +381,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
for sheet in all_stylesheets(container, name): for sheet in all_stylesheets(container, name):
if sheet not in seen_stylesheets: if sheet not in seen_stylesheets:
seen_stylesheets.add(sheet) seen_stylesheets.add(sheet)
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master)) link = head.makeelement(base.tag('xhtml', 'link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
container.insert_into_xml(head, link) container.insert_into_xml(head, link)
# Rebase links if master is in a different directory # Rebase links if master is in a different directory
@@ -386,7 +390,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
root = p(name) root = p(name)
children = [] children = []
for body in p(name).findall('h:body', namespaces=XPNSMAP): for body in p(name).findall('h:body', namespaces=const.XPNSMAP):
children.append(body.text if body.text and body.text.strip() else '\n\n') children.append(body.text if body.text and body.text.strip() else '\n\n')
children.extend(body) children.extend(body)
@@ -396,7 +400,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
break break
if isinstance(first_child, (str, bytes)): if isinstance(first_child, (str, bytes)):
# body contained only text, no tags # body contained only text, no tags
first_child = body.makeelement(XHTML('p')) first_child = body.makeelement(base.tag('xhtml', 'p'))
first_child.text, children[0] = children[0], first_child first_child.text, children[0] = children[0], first_child
amap = anchor_map[name] amap = anchor_map[name]
@@ -424,7 +428,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
amap[''] = first_child.get('id') amap[''] = first_child.get('id')
# Fix links that point to local changed anchors # Fix links that point to local changed anchors
for a in XPath('//h:a[starts-with(@href, "#")]')(root): for a in base.XPath('//h:a[starts-with(@href, "#")]')(root):
q = a.get('href')[1:] q = a.get('href')[1:]
if q in amap: if q in amap:
a.set('href', '#' + amap[q]) a.set('href', '#' + amap[q])
@@ -472,10 +476,10 @@ def merge_css(container, names, master):
# Remove links to merged stylesheets in the html files, replacing with a # Remove links to merged stylesheets in the html files, replacing with a
# link to the master sheet # link to the master sheet
for name, mt in container.mime_map.items(): for name, mt in container.mime_map.items():
if mt in OEB_DOCS: if mt in base.OEB_DOCS:
removed = False removed = False
root = p(name) root = p(name)
for link in XPath('//h:link[@href]')(root): for link in base.XPath('//h:link[@href]')(root):
q = container.href_to_name(link.get('href'), name) q = container.href_to_name(link.get('href'), name)
if q in merged: if q in merged:
container.remove_from_xml(link) container.remove_from_xml(link)
@@ -483,9 +487,9 @@ def merge_css(container, names, master):
if removed: if removed:
container.dirty(name) container.dirty(name)
if removed and master not in set(all_stylesheets(container, name)): if removed and master not in set(all_stylesheets(container, name)):
head = root.find('h:head', namespaces=XPNSMAP) head = root.find('h:head', namespaces=const.XPNSMAP)
if head is not None: if head is not None:
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name)) link = head.makeelement(base.tag('xhtml', 'link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
container.insert_into_xml(head, link) container.insert_into_xml(head, link)
+178 -143
View File
@@ -1,16 +1,16 @@
import re import collections
from collections import Counter, OrderedDict import functools
from functools import partial import operator
from operator import itemgetter
import pkg_resources import pkg_resources
import re
import urllib.parse import urllib.parse
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from ebook_converter import __version__ from ebook_converter import __version__
from ebook_converter.ebooks.oeb.base import ( from ebook_converter import constants as const
XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize, EPUB_NS, XML_NS, OEB_DOCS) from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup
from ebook_converter.ebooks.oeb.polish.utils import guess_type, extract from ebook_converter.ebooks.oeb.polish.utils import guess_type, extract
from ebook_converter.ebooks.oeb.polish.opf import set_guide_item, get_book_language from ebook_converter.ebooks.oeb.polish.opf import set_guide_item, get_book_language
@@ -18,10 +18,6 @@ from ebook_converter.ebooks.oeb.polish.pretty import pretty_html_tree
from ebook_converter.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1 from ebook_converter.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
ns = etree.FunctionNamespace('calibre_xpath_extensions') ns = etree.FunctionNamespace('calibre_xpath_extensions')
ns.prefix = 'calibre' ns.prefix = 'calibre'
ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x
@@ -81,7 +77,8 @@ class TOC(object):
seen = set() seen = set()
remove = [] remove = []
for child in self: for child in self:
key = child.title if only_text else (child.title, child.dest, (child.frag or None)) key = child.title if only_text else (child.title, child.dest,
(child.frag or None))
if key in seen: if key in seen:
remove.append(child) remove.append(child)
else: else:
@@ -104,7 +101,7 @@ class TOC(object):
def get_lines(self, lvl=0): def get_lines(self, lvl=0):
frag = ('#'+self.frag) if self.frag else '' frag = ('#'+self.frag) if self.frag else ''
ans = [('\t'*lvl) + 'TOC: %s --> %s%s'%(self.title, self.dest, frag)] ans = [('\t'*lvl) + 'TOC: %s --> %s%s' % (self.title, self.dest, frag)]
for child in self: for child in self:
ans.extend(child.get_lines(lvl+1)) ans.extend(child.get_lines(lvl+1))
return ans return ans
@@ -113,10 +110,8 @@ class TOC(object):
return '\n'.join(self.get_lines()) return '\n'.join(self.get_lines())
def to_dict(self, node_counter=None): def to_dict(self, node_counter=None):
ans = { ans = {'title': self.title, 'dest': self.dest, 'frag': self.frag,
'title':self.title, 'dest':self.dest, 'frag':self.frag, 'children': [c.to_dict(node_counter) for c in self.children]}
'children':[c.to_dict(node_counter) for c in self.children]
}
if self.dest_exists is not None: if self.dest_exists is not None:
ans['dest_exists'] = self.dest_exists ans['dest_exists'] = self.dest_exists
if self.dest_error is not None: if self.dest_error is not None:
@@ -131,7 +126,7 @@ class TOC(object):
def child_xpath(tag, name): def child_xpath(tag, name):
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]'%name) return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]' % name)
def add_from_navpoint(container, navpoint, parent, ncx_name): def add_from_navpoint(container, navpoint, parent, ncx_name):
@@ -142,7 +137,7 @@ def add_from_navpoint(container, navpoint, parent, ncx_name):
text = '' text = ''
for txt in child_xpath(nl, 'text'): for txt in child_xpath(nl, 'text'):
text += etree.tostring(txt, method='text', text += etree.tostring(txt, method='text',
encoding='unicode', with_tail=False) encoding='unicode', with_tail=False)
content = child_xpath(navpoint, 'content') content = child_xpath(navpoint, 'content')
if content: if content:
content = content[0] content = content[0]
@@ -154,7 +149,8 @@ def add_from_navpoint(container, navpoint, parent, ncx_name):
def process_ncx_node(container, node, toc_parent, ncx_name): def process_ncx_node(container, node, toc_parent, ncx_name):
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) = "navpoint"]'): for navpoint in node.xpath('./*[calibre:lower-case(local-name()) '
'= "navpoint"]'):
child = add_from_navpoint(container, navpoint, toc_parent, ncx_name) child = add_from_navpoint(container, navpoint, toc_parent, ncx_name)
if child is not None: if child is not None:
process_ncx_node(container, navpoint, child, ncx_name) process_ncx_node(container, navpoint, child, ncx_name)
@@ -171,29 +167,38 @@ def parse_ncx(container, ncx_name):
if attr.endswith('lang'): if attr.endswith('lang'):
toc_root.lang = str(val) toc_root.lang = str(val)
break break
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'): for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and '
'@name="dtb:uid"]/@content'):
if uid: if uid:
toc_root.uid = str(uid) toc_root.uid = str(uid)
break break
for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'): for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'): for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = '
'"pagetarget"]'):
pagenum = pt.get('value') pagenum = pt.get('value')
if pagenum: if pagenum:
href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src') href = pt.xpath('descendant::*[calibre:lower-case(local-name()'
') = "content"]/@src')
if href: if href:
dest = container.href_to_name(href[0], base=ncx_name) dest = container.href_to_name(href[0], base=ncx_name)
frag = urllib.parse.urlparse(href[0]).fragment or None frag = urllib.parse.urlparse(href[0]).fragment or None
toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag}) toc_root.page_list.append({'dest': dest,
'pagenum': pagenum,
'frag': frag})
return toc_root return toc_root
def add_from_li(container, li, parent, nav_name): def add_from_li(container, li, parent, nav_name):
dest = frag = text = None dest = frag = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')): for x in li.iterchildren(base.tag('xhtml', 'a'),
text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() base.tag('xhtml', 'span')):
text = (etree.tostring(x, method='text', encoding='unicode',
with_tail=False).strip() or
' '.join(x.xpath('descendant-or-self::*/@title')).strip())
href = x.get('href') href = x.get('href')
if href: if href:
dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name) dest = (nav_name if href.startswith('#') else
container.href_to_name(href, base=nav_name))
frag = urllib.parse.urlparse(href).fragment or None frag = urllib.parse.urlparse(href).fragment or None
break break
return parent.add(text or None, dest or None, frag or None) return parent.add(text or None, dest or None, frag or None)
@@ -207,9 +212,9 @@ def first_child(parent, tagname):
def process_nav_node(container, node, toc_parent, nav_name): def process_nav_node(container, node, toc_parent, nav_name):
for li in node.iterchildren(XHTML('li')): for li in node.iterchildren(base.tag('xhtml', 'li')):
child = add_from_li(container, li, toc_parent, nav_name) child = add_from_li(container, li, toc_parent, nav_name)
ol = first_child(li, XHTML('ol')) ol = first_child(li, base.tag('xhtml', 'ol'))
if child is not None and ol is not None: if child is not None and ol is not None:
process_nav_node(container, ol, child, nav_name) process_nav_node(container, ol, child, nav_name)
@@ -218,14 +223,16 @@ def parse_nav(container, nav_name):
root = container.parsed(nav_name) root = container.parsed(nav_name)
toc_root = TOC() toc_root = TOC()
toc_root.lang = toc_root.uid = None toc_root.lang = toc_root.uid = None
et = '{%s}type' % EPUB_NS xhtml = functools.partial(base.tag, 'xhtml')
for nav in root.iterdescendants(XHTML('nav')): for nav in root.iterdescendants(base.tag('xhtml', 'nav')):
if nav.get(et) == 'toc': if nav.get(base.tag('epub', 'type')) == 'toc':
ol = first_child(nav, XHTML('ol')) ol = first_child(nav, base.tag('xhtml', 'ol'))
if ol is not None: if ol is not None:
process_nav_node(container, ol, toc_root, nav_name) process_nav_node(container, ol, toc_root, nav_name)
for h in nav.iterchildren(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())): for h in nav.iterchildren(*map(xhtml,
text = etree.tostring(h, method='text', encoding='unicode', with_tail=False) or h.get('title') 'h1 h2 h3 h4 h5 h6'.split())):
text = etree.tostring(h, method='text', encoding='unicode',
with_tail=False) or h.get('title')
if text: if text:
toc_root.toc_title = text toc_root.toc_title = text
break break
@@ -235,7 +242,7 @@ def parse_nav(container, nav_name):
def verify_toc_destinations(container, toc): def verify_toc_destinations(container, toc):
anchor_map = {} anchor_map = {}
anchor_xpath = XPath('//*/@id|//h:a/@name') anchor_xpath = base.XPath('//*/@id|//h:a/@name')
for item in toc.iterdescendants(): for item in toc.iterdescendants():
name = item.dest name = item.dest
if not name: if not name:
@@ -284,7 +291,8 @@ def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
ans.lang = ans.uid = None ans.lang = ans.uid = None
return ans return ans
toc = find_toc(container) toc = find_toc(container)
ans = empty_toc() if toc is None or not container.has_name(toc) else parse_toc(container, toc) ans = (empty_toc() if toc is None or not container.has_name(toc) else
parse_toc(container, toc))
ans.toc_file_name = toc if toc and container.has_name(toc) else None ans.toc_file_name = toc if toc and container.has_name(toc) else None
if verify_destinations: if verify_destinations:
verify_toc_destinations(container, ans) verify_toc_destinations(container, ans)
@@ -294,11 +302,14 @@ def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
def get_toc(container, verify_destinations=True): def get_toc(container, verify_destinations=True):
ver = container.opf_version_parsed ver = container.opf_version_parsed
if ver.major < 3: if ver.major < 3:
return get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations) return get_x_toc(container, find_existing_ncx_toc, parse_ncx,
verify_destinations=verify_destinations)
else: else:
ans = get_x_toc(container, find_existing_nav_toc, parse_nav, verify_destinations=verify_destinations) ans = get_x_toc(container, find_existing_nav_toc, parse_nav,
verify_destinations=verify_destinations)
if len(ans) == 0: if len(ans) == 0:
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations) ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx,
verify_destinations=verify_destinations)
return ans return ans
@@ -308,25 +319,33 @@ def get_guide_landmarks(container):
href, frag = href.partition('#')[::2] href, frag = href.partition('#')[::2]
name = container.href_to_name(href, container.opf_name) name = container.href_to_name(href, container.opf_name)
if container.has_name(name): if container.has_name(name):
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''} yield {'dest': name,
'frag': frag,
'title': title or '',
'type': rtype or ''}
def get_nav_landmarks(container): def get_nav_landmarks(container):
nav = find_existing_nav_toc(container) nav = find_existing_nav_toc(container)
if nav and container.has_name(nav): if nav and container.has_name(nav):
root = container.parsed(nav) root = container.parsed(nav)
et = '{%s}type' % EPUB_NS et = base('epub', 'type')
for elem in root.iterdescendants(XHTML('nav')): for elem in root.iterdescendants(base.tag('xhtml', 'nav')):
if elem.get(et) == 'landmarks': if elem.get(et) == 'landmarks':
for li in elem.iterdescendants(XHTML('li')): for li in elem.iterdescendants(base.tag('xhtml', 'li')):
for a in li.iterdescendants(XHTML('a')): for a in li.iterdescendants(base.tag('xhtml', 'a')):
href, rtype = a.get('href'), a.get(et) href, rtype = a.get('href'), a.get(et)
if href: if href:
title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip() title = etree.tostring(a, method='text',
encoding='unicode',
with_tail=False).strip()
href, frag = href.partition('#')[::2] href, frag = href.partition('#')[::2]
name = container.href_to_name(href, nav) name = container.href_to_name(href, nav)
if container.has_name(name): if container.has_name(name):
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''} yield {'dest': name,
'frag': frag,
'title': title or '',
'type': rtype or ''}
break break
@@ -344,7 +363,7 @@ def ensure_id(elem, all_ids):
elem_id = elem.get('id') elem_id = elem.get('id')
if elem_id: if elem_id:
return False, elem_id return False, elem_id
if elem.tag == XHTML('a'): if elem.tag == base.tag('xhtml', 'a'):
anchor = elem.get('name', None) anchor = elem.get('name', None)
if anchor: if anchor:
elem.set('id', anchor) elem.set('id', anchor)
@@ -361,7 +380,7 @@ def ensure_id(elem, all_ids):
def elem_to_toc_text(elem): def elem_to_toc_text(elem):
text = xml2text(elem).strip() text = base.xml2text(elem).strip()
if not text: if not text:
text = elem.get('title', '') text = elem.get('title', '')
if not text: if not text:
@@ -375,7 +394,7 @@ def elem_to_toc_text(elem):
def item_at_top(elem): def item_at_top(elem):
try: try:
body = XPath('//h:body')(elem.getroottree().getroot())[0] body = base.XPath('//h:body')(elem.getroottree().getroot())[0]
except (TypeError, IndexError, KeyError, AttributeError): except (TypeError, IndexError, KeyError, AttributeError):
return False return False
tree = body.getroottree() tree = body.getroottree()
@@ -387,7 +406,7 @@ def item_at_top(elem):
try: try:
if el.tag.endswith('}img') or (el.text and el.text.strip()): if el.tag.endswith('}img') or (el.text and el.text.strip()):
return False return False
except: except Exception:
return False return False
if not path.startswith(epath): if not path.startswith(epath):
# Only check tail of non-parent elements # Only check tail of non-parent elements
@@ -404,24 +423,26 @@ def from_xpaths(container, xpaths):
Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags. Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
''' '''
tocroot = TOC() tocroot = TOC()
xpaths = [XPath(xp) for xp in xpaths] xpaths = [base.XPath(xp) for xp in xpaths]
# Find those levels that have no elements in all spine items # Find those levels that have no elements in all spine items
maps = OrderedDict() maps = collections.OrderedDict()
empty_levels = {i+1 for i, xp in enumerate(xpaths)} empty_levels = {i+1 for i, xp in enumerate(xpaths)}
for spinepath in container.spine_items: for spinepath in container.spine_items:
name = container.abspath_to_name(spinepath) name = container.abspath_to_name(spinepath)
root = container.parsed(name) root = container.parsed(name)
level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)} level_item_map = maps[name] = {i + 1: frozenset(xp(root))
for i, xp in enumerate(xpaths)}
for lvl, elems in level_item_map.items(): for lvl, elems in level_item_map.items():
if elems: if elems:
empty_levels.discard(lvl) empty_levels.discard(lvl)
# Remove empty levels from all level_maps # Remove empty levels from all level_maps
if empty_levels: if empty_levels:
for name, lmap in tuple(maps.items()): for name, lmap in tuple(maps.items()):
lmap = {lvl:items for lvl, items in lmap.items() if lvl not in empty_levels} lmap = {lvl: items for lvl, items in lmap.items()
lmap = sorted(lmap.items(), key=itemgetter(0)) if lvl not in empty_levels}
lmap = {i+1:items for i, (l, items) in enumerate(lmap)} lmap = sorted(lmap.items(), key=operator.itemgetter(0))
lmap = {i + 1: items for i, (l, items) in enumerate(lmap)}
maps[name] = lmap maps[name] = lmap
node_level_map = {tocroot: 0} node_level_map = {tocroot: 0}
@@ -434,13 +455,15 @@ def from_xpaths(container, xpaths):
if child is None: if child is None:
return node return node
lvl = node_level_map[child] lvl = node_level_map[child]
return node if lvl > limit else child if lvl == limit else process_node(child) return (node if lvl > limit else
child if lvl == limit else process_node(child))
return process_node(tocroot) return process_node(tocroot)
for name, level_item_map in maps.items(): for name, level_item_map in maps.items():
root = container.parsed(name) root = container.parsed(name)
item_level_map = {e:i for i, elems in level_item_map.items() for e in elems} item_level_map = {e: i for i, elems in level_item_map.items()
for e in elems}
item_dirtied = False item_dirtied = False
all_ids = set(root.xpath('//*/@id')) all_ids = set(root.xpath('//*/@id'))
@@ -470,7 +493,7 @@ def from_links(container):
Generate a Table of Contents from links in the book. Generate a Table of Contents from links in the book.
''' '''
toc = TOC() toc = TOC()
link_path = XPath('//h:a[@href]') link_path = base.XPath('//h:a[@href]')
seen_titles, seen_dests = set(), set() seen_titles, seen_dests = set(), set()
for name, is_linear in container.spine_names: for name, is_linear in container.spine_names:
root = container.parsed(name) root = container.parsed(name)
@@ -506,7 +529,7 @@ def find_text(node):
pat = re.compile(r'\s+') pat = re.compile(r'\s+')
for child in node: for child in node:
if isinstance(child, etree._Element): if isinstance(child, etree._Element):
text = xml2text(child).strip() text = base.xml2text(child).strip()
text = pat.sub(' ', text) text = pat.sub(' ', text)
if len(text) < 1: if len(text) < 1:
continue continue
@@ -526,7 +549,7 @@ def from_files(container):
for i, spinepath in enumerate(container.spine_items): for i, spinepath in enumerate(container.spine_items):
name = container.abspath_to_name(spinepath) name = container.abspath_to_name(spinepath)
root = container.parsed(name) root = container.parsed(name)
body = XPath('//h:body')(root) body = base.XPath('//h:body')(root)
if not body: if not body:
continue continue
text = find_text(body[0]) text = find_text(body[0])
@@ -576,42 +599,46 @@ def add_id(container, name, loc, totals=None):
def create_ncx(toc, to_href, btitle, lang, uid): def create_ncx(toc, to_href, btitle, lang, uid):
lang = lang.replace('_', '-') lang = lang.replace('_', '-')
ncx = etree.Element(NCX('ncx'), ncx = etree.Element(base.tag('ncx', 'ncx'),
attrib={'version': '2005-1', XML('lang'): lang}, attrib={'version': '2005-1',
nsmap={None: NCX_NS}) base.tag('xml', 'lang'): lang},
head = etree.SubElement(ncx, NCX('head')) nsmap={None: const.NCX_NS})
etree.SubElement(head, NCX('meta'), head = etree.SubElement(ncx, base.tag('ncx', 'head'))
name='dtb:uid', content=str(uid)) etree.SubElement(head, base.tag('ncx', 'meta'),
etree.SubElement(head, NCX('meta'), name='dtb:uid', content=str(uid))
name='dtb:depth', content=str(toc.depth)) etree.SubElement(head, base.tag('ncx', 'meta'),
name='dtb:depth', content=str(toc.depth))
generator = ''.join(['calibre (', __version__, ')']) generator = ''.join(['calibre (', __version__, ')'])
etree.SubElement(head, NCX('meta'), etree.SubElement(head, base.tag('ncx', 'meta'),
name='dtb:generator', content=generator) name='dtb:generator', content=generator)
etree.SubElement(head, NCX('meta'), name='dtb:totalPageCount', content='0') etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:totalPageCount',
etree.SubElement(head, NCX('meta'), name='dtb:maxPageNumber', content='0') content='0')
title = etree.SubElement(ncx, NCX('docTitle')) etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:maxPageNumber',
text = etree.SubElement(title, NCX('text')) content='0')
title = etree.SubElement(ncx, base.tag('ncx', 'docTitle'))
text = etree.SubElement(title, base.tag('ncx', 'text'))
text.text = btitle text.text = btitle
navmap = etree.SubElement(ncx, NCX('navMap')) navmap = etree.SubElement(ncx, base.tag('ncx', 'navMap'))
spat = re.compile(r'\s+') spat = re.compile(r'\s+')
play_order = Counter() play_order = collections.Counter()
def process_node(xml_parent, toc_parent): def process_node(xml_parent, toc_parent):
for child in toc_parent: for child in toc_parent:
play_order['c'] += 1 play_order['c'] += 1
point = etree.SubElement(xml_parent, NCX('navPoint'), id='num_%d' % play_order['c'], point = etree.SubElement(xml_parent, base.tag('ncx', 'navPoint'),
playOrder=str(play_order['c'])) id='num_%d' % play_order['c'],
label = etree.SubElement(point, NCX('navLabel')) playOrder=str(play_order['c']))
label = etree.SubElement(point, base.tag('ncx', 'navLabel'))
title = child.title title = child.title
if title: if title:
title = spat.sub(' ', title) title = spat.sub(' ', title)
etree.SubElement(label, NCX('text')).text = title etree.SubElement(label, base.tag('ncx', 'text')).text = title
if child.dest: if child.dest:
href = to_href(child.dest) href = to_href(child.dest)
if child.frag: if child.frag:
href += '#'+child.frag href += '#'+child.frag
etree.SubElement(point, NCX('content'), src=href) etree.SubElement(point, base.tag('ncx', 'content'), src=href)
process_node(point, child) process_node(point, child)
process_node(navmap, toc) process_node(navmap, toc)
@@ -622,41 +649,43 @@ def commit_ncx_toc(container, toc, lang=None, uid=None):
tocname = find_existing_ncx_toc(container) tocname = find_existing_ncx_toc(container)
if tocname is None: if tocname is None:
item = container.generate_item('toc.ncx', id_prefix='toc') item = container.generate_item('toc.ncx', id_prefix='toc')
tocname = container.href_to_name(item.get('href'), base=container.opf_name) tocname = container.href_to_name(item.get('href'),
base=container.opf_name)
ncx_id = item.get('id') ncx_id = item.get('id')
[s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')] [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
if not lang: if not lang:
lang = get_lang() lang = get_lang()
for l in container.opf_xpath('//dc:language'): for _l in container.opf_xpath('//dc:language'):
l = canonicalize_lang(xml2text(l).strip()) _l = canonicalize_lang(base.xml2text(_l).strip())
if l: if _l:
lang = l lang = _l
lang = lang_as_iso639_1(l) or l lang = lang_as_iso639_1(_l) or _l
break break
lang = lang_as_iso639_1(lang) or lang lang = lang_as_iso639_1(lang) or lang
if not uid: if not uid:
uid = uuid_id() uid = base.uuid_id()
eid = container.opf.get('unique-identifier', None) eid = container.opf.get('unique-identifier', None)
if eid: if eid:
m = container.opf_xpath('//*[@id="%s"]'%eid) m = container.opf_xpath('//*[@id="%s"]' % eid)
if m: if m:
uid = xml2text(m[0]) uid = base.xml2text(m[0])
title = 'Table of Contents' title = 'Table of Contents'
m = container.opf_xpath('//dc:title') m = container.opf_xpath('//dc:title')
if m: if m:
x = xml2text(m[0]).strip() x = base.xml2text(m[0]).strip()
title = x or title title = x or title
to_href = partial(container.name_to_href, base=tocname) to_href = functools.partial(container.name_to_href, base=tocname)
root = create_ncx(toc, to_href, title, lang, uid) root = create_ncx(toc, to_href, title, lang, uid)
container.replace(tocname, root) container.replace(tocname, root)
container.pretty_print.add(tocname) container.pretty_print.add(tocname)
def ensure_single_nav_of_type(root, ntype='toc'): def ensure_single_nav_of_type(root, ntype='toc'):
et = '{%s}type' % EPUB_NS et = base('epub', 'type')
navs = [n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == ntype] navs = [n for n in root.iterdescendants(base.tag('xhtml', 'nav'))
if n.get(et) == ntype]
for x in navs[1:]: for x in navs[1:]:
extract(x) extract(x)
if navs: if navs:
@@ -667,13 +696,14 @@ def ensure_single_nav_of_type(root, ntype='toc'):
nav.attrib.update(attrib) nav.attrib.update(attrib)
nav.tail = tail nav.tail = tail
else: else:
nav = root.makeelement(XHTML('nav')) nav = root.makeelement(base.tag('xhtml', 'nav'))
first_child(root, XHTML('body')).append(nav) first_child(root, base.tag('xhtml', 'body')).append(nav)
nav.set('{%s}type' % EPUB_NS, ntype) nav.set(et, ntype)
return nav return nav
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None): def commit_nav_toc(container, toc, lang=None, landmarks=None,
previous_nav=None):
from ebook_converter.ebooks.oeb.polish.pretty import pretty_xml_tree from ebook_converter.ebooks.oeb.polish.pretty import pretty_xml_tree
tocname = find_existing_nav_toc(container) tocname = find_existing_nav_toc(container)
if previous_nav is not None: if previous_nav is not None:
@@ -684,7 +714,8 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
if tocname is None: if tocname is None:
item = container.generate_item('nav.xhtml', id_prefix='nav') item = container.generate_item('nav.xhtml', id_prefix='nav')
item.set('properties', 'nav') item.set('properties', 'nav')
tocname = container.href_to_name(item.get('href'), base=container.opf_name) tocname = container.href_to_name(item.get('href'),
base=container.opf_name)
if previous_nav is not None: if previous_nav is not None:
root = previous_nav[1] root = previous_nav[1]
else: else:
@@ -698,24 +729,25 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
if lang: if lang:
lang = lang_as_iso639_1(lang) or lang lang = lang_as_iso639_1(lang) or lang
root.set('lang', lang) root.set('lang', lang)
root.set('{%s}lang' % XML_NS, lang) root.set(base.tag('xml', 'lang'), lang)
nav = ensure_single_nav_of_type(root, 'toc') nav = ensure_single_nav_of_type(root, 'toc')
if toc.toc_title: if toc.toc_title:
nav.append(nav.makeelement(XHTML('h1'))) nav.append(nav.makeelement(base.tag('xhtml', 'h1')))
nav[-1].text = toc.toc_title nav[-1].text = toc.toc_title
rnode = nav.makeelement(XHTML('ol')) rnode = nav.makeelement(base.tag('xhtml', 'ol'))
nav.append(rnode) nav.append(rnode)
to_href = partial(container.name_to_href, base=tocname) to_href = functools.partial(container.name_to_href, base=tocname)
spat = re.compile(r'\s+') spat = re.compile(r'\s+')
def process_node(xml_parent, toc_parent): def process_node(xml_parent, toc_parent):
for child in toc_parent: for child in toc_parent:
li = xml_parent.makeelement(XHTML('li')) li = xml_parent.makeelement(base.tag('xhtml', 'li'))
xml_parent.append(li) xml_parent.append(li)
title = child.title or '' title = child.title or ''
title = spat.sub(' ', title).strip() title = spat.sub(' ', title).strip()
a = li.makeelement(XHTML('a' if child.dest else 'span')) a = li.makeelement(base.tag('xhtml', 'a'
if child.dest else 'span'))
a.text = title a.text = title
li.append(a) li.append(a)
if child.dest: if child.dest:
@@ -724,14 +756,14 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
href += '#'+child.frag href += '#'+child.frag
a.set('href', href) a.set('href', href)
if len(child): if len(child):
ol = li.makeelement(XHTML('ol')) ol = li.makeelement(base.tag('xhtml', 'ol'))
li.append(ol) li.append(ol)
process_node(ol, child) process_node(ol, child)
process_node(rnode, toc) process_node(rnode, toc)
pretty_xml_tree(nav) pretty_xml_tree(nav)
def collapse_li(parent): def collapse_li(parent):
for li in parent.iterdescendants(XHTML('li')): for li in parent.iterdescendants(base.tag('xhtml', 'li')):
if len(li) == 1: if len(li) == 1:
li.text = None li.text = None
li[0].tail = None li[0].tail = None
@@ -739,9 +771,9 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
nav.tail = '\n' nav.tail = '\n'
def create_li(ol, entry): def create_li(ol, entry):
li = ol.makeelement(XHTML('li')) li = ol.makeelement(base.tag('xhtml', 'li'))
ol.append(li) ol.append(li)
a = li.makeelement(XHTML('a')) a = li.makeelement(base.tag('xhtml', 'a'))
li.append(a) li.append(a)
href = container.name_to_href(entry['dest'], tocname) href = container.name_to_href(entry['dest'], tocname)
if entry['frag']: if entry['frag']:
@@ -752,12 +784,13 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
if landmarks is not None: if landmarks is not None:
nav = ensure_single_nav_of_type(root, 'landmarks') nav = ensure_single_nav_of_type(root, 'landmarks')
nav.set('hidden', '') nav.set('hidden', '')
ol = nav.makeelement(XHTML('ol')) ol = nav.makeelement(base.tag('xhtml', 'ol'))
nav.append(ol) nav.append(ol)
for entry in landmarks: for entry in landmarks:
if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS: if (entry['type'] and container.has_name(entry['dest']) and
container.mime_map[entry['dest']] in base.OEB_DOCS):
a = create_li(ol, entry) a = create_li(ol, entry)
a.set('{%s}type' % EPUB_NS, entry['type']) a.set(base.tag('epub', 'type'), entry['type'])
a.text = entry['title'] or None a.text = entry['title'] or None
pretty_xml_tree(nav) pretty_xml_tree(nav)
collapse_li(nav) collapse_li(nav)
@@ -765,10 +798,11 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
if toc.page_list: if toc.page_list:
nav = ensure_single_nav_of_type(root, 'page-list') nav = ensure_single_nav_of_type(root, 'page-list')
nav.set('hidden', '') nav.set('hidden', '')
ol = nav.makeelement(XHTML('ol')) ol = nav.makeelement(base.tag('xhtml', 'ol'))
nav.append(ol) nav.append(ol)
for entry in toc.page_list: for entry in toc.page_list:
if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS: if (container.has_name(entry['dest']) and
container.mime_map[entry['dest']] in base.OEB_DOCS):
a = create_li(ol, entry) a = create_li(ol, entry)
a.text = str(entry['pagenum']) a.text = str(entry['pagenum'])
pretty_xml_tree(nav) pretty_xml_tree(nav)
@@ -785,11 +819,12 @@ def commit_toc(container, toc, lang=None, uid=None):
def remove_names_from_toc(container, names): def remove_names_from_toc(container, names):
changed = [] changed = []
names = frozenset(names) names = frozenset(names)
for find_toc, parse_toc, commit_toc in ( for find_toc, parse_toc, commit_toc in ((find_existing_ncx_toc,
(find_existing_ncx_toc, parse_ncx, commit_ncx_toc), parse_ncx, commit_ncx_toc),
(find_existing_nav_toc, parse_nav, commit_nav_toc), (find_existing_nav_toc,
): parse_nav, commit_nav_toc)):
toc = get_x_toc(container, find_toc, parse_toc, verify_destinations=False) toc = get_x_toc(container, find_toc, parse_toc,
verify_destinations=False)
if len(toc) > 0: if len(toc) > 0:
remove = [] remove = []
for node in toc.iterdescendants(): for node in toc.iterdescendants():
@@ -805,15 +840,16 @@ def remove_names_from_toc(container, names):
def find_inline_toc(container): def find_inline_toc(container):
for name, linear in container.spine_names: for name, linear in container.spine_names:
if container.parsed(name).xpath('//*[local-name()="body" and @id="calibre_generated_inline_toc"]'): if container.parsed(name).xpath('//*[local-name()="body" and @id='
'"calibre_generated_inline_toc"]'):
return name return name
def toc_to_html(toc, container, toc_name, title, lang=None): def toc_to_html(toc, container, toc_name, title, lang=None):
def process_node(html_parent, toc, level=1, indent=' ', style_level=2): def process_node(html_parent, toc, level=1, indent=' ', style_level=2):
li = html_parent.makeelement(XHTML('li')) li = html_parent.makeelement(base.tag('xhtml', 'li'))
li.tail = '\n'+ (indent*level) li.tail = '\n' + (indent * level)
html_parent.append(li) html_parent.append(li)
name, frag = toc.dest, toc.frag name, frag = toc.dest, toc.frag
href = '#' href = '#'
@@ -821,32 +857,29 @@ def toc_to_html(toc, container, toc_name, title, lang=None):
href = container.name_to_href(name, toc_name) href = container.name_to_href(name, toc_name)
if frag: if frag:
href += '#' + frag href += '#' + frag
a = li.makeelement(XHTML('a'), href=href) a = li.makeelement(base.tag('xhtml', 'a'), href=href)
a.text = toc.title a.text = toc.title
li.append(a) li.append(a)
if len(toc) > 0: if len(toc) > 0:
parent = li.makeelement(XHTML('ul')) parent = li.makeelement(base.tag('xhtml', 'ul'))
parent.set('class', 'level%d' % (style_level)) parent.set('class', 'level%d' % (style_level))
li.append(parent) li.append(parent)
a.tail = '\n\n' + (indent*(level+2)) a.tail = '\n\n' + (indent*(level+2))
parent.text = '\n'+(indent*(level+3)) parent.text = '\n'+(indent*(level+3))
parent.tail = '\n\n' + (indent*(level+1)) parent.tail = '\n\n' + (indent*(level+1))
for child in toc: for child in toc:
process_node(parent, child, level+3, style_level=style_level + 1) process_node(parent, child, level+3,
style_level=style_level + 1)
parent[-1].tail = '\n' + (indent*(level+2)) parent[-1].tail = '\n' + (indent*(level+2))
E = ElementMaker(namespace=XHTML_NS, nsmap={None:XHTML_NS}) E = ElementMaker(namespace=const.XHTML_NS, nsmap={None: const.XHTML_NS})
html = E.html( # TODO(gryf): revisit lack of css.
E.head( css_f = pkg_resources.resource_filename('ebook_converter',
E.title(title), 'data/inline_toc_styles.css')
E.style(P('templates/inline_toc_styles.css', data=True), type='text/css'), html = E.html(E.head(E.title(title),
), E.style(css_f, type='text/css')),
E.body( E.body(E.h2(title), E.ul(),
E.h2(title), id="calibre_generated_inline_toc"))
E.ul(),
id="calibre_generated_inline_toc",
)
)
ul = html[1][1] ul = html[1][1]
ul.set('class', 'level1') ul.set('class', 'level1')
@@ -859,11 +892,12 @@ def toc_to_html(toc, container, toc_name, title, lang=None):
def create_inline_toc(container, title=None): def create_inline_toc(container, title=None):
''' """
Create an inline (HTML) Table of Contents from an existing NCX Table of Contents. Create an inline (HTML) Table of Contents from an existing NCX Table of
Contents.
:param title: The title for this table of contents. :param title: The title for this table of contents.
''' """
lang = get_book_language(container) lang = get_book_language(container)
default_title = 'Table of Contents' default_title = 'Table of Contents'
title = title or default_title title = title or default_title
@@ -874,7 +908,7 @@ def create_inline_toc(container, title=None):
name = toc_name name = toc_name
html = toc_to_html(toc, container, name, title, lang) html = toc_to_html(toc, container, name, title, lang)
raw = serialize(html, 'text/html') raw = base.serialize(html, 'text/html')
if name is None: if name is None:
name, c = 'toc.xhtml', 0 name, c = 'toc.xhtml', 0
while container.has_name(name): while container.has_name(name):
@@ -884,5 +918,6 @@ def create_inline_toc(container, title=None):
else: else:
with container.open(name, 'wb') as f: with container.open(name, 'wb') as f:
f.write(raw) f.write(raw)
set_guide_item(container, 'toc', title, name, frag='calibre_generated_inline_toc') set_guide_item(container, 'toc', title, name,
frag='calibre_generated_inline_toc')
return name return name
+160 -142
View File
@@ -1,21 +1,21 @@
""" """
Container-/OPF-based input OEBBook reader. Container-/OPF-based input OEBBook reader.
""" """
import sys, os, uuid, copy, re, io import collections
from collections import defaultdict import copy
import io
import os
import re
import sys
import urllib.parse import urllib.parse
import uuid
from lxml import etree from lxml import etree
from ebook_converter.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ from ebook_converter import constants as const
DC_NSES, OPF, xml2text, XHTML_MIME from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ from ebook_converter.ebooks.oeb import parse_utils
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME from ebook_converter.ebooks.metadata import opf2 as opf_meta
from ebook_converter.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
MS_COVER_TYPE, iterlinks
from ebook_converter.ebooks.oeb.base import namespace, barename, XPath, xpath, \
urlnormalize, BINARY_MIME, \
OEBError, OEBBook, DirContainer
from ebook_converter.ebooks.oeb.writer import OEBWriter from ebook_converter.ebooks.oeb.writer import OEBWriter
from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.cleantext import clean_xml_chars
@@ -26,18 +26,13 @@ from ebook_converter import guess_type, xml_replace_entities
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
__all__ = ['OEBReader']
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
class OEBReader(object): class OEBReader(object):
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" """Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_SVG_XP = base.XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = base.XPath('h:body//h:object[@data][position() = 1]')
Container = DirContainer Container = base.DirContainer
"""Container type used to access book files. Override in sub-classes.""" """Container type used to access book files. Override in sub-classes."""
DEFAULT_PROFILE = 'PRS505' DEFAULT_PROFILE = 'PRS505'
@@ -75,61 +70,67 @@ class OEBReader(object):
for elem in opf.iter(tag=etree.Element): for elem in opf.iter(tag=etree.Element):
nsmap.update(elem.nsmap) nsmap.update(elem.nsmap)
for elem in opf.iter(tag=etree.Element): for elem in opf.iter(tag=etree.Element):
if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag): if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and
elem.tag = OPF(barename(elem.tag)) ':' not in parse_utils.barename(elem.tag)):
nsmap.update(OPF2_NSMAP) elem.tag = base.tag('opf', parse_utils.barename(elem.tag))
nsmap.update(const.OPF2_NSMAP)
attrib = dict(opf.attrib) attrib = dict(opf.attrib)
nroot = etree.Element(OPF('package'), nroot = etree.Element(base.tag('opf', 'package'),
nsmap={None: OPF2_NS}, attrib=attrib) nsmap={None: const.OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'),
ignored = (OPF('dc-metadata'), OPF('x-metadata')) nsmap=nsmap)
for elem in xpath(opf, 'o2:metadata//*'): ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata'))
for elem in base.xpath(opf, 'o2:metadata//*'):
if elem.tag in ignored: if elem.tag in ignored:
continue continue
if namespace(elem.tag) in DC_NSES: if parse_utils.namespace(elem.tag) in const.DC_NSES:
tag = barename(elem.tag).lower() tag = parse_utils.barename(elem.tag).lower()
elem.tag = '{%s}%s' % (DC11_NS, tag) elem.tag = '{%s}%s' % (const.DC11_NS, tag)
if elem.tag.startswith('dc:'): if elem.tag.startswith('dc:'):
tag = elem.tag.partition(':')[-1].lower() tag = elem.tag.partition(':')[-1].lower()
elem.tag = '{%s}%s' % (DC11_NS, tag) elem.tag = '{%s}%s' % (const.DC11_NS, tag)
metadata.append(elem) metadata.append(elem)
for element in xpath(opf, 'o2:metadata//o2:meta'): for element in base.xpath(opf, 'o2:metadata//o2:meta'):
metadata.append(element) metadata.append(element)
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
for element in xpath(opf, tag): for element in base.xpath(opf, tag):
nroot.append(element) nroot.append(element)
return nroot return nroot
def _read_opf(self): def _read_opf(self):
data = self.oeb.container.read(None) data = self.oeb.container.read(None)
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data) data = base.XMLDECL_RE.sub('', data)
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
OPF1_NS, data) const.OPF1_NS, data)
try: try:
opf = safe_xml_fromstring(data) opf = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = xml_replace_entities(clean_xml_chars(data), encoding=None) data = xml_replace_entities(clean_xml_chars(data), encoding=None)
try: try:
opf = safe_xml_fromstring(data) opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities') self.logger.warn('OPF contains invalid HTML named entities')
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = re.sub(r'(?is)<tours>.+</tours>', '', data)
data = data.replace('<dc-metadata>', data = data.replace('<dc-metadata>',
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') '<dc-metadata xmlns:dc="'
opf = safe_xml_fromstring(data) 'http://purl.org/metadata/dublin_core">')
opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid tours section') self.logger.warn('OPF contains invalid tours section')
ns = namespace(opf.tag) ns = parse_utils.namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS): if ns not in ('', const.OPF1_NS, const.OPF2_NS):
raise OEBError('Invalid namespace %r for OPF document' % ns) raise base.OEBError('Invalid namespace %r for OPF document' % ns)
opf = self._clean_opf(opf) opf = self._clean_opf(opf)
return opf return opf
def _metadata_from_opf(self, opf): def _metadata_from_opf(self, opf):
from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.ebooks.metadata.opf2 import OPF
from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata from ebook_converter.ebooks.oeb.transforms.metadata import \
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8')) meta_info_to_oeb_metadata
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True,
encoding='utf-8'))
# o = opf_meta.OPF(stream)
o = OPF(stream) o = OPF(stream)
pwm = o.primary_writing_mode pwm = o.primary_writing_mode
if pwm: if pwm:
@@ -139,8 +140,8 @@ class OEBReader(object):
mi.language = get_lang().replace('_', '-') mi.language = get_lang().replace('_', '-')
self.oeb.metadata.add('language', mi.language) self.oeb.metadata.add('language', mi.language)
if not mi.book_producer: if not mi.book_producer:
mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\ mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' %
dict(a=__appname__, v=__version__) dict(a=__appname__, v=__version__))
meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger) meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
m = self.oeb.metadata m = self.oeb.metadata
m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid')
@@ -162,16 +163,16 @@ class OEBReader(object):
data. data.
''' '''
bad = [] bad = []
check = OEB_DOCS.union(OEB_STYLES) check = base.OEB_DOCS.union(base.OEB_STYLES)
for item in list(self.oeb.manifest.values()): for item in list(self.oeb.manifest.values()):
if item.media_type in check: if item.media_type in check:
try: try:
item.data item.data
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except: except Exception:
self.logger.exception('Failed to parse content in %s'% self.logger.exception('Failed to parse content in %s' %
item.href) item.href)
bad.append(item) bad.append(item)
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return bad return bad
@@ -181,25 +182,28 @@ class OEBReader(object):
manifest = self.oeb.manifest manifest = self.oeb.manifest
known = set(manifest.hrefs) known = set(manifest.hrefs)
unchecked = set(manifest.values()) unchecked = set(manifest.values())
cdoc = OEB_DOCS|OEB_STYLES cdoc = base.OEB_DOCS | base.OEB_STYLES
invalid = set() invalid = set()
while unchecked: while unchecked:
new = set() new = set()
for item in unchecked: for item in unchecked:
data = None data = None
if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): if (item.media_type in cdoc or
item.media_type[-4:] in ('/xml', '+xml')):
try: try:
data = item.data data = item.data
except: except Exception:
self.oeb.log.exception('Failed to read from manifest ' self.oeb.log.exception('Failed to read from manifest '
'entry with id: %s, ignoring'%item.id) 'entry with id: %s, ignoring' %
item.id)
invalid.add(item) invalid.add(item)
continue continue
if data is None: if data is None:
continue continue
if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): if (item.media_type in base.OEB_DOCS or
hrefs = [r[2] for r in iterlinks(data)] item.media_type[-4:] in ('/xml', '+xml')):
hrefs = [r[2] for r in base.iterlinks(data)]
for href in hrefs: for href in hrefs:
if isinstance(href, bytes): if isinstance(href, bytes):
href = href.decode('utf-8') href = href.decode('utf-8')
@@ -207,22 +211,22 @@ class OEBReader(object):
if not href: if not href:
continue continue
try: try:
href = item.abshref(urlnormalize(href)) href = item.abshref(base.urlnormalize(href))
scheme = urllib.parse.urlparse(href).scheme scheme = urllib.parse.urlparse(href).scheme
except: except Exception:
self.oeb.log.exception( self.oeb.log.exception('Skipping invalid href: '
'Skipping invalid href: %r'%href) '%r' % href)
continue continue
if not scheme and href not in known: if not scheme and href not in known:
new.add(href) new.add(href)
elif item.media_type in OEB_STYLES: elif item.media_type in base.OEB_STYLES:
try: try:
urls = list(css_parser.getUrls(data)) urls = list(css_parser.getUrls(data))
except: except Exception:
urls = [] urls = []
for url in urls: for url in urls:
href, _ = urllib.parse.urldefrag(url) href, _ = urllib.parse.urldefrag(url)
href = item.abshref(urlnormalize(href)) href = item.abshref(base.urlnormalize(href))
scheme = urllib.parse.urlparse(href).scheme scheme = urllib.parse.urlparse(href).scheme
if not scheme and href not in known: if not scheme and href not in known:
new.add(href) new.add(href)
@@ -232,7 +236,7 @@ class OEBReader(object):
known.add(href) known.add(href)
is_invalid = False is_invalid = False
for item in invalid: for item in invalid:
if href == item.abshref(urlnormalize(href)): if href == item.abshref(base.urlnormalize(href)):
is_invalid = True is_invalid = True
break break
if is_invalid: if is_invalid:
@@ -243,11 +247,12 @@ class OEBReader(object):
warned.add(href) warned.add(href)
continue continue
if href not in warned: if href not in warned:
self.logger.warn('Referenced file %r not in manifest' % href) self.logger.warn('Referenced file %r not in manifest' %
href)
warned.add(href) warned.add(href)
id, _ = manifest.generate(id='added') id, _ = manifest.generate(id='added')
guessed = guess_type(href)[0] guessed = guess_type(href)[0]
media_type = guessed or BINARY_MIME media_type = guessed or base.BINARY_MIME
added = manifest.add(id, href, media_type) added = manifest.add(id, href, media_type)
unchecked.add(added) unchecked.add(added)
@@ -256,7 +261,7 @@ class OEBReader(object):
def _manifest_from_opf(self, opf): def _manifest_from_opf(self, opf):
manifest = self.oeb.manifest manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'):
id = elem.get('id') id = elem.get('id')
href = elem.get('href') href = elem.get('href')
media_type = elem.get('media-type', None) media_type = elem.get('media-type', None)
@@ -264,7 +269,7 @@ class OEBReader(object):
media_type = elem.get('mediatype', None) media_type = elem.get('mediatype', None)
if not media_type or media_type == 'text/xml': if not media_type or media_type == 'text/xml':
guessed = guess_type(href)[0] guessed = guess_type(href)[0]
media_type = guessed or media_type or BINARY_MIME media_type = guessed or media_type or base.BINARY_MIME
if hasattr(media_type, 'lower'): if hasattr(media_type, 'lower'):
media_type = media_type.lower() media_type = media_type.lower()
fallback = elem.get('fallback') fallback = elem.get('fallback')
@@ -285,12 +290,12 @@ class OEBReader(object):
manifest = self.oeb.manifest manifest = self.oeb.manifest
spine = self.oeb.spine spine = self.oeb.spine
unchecked = set(spine) unchecked = set(spine)
selector = XPath('h:body//h:a/@href') selector = base.XPath('h:body//h:a/@href')
extras = set() extras = set()
while unchecked: while unchecked:
new = set() new = set()
for item in unchecked: for item in unchecked:
if item.media_type not in OEB_DOCS: if item.media_type not in base.OEB_DOCS:
# TODO: handle fallback chains # TODO: handle fallback chains
continue continue
for href in selector(item.data): for href in selector(item.data):
@@ -298,20 +303,21 @@ class OEBReader(object):
if not href: if not href:
continue continue
try: try:
href = item.abshref(urlnormalize(href)) href = item.abshref(base.urlnormalize(href))
except ValueError: # Malformed URL except ValueError: # Malformed URL
continue continue
if href not in manifest.hrefs: if href not in manifest.hrefs:
continue continue
found = manifest.hrefs[href] found = manifest.hrefs[href]
if found.media_type not in OEB_DOCS or \ if found.media_type not in base.OEB_DOCS or \
found in spine or found in extras: found in spine or found in extras:
continue continue
new.add(found) new.add(found)
extras.update(new) extras.update(new)
unchecked = new unchecked = new
version = int(self.oeb.version[0]) version = int(self.oeb.version[0])
removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
())
for item in sorted(extras): for item in sorted(extras):
if item.href in removed_items_to_ignore: if item.href in removed_items_to_ignore:
continue continue
@@ -323,34 +329,38 @@ class OEBReader(object):
def _spine_from_opf(self, opf): def _spine_from_opf(self, opf):
spine = self.oeb.spine spine = self.oeb.spine
manifest = self.oeb.manifest manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'):
idref = elem.get('idref') idref = elem.get('idref')
if idref not in manifest.ids: if idref not in manifest.ids:
self.logger.warn('Spine item %r not found' % idref) self.logger.warn('Spine item %r not found' % idref)
continue continue
item = manifest.ids[idref] item = manifest.ids[idref]
if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'): if (item.media_type.lower() in base.OEB_DOCS and
hasattr(item.data, 'xpath') and not
getattr(item.data, 'tag', '').endswith('}ncx')):
spine.add(item, elem.get('linear')) spine.add(item, elem.get('linear'))
else: else:
if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'): if (hasattr(item.data, 'tag') and
item.media_type = XHTML_MIME item.data.tag and item.data.tag.endswith('}html')):
item.media_type = base.XHTML_MIME
spine.add(item, elem.get('linear')) spine.add(item, elem.get('linear'))
else: else:
self.oeb.log.warn('The item %s is not a XML document.' self.oeb.log.warn('The item %s is not a XML document.'
' Removing it from spine.'%item.href) ' Removing it from spine.' % item.href)
if len(spine) == 0: if len(spine) == 0:
raise OEBError("Spine is empty") raise base.OEBError("Spine is empty")
self._spine_add_extra() self._spine_add_extra()
for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'): for val in base.xpath(opf,
'/o2:package/o2:spine/@page-progression-direction'):
if val in {'ltr', 'rtl'}: if val in {'ltr', 'rtl'}:
spine.page_progression_direction = val spine.page_progression_direction = val
def _guide_from_opf(self, opf): def _guide_from_opf(self, opf):
guide = self.oeb.guide guide = self.oeb.guide
manifest = self.oeb.manifest manifest = self.oeb.manifest
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'):
ref_href = elem.get('href') ref_href = elem.get('href')
path = urlnormalize(urllib.parse.urldefrag(ref_href)[0]) path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0])
if path not in manifest.hrefs: if path not in manifest.hrefs:
corrected_href = None corrected_href = None
for href in manifest.hrefs: for href in manifest.hrefs:
@@ -366,7 +376,7 @@ class OEBReader(object):
guide.add(typ, elem.get('title'), ref_href) guide.add(typ, elem.get('title'), ref_href)
def _find_ncx(self, opf): def _find_ncx(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@toc') result = base.xpath(opf, '/o2:package/o2:spine/@toc')
if result: if result:
id = result[0] id = result[0]
if id not in self.oeb.manifest.ids: if id not in self.oeb.manifest.ids:
@@ -375,30 +385,33 @@ class OEBReader(object):
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return item return item
for item in self.oeb.manifest.values(): for item in self.oeb.manifest.values():
if item.media_type == NCX_MIME: if item.media_type == base.NCX_MIME:
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return item return item
return None return None
def _toc_from_navpoint(self, item, toc, navpoint): def _toc_from_navpoint(self, item, toc, navpoint):
children = xpath(navpoint, 'ncx:navPoint') children = base.xpath(navpoint, 'ncx:navPoint')
for child in children: for child in children:
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip()) title = base.COLLAPSE_RE.sub(' ', title.strip())
href = xpath(child, 'ncx:content/@src') href = base.xpath(child, 'ncx:content/@src')
if not title: if not title:
self._toc_from_navpoint(item, toc, child) self._toc_from_navpoint(item, toc, child)
continue continue
if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'):
# This node is useless # This node is useless
continue continue
href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' if href and href[0]:
href = item.abshref(base.urlnormalize(href[0]))
else:
href = ''
path, _ = urllib.parse.urldefrag(href) path, _ = urllib.parse.urldefrag(href)
if path and path not in self.oeb.manifest.hrefs: if path and path not in self.oeb.manifest.hrefs:
path = urlnormalize(path) path = base.urlnormalize(path)
if href and path not in self.oeb.manifest.hrefs: if href and path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href) self.logger.warn('TOC reference %r not found' % href)
gc = xpath(child, 'ncx:navPoint') gc = base.xpath(child, 'ncx:navPoint')
if not gc: if not gc:
# This node is useless # This node is useless
continue continue
@@ -406,36 +419,40 @@ class OEBReader(object):
klass = child.get('class', 'chapter') klass = child.get('class', 'chapter')
try: try:
po = int(child.get('playOrder', self.oeb.toc.next_play_order())) po = int(child.get('playOrder',
except: self.oeb.toc.next_play_order()))
except Exception:
po = self.oeb.toc.next_play_order() po = self.oeb.toc.next_play_order()
authorElement = xpath(child, authorElement = base.xpath(child,
'descendant::calibre:meta[@name = "author"]') 'descendant::calibre:meta[@name = "author"]')
if authorElement: if authorElement:
author = authorElement[0].text author = authorElement[0].text
else: else:
author = None author = None
descriptionElement = xpath(child, descriptionElement = base.xpath(child,
'descendant::calibre:meta[@name = "description"]') 'descendant::calibre:meta[@name = '
'"description"]')
if descriptionElement: if descriptionElement:
description = etree.tostring(descriptionElement[0], description = etree.tostring(descriptionElement[0],
method='text', encoding='unicode').strip() method='text',
encoding='unicode').strip()
if not description: if not description:
description = None description = None
else: else:
description = None description = None
index_image = xpath(child, index_image = base.xpath(child,
'descendant::calibre:meta[@name = "toc_thumbnail"]') 'descendant::calibre:meta[@name = '
'"toc_thumbnail"]')
toc_thumbnail = (index_image[0].text if index_image else None) toc_thumbnail = (index_image[0].text if index_image else None)
if not toc_thumbnail or not toc_thumbnail.strip(): if not toc_thumbnail or not toc_thumbnail.strip():
toc_thumbnail = None toc_thumbnail = None
node = toc.add(title, href, id=id, klass=klass, node = toc.add(title, href, id=id, klass=klass,
play_order=po, description=description, author=author, play_order=po, description=description,
toc_thumbnail=toc_thumbnail) author=author, toc_thumbnail=toc_thumbnail)
self._toc_from_navpoint(item, node, child) self._toc_from_navpoint(item, node, child)
@@ -444,31 +461,31 @@ class OEBReader(object):
return False return False
self.log.debug('Reading TOC from NCX...') self.log.debug('Reading TOC from NCX...')
ncx = item.data ncx = item.data
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
title = COLLAPSE_RE.sub(' ', title.strip()) title = base.COLLAPSE_RE.sub(' ', title.strip())
title = title or str(self.oeb.metadata.title[0]) title = title or str(self.oeb.metadata.title[0])
toc = self.oeb.toc toc = self.oeb.toc
toc.title = title toc.title = title
navmaps = xpath(ncx, 'ncx:navMap') navmaps = base.xpath(ncx, 'ncx:navMap')
for navmap in navmaps: for navmap in navmaps:
self._toc_from_navpoint(item, toc, navmap) self._toc_from_navpoint(item, toc, navmap)
return True return True
def _toc_from_tour(self, opf): def _toc_from_tour(self, opf):
result = xpath(opf, 'o2:tours/o2:tour') result = base.xpath(opf, 'o2:tours/o2:tour')
if not result: if not result:
return False return False
self.log.debug('Reading TOC from tour...') self.log.debug('Reading TOC from tour...')
tour = result[0] tour = result[0]
toc = self.oeb.toc toc = self.oeb.toc
toc.title = tour.get('title') toc.title = tour.get('title')
sites = xpath(tour, 'o2:site') sites = base.xpath(tour, 'o2:site')
for site in sites: for site in sites:
title = site.get('title') title = site.get('title')
href = site.get('href') href = site.get('href')
if not title or not href: if not title or not href:
continue continue
path, _ = urllib.parse.urldefrag(urlnormalize(href)) path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
if path not in self.oeb.manifest.hrefs: if path not in self.oeb.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href) self.logger.warn('TOC reference %r not found' % href)
continue continue
@@ -484,23 +501,23 @@ class OEBReader(object):
item = self.oeb.manifest.hrefs[itempath] item = self.oeb.manifest.hrefs[itempath]
html = item.data html = item.data
if frag: if frag:
elems = xpath(html, './/*[@id="%s"]' % frag) elems = base.xpath(html, './/*[@id="%s"]' % frag)
if not elems: if not elems:
elems = xpath(html, './/*[@name="%s"]' % frag) elems = base.xpath(html, './/*[@name="%s"]' % frag)
elem = elems[0] if elems else html elem = elems[0] if elems else html
while elem != html and not xpath(elem, './/h:a[@href]'): while elem != html and not base.xpath(elem, './/h:a[@href]'):
elem = elem.getparent() elem = elem.getparent()
html = elem html = elem
titles = defaultdict(list) titles = collections.defaultdict(list)
order = [] order = []
for anchor in xpath(html, './/h:a[@href]'): for anchor in base.xpath(html, './/h:a[@href]'):
href = anchor.attrib['href'] href = anchor.attrib['href']
href = item.abshref(urlnormalize(href)) href = item.abshref(base.urlnormalize(href))
path, frag = urllib.parse.urldefrag(href) path, frag = urllib.parse.urldefrag(href)
if path not in self.oeb.manifest.hrefs: if path not in self.oeb.manifest.hrefs:
continue continue
title = xml2text(anchor) title = base.xml2text(anchor)
title = COLLAPSE_RE.sub(' ', title.strip()) title = base.COLLAPSE_RE.sub(' ', title.strip())
if href not in titles: if href not in titles:
order.append(href) order.append(href)
titles[href].append(title) titles[href].append(title)
@@ -518,15 +535,15 @@ class OEBReader(object):
if not item.linear: if not item.linear:
continue continue
html = item.data html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()'))
title = COLLAPSE_RE.sub(' ', title.strip()) title = base.COLLAPSE_RE.sub(' ', title.strip())
if title: if title:
titles.append(title) titles.append(title)
headers.append('(unlabled)') headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()' expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html, expr % tag)) header = ''.join(base.xpath(html, expr % tag))
header = COLLAPSE_RE.sub(' ', header.strip()) header = base.COLLAPSE_RE.sub(' ', header.strip())
if header: if header:
headers[-1] = header headers[-1] = header
break break
@@ -558,17 +575,17 @@ class OEBReader(object):
ncx = item.data ncx = item.data
if ncx is None: if ncx is None:
return False return False
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget')
if not ptargets: if not ptargets:
return False return False
pages = self.oeb.pages pages = self.oeb.pages
for ptarget in ptargets: for ptarget in ptargets:
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
name = COLLAPSE_RE.sub(' ', name.strip()) name = base.COLLAPSE_RE.sub(' ', name.strip())
href = xpath(ptarget, 'ncx:content/@src') href = base.xpath(ptarget, 'ncx:content/@src')
if not href: if not href:
continue continue
href = item.abshref(urlnormalize(href[0])) href = item.abshref(base.urlnormalize(href[0]))
id = ptarget.get('id') id = ptarget.get('id')
type = ptarget.get('type', 'normal') type = ptarget.get('type', 'normal')
klass = ptarget.get('class') klass = ptarget.get('class')
@@ -576,7 +593,7 @@ class OEBReader(object):
return True return True
def _find_page_map(self, opf): def _find_page_map(self, opf):
result = xpath(opf, '/o2:package/o2:spine/@page-map') result = base.xpath(opf, '/o2:package/o2:spine/@page-map')
if result: if result:
id = result[0] id = result[0]
if id not in self.oeb.manifest.ids: if id not in self.oeb.manifest.ids:
@@ -585,7 +602,7 @@ class OEBReader(object):
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return item return item
for item in self.oeb.manifest.values(): for item in self.oeb.manifest.values():
if item.media_type == PAGE_MAP_MIME: if item.media_type == base.PAGE_MAP_MIME:
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
return item return item
return None return None
@@ -596,13 +613,13 @@ class OEBReader(object):
return False return False
pmap = item.data pmap = item.data
pages = self.oeb.pages pages = self.oeb.pages
for page in xpath(pmap, 'o2:page'): for page in base.xpath(pmap, 'o2:page'):
name = page.get('name', '') name = page.get('name', '')
href = page.get('href') href = page.get('href')
if not href: if not href:
continue continue
name = COLLAPSE_RE.sub(' ', name.strip()) name = base.COLLAPSE_RE.sub(' ', name.strip())
href = item.abshref(urlnormalize(href)) href = item.abshref(base.urlnormalize(href))
type = 'normal' type = 'normal'
if not name: if not name:
type = 'special' type = 'special'
@@ -628,14 +645,14 @@ class OEBReader(object):
if not data: if not data:
data = b'' data = b''
id, href = self.oeb.manifest.generate('cover', 'cover.jpg') id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data)
return item return item
def _locate_cover_image(self): def _locate_cover_image(self):
if self.oeb.metadata.cover: if self.oeb.metadata.cover:
id = str(self.oeb.metadata.cover[0]) id = str(self.oeb.metadata.cover[0])
item = self.oeb.manifest.ids.get(id, None) item = self.oeb.manifest.ids.get(id, None)
if item is not None and item.media_type in OEB_IMAGES: if item is not None and item.media_type in base.OEB_IMAGES:
return item return item
else: else:
self.logger.warn('Invalid cover image @id %r' % id) self.logger.warn('Invalid cover image @id %r' % id)
@@ -644,27 +661,27 @@ class OEBReader(object):
href = self.oeb.guide['cover'].href href = self.oeb.guide['cover'].href
item = self.oeb.manifest.hrefs[href] item = self.oeb.manifest.hrefs[href]
media_type = item.media_type media_type = item.media_type
if media_type in OEB_IMAGES: if media_type in base.OEB_IMAGES:
return item return item
elif media_type in OEB_DOCS: elif media_type in base.OEB_DOCS:
hcover = item hcover = item
html = hcover.data html = hcover.data
if MS_COVER_TYPE in self.oeb.guide: if base.MS_COVER_TYPE in self.oeb.guide:
href = self.oeb.guide[MS_COVER_TYPE].href href = self.oeb.guide[base.MS_COVER_TYPE].href
item = self.oeb.manifest.hrefs.get(href, None) item = self.oeb.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES: if item is not None and item.media_type in base.OEB_IMAGES:
return item return item
if self.COVER_SVG_XP(html): if self.COVER_SVG_XP(html):
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
href = os.path.splitext(hcover.href)[0] + '.svg' href = os.path.splitext(hcover.href)[0] + '.svg'
id, href = self.oeb.manifest.generate(hcover.id, href) id, href = self.oeb.manifest.generate(hcover.id, href)
item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) item = self.oeb.manifest.add(id, href, base.SVG_MIME, data=svg)
return item return item
if self.COVER_OBJECT_XP(html): if self.COVER_OBJECT_XP(html):
object = self.COVER_OBJECT_XP(html)[0] object = self.COVER_OBJECT_XP(html)[0]
href = hcover.abshref(object.get('data')) href = hcover.abshref(object.get('data'))
item = self.oeb.manifest.hrefs.get(href, None) item = self.oeb.manifest.hrefs.get(href, None)
if item is not None and item.media_type in OEB_IMAGES: if item is not None and item.media_type in base.OEB_IMAGES:
return item return item
return self._cover_from_html(hcover) return self._cover_from_html(hcover)
@@ -687,7 +704,8 @@ class OEBReader(object):
items = [x for x in self.oeb.manifest if x.href == href] items = [x for x in self.oeb.manifest if x.href == href]
for x in items: for x in items:
if x not in self.oeb.spine: if x not in self.oeb.spine:
self.oeb.log.warn('Removing duplicate manifest item with id:', x.id) self.oeb.log.warn('Removing duplicate manifest item with '
'id:', x.id)
self.oeb.manifest.remove_duplicate_item(x) self.oeb.manifest.remove_duplicate_item(x)
def _all_from_opf(self, opf): def _all_from_opf(self, opf):
@@ -706,7 +724,7 @@ class OEBReader(object):
def main(argv=sys.argv): def main(argv=sys.argv):
reader = OEBReader() reader = OEBReader()
for arg in argv[1:]: for arg in argv[1:]:
oeb = reader(OEBBook(), arg) oeb = reader(base.OEBBook(), arg)
for name, doc in oeb.to_opf1().values(): for name, doc in oeb.to_opf1().values():
print(etree.tostring(doc, pretty_print=True)) print(etree.tostring(doc, pretty_print=True))
for name, doc in oeb.to_opf2(page_map=True).values(): for name, doc in oeb.to_opf2(page_map=True).values():
+14 -14
View File
@@ -10,17 +10,16 @@ from css_parser.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
cssproperties) cssproperties)
from css_parser import (profile as cssprofiles, parseString, parseStyle, log as from css_parser import (profile as cssprofiles, parseString, parseStyle, log as
css_parser_log, CSSParser, profiles, replaceUrls) css_parser_log, CSSParser, profiles, replaceUrls)
from ebook_converter import constants as const
from ebook_converter import force_unicode, as_unicode from ebook_converter import force_unicode, as_unicode
from ebook_converter.ebooks import unit_convert from ebook_converter.ebooks import unit_convert
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.normalize_css import DEFAULTS, normalizers from ebook_converter.ebooks.oeb.normalize_css import DEFAULTS, normalizers
from ebook_converter.css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES from ebook_converter.css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
from ebook_converter.tinycss.media3 import CSSMedia3Parser from ebook_converter.tinycss.media3 import CSSMedia3Parser
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
css_parser_log.setLevel(logging.WARN) css_parser_log.setLevel(logging.WARN)
_html_css_stylesheet = None _html_css_stylesheet = None
@@ -208,7 +207,7 @@ class Stylizer(object):
stylesheets = [html_css_stylesheet()] stylesheets = [html_css_stylesheet()]
if base_css: if base_css:
stylesheets.append(parseString(base_css, validate=False)) stylesheets.append(parseString(base_css, validate=False))
style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') style_tags = base.xpath(tree, '//*[local-name()="style" or local-name()="link"]')
# Add css_parser parsing profiles from output_profile # Add css_parser parsing profiles from output_profile
for profile in self.opts.output_profile.extra_css_modules: for profile in self.opts.output_profile.extra_css_modules:
@@ -219,7 +218,7 @@ class Stylizer(object):
parser = CSSParser(fetcher=self._fetch_css_file, parser = CSSParser(fetcher=self._fetch_css_file,
log=logging.getLogger('calibre.css')) log=logging.getLogger('calibre.css'))
for elem in style_tags: for elem in style_tags:
if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): if (elem.tag == base.tag('xhtml', 'style') and elem.get('type', base.CSS_MIME) in base.OEB_STYLES and media_ok(elem.get('media'))):
text = elem.text if elem.text else '' text = elem.text if elem.text else ''
for x in elem: for x in elem:
t = getattr(x, 'text', None) t = getattr(x, 'text', None)
@@ -245,7 +244,7 @@ class Stylizer(object):
self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
continue continue
sitem = hrefs[ihref] sitem = hrefs[ihref]
if sitem.media_type not in OEB_STYLES: if sitem.media_type not in base.OEB_STYLES:
self.logger.warn('CSS @import of non-CSS file %r' % rule.href) self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
continue continue
stylesheets.append(sitem.data) stylesheets.append(sitem.data)
@@ -254,11 +253,11 @@ class Stylizer(object):
replaceUrls(stylesheet, item.abshref, replaceUrls(stylesheet, item.abshref,
ignoreImportRules=True) ignoreImportRules=True)
stylesheets.append(stylesheet) stylesheets.append(stylesheet)
elif (elem.tag == XHTML('link') and elem.get('href') and elem.get( elif (elem.tag == base.tag('xhtml', 'link') and elem.get('href') and elem.get(
'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media')) 'type', base.CSS_MIME).lower() in base.OEB_STYLES and media_ok(elem.get('media'))
): ):
href = urlnormalize(elem.attrib['href']) href = base.urlnormalize(elem.attrib['href'])
path = item.abshref(href) path = item.abshref(href)
sitem = oeb.manifest.hrefs.get(path, None) sitem = oeb.manifest.hrefs.get(path, None)
if sitem is None: if sitem is None:
@@ -326,7 +325,8 @@ class Stylizer(object):
special_text = ''.join(punctuation_chars) + \ special_text = ''.join(punctuation_chars) + \
(text[0] if text else '') (text[0] if text else '')
span = x.makeelement('{%s}span' % XHTML_NS) span = x.makeelement('{%s}span' %
const.XHTML_NS)
span.text = special_text span.text = special_text
span.set('data-fake-first-letter', '1') span.set('data-fake-first-letter', '1')
span.tail = text[1:] span.tail = text[1:]
@@ -340,10 +340,10 @@ class Stylizer(object):
else: else:
for elem in matches: for elem in matches:
self.style(elem)._update_cssdict(cssdict) self.style(elem)._update_cssdict(cssdict)
for elem in xpath(tree, '//h:*[@style]'): for elem in base.xpath(tree, '//h:*[@style]'):
self.style(elem)._apply_style_attr(url_replacer=item.abshref) self.style(elem)._apply_style_attr(url_replacer=item.abshref)
num_pat = re.compile(r'[0-9.]+$') num_pat = re.compile(r'[0-9.]+$')
for elem in xpath(tree, '//h:img[@width or @height]'): for elem in base.xpath(tree, '//h:img[@width or @height]'):
style = self.style(elem) style = self.style(elem)
# Check if either height or width is not default # Check if either height or width is not default
is_styled = style._style.get('width', 'auto') != 'auto' or \ is_styled = style._style.get('width', 'auto') != 'auto' or \
@@ -370,7 +370,7 @@ class Stylizer(object):
self.logger.warn('CSS import of missing file %r' % path) self.logger.warn('CSS import of missing file %r' % path)
return (None, None) return (None, None)
item = hrefs[path] item = hrefs[path]
if item.media_type not in OEB_STYLES: if item.media_type not in base.OEB_STYLES:
self.logger.warn('CSS import of non-CSS file %r' % path) self.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None) return (None, None)
data = item.data.cssText data = item.data.cssText
+52 -56
View File
@@ -1,66 +1,61 @@
import textwrap import textwrap
import urllib.parse import urllib.parse
from lxml import etree
from ebook_converter import guess_type from ebook_converter import guess_type
from ebook_converter.utils.imghdr import identify from ebook_converter.utils.imghdr import identify
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class CoverManager(object): class CoverManager(object):
SVG_TEMPLATE = textwrap.dedent('''\ SVG_TEMPLATE = textwrap.dedent('''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head> <head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="calibre:cover" content="true" /> <meta name="calibre:cover" content="true" />
<title>Cover</title> <title>Cover</title>
<style type="text/css" title="override_css"> <style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt} @page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; } body { text-align: center; padding:0pt; margin: 0pt; }
</style> </style>
</head> </head>
<body> <body>
<div> <div>
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" <svg version="1.1" xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xlink="http://www.w3.org/1999/xlink"
width="100%%" height="100%%" viewBox="__viewbox__" width="100%%" height="100%%" viewBox="__viewbox__"
preserveAspectRatio="__ar__"> preserveAspectRatio="__ar__">
<image width="__width__" height="__height__" xlink:href="%s"/> <image width="__width__" height="__height__" xlink:href="%s"/>
</svg> </svg>
</div> </div>
</body> </body>
</html> </html>''')
''')
NONSVG_TEMPLATE = textwrap.dedent('''\ NONSVG_TEMPLATE = textwrap.dedent('''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head> <head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="calibre:cover" content="true" /> <meta name="calibre:cover" content="true" />
<title>Cover</title> <title>Cover</title>
<style type="text/css" title="override_css"> <style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt} @page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt } body { text-align: center; padding:0pt; margin: 0pt }
div { padding:0pt; margin: 0pt } div { padding:0pt; margin: 0pt }
img { padding:0pt; margin: 0pt } img { padding:0pt; margin: 0pt }
</style> </style>
</head> </head>
<body> <body>
<div> <div>
<img src="%s" alt="cover" __style__ /> <img src="%s" alt="cover" __style__ />
</div> </div>
</body> </body>
</html> </html>
''') ''')
def __init__(self, no_default_cover=False, no_svg_cover=False, def __init__(self, no_default_cover=False, no_svg_cover=False,
preserve_aspect_ratio=False, fixed_size=None): preserve_aspect_ratio=False, fixed_size=None):
self.no_default_cover = no_default_cover self.no_default_cover = no_default_cover
self.no_svg_cover = no_svg_cover self.no_svg_cover = no_svg_cover
self.preserve_aspect_ratio = preserve_aspect_ratio self.preserve_aspect_ratio = preserve_aspect_ratio
@@ -72,9 +67,9 @@ class CoverManager(object):
style = 'style="height: 100%%"' style = 'style="height: 100%%"'
else: else:
width, height = fixed_size width, height = fixed_size
style = 'style="height: %s; width: %s"'%(height, width) style = 'style="height: %s; width: %s"' % (height, width)
self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__', self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__',
style) style)
def __call__(self, oeb, opts, log): def __call__(self, oeb, opts, log):
self.oeb = oeb self.oeb = oeb
@@ -108,22 +103,23 @@ class CoverManager(object):
# if self.preserve_aspect_ratio: # if self.preserve_aspect_ratio:
# width, height = 600, 800 # width, height = 600, 800
self.svg_template = self.svg_template.replace('__viewbox__', self.svg_template = self.svg_template.replace('__viewbox__',
'0 0 %d %d'%(width, height)) '0 0 %d %d' %
(width, height))
self.svg_template = self.svg_template.replace('__width__', self.svg_template = self.svg_template.replace('__width__',
str(width)) str(width))
self.svg_template = self.svg_template.replace('__height__', self.svg_template = self.svg_template.replace('__height__',
str(height)) str(height))
if href is not None: if href is not None:
templ = self.non_svg_template if self.no_svg_cover \ templ = self.non_svg_template if self.no_svg_cover \
else self.svg_template else self.svg_template
tp = templ%unquote(href) tp = templ % unquote(href)
id, href = m.generate('titlepage', 'titlepage.xhtml') id, href = m.generate('titlepage', 'titlepage.xhtml')
item = m.add(id, href, guess_type('t.xhtml')[0], item = m.add(id, href, guess_type('t.xhtml')[0],
data=safe_xml_fromstring(tp)) data=etree.fromstring(tp))
else: else:
item = self.oeb.manifest.hrefs[ key = urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0]
urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0]] item = self.oeb.manifest.hrefs[key]
if item is not None: if item is not None:
self.oeb.spine.insert(0, item, True) self.oeb.spine.insert(0, item, True)
if 'cover' not in self.oeb.guide.refs: if 'cover' not in self.oeb.guide.refs:
@@ -1,26 +1,27 @@
""" """
CSS flattening transform. CSS flattening transform.
""" """
import re, operator, math, numbers import collections
from collections import defaultdict import math
from xml.dom import SyntaxErr import numbers
import operator
import re
from xml import dom
from lxml import etree from lxml import etree
import css_parser import css_parser
from css_parser.css import Property from css_parser import css as cp_css
from ebook_converter import constants as const
from ebook_converter import guess_type from ebook_converter import guess_type
from ebook_converter.ebooks import unit_convert from ebook_converter.ebooks import unit_convert
from ebook_converter.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, from ebook_converter.ebooks.oeb import base
namespace, barename, XPath, css_text) from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.utils.filenames import ascii_filename, ascii_text from ebook_converter.utils.filenames import ascii_filename, ascii_text
from ebook_converter.utils.icu import numeric_sort_key
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
COLLAPSE = re.compile(r'[ \t\r\n\v]+') COLLAPSE = re.compile(r'[ \t\r\n\v]+')
STRIPNUM = re.compile(r'[-0-9]+$') STRIPNUM = re.compile(r'[-0-9]+$')
@@ -121,7 +122,7 @@ class EmbedFontsCSSRules(object):
return None return None
if not self.href: if not self.href:
iid, href = oeb.manifest.generate('page_styles', 'page_styles.css') iid, href = oeb.manifest.generate('page_styles', 'page_styles.css')
rules = [css_text(x) for x in self.rules] rules = [base.css_text(x) for x in self.rules]
rules = '\n\n'.join(rules) rules = '\n\n'.join(rules)
sheet = css_parser.parseString(rules, validate=False) sheet = css_parser.parseString(rules, validate=False)
self.href = oeb.manifest.add(iid, href, guess_type(href)[0], self.href = oeb.manifest.add(iid, href, guess_type(href)[0],
@@ -186,7 +187,7 @@ class CSSFlattener(object):
for item in oeb.manifest.values(): for item in oeb.manifest.values():
# Make all links to resources absolute, as these sheets will be # Make all links to resources absolute, as these sheets will be
# consolidated into a single stylesheet at the root of the document # consolidated into a single stylesheet at the root of the document
if item.media_type in OEB_STYLES: if item.media_type in base.OEB_STYLES:
css_parser.replaceUrls(item.data, item.abshref, css_parser.replaceUrls(item.data, item.abshref,
ignoreImportRules=True) ignoreImportRules=True)
@@ -273,7 +274,7 @@ class CSSFlattener(object):
css = '' css = ''
for item in self.items: for item in self.items:
html = item.data html = item.data
body = html.find(XHTML('body')) body = html.find(base.tag('xhtml', 'body'))
if 'style' in html.attrib: if 'style' in html.attrib:
b = body.attrib.get('style', '') b = body.attrib.get('style', '')
body.set('style', html.get('style') + ';' + b) body.set('style', html.get('style') + ';' + b)
@@ -310,11 +311,11 @@ class CSSFlattener(object):
sizes[csize] += len(COLLAPSE.sub(' ', child.tail)) sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
def baseline_spine(self): def baseline_spine(self):
sizes = defaultdict(float) sizes = collections.defaultdict(float)
for item in self.items: for item in self.items:
html = item.data html = item.data
stylizer = self.stylizers[item] stylizer = self.stylizers[item]
body = html.find(XHTML('body')) body = html.find(base.tag('xhtml', 'body'))
fsize = self.context.source.fbase fsize = self.context.source.fbase
self.baseline_node(body, stylizer, sizes, fsize) self.baseline_node(body, stylizer, sizes, fsize)
try: try:
@@ -351,9 +352,9 @@ class CSSFlattener(object):
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True): def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True):
if not isinstance(node.tag, (str, bytes)) \ if not isinstance(node.tag, (str, bytes)) \
or namespace(node.tag) != XHTML_NS: or parse_utils.namespace(node.tag) != const.XHTML_NS:
return return
tag = barename(node.tag) tag = parse_utils.barename(node.tag)
style = stylizer.style(node) style = stylizer.style(node)
cssdict = style.cssdict() cssdict = style.cssdict()
try: try:
@@ -375,7 +376,7 @@ class CSSFlattener(object):
if 'margin-left' not in cssdict and 'margin-right' not in cssdict: if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
cssdict['margin-left'] = cssdict['margin-right'] = 'auto' cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
else: else:
for table in node.iterchildren(XHTML("table")): for table in node.iterchildren(base.tag('xhtml', "table")):
ts = stylizer.style(table) ts = stylizer.style(table)
if ts.get('margin-left') is None and ts.get('margin-right') is None: if ts.get('margin-left') is None and ts.get('margin-right') is None:
ts.set('margin-left', 'auto') ts.set('margin-left', 'auto')
@@ -391,11 +392,12 @@ class CSSFlattener(object):
if cssdict.get('vertical-align') == 'inherit': if cssdict.get('vertical-align') == 'inherit':
cssdict['vertical-align'] = node.attrib['valign'] cssdict['vertical-align'] = node.attrib['valign']
del node.attrib['valign'] del node.attrib['valign']
if node.tag == XHTML('font'): if node.tag == base.tag('xhtml', 'font'):
tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1', tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')] 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
tag = 'div' if XPath('|'.join(tags))(node) else 'span' # TODO(gryf): this will override tag from line 355. On purpose?
node.tag = XHTML(tag) tag = 'div' if base.XPath('|'.join(tags))(node) else 'span'
node.tag = base.tag('xhtml', tag)
if 'size' in node.attrib: if 'size' in node.attrib:
def force_int(raw): def force_int(raw):
return int(re.search(r'([0-9+-]+)', raw).group(1)) return int(re.search(r'([0-9+-]+)', raw).group(1))
@@ -425,14 +427,14 @@ class CSSFlattener(object):
del node.attrib['face'] del node.attrib['face']
if 'color' in node.attrib: if 'color' in node.attrib:
try: try:
cssdict['color'] = Property('color', node.attrib['color']).value cssdict['color'] = cp_css.Property('color', node.attrib['color']).value
except (ValueError, SyntaxErr): except (ValueError, dom.SyntaxErr):
pass pass
del node.attrib['color'] del node.attrib['color']
if 'bgcolor' in node.attrib: if 'bgcolor' in node.attrib:
try: try:
cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value cssdict['background-color'] = cp_css.Property('background-color', node.attrib['bgcolor']).value
except (ValueError, SyntaxErr): except (ValueError, dom.SyntaxErr):
pass pass
del node.attrib['bgcolor'] del node.attrib['bgcolor']
if tag == 'ol' and 'type' in node.attrib: if tag == 'ol' and 'type' in node.attrib:
@@ -573,7 +575,7 @@ class CSSFlattener(object):
def flatten_head(self, item, href, global_href): def flatten_head(self, item, href, global_href):
html = item.data html = item.data
head = html.find(XHTML('head')) head = html.find(base.tag('xhtml', 'head'))
def safe_lower(x): def safe_lower(x):
try: try:
@@ -583,39 +585,39 @@ class CSSFlattener(object):
return x return x
for node in html.xpath('//*[local-name()="style" or local-name()="link"]'): for node in html.xpath('//*[local-name()="style" or local-name()="link"]'):
if node.tag == XHTML('link') \ if node.tag == base.tag('xhtml', 'link') \
and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \ and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \
and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES: and safe_lower(node.get('type', base.CSS_MIME)) in base.OEB_STYLES:
node.getparent().remove(node) node.getparent().remove(node)
elif node.tag == XHTML('style') \ elif node.tag == base.tag('xhtml', 'style') \
and node.get('type', CSS_MIME) in OEB_STYLES: and node.get('type', base.CSS_MIME) in base.OEB_STYLES:
node.getparent().remove(node) node.getparent().remove(node)
href = item.relhref(href) href = item.relhref(href)
l = etree.SubElement(head, XHTML('link'), l = etree.SubElement(head, base.tag('xhtml', 'link'),
rel='stylesheet', type=CSS_MIME, href=href) rel='stylesheet', type=base.CSS_MIME, href=href)
l.tail='\n' l.tail='\n'
if global_href: if global_href:
href = item.relhref(global_href) href = item.relhref(global_href)
l = etree.SubElement(head, XHTML('link'), l = etree.SubElement(head, base.tag('xhtml', 'link'),
rel='stylesheet', type=CSS_MIME, href=href) rel='stylesheet', type=base.CSS_MIME, href=href)
l.tail = '\n' l.tail = '\n'
def replace_css(self, css): def replace_css(self, css):
manifest = self.oeb.manifest manifest = self.oeb.manifest
for item in manifest.values(): for item in manifest.values():
if item.media_type in OEB_STYLES: if item.media_type in base.OEB_STYLES:
manifest.remove(item) manifest.remove(item)
id, href = manifest.generate('css', 'stylesheet.css') id, href = manifest.generate('css', 'stylesheet.css')
sheet = css_parser.parseString(css, validate=False) sheet = css_parser.parseString(css, validate=False)
if self.transform_css_rules: if self.transform_css_rules:
from ebook_converter.ebooks.css_transform_rules import transform_sheet from ebook_converter.ebooks.css_transform_rules import transform_sheet
transform_sheet(self.transform_css_rules, sheet) transform_sheet(self.transform_css_rules, sheet)
item = manifest.add(id, href, CSS_MIME, data=sheet) item = manifest.add(id, href, base.CSS_MIME, data=sheet)
self.oeb.manifest.main_stylesheet = item self.oeb.manifest.main_stylesheet = item
return href return href
def collect_global_css(self): def collect_global_css(self):
global_css = defaultdict(list) global_css = collections.defaultdict(list)
for item in self.items: for item in self.items:
stylizer = self.stylizers[item] stylizer = self.stylizers[item]
if float(self.context.margin_top) >= 0: if float(self.context.margin_top) >= 0:
@@ -627,7 +629,7 @@ class CSSFlattener(object):
items = sorted(stylizer.page_rule.items()) items = sorted(stylizer.page_rule.items())
css = ';\n'.join("%s: %s" % (key, val) for key, val in items) css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
css = ('@page {\n%s\n}\n'%css) if items else '' css = ('@page {\n%s\n}\n'%css) if items else ''
rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules] rules = [base.css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
raw = '\n\n'.join(rules) raw = '\n\n'.join(rules)
css += '\n\n' + raw css += '\n\n' + raw
global_css[css].append(item) global_css[css].append(item)
@@ -642,7 +644,7 @@ class CSSFlattener(object):
if self.transform_css_rules: if self.transform_css_rules:
from ebook_converter.ebooks.css_transform_rules import transform_sheet from ebook_converter.ebooks.css_transform_rules import transform_sheet
transform_sheet(self.transform_css_rules, sheet) transform_sheet(self.transform_css_rules, sheet)
manifest.add(id_, href, CSS_MIME, data=sheet) manifest.add(id_, href, base.CSS_MIME, data=sheet)
gc_map[css] = href gc_map[css] = href
ans = {} ans = {}
@@ -652,8 +654,8 @@ class CSSFlattener(object):
return ans return ans
def flatten_spine(self): def flatten_spine(self):
names = defaultdict(int) names = collections.defaultdict(int)
styles, pseudo_styles = {}, defaultdict(dict) styles, pseudo_styles = {}, collections.defaultdict(dict)
for item in self.items: for item in self.items:
html = item.data html = item.data
stylizer = self.stylizers[item] stylizer = self.stylizers[item]
@@ -661,7 +663,7 @@ class CSSFlattener(object):
self.specializer(item, stylizer) self.specializer(item, stylizer)
fsize = self.context.dest.fbase fsize = self.context.dest.fbase
self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False) self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False)
self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id) self.flatten_node(html.find(base.tag('xhtml', 'body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
items = sorted(((key, val) for (val, key) in styles.items())) items = sorted(((key, val) for (val, key) in styles.items()))
# :hover must come after link and :active must come after :hover # :hover must come after link and :active must come after :hover
psels = sorted(pseudo_styles, key=lambda x : psels = sorted(pseudo_styles, key=lambda x :
@@ -1,46 +1,20 @@
""" """
HTML-TOC-adding transform. HTML-TOC-adding transform.
""" """
from ebook_converter.ebooks.oeb.base import XML, XHTML, XHTML_NS from ebook_converter import constants as const
from ebook_converter.ebooks.oeb.base import XHTML_MIME, CSS_MIME from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import element, XPath
__all__ = ['HTMLTOCAdder']
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
DEFAULT_TITLE = 'Table of Contents' DEFAULT_TITLE = 'Table of Contents'
STYLE_CSS = {'nested': '.calibre_toc_header {\n text-align: center;\n}\n'
'.calibre_toc_block {\n margin-left: 1.2em;\n text-indent: '
'-1.2em;\n}\n.calibre_toc_block .calibre_toc_block {\n '
'margin-left: 2.4em;\n}\n.calibre_toc_block .calibre_toc_block '
'.calibre_toc_block {\n margin-left: 3.6em;\n}\n',
STYLE_CSS = { 'centered': '.calibre_toc_header {\n text-align: center;\n}\n'
'nested': """ '.calibre_toc_block {\n text-align: center;\n}\nbody > '
.calibre_toc_header { '.calibre_toc_block {\n margin-top: 1.2em;\n}\n'}
text-align: center;
}
.calibre_toc_block {
margin-left: 1.2em;
text-indent: -1.2em;
}
.calibre_toc_block .calibre_toc_block {
margin-left: 2.4em;
}
.calibre_toc_block .calibre_toc_block .calibre_toc_block {
margin-left: 3.6em;
}
""",
'centered': """
.calibre_toc_header {
text-align: center;
}
.calibre_toc_block {
text-align: center;
}
body > .calibre_toc_block {
margin-top: 1.2em;
}
"""
}
class HTMLTOCAdder(object): class HTMLTOCAdder(object):
@@ -71,7 +45,7 @@ class HTMLTOCAdder(object):
if href in oeb.manifest.hrefs: if href in oeb.manifest.hrefs:
item = oeb.manifest.hrefs[href] item = oeb.manifest.hrefs[href]
if (hasattr(item.data, 'xpath') and if (hasattr(item.data, 'xpath') and
XPath('//h:a[@href]')(item.data)): base.XPath('//h:a[@href]')(item.data)):
if oeb.spine.index(item) < 0: if oeb.spine.index(item) < 0:
if self.position == 'end': if self.position == 'end':
oeb.spine.add(item, linear=False) oeb.spine.add(item, linear=False)
@@ -91,23 +65,24 @@ class HTMLTOCAdder(object):
oeb.logger.error('Unknown TOC style %r' % style) oeb.logger.error('Unknown TOC style %r' % style)
style = 'nested' style = 'nested'
id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style]) oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style])
language = str(oeb.metadata.language[0]) language = str(oeb.metadata.language[0])
contents = element(None, XHTML('html'), nsmap={None: XHTML_NS}, contents = base.element(None, base.tag('xhtml', 'html'),
attrib={XML('lang'): language}) nsmap={None: const.XHTML_NS},
head = element(contents, XHTML('head')) attrib={base.tag('xml', 'lang'): language})
htitle = element(head, XHTML('title')) head = base.element(contents, base.tag('xhtml', 'head'))
htitle = base.element(head, base.tag('xhtml', 'title'))
htitle.text = title htitle.text = title
element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME, base.element(head, base.tag('xhtml', 'link'), rel='stylesheet',
href=css_href) type=base.CSS_MIME, href=css_href)
body = element(contents, XHTML('body'), body = base.element(contents, base.tag('xhtml', 'body'),
attrib={'class': 'calibre_toc'}) attrib={'class': 'calibre_toc'})
h1 = element(body, XHTML('h2'), h1 = base.element(body, base.tag('xhtml', 'h2'),
attrib={'class': 'calibre_toc_header'}) attrib={'class': 'calibre_toc_header'})
h1.text = title h1.text = title
self.add_toc_level(body, oeb.toc) self.add_toc_level(body, oeb.toc)
id, href = oeb.manifest.generate('contents', 'contents.xhtml') id, href = oeb.manifest.generate('contents', 'contents.xhtml')
item = oeb.manifest.add(id, href, XHTML_MIME, data=contents) item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents)
if self.position == 'end': if self.position == 'end':
oeb.spine.add(item, linear=False) oeb.spine.add(item, linear=False)
else: else:
@@ -116,10 +91,10 @@ class HTMLTOCAdder(object):
def add_toc_level(self, elem, toc): def add_toc_level(self, elem, toc):
for node in toc: for node in toc:
block = element(elem, XHTML('div'), block = base.element(elem, base.tag('xhtml', 'div'),
attrib={'class': 'calibre_toc_block'}) attrib={'class': 'calibre_toc_block'})
line = element(block, XHTML('a'), line = base.element(block, base.tag('xhtml', 'a'),
attrib={'href': node.href, attrib={'href': node.href,
'class': 'calibre_toc_line'}) 'class': 'calibre_toc_line'})
line.text = node.title line.text = node.title
self.add_toc_level(block, node) self.add_toc_level(block, node)
@@ -4,9 +4,10 @@ from string import Formatter
import pkg_resources import pkg_resources
import urllib.parse import urllib.parse
from ebook_converter import constants as const
from ebook_converter import guess_type, strftime from ebook_converter import guess_type, strftime
from ebook_converter.constants_old import iswindows from ebook_converter.constants_old import iswindows
from ebook_converter.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urlnormalize from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
from ebook_converter.library.comments import comments_to_html, markdown from ebook_converter.library.comments import comments_to_html, markdown
from ebook_converter.utils.date import is_date_undefined, as_local_time from ebook_converter.utils.date import is_date_undefined, as_local_time
from ebook_converter.ebooks.chardet import strip_encoding_declarations from ebook_converter.ebooks.chardet import strip_encoding_declarations
@@ -303,7 +304,7 @@ def render_jacket(mi, output_profile,
'tags_label': 'Tags', 'tags_label': 'Tags',
'title': title, 'title': title,
'title_str': title_str, 'title_str': title_str,
'xmlns': XHTML_NS} 'xmlns': const.XHTML_NS}
for key in mi.custom_field_keys(): for key in mi.custom_field_keys():
m = mi.get_user_metadata(key, False) or {} m = mi.get_user_metadata(key, False) or {}
@@ -370,7 +371,7 @@ def render_jacket(mi, output_profile,
# We cannot use data-calibre-rescale 100 on the body tag as that will just # We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless. # give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'): for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(XHTML('div')) fw = body.makeelement(const.XHTML_DIV)
fw.set('data-calibre-rescale', '100') fw.set('data-calibre-rescale', '100')
for child in body: for child in body:
fw.append(child) fw.append(child)
@@ -387,9 +388,9 @@ def linearize_jacket(oeb):
for x in oeb.spine[:4]: for x in oeb.spine[:4]:
if XPath(JACKET_XPATH)(x.data): if XPath(JACKET_XPATH)(x.data):
for e in XPath('//h:table|//h:tr|//h:th')(x.data): for e in XPath('//h:table|//h:tr|//h:th')(x.data):
e.tag = XHTML('div') e.tag = const.XHTML_DIV
for e in XPath('//h:td')(x.data): for e in XPath('//h:td')(x.data):
e.tag = XHTML('span') e.tag = const.XHTML_SPAN
break break
@@ -5,9 +5,9 @@ import string
from lxml import etree from lxml import etree
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS from ebook_converter import constants as const
from ebook_converter.ebooks.oeb.base import CSS_MIME from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import namespace from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.stylizer import Stylizer
@@ -43,15 +43,16 @@ class CaseMangler(object):
def mangle_spine(self): def mangle_spine(self):
id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css')
self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) self.oeb.manifest.add(id, href, base.CSS_MIME, data=CASE_MANGLER_CSS)
for item in self.oeb.spine: for item in self.oeb.spine:
html = item.data html = item.data
relhref = item.relhref(href) relhref = item.relhref(href)
etree.SubElement(html.find(XHTML('head')), XHTML('link'), etree.SubElement(html.find(base.tag('xhtml', 'head')),
rel='stylesheet', href=relhref, type=CSS_MIME) base.tag('xhtml', 'link'), rel='stylesheet',
href=relhref, type=base.CSS_MIME)
stylizer = Stylizer(html, item.href, self.oeb, self.opts, stylizer = Stylizer(html, item.href, self.oeb, self.opts,
self.profile) self.profile)
self.mangle_elem(html.find(XHTML('body')), stylizer) self.mangle_elem(html.find(base.tag('xhtml', 'body')), stylizer)
def text_transform(self, transform, text): def text_transform(self, transform, text):
if transform == 'capitalize': if transform == 'capitalize':
@@ -85,7 +86,8 @@ class CaseMangler(object):
else: else:
last.tail = text last.tail = text
else: else:
child = elem.makeelement(XHTML('span'), attrib=attrib) child = elem.makeelement(base.tag('xhtml', 'span'),
attrib=attrib)
child.text = text.upper() child.text = text.upper()
if last is None: if last is None:
elem.insert(0, child) elem.insert(0, child)
@@ -99,7 +101,7 @@ class CaseMangler(object):
def mangle_elem(self, elem, stylizer): def mangle_elem(self, elem, stylizer):
if not isinstance(elem.tag, (str, bytes)) or \ if not isinstance(elem.tag, (str, bytes)) or \
namespace(elem.tag) != XHTML_NS: parse_utils.namespace(elem.tag) != const.XHTML_NS:
return return
children = list(elem) children = list(elem)
style = stylizer.style(elem) style = stylizer.style(elem)
@@ -1,15 +1,12 @@
import os, re import os
import re
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.date import isoformat, now from ebook_converter.utils.date import isoformat, now
from ebook_converter import guess_type from ebook_converter import guess_type
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False): def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
from ebook_converter.ebooks.oeb.base import OPF
if not mi.is_null('title'): if not mi.is_null('title'):
m.clear('title') m.clear('title')
m.add('title', mi.title) m.add('title', mi.title)
@@ -19,17 +16,17 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
m.clear('title_sort') m.clear('title_sort')
m.add('title_sort', mi.title_sort) m.add('title_sort', mi.title_sort)
if not mi.is_null('authors'): if not mi.is_null('authors'):
m.filter('creator', lambda x : x.role.lower() in ['aut', '']) m.filter('creator', lambda x: x.role.lower() in ['aut', ''])
for a in mi.authors: for a in mi.authors:
attrib = {'role':'aut'} attrib = {'role': 'aut'}
if mi.author_sort: if mi.author_sort:
attrib[OPF('file-as')] = mi.author_sort attrib[base.tag('opf', 'file-as')] = mi.author_sort
m.add('creator', a, attrib=attrib) m.add('creator', a, attrib=attrib)
if not mi.is_null('book_producer'): if not mi.is_null('book_producer'):
m.filter('contributor', lambda x : x.role.lower() == 'bkp') m.filter('contributor', lambda x: x.role.lower() == 'bkp')
m.add('contributor', mi.book_producer, role='bkp') m.add('contributor', mi.book_producer, role='bkp')
elif override_input_metadata: elif override_input_metadata:
m.filter('contributor', lambda x : x.role.lower() == 'bkp') m.filter('contributor', lambda x: x.role.lower() == 'bkp')
if not mi.is_null('comments'): if not mi.is_null('comments'):
m.clear('description') m.clear('description')
m.add('description', mi.comments) m.add('description', mi.comments)
@@ -71,7 +68,7 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
m.clear('series_index') m.clear('series_index')
if not mi.is_null('rating'): if not mi.is_null('rating'):
m.clear('rating') m.clear('rating')
m.add('rating', '%.2f'%mi.rating) m.add('rating', '%.2f' % mi.rating)
elif override_input_metadata: elif override_input_metadata:
m.clear('rating') m.clear('rating')
if not mi.is_null('tags'): if not mi.is_null('tags'):
@@ -101,23 +98,25 @@ class MergeMetadata(object):
'Merge in user metadata, including cover' 'Merge in user metadata, including cover'
def __call__(self, oeb, mi, opts, override_input_metadata=False): def __call__(self, oeb, mi, opts, override_input_metadata=False):
_oim = override_input_metadata
self.oeb, self.log = oeb, oeb.log self.oeb, self.log = oeb, oeb.log
m = self.oeb.metadata m = self.oeb.metadata
self.log('Merging user specified metadata...') self.log('Merging user specified metadata...')
meta_info_to_oeb_metadata(mi, m, oeb.log, meta_info_to_oeb_metadata(mi, m, oeb.log,
override_input_metadata=override_input_metadata) override_input_metadata=_oim)
cover_id = self.set_cover(mi, opts.prefer_metadata_cover) cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
m.clear('cover') m.clear('cover')
if cover_id is not None: if cover_id is not None:
m.add('cover', cover_id) m.add('cover', cover_id)
if mi.uuid is not None: if mi.uuid is not None:
m.filter('identifier', lambda x:x.id=='uuid_id') m.filter('identifier', lambda x: x.id == 'uuid_id')
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id', self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
scheme='uuid') scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1] self.oeb.uid = self.oeb.metadata.identifier[-1]
if mi.application_id is not None: if mi.application_id is not None:
m.filter('identifier', lambda x:x.scheme=='calibre') m.filter('identifier', lambda x: x.scheme == 'calibre')
self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre') self.oeb.metadata.add('identifier', mi.application_id,
scheme='calibre')
def set_cover(self, mi, prefer_metadata_cover): def set_cover(self, mi, prefer_metadata_cover):
cdata, ext = b'', 'jpg' cdata, ext = b'', 'jpg'
@@ -138,7 +137,8 @@ class MergeMetadata(object):
if cdata: if cdata:
self.oeb.guide.remove('cover') self.oeb.guide.remove('cover')
self.oeb.guide.remove('titlepage') self.oeb.guide.remove('titlepage')
elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None: elif (self.oeb.plumber_output_format in {'mobi', 'azw3'} and
old_cover is not None):
# The amazon formats dont support html cover pages, so remove them # The amazon formats dont support html cover pages, so remove them
# even if no cover was specified. # even if no cover was specified.
self.oeb.guide.remove('titlepage') self.oeb.guide.remove('titlepage')
@@ -156,7 +156,9 @@ class MergeMetadata(object):
new_cover_item = None new_cover_item = None
if cdata: if cdata:
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext) id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata) new_cover_item = self.oeb.manifest.add(id, href,
guess_type('cover.'+ext)[0],
data=cdata)
self.oeb.guide.add('cover', 'Cover', href) self.oeb.guide.add('cover', 'Cover', href)
if do_remove_old_cover: if do_remove_old_cover:
self.remove_old_cover(item, new_cover_item.href) self.remove_old_cover(item, new_cover_item.href)
@@ -186,7 +188,8 @@ class MergeMetadata(object):
if href == cover_item.href: if href == cover_item.href:
if new_cover_href is not None: if new_cover_href is not None:
replacement_href = item.relhref(new_cover_href) replacement_href = item.relhref(new_cover_href)
attr = 'src' if img.tag.endswith('img') else XLINK('href') attr = ('src' if img.tag.endswith('img')
else XLINK('href'))
img.set(attr, replacement_href) img.set(attr, replacement_href)
else: else:
p = img.getparent() p = img.getparent()
@@ -202,13 +205,14 @@ class MergeMetadata(object):
for item in affected_items: for item in affected_items:
body = XPath('//h:body')(item.data) body = XPath('//h:body')(item.data)
if body: if body:
text = etree.tostring(body[0], method='text', encoding='unicode') text = etree.tostring(body[0], method='text',
encoding='unicode')
else: else:
text = '' text = ''
text = re.sub(r'\s+', '', text) text = re.sub(r'\s+', '', text)
if not text and not XPath('//h:img|//svg:svg')(item.data): if not text and not XPath('//h:img|//svg:svg')(item.data):
self.log('Removing %s as it is a wrapper around' self.log('Removing %s as it is a wrapper around the cover '
' the cover image'%item.href) 'image' % item.href)
self.oeb.spine.remove(item) self.oeb.spine.remove(item)
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
self.oeb.guide.remove_by_href(item.href) self.oeb.guide.remove_by_href(item.href)
@@ -1,7 +1,8 @@
import numbers import numbers
from collections import Counter from collections import Counter
from ebook_converter.ebooks.oeb.base import barename, XPath from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import XPath
__license__ = 'GPL v3' __license__ = 'GPL v3'
@@ -142,7 +143,7 @@ class RemoveFakeMargins(object):
for p in paras(body): for p in paras(body):
level = level_of(p, body) level = level_of(p, body)
level = '%s_%d'%(barename(p.tag), level) level = '%s_%d' % (parse_utils.barename(p.tag), level)
if level not in self.levels: if level not in self.levels:
self.levels[level] = [] self.levels[level] = []
self.levels[level].append(p) self.levels[level].append(p)
@@ -5,10 +5,8 @@ import os
import re import re
import urllib.parse import urllib.parse
# from PyQt5.Qt import ( from ebook_converter import constants as const
# Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer) from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.base import XHTML, XLINK
from ebook_converter.ebooks.oeb.base import SVG_MIME, PNG_MIME from ebook_converter.ebooks.oeb.base import SVG_MIME, PNG_MIME
from ebook_converter.ebooks.oeb.base import xml2str, xpath from ebook_converter.ebooks.oeb.base import xml2str, xpath
from ebook_converter.ebooks.oeb.base import urlnormalize from ebook_converter.ebooks.oeb.base import urlnormalize
@@ -17,10 +15,7 @@ from ebook_converter.ptempfile import PersistentTemporaryFile
from ebook_converter.utils.imghdr import what from ebook_converter.utils.imghdr import what
__license__ = 'GPL v3' IMAGE_TAGS = {base.tag('xhtml', 'img'), base.tag('xhtml', 'object')}
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
IMAGE_TAGS = {XHTML('img'), XHTML('object')}
KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'} KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'}
@@ -113,7 +108,7 @@ class SVGRasterizer(object):
svg = item.data svg = item.data
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
for elem in xpath(svg, '//svg:*[@xl:href]'): for elem in xpath(svg, '//svg:*[@xl:href]'):
href = urlnormalize(elem.attrib[XLINK('href')]) href = urlnormalize(elem.attrib[base.tag('xlink', 'href')])
path = urllib.parse.urldefrag(href)[0] path = urllib.parse.urldefrag(href)[0]
if not path: if not path:
continue continue
@@ -126,7 +121,7 @@ class SVGRasterizer(object):
with PersistentTemporaryFile(suffix='.'+ext) as pt: with PersistentTemporaryFile(suffix='.'+ext) as pt:
pt.write(data) pt.write(data)
self.temp_files.append(pt.name) self.temp_files.append(pt.name)
elem.attrib[XLINK('href')] = pt.name elem.attrib[base.tag('xlink', 'href')] = pt.name
return svg return svg
def stylizer(self, item): def stylizer(self, item):
@@ -171,7 +166,7 @@ class SVGRasterizer(object):
href = os.path.splitext(item.href)[0] + '.png' href = os.path.splitext(item.href)[0] + '.png'
id, href = manifest.generate(item.id, href) id, href = manifest.generate(item.id, href)
manifest.add(id, href, PNG_MIME, data=data) manifest.add(id, href, PNG_MIME, data=data)
img = elem.makeelement(XHTML('img'), src=item.relhref(href)) img = elem.makeelement(base.tag('xhtml', 'img'), src=item.relhref(href))
elem.getparent().replace(elem, img) elem.getparent().replace(elem, img)
for prop in ('width', 'height'): for prop in ('width', 'height'):
if prop in elem.attrib: if prop in elem.attrib:
@@ -208,7 +203,7 @@ class SVGRasterizer(object):
id, href = manifest.generate(svgitem.id, href) id, href = manifest.generate(svgitem.id, href)
manifest.add(id, href, PNG_MIME, data=data) manifest.add(id, href, PNG_MIME, data=data)
self.images[key] = href self.images[key] = href
elem.tag = XHTML('img') elem.tag = base.tag('xhtml', 'img')
for attr in elem.attrib: for attr in elem.attrib:
if attr not in KEEP_ATTRS: if attr not in KEEP_ATTRS:
del elem.attrib[attr] del elem.attrib[attr]
+10 -9
View File
@@ -10,10 +10,11 @@ import urllib.parse
from lxml.etree import XPath as _XPath from lxml.etree import XPath as _XPath
from lxml import etree from lxml import etree
from ebook_converter import constants as const
from ebook_converter import as_unicode, force_unicode from ebook_converter import as_unicode, force_unicode
from ebook_converter.ebooks.epub import rules from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, from ebook_converter.ebooks.oeb.base import \
rewrite_links, XHTML, urlnormalize) OEB_STYLES, rewrite_links, urlnormalize
from ebook_converter.ebooks.oeb.polish.split import do_split from ebook_converter.ebooks.oeb.polish.split import do_split
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
from ebook_converter.css_selectors import Select, SelectorError from ebook_converter.css_selectors import Select, SelectorError
@@ -22,7 +23,7 @@ from ebook_converter.css_selectors import Select, SelectorError
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
XPath = functools.partial(_XPath, namespaces=NAMESPACES) XPath = functools.partial(_XPath, namespaces=const.XPNSMAP)
SPLIT_POINT_ATTR = 'csp' SPLIT_POINT_ATTR = 'csp'
@@ -104,7 +105,7 @@ class Split(object):
select = Select(item.data) select = Select(item.data)
if not self.page_break_selectors: if not self.page_break_selectors:
return [], [] return [], []
body = item.data.xpath('//h:body', namespaces=NAMESPACES) body = item.data.xpath('//h:body', namespaces=const.XPNSMAP)
if not body: if not body:
return [], [] return [], []
descendants = frozenset(body[0].iterdescendants('*')) descendants = frozenset(body[0].iterdescendants('*'))
@@ -268,13 +269,13 @@ class FlowSplitter(object):
if body is not None: if body is not None:
existing_ids = frozenset(body.xpath('//*/@id')) existing_ids = frozenset(body.xpath('//*/@id'))
for x in ids - existing_ids: for x in ids - existing_ids:
body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt')) body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt'))
ids = set() ids = set()
trees.append(tree) trees.append(tree)
self.trees = trees self.trees = trees
def get_body(self, root): def get_body(self, root):
body = root.xpath('//h:body', namespaces=NAMESPACES) body = root.xpath('//h:body', namespaces=const.XPNSMAP)
if not body: if not body:
return None return None
return body[0] return body[0]
@@ -296,7 +297,7 @@ class FlowSplitter(object):
etree.tostring(body, method='text', encoding='unicode')) etree.tostring(body, method='text', encoding='unicode'))
if len(txt) > 1: if len(txt) > 1:
return False return False
for img in root.xpath('//h:img', namespaces=NAMESPACES): for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
if img.get('style', '') != 'display:none': if img.get('style', '') != 'display:none':
return False return False
if root.xpath('//*[local-name() = "svg"]'): if root.xpath('//*[local-name() = "svg"]'):
@@ -401,7 +402,7 @@ class FlowSplitter(object):
'//h:br', '//h:br',
'//h:li', '//h:li',
): ):
elems = root.xpath(path, namespaces=NAMESPACES) elems = root.xpath(path, namespaces=const.XPNSMAP)
elem = pick_elem(elems) elem = pick_elem(elems)
if elem is not None: if elem is not None:
try: try:
@@ -436,7 +437,7 @@ class FlowSplitter(object):
spine_pos = self.item.spine_position spine_pos = self.item.spine_position
for current, tree in zip(*map(reversed, (self.files, self.trees))): for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP):
href = a.get('href').strip() href = a.get('href').strip()
if href.startswith('#'): if href.startswith('#'):
anchor = href[1:] anchor = href[1:]
@@ -1,22 +1,19 @@
import collections
import re import re
import uuid
import urllib.parse import urllib.parse
import uuid
from lxml import etree from lxml import etree
from collections import OrderedDict, Counter
from ebook_converter.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import TOC, xml2text
from ebook_converter.ebooks import ConversionError from ebook_converter.ebooks import ConversionError
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
def XPath(x): def XPath(x):
try: try:
return etree.XPath(x, namespaces=XPNSMAP) return etree.XPath(x, namespaces=const.XPNSMAP)
except etree.XPathSyntaxError: except etree.XPathSyntaxError:
raise ConversionError( raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x)) 'The syntax of the XPath expression %s is invalid.' % repr(x))
@@ -84,7 +81,7 @@ class DetectStructure(object):
try: try:
prev = next(elem.itersiblings(tag=etree.Element, prev = next(elem.itersiblings(tag=etree.Element,
preceding=True)) preceding=True))
if (barename(elem.tag) in {'h1', 'h2'} and barename( if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(
prev.tag) in {'h1', 'h2'} and (not prev.tail or prev.tag) in {'h1', 'h2'} and (not prev.tail or
not prev.tail.split())): not prev.tail.split())):
# We have two adjacent headings, do not put a page # We have two adjacent headings, do not put a page
@@ -165,7 +162,7 @@ class DetectStructure(object):
chapter_mark = self.opts.chapter_mark chapter_mark = self.opts.chapter_mark
page_break_before = 'display: block; page-break-before: always' page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always' page_break_after = 'display: block; page-break-after: always'
c = Counter() c = collections.Counter()
for item, elem in self.detected_chapters: for item, elem in self.detected_chapters:
c[item] += 1 c[item] += 1
text = xml2text(elem).strip() text = xml2text(elem).strip()
@@ -174,7 +171,7 @@ class DetectStructure(object):
if chapter_mark == 'none': if chapter_mark == 'none':
continue continue
if chapter_mark == 'rule': if chapter_mark == 'rule':
mark = elem.makeelement(XHTML('hr')) mark = elem.makeelement(const.XHTML_HR)
elif chapter_mark == 'pagebreak': elif chapter_mark == 'pagebreak':
if c[item] < 3 and at_start(elem): if c[item] < 3 and at_start(elem):
# For the first two elements in this item, check if they # For the first two elements in this item, check if they
@@ -184,9 +181,9 @@ class DetectStructure(object):
# feedbooks epubs match both a heading tag and its # feedbooks epubs match both a heading tag and its
# containing div with the default chapter expression. # containing div with the default chapter expression.
continue continue
mark = elem.makeelement(XHTML('div'), style=page_break_after) mark = elem.makeelement(const.XHTML_DIV, style=page_break_after)
else: # chapter_mark == 'both': else: # chapter_mark == 'both':
mark = elem.makeelement(XHTML('hr'), style=page_break_before) mark = elem.makeelement(const.XHTML_HR, style=page_break_before)
try: try:
elem.addprevious(mark) elem.addprevious(mark)
except TypeError: except TypeError:
@@ -254,8 +251,8 @@ class DetectStructure(object):
return text, href return text, href
def add_leveled_toc_items(self): def add_leveled_toc_items(self):
added = OrderedDict() added = collections.OrderedDict()
added2 = OrderedDict() added2 = collections.OrderedDict()
counter = 1 counter = 1
def find_matches(expr, doc): def find_matches(expr, doc):
+38 -25
View File
@@ -5,10 +5,10 @@ import shutil
import subprocess import subprocess
import sys import sys
from lxml import etree
from ebook_converter import CurrentDir, xml_replace_entities, prints from ebook_converter import CurrentDir, xml_replace_entities, prints
from ebook_converter.constants_old import ( from ebook_converter.constants_old import isbsd, islinux, isosx, iswindows
filesystem_encoding, isbsd, islinux, isosx, iswindows
)
from ebook_converter.ebooks import ConversionError, DRMError from ebook_converter.ebooks import ConversionError, DRMError
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.ptempfile import PersistentTemporaryFile
@@ -26,10 +26,13 @@ def popen(cmd, **kw):
if isosx and hasattr(sys, 'frameworks_dir'): if isosx and hasattr(sys, 'frameworks_dir'):
base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', 'Contents', 'MacOS') base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app',
'Contents', 'MacOS')
PDFTOHTML = os.path.join(base, PDFTOHTML) PDFTOHTML = os.path.join(base, PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'): if iswindows and hasattr(sys, 'frozen'):
base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable) base = os.path.dirname(sys.executable)
if hasattr(sys, 'new_app_layout'):
base = sys.extensions_location
PDFTOHTML = os.path.join(base, 'pdftohtml.exe') PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
if (islinux or isbsd) and getattr(sys, 'frozen', False): if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
@@ -55,7 +58,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
exe = PDFTOHTML exe = PDFTOHTML
cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', a(pdfsrc), a(index)] '-nodrm', a(pdfsrc), a(index)]
if isbsd: if isbsd:
cmd.remove('-nodrm') cmd.remove('-nodrm')
@@ -67,7 +70,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
logf = PersistentTemporaryFile('pdftohtml_log') logf = PersistentTemporaryFile('pdftohtml_log')
try: try:
p = popen(cmd, stderr=logf._fd, stdout=logf._fd, p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE) stdin=subprocess.PIPE)
except OSError as err: except OSError as err:
if err.errno == errno.ENOENT: if err.errno == errno.ENOENT:
raise ConversionError('Could not find pdftohtml, check it is ' raise ConversionError('Could not find pdftohtml, check it is '
@@ -79,7 +82,8 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
logf.close() logf.close()
out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip() out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
if ret != 0: if ret != 0:
raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) raise ConversionError('pdftohtml failed with return code: '
'%d\n%s' % (ret, out))
if out: if out:
prints("pdftohtml log:") prints("pdftohtml log:")
prints(out) prints(out)
@@ -90,22 +94,27 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
with open(index, 'r+b') as i: with open(index, 'r+b') as i:
raw = i.read().decode('utf-8', 'replace') raw = i.read().decode('utf-8', 'replace')
raw = flip_images(raw) raw = flip_images(raw)
raw = raw.replace('<head', '<!-- created by ebook-converter\'s pdftohtml -->\n <head', 1) raw = raw.replace('<head', '<!-- created by ebook-converter\'s'
' pdftohtml -->\n <head', 1)
i.seek(0) i.seek(0)
i.truncate() i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this # versions of pdftohtml >= 0.20 output self closing <br> tags,
# breaks the pdf heuristics regexps, so replace them # this breaks the pdf heuristics regexps, so replace them
raw = raw.replace('<br/>', '<br>') raw = raw.replace('<br/>', '<br>')
raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw,
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw,
flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"',
raw, flags=re.I)
raw = xml_replace_entities(raw) raw = xml_replace_entities(raw)
raw = raw.replace('\u00a0', ' ') raw = raw.replace('\u00a0', ' ')
i.write(raw.encode('utf-8')) i.write(raw.encode('utf-8'))
cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
'-nodrm', '-q', '-stdout', a(pdfsrc)] '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
a(pdfsrc)]
if isbsd: if isbsd:
cmd.remove('-nodrm') cmd.remove('-nodrm')
p = popen(cmd, stdout=subprocess.PIPE) p = popen(cmd, stdout=subprocess.PIPE)
@@ -115,15 +124,14 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
try: try:
os.remove(pdfsrc) os.remove(pdfsrc)
except: except Exception:
pass pass
def parse_outline(raw, output_dir): def parse_outline(raw, output_dir):
from lxml import etree raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True,
from ebook_converter.utils.xml_parse import safe_xml_fromstring assume_utf8=True)[0])
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw).xpath('(//outline)[1]')
outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
if outline: if outline:
from ebook_converter.ebooks.oeb.polish.toc import TOC, create_ncx from ebook_converter.ebooks.oeb.polish.toc import TOC, create_ncx
outline = outline[0] outline = outline[0]
@@ -142,13 +150,18 @@ def parse_outline(raw, output_dir):
count[0] += 1 count[0] += 1
process_node(outline, toc) process_node(outline, toc)
if count[0] > 2: if count[0] > 2:
root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml') root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en',
'pdftohtml')
with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True)) f.write(etree.tostring(root, pretty_print=True,
with_tail=False, encoding='utf-8',
xml_declaration=True))
def flip_image(img, flip): def flip_image(img, flip):
from ebook_converter.utils.img import flip_image, image_and_format_from_data, image_to_data from ebook_converter.utils.img import image_to_data
from ebook_converter.utils.img import image_and_format_from_data
from ebook_converter.utils.img import flip_image
with open(img, 'r+b') as f: with open(img, 'r+b') as f:
img, fmt = image_and_format_from_data(f.read()) img, fmt = image_and_format_from_data(f.read())
img = flip_image(img, horizontal='x' in flip, vertical='y' in flip) img = flip_image(img, horizontal='x' in flip, vertical='y' in flip)
@@ -170,5 +183,5 @@ def flip_images(raw):
if not os.path.exists(img): if not os.path.exists(img):
continue continue
flip_image(img, flip) flip_image(img, flip)
raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I|re.DOTALL) raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I | re.DOTALL)
return raw return raw
+4 -3
View File
@@ -5,8 +5,9 @@ import re
from functools import partial from functools import partial
from ebook_converter import constants as const
from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from ebook_converter.ebooks.oeb.base import XHTML, barename, namespace, rewrite_links
from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.stylizer import Stylizer
@@ -110,9 +111,9 @@ class MarkdownMLizer(OEB2HTML):
# We can only processes tags. If there isn't a tag return any text. # We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) != XHTML_NS: or namespace(elem.tag) != const.XHTML_NS:
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \
and elem.tail: and elem.tail:
return [elem.tail] return [elem.tail]
return [''] return ['']
+5 -3
View File
@@ -5,8 +5,10 @@ import re
from functools import partial from functools import partial
from ebook_converter import constants as const
from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links from ebook_converter.ebooks.oeb.base import XHTML, barename, namespace, \
rewrite_links
from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.ebooks import unit_convert from ebook_converter.ebooks import unit_convert
from ebook_converter.ebooks.textile.unsmarten import unsmarten from ebook_converter.ebooks.textile.unsmarten import unsmarten
@@ -225,9 +227,9 @@ class TextileMLizer(OEB2HTML):
# We can only processes tags. If there isn't a tag return any text. # We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) != XHTML_NS: or namespace(elem.tag) != const.XHTML_NS:
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \
and elem.tail: and elem.tail:
return [elem.tail] return [elem.tail]
return [''] return ['']
+25 -19
View File
@@ -5,10 +5,11 @@ import re
from lxml import etree from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
BLOCK_TAGS = [ BLOCK_TAGS = [
'div', 'div',
@@ -60,9 +61,6 @@ class TXTMLizer(object):
return self.mlize_spine() return self.mlize_spine()
def mlize_spine(self): def mlize_spine(self):
from ebook_converter.ebooks.oeb.base import XHTML
from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.utils.xml_parse import safe_xml_fromstring
output = [u''] output = [u'']
output.append(self.get_toc()) output.append(self.get_toc())
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
@@ -72,9 +70,11 @@ class TXTMLizer(object):
x.text = x.text.replace('--', '__') x.text = x.text.replace('--', '__')
content = etree.tostring(item.data, encoding='unicode') content = etree.tostring(item.data, encoding='unicode')
content = self.remove_newlines(content) content = self.remove_newlines(content)
content = safe_xml_fromstring(content) content = etree.fromstring(content)
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts,
output += self.dump_text(content.find(XHTML('body')), stylizer, item) self.opts.output_profile)
output += self.dump_text(content.find(base.tag('xhtml', 'body')),
stylizer, item)
output += '\n\n\n\n\n\n' output += '\n\n\n\n\n\n'
output = ''.join(output) output = ''.join(output)
output = '\n'.join(l.rstrip() for l in output.splitlines()) output = '\n'.join(l.rstrip() for l in output.splitlines())
@@ -130,8 +130,12 @@ class TXTMLizer(object):
text = re.sub('\n[ ]+\n', '\n\n', text) text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing: if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text) text = re.sub('\n{2,}', '\n', text)
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text) text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' %
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text) mo.group('t'), text)
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)',
lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'),
mo.group('t')),
text)
else: else:
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text) text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
@@ -146,7 +150,8 @@ class TXTMLizer(object):
if self.opts.max_line_length: if self.opts.max_line_length:
max_length = self.opts.max_line_length max_length = self.opts.max_line_length
if self.opts.max_line_length < 25 and not self.opts.force_max_line_length: if (self.opts.max_line_length < 25 and not
self.opts.force_max_line_length):
max_length = 25 max_length = 25
short_lines = [] short_lines = []
lines = text.splitlines() lines = text.splitlines()
@@ -186,13 +191,13 @@ class TXTMLizer(object):
@stylizer: The style information attached to the element. @stylizer: The style information attached to the element.
@page: OEB page used to determine absolute urls. @page: OEB page used to determine absolute urls.
''' '''
from ebook_converter.ebooks.oeb.base import XHTML_NS, barename, namespace
if not isinstance(elem.tag, (str, bytes)) \ if not isinstance(elem.tag, (str, bytes)) \
or namespace(elem.tag) != XHTML_NS: or parse_utils.namespace(elem.tag) != const.XHTML_NS:
p = elem.getparent() p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ if (p is not None and isinstance(p.tag, (str, bytes)) and
and elem.tail: parse_utils.namespace(p.tag) == const.XHTML_NS and
elem.tail):
return [elem.tail] return [elem.tail]
return [''] return ['']
@@ -205,14 +210,15 @@ class TXTMLizer(object):
return [elem.tail] return [elem.tail]
return [''] return ['']
tag = barename(elem.tag) tag = parse_utils.barename(elem.tag)
tag_id = elem.attrib.get('id', None) tag_id = elem.attrib.get('id', None)
in_block = False in_block = False
in_heading = False in_heading = False
# Are we in a heading? # Are we in a heading?
# This can either be a heading tag or a TOC item. # This can either be a heading tag or a TOC item.
if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids: if tag in HEADING_TAGS or '%s#%s' % (page.href,
tag_id) in self.toc_ids:
in_heading = True in_heading = True
if not self.last_was_heading: if not self.last_was_heading:
text.append('\n\n\n\n\n\n') text.append('\n\n\n\n\n\n')
@@ -234,7 +240,7 @@ class TXTMLizer(object):
ems = int(round((float(style.marginTop) / style.fontSize) - 1)) ems = int(round((float(style.marginTop) / style.fontSize) - 1))
if ems >= 1: if ems >= 1:
text.append('\n' * ems) text.append('\n' * ems)
except: except Exception:
pass pass
# Process tags that contain text. # Process tags that contain text.
+10 -10
View File
@@ -10,7 +10,7 @@ import builtins
import locale import locale
import sys import sys
from ebook_converter import constants from ebook_converter import constants_old
# For backwards compat with some third party plugins # For backwards compat with some third party plugins
builtins.__dict__['dynamic_property'] = lambda func: func(None) builtins.__dict__['dynamic_property'] = lambda func: func(None)
@@ -41,8 +41,8 @@ if not _run_once:
# #
# Platform specific modules # Platform specific modules
if constants.iswindows: if constants_old.iswindows:
winutil, winutilerror = constants.plugins['winutil'] winutil, winutilerror = constants_old.plugins['winutil']
if not winutil: if not winutil:
raise RuntimeError('Failed to load the winutil plugin: %s'%winutilerror) raise RuntimeError('Failed to load the winutil plugin: %s'%winutilerror)
if len(sys.argv) > 1 and not isinstance(sys.argv[1], str): if len(sys.argv) > 1 and not isinstance(sys.argv[1], str):
@@ -57,8 +57,8 @@ if not _run_once:
# #
# Convert command line arguments to unicode # Convert command line arguments to unicode
enc = constants.preferred_encoding enc = constants_old.preferred_encoding
if constants.isosx: if constants_old.isosx:
enc = 'utf-8' enc = 'utf-8'
for i in range(1, len(sys.argv)): for i in range(1, len(sys.argv)):
if not isinstance(sys.argv[i], str): if not isinstance(sys.argv[i], str):
@@ -66,7 +66,7 @@ if not _run_once:
# #
# Ensure that the max number of open files is at least 1024 # Ensure that the max number of open files is at least 1024
if constants.iswindows: if constants_old.iswindows:
# See https://msdn.microsoft.com/en-us/library/6e3b887c.aspx # See https://msdn.microsoft.com/en-us/library/6e3b887c.aspx
if hasattr(winutil, 'setmaxstdio'): if hasattr(winutil, 'setmaxstdio'):
winutil.setmaxstdio(max(1024, winutil.getmaxstdio())) winutil.setmaxstdio(max(1024, winutil.getmaxstdio()))
@@ -77,7 +77,7 @@ if not _run_once:
try: try:
resource.setrlimit(resource.RLIMIT_NOFILE, (min(1024, hard), hard)) resource.setrlimit(resource.RLIMIT_NOFILE, (min(1024, hard), hard))
except Exception: except Exception:
if constants.DEBUG: if constants_old.DEBUG:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@@ -122,7 +122,7 @@ if not _run_once:
bound_signal.connect(slot, **kw) bound_signal.connect(slot, **kw)
builtins.__dict__['connect_lambda'] = connect_lambda builtins.__dict__['connect_lambda'] = connect_lambda
if constants.islinux or constants.isosx or constants.isfreebsd: if constants_old.islinux or constants_old.isosx or constants_old.isfreebsd:
# Name all threads at the OS level created using the threading module, see # Name all threads at the OS level created using the threading module, see
# http://bugs.python.org/issue15500 # http://bugs.python.org/issue15500
import threading import threading
@@ -140,7 +140,7 @@ if not _run_once:
if name: if name:
if isinstance(name, str): if isinstance(name, str):
name = name.encode('ascii', 'replace').decode('ascii') name = name.encode('ascii', 'replace').decode('ascii')
constants.plugins['speedup'][0].set_thread_name(name[:15]) constants_old.plugins['speedup'][0].set_thread_name(name[:15])
except Exception: except Exception:
pass # Don't care about failure to set name pass # Don't care about failure to set name
threading.Thread.start = new_start threading.Thread.start = new_start
@@ -152,7 +152,7 @@ def test_lopen():
n = 'f\xe4llen' n = 'f\xe4llen'
print('testing open()') print('testing open()')
if constants.iswindows: if constants_old.iswindows:
import msvcrt, win32api import msvcrt, win32api
def assert_not_inheritable(f): def assert_not_inheritable(f):
+10 -4
View File
@@ -1,3 +1,7 @@
import os
import tempfile
import unittest
from lxml import etree from lxml import etree
@@ -24,7 +28,6 @@ def safe_xml_fromstring(string_or_bytes, recover=True):
def find_tests(): def find_tests():
import unittest, tempfile, os
class TestXMLParse(unittest.TestCase): class TestXMLParse(unittest.TestCase):
@@ -37,9 +40,11 @@ def find_tests():
os.remove(self.temp_file) os.remove(self.temp_file)
def test_safe_xml_fromstring(self): def test_safe_xml_fromstring(self):
templ = '''<!DOCTYPE foo [ <!ENTITY e {id} "{val}" > ]><r>&e;</r>''' templ = '<!DOCTYPE foo [ <!ENTITY e {id} "{val}" > ]><r>&e;</r>'
external = 'file:///' + self.temp_file.replace(os.sep, '/') external = 'file:///' + self.temp_file.replace(os.sep, '/')
self.assertEqual(etree.fromstring(templ.format(id='SYSTEM', val=external)).text, 'external') self.assertEqual(etree.fromstring(templ.format(id='SYSTEM',
val=external)).text,
'external')
for eid, val, expected in ( for eid, val, expected in (
('', 'normal entity', 'normal entity'), ('', 'normal entity', 'normal entity'),
('', external, external), ('', external, external),
@@ -50,7 +55,8 @@ def find_tests():
('PUBLIC', external, None), ('PUBLIC', external, None),
('PUBLIC', 'http://example.com', None), ('PUBLIC', 'http://example.com', None),
): ):
got = getattr(safe_xml_fromstring(templ.format(id=eid, val=val)), 'text', None) got = getattr(etree.fromstring(templ.format(id=eid, val=val)),
'text', None)
self.assertEqual(got, expected) self.assertEqual(got, expected)
return unittest.defaultTestLoader.loadTestsFromTestCase(TestXMLParse) return unittest.defaultTestLoader.loadTestsFromTestCase(TestXMLParse)
+1 -1
View File
@@ -8,7 +8,7 @@ from contextlib import closing
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from ebook_converter import sanitize_file_name from ebook_converter import sanitize_file_name
from ebook_converter.constants import filesystem_encoding from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.ebooks.chardet import detect from ebook_converter.ebooks.chardet import detect
from ebook_converter.polyglot.builtins import as_bytes from ebook_converter.polyglot.builtins import as_bytes