1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-22 18:33:34 +01:00

Use the real constants module.

This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
This commit is contained in:
2020-05-29 17:04:53 +02:00
parent ee4801228f
commit ce89f5c9d1
54 changed files with 2383 additions and 2081 deletions

View File

@@ -3,6 +3,7 @@ Based on ideas from comiclrf created by FangornUK.
"""
import shutil, textwrap, codecs, os
from ebook_converter import constants as const
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
from ebook_converter import CurrentDir
from ebook_converter.ptempfile import PersistentTemporaryDirectory
@@ -245,7 +246,6 @@ class ComicInput(InputFormatPlugin):
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from ebook_converter.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
@@ -267,7 +267,8 @@ class ComicInput(InputFormatPlugin):
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
wrapper = WRAPPER%(const.XHTML_NS, i+1, os.path.basename(page),
i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
with open(page, 'wb') as f:
f.write(wrapper.encode('utf-8'))
@@ -275,8 +276,6 @@ class ComicInput(InputFormatPlugin):
return wrappers
def create_viewer_wrapper(self, pages):
from ebook_converter.ebooks.oeb.base import XHTML_NS
def page(src):
return '<img src="{}"></img>'.format(os.path.basename(src))
@@ -303,7 +302,7 @@ class ComicInput(InputFormatPlugin):
%s
</body>
</html>
''' % (XHTML_NS, pages)
''' % (const.XHTML_NS, pages)
path = os.path.join(base, 'wrapper.xhtml')
with open(path, 'wb') as f:
f.write(wrapper.encode('utf-8'))

View File

@@ -1,14 +1,22 @@
from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation
import io
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.customize import conversion
from ebook_converter.ebooks.docx.dump import do_dump
from ebook_converter.ebooks.docx.writer.container import DOCX
from ebook_converter.ebooks.docx.writer.from_html import Convert
from ebook_converter.ebooks.metadata import opf2 as opf_meta
from ebook_converter.ebooks.oeb import base
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
_OPT = conversion.OptionRecommendation
class DOCXOutput(OutputFormatPlugin):
class DOCXOutput(conversion.OutputFormatPlugin):
name = 'DOCX Output'
author = 'Kovid Goyal'
@@ -16,75 +24,63 @@ class DOCXOutput(OutputFormatPlugin):
commit_name = 'docx_output'
ui_data = {'page_sizes': PAGE_SIZES}
options = {
OptionRecommendation(name='docx_page_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
help='The size of the page. Default is letter. Choices '
'are %s' % PAGE_SIZES),
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
help='Custom size of the document. Use the form widthxheight '
'EG. `123x321` to specify the width and height (in pts). '
'This overrides any specified page-size.'),
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help='Do not insert the book cover as an image at the start of the document.'
' If you use this option, the book cover will be discarded.'),
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
help='Preserve the aspect ratio of the cover image instead of stretching'
' it out to cover the entire page.'),
OptionRecommendation(name='docx_no_toc', recommended_value=False,
help='Do not insert the table of contents as a page at the start of the document.'),
OptionRecommendation(name='extract_to',
help='Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.' % 'DOCX'),
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help='The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.'
),
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help='The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.'
),
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help='The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.'
),
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help='The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.'
),
}
options = {_OPT(name='docx_page_size', recommended_value='letter',
level=_OPT.LOW, choices=PAGE_SIZES,
help='The size of the page. Default is letter. Choices '
'are %s' % PAGE_SIZES),
_OPT(name='docx_custom_page_size', recommended_value=None,
help='Custom size of the document. Use the form '
'widthxheight EG. `123x321` to specify the width and '
'height (in pts). This overrides any specified '
'page-size.'),
_OPT(name='docx_no_cover', recommended_value=False,
help='Do not insert the book cover as an image at the '
'start of the document. If you use this option, the book '
'cover will be discarded.'),
_OPT(name='preserve_cover_aspect_ratio',
recommended_value=False, help='Preserve the aspect ratio '
'of the cover image instead of stretching it out to cover '
'the entire page.'),
_OPT(name='docx_no_toc', recommended_value=False,
help='Do not insert the table of contents as a page at '
'the start of the document.'),
_OPT(name='extract_to', help='Extract the contents of the '
'generated DOCX file to the specified directory. The '
'contents of the directory are first deleted, so be '
'careful.'),
_OPT(name='docx_page_margin_left', recommended_value=72.0,
level=_OPT.LOW, help='The size of the left page margin, '
'in pts. Default is 72pt. Overrides the common left page '
'margin setting.'),
_OPT(name='docx_page_margin_top', recommended_value=72.0,
level=_OPT.LOW, help='The size of the top page margin, '
'in pts. Default is 72pt. Overrides the common top page '
'margin setting, unless set to zero.'),
_OPT(name='docx_page_margin_right', recommended_value=72.0,
level=_OPT.LOW, help='The size of the right page margin, '
'in pts. Default is 72pt. Overrides the common right page '
'margin setting, unless set to zero.'),
_OPT(name='docx_page_margin_bottom', recommended_value=72.0,
level=_OPT.LOW, help='The size of the bottom page margin, '
'in pts. Default is 72pt. Overrides the common bottom '
'page margin setting, unless set to zero.')}
def convert_metadata(self, oeb):
from lxml import etree
from ebook_converter.ebooks.oeb.base import OPF, OPF2_NS
from ebook_converter.ebooks.metadata.opf2 import OPF as ReadOPF
from io import BytesIO
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
package = etree.Element(base.tag('opf', 'package'),
attrib={'version': '2.0'},
nsmap={None: const.OPF2_NS})
oeb.metadata.to_opf2(package)
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
self.mi = opf_meta.OPF(io.BytesIO(etree.tostring(package,
encoding='utf-8')),
populate_spine=False,
try_to_guess_cover=False).to_book_metadata()
def convert(self, oeb, output_path, input_plugin, opts, log):
from ebook_converter.ebooks.docx.writer.container import DOCX
from ebook_converter.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log)
self.convert_metadata(oeb)
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
Convert(oeb, docx, self.mi, not opts.docx_no_cover,
not opts.docx_no_toc)()
docx.write(output_path, self.mi)
if opts.extract_to:
from ebook_converter.ebooks.docx.dump import do_dump
do_dump(output_path, opts.extract_to)

View File

@@ -1,14 +1,19 @@
import os, re, posixpath
from itertools import cycle
import hashlib
import itertools
import os
import re
import traceback
import uuid
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
from lxml import etree
from ebook_converter.ebooks.metadata import opf2 as opf_meta
from ebook_converter.ebooks.oeb import base
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
@@ -16,8 +21,8 @@ def decrypt_font_data(key, data, algorithm):
is_adobe = algorithm == ADOBE_OBFUSCATION
crypt_len = 1024 if is_adobe else 1040
crypt = bytearray(data[:crypt_len])
key = cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x^next(key) for x in crypt))
key = itertools.cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x ^ next(key) for x in crypt))
return decrypt + data[crypt_len:]
@@ -29,18 +34,16 @@ def decrypt_font(key, path, algorithm):
class EPUBInput(InputFormatPlugin):
name = 'EPUB Input'
author = 'Kovid Goyal'
name = 'EPUB Input'
author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML'
file_types = {'epub'}
file_types = {'epub'}
output_encoding = None
commit_name = 'epub_input'
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def process_encryption(self, encfile, opf, log):
from lxml import etree
import uuid, hashlib
idpf_key = opf.raw_unique_identifier
if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
@@ -56,27 +59,28 @@ class EPUBInput(InputFormatPlugin):
try:
key = item.text.rpartition(':')[-1]
key = uuid.UUID(key).bytes
except:
import traceback
except Exception:
traceback.print_exc()
key = None
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
for em in root.xpath('descendant::*[contains(name(), '
'"EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
cr = em.getparent().xpath('descendant::*[contains(name(), '
'"CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
path = os.path.abspath(os.path.join(os.path.dirname(encfile),
'..', *uri.split('/')))
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
if (tkey and os.path.exists(path)):
self._encrypted_font_uris.append(uri)
decrypt_font(tkey, path, algorithm)
return True
except:
import traceback
except Exception:
traceback.print_exc()
return False
@@ -97,8 +101,11 @@ class EPUBInput(InputFormatPlugin):
return t
def rationalize_cover3(self, opf, log):
''' If there is a reference to the cover/titlepage via manifest properties, convert to
entries in the <guide> so that the rest of the pipeline picks it up. '''
"""
If there is a reference to the cover/titlepage via manifest
properties, convert to entries in the <guide> so that the rest of the
pipeline picks it up.
"""
from ebook_converter.ebooks.metadata.opf3 import items_with_property
removed = guide_titlepage_href = guide_titlepage_id = None
@@ -128,7 +135,8 @@ class EPUBInput(InputFormatPlugin):
titlepage_id, titlepage_href = tid, href.partition('#')[0]
break
if titlepage_href is None:
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
titlepage_href = guide_titlepage_href
titlepage_id = guide_titlepage_id
if titlepage_href is not None:
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
spine = list(opf.iterspine())
@@ -148,7 +156,6 @@ class EPUBInput(InputFormatPlugin):
means, at most one entry with type="cover" that points to a raster
cover and at most one entry with type="titlepage" that points to an
HTML titlepage. '''
from ebook_converter.ebooks.oeb.base import OPF
removed = None
from lxml import etree
guide_cover, guide_elem = None, None
@@ -160,12 +167,14 @@ class EPUBInput(InputFormatPlugin):
raster_cover = opf.raster_cover
if raster_cover:
if guide_elem is None:
g = opf.root.makeelement(OPF('guide'))
g = opf.root.makeelement(base.tag('opf', 'guide'))
opf.root.append(g)
else:
g = guide_elem.getparent()
guide_cover = raster_cover
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
guide_elem = g.makeelement(base.tag('opf', 'reference'),
attrib={'href': raster_cover,
'type': 'cover'})
g.append(guide_elem)
return
spine = list(opf.iterspine())
@@ -186,7 +195,8 @@ class EPUBInput(InputFormatPlugin):
# specially
if not self.for_viewer:
if len(spine) == 1:
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
log.warn('There is only a single spine item and it is marked '
'as the cover. Removing cover marking.')
for guide_elem in tuple(opf.iterguide()):
if guide_elem.get('type', '').lower() == 'cover':
guide_elem.getparent().remove(guide_elem)
@@ -215,8 +225,9 @@ class EPUBInput(InputFormatPlugin):
# Render the titlepage to create a raster cover
from ebook_converter.ebooks import render_html_svg_workaround
guide_elem.set('href', 'calibre_raster_cover.jpg')
t = etree.SubElement(
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
t = etree.SubElement(elem[0].getparent(), base.tag('opf', 'item'),
href=guide_elem.get('href'),
id='calibre_raster_cover')
t.set('media-type', 'image/jpeg')
if os.path.exists(guide_cover):
renderer = render_html_svg_workaround(guide_cover, log)
@@ -229,17 +240,16 @@ class EPUBInput(InputFormatPlugin):
return removed
def find_opf(self):
from ebook_converter.utils.xml_parse import safe_xml_fromstring
def attr(n, attr):
for k, v in n.attrib.items():
if k.endswith(attr):
return v
try:
with open('META-INF/container.xml', 'rb') as f:
root = safe_xml_fromstring(f.read())
root = etree.fromstring(f.read())
for r in root.xpath('//*[local-name()="rootfile"]'):
if attr(r, 'media-type') != "application/oebps-package+xml":
if (attr(r, 'media-type') !=
"application/oebps-package+xml"):
continue
path = attr(r, 'full-path')
if not path:
@@ -248,20 +258,18 @@ class EPUBInput(InputFormatPlugin):
if os.path.exists(path):
return path
except Exception:
import traceback
traceback.print_exc()
def convert(self, stream, options, file_ext, log, accelerators):
from ebook_converter.utils.zipfile import ZipFile
from ebook_converter import walk
from ebook_converter.ebooks import DRMError
from ebook_converter.ebooks.metadata.opf2 import OPF
try:
zf = ZipFile(stream)
zf.extractall(os.getcwd())
except:
except Exception:
log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
' more forgiving ZIP parser')
from ebook_converter.utils.localunzip import extractall
stream.seek(0)
extractall(stream)
@@ -276,11 +284,12 @@ class EPUBInput(InputFormatPlugin):
path = getattr(stream, 'name', 'stream')
if opf is None:
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
raise ValueError('%s is not a valid EPUB file (could not find '
'opf)' % path)
opf = os.path.relpath(opf, os.getcwd())
parts = os.path.split(opf)
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
# parts = os.path.split(opf)
opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf)))
self._encrypted_font_uris = []
if os.path.exists(encfile):
@@ -288,18 +297,23 @@ class EPUBInput(InputFormatPlugin):
raise DRMError(os.path.basename(path))
self.encrypted_fonts = self._encrypted_font_uris
if len(parts) > 1 and parts[0]:
delta = '/'.join(parts[:-1])+'/'
# XXX(gryf): this code would fail pretty ugly, thus, this part was
# never used.
# if len(parts) > 1 and parts[0]:
# delta = '/'.join(parts[:-1])+'/'
def normpath(x):
return posixpath.normpath(delta + elem.get('href'))
# def normpath(x):
# return posixpath.normpath(delta + elem.get('href'))
for elem in opf.itermanifest():
elem.set('href', normpath(elem.get('href')))
for elem in opf.iterguide():
elem.set('href', normpath(elem.get('href')))
# for elem in opf.itermanifest():
# elem.set('href', normpath(elem.get('href')))
# for elem in opf.iterguide():
# elem.set('href', normpath(elem.get('href')))
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
if opf.package_version >= 3.0:
f = self.rationalize_cover3
else:
f = self.rationalize_cover2
self.removed_cover = f(opf, log)
if self.removed_cover:
self.removed_items_to_ignore = (self.removed_cover,)
@@ -352,15 +366,18 @@ class EPUBInput(InputFormatPlugin):
from lxml import etree
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.oeb.polish.parsing import parse
from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, \
NCX, urlnormalize, urlunquote, serialize
from ebook_converter.ebooks.oeb.polish.toc import first_child
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from tempfile import NamedTemporaryFile
with open(nav_path, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True)[0]
root = parse(raw, log=log)
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/'
'ncx/" version="2005-1" xml:lang="eng">'
'<navMap/></ncx>')
navmap = ncx[0]
et = '{%s}type' % EPUB_NS
bn = os.path.basename(nav_path)
@@ -368,8 +385,8 @@ class EPUBInput(InputFormatPlugin):
def add_from_li(li, parent):
href = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')):
text = etree.tostring(
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
text = etree.tostring(x, method='text', encoding='unicode',
with_tail=False).strip() or ' '.join(
x.xpath('descendant-or-self::*/@title')).strip()
href = x.get('href')
if href:
@@ -382,7 +399,7 @@ class EPUBInput(InputFormatPlugin):
np[0].append(np.makeelement(NCX('text')))
np[0][0].text = text
if href:
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
np.append(np.makeelement(NCX('content'), attrib={'src': href}))
return np
def process_nav_node(node, toc_parent):
@@ -401,20 +418,25 @@ class EPUBInput(InputFormatPlugin):
else:
return
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path),
delete=False) as f:
f.write(etree.tostring(ncx, encoding='utf-8'))
ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME,
append=True).get('id')
for spine in opf.root.xpath('//*[local-name()="spine"]'):
spine.set('toc', ncx_id)
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
url = os.path.relpath(nav_path).replace(os.sep, '/')
opts.epub3_nav_href = urlnormalize(url)
opts.epub3_nav_parsed = root
if getattr(self, 'removed_cover', None):
changed = False
base_path = os.path.dirname(nav_path)
for elem in root.xpath('//*[@href]'):
href, frag = elem.get('href').partition('#')[::2]
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
link_path = os.path.relpath(os.path.join(base_path,
urlunquote(href)),
base_path)
abs_href = urlnormalize(link_path)
if abs_href == self.removed_cover:
changed = True

View File

@@ -2,7 +2,11 @@ import os
import re
import shutil
import urllib.parse
import uuid
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.customize.conversion import OutputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
@@ -132,39 +136,37 @@ class EPUBOutput(OutputFormatPlugin):
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def workaround_webkit_quirks(self): # {{{
from ebook_converter.ebooks.oeb.base import XPath
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
body = base.XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for pre in XPath('//h:pre')(body):
for pre in base.XPath('//h:pre')(body):
if not pre.text and len(pre) == 0:
pre.tag = 'div'
# }}}
def upshift_markup(self): # {{{
'Upgrade markup to comply with XHTML 1.1 where possible'
from ebook_converter.ebooks.oeb.base import XPath, XML
for x in self.oeb.spine:
root = x.data
if (not root.get(XML('lang'))) and (root.get('lang')):
root.set(XML('lang'), root.get('lang'))
body = XPath('//h:body')(root)
if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')):
root.set(base.tag('xml', 'lang'), root.get('lang'))
body = base.XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for u in XPath('//h:u')(root):
for u in base.XPath('//h:u')(root):
u.tag = 'span'
seen_ids, seen_names = set(), set()
for x in XPath('//*[@id or @name]')(root):
for x in base.XPath('//*[@id or @name]')(root):
eid, name = x.get('id', None), x.get('name', None)
if eid:
if eid in seen_ids:
@@ -223,28 +225,27 @@ class EPUBOutput(OutputFormatPlugin):
first = next(iter(self.oeb.spine))
self.oeb.toc.add('Start', first.href)
from ebook_converter.ebooks.oeb.base import OPF
identifiers = oeb.metadata['identifier']
uuid = None
_uuid = None
for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'):
uuid = str(x).split(':')[-1]
if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or
str(x).startswith('urn:uuid:')):
_uuid = str(x).split(':')[-1]
break
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
if uuid is None:
if _uuid is None:
self.log.warn('No UUID identifier found')
from uuid import uuid4
uuid = str(uuid4())
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
_uuid = str(uuid.uuid4())
oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
if encrypted_fonts and not _uuid.startswith('urn:uuid:'):
# Apparently ADE requires this value to start with urn:uuid:
# for some absurd reason, or it will throw a hissy fit and refuse
# to use the obfuscated fonts.
for x in identifiers:
if str(x) == uuid:
x.content = 'urn:uuid:'+uuid
if str(x) == _uuid:
x.content = 'urn:uuid:' + _uuid
with TemporaryDirectory('_epub_output') as tdir:
from ebook_converter.customize.ui import plugin_for_output_format
@@ -264,7 +265,7 @@ class EPUBOutput(OutputFormatPlugin):
self.upgrade_to_epub3(tdir, opf)
encryption = None
if encrypted_fonts:
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid)
from ebook_converter.ebooks.epub import initialize_container
with initialize_container(output_path, os.path.basename(opf),
@@ -312,12 +313,12 @@ class EPUBOutput(OutputFormatPlugin):
except EnvironmentError:
pass
def encrypt_fonts(self, uris, tdir, uuid): # {{{
def encrypt_fonts(self, uris, tdir, _uuid): # {{{
from ebook_converter.polyglot.binary import from_hex_bytes
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
key = re.sub(r'[^a-fA-F0-9]', '', _uuid)
if len(key) < 16:
raise ValueError('UUID identifier %r is invalid'%uuid)
raise ValueError('UUID identifier %r is invalid'% _uuid)
key = bytearray(from_hex_bytes((key + key)[:32]))
paths = []
with CurrentDir(tdir):
@@ -335,7 +336,8 @@ class EPUBOutput(OutputFormatPlugin):
if len(data) >= 1024:
data = bytearray(data)
f.seek(0)
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
f.write(bytes(bytearray(data[i] ^ key[i%16]
for i in range(1024))))
else:
self.log.warn('Font', path, 'is invalid, ignoring')
if not isinstance(uri, str):
@@ -374,11 +376,10 @@ class EPUBOutput(OutputFormatPlugin):
# }}}
def workaround_ade_quirks(self): # {{{
'''
"""
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
from ebook_converter.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
"""
stylesheet = self.oeb.manifest.main_stylesheet
@@ -388,23 +389,23 @@ class EPUBOutput(OutputFormatPlugin):
for node in self.oeb.toc.iter():
href = getattr(node, 'href', None)
if hasattr(href, 'partition'):
base, _, frag = href.partition('#')
frag = urlunquote(frag)
_base, _, frag = href.partition('#')
frag = base.urlunquote(frag)
if frag and frag_pat.match(frag) is None:
self.log.warn(
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
node.href = base
node.href = _base
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
body = base.XPath('//h:body')(root)
if body:
body = body[0]
if hasattr(body, 'xpath'):
# remove <img> tags with empty src elements
bad = []
for x in XPath('//h:img')(body):
for x in base.XPath('//h:img')(body):
src = x.get('src', '').strip()
if src in ('', '#') or src.startswith('http:'):
bad.append(x)
@@ -412,7 +413,7 @@ class EPUBOutput(OutputFormatPlugin):
img.getparent().remove(img)
# Add id attribute to <a> tags that have name
for x in XPath('//h:a[@name]')(body):
for x in base.XPath('//h:a[@name]')(body):
if not x.get('id', False):
x.set('id', x.get('name'))
# The delightful epubcheck has started complaining about <a> tags that
@@ -420,19 +421,19 @@ class EPUBOutput(OutputFormatPlugin):
x.attrib.pop('name')
# Replace <br> that are children of <body> as ADE doesn't handle them
for br in XPath('./h:br')(body):
for br in base.XPath('./h:br')(body):
if br.getparent() is None:
continue
try:
prior = next(br.itersiblings(preceding=True))
priortag = barename(prior.tag)
priortag = parse_utils.barename(prior.tag)
priortext = prior.tail
except:
priortag = 'body'
priortext = body.text
if priortext:
priortext = priortext.strip()
br.tag = XHTML('p')
br.tag = base.tag('xhtml', 'p')
br.text = '\u00a0'
style = br.get('style', '').split(';')
style = list(filter(None, map(lambda x: x.strip(), style)))
@@ -446,44 +447,44 @@ class EPUBOutput(OutputFormatPlugin):
style.append('height:0pt')
br.set('style', '; '.join(style))
for tag in XPath('//h:embed')(root):
for tag in base.XPath('//h:embed')(root):
tag.getparent().remove(tag)
for tag in XPath('//h:object')(root):
for tag in base.XPath('//h:object')(root):
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
continue
tag.getparent().remove(tag)
for tag in XPath('//h:title|//h:style')(root):
for tag in base.XPath('//h:title|//h:style')(root):
if not tag.text:
tag.getparent().remove(tag)
for tag in XPath('//h:script')(root):
for tag in base.XPath('//h:script')(root):
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
tag.getparent().remove(tag)
for tag in XPath('//h:body/descendant::h:script')(root):
for tag in base.XPath('//h:body/descendant::h:script')(root):
tag.getparent().remove(tag)
formchildren = XPath('./h:input|./h:button|./h:textarea|'
formchildren = base.XPath('./h:input|./h:button|./h:textarea|'
'./h:label|./h:fieldset|./h:legend')
for tag in XPath('//h:form')(root):
for tag in base.XPath('//h:form')(root):
if formchildren(tag):
tag.getparent().remove(tag)
else:
# Not a real form
tag.tag = XHTML('div')
tag.tag = base.tag('xhtml', 'div')
for tag in XPath('//h:center')(root):
tag.tag = XHTML('div')
for tag in base.XPath('//h:center')(root):
tag.tag = base.tag('xhtml', 'div')
tag.set('style', 'text-align:center')
# ADE can't handle &amp; in an img url
for tag in XPath('//h:img[@src]')(root):
for tag in base.XPath('//h:img[@src]')(root):
tag.set('src', tag.get('src', '').replace('&', ''))
# ADE whimpers in fright when it encounters a <td> outside a
# <table>
in_table = XPath('ancestor::h:table')
for tag in XPath('//h:td|//h:tr|//h:th')(root):
in_table = base.XPath('ancestor::h:table')
for tag in base.XPath('//h:td|//h:tr|//h:th')(root):
if not in_table(tag):
tag.tag = XHTML('div')
tag.tag = base.tag('xhtml', 'div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]')
@@ -498,7 +499,7 @@ class EPUBOutput(OutputFormatPlugin):
if stylesheet is not None:
# ADE doesn't render lists correctly if they have left margins
from css_parser.css import CSSRule
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root):
sel = '.'+lb.get('class')
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if sel == rule.selectorList.selectorText:
@@ -519,11 +520,10 @@ class EPUBOutput(OutputFormatPlugin):
'''
Perform toc link transforms to alleviate slow loading.
'''
from ebook_converter.ebooks.oeb.base import XPath
from ebook_converter.ebooks.oeb.polish.toc import item_at_top
def frag_is_at_top(root, frag):
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
if elem:
elem = elem[0]
else:

View File

@@ -1,59 +1,57 @@
"""
Convert .fb2 files to .lrf
"""
import os, re
import os
import pkg_resources
import re
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter import guess_type
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
class FB2Input(InputFormatPlugin):
name = 'FB2 Input'
author = 'Anatoly Shipitsin'
name = 'FB2 Input'
author = 'Anatoly Shipitsin'
description = 'Convert FB2 and FBZ files to HTML'
file_types = {'fb2', 'fbz'}
file_types = {'fb2', 'fbz'}
commit_name = 'fb2_input'
recommendations = {
('level1_toc', '//h:h1', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED),
('level3_toc', '//h:h3', OptionRecommendation.MED),
}
recommendations = {('level1_toc', '//h:h1', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED),
('level3_toc', '//h:h3', OptionRecommendation.MED)}
options = {
OptionRecommendation(name='no_inline_fb2_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help='Do not insert a Table of Contents at the beginning of the book.'
)}
options = {OptionRecommendation(name='no_inline_fb2_toc',
recommended_value=False,
level=OptionRecommendation.LOW,
help='Do not insert a Table of Contents '
'at the beginning of the book.')}
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
from ebook_converter.ebooks.metadata.fb2 import ensure_namespace
from ebook_converter.ebooks.metadata.fb2 import get_fb2_data
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.metadata.meta import get_metadata
from ebook_converter.ebooks.oeb.base import XLINK_NS, XHTML_NS
from ebook_converter.ebooks.chardet import xml_to_unicode
self.log = log
log.debug('Parsing XML...')
raw = get_fb2_data(stream)[0]
raw = raw.replace(b'\0', b'')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
assume_utf8=True, resolve_entities=True)[0]
try:
doc = safe_xml_fromstring(raw)
doc = etree.fromstring(raw)
except etree.XMLSyntaxError:
doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
doc = etree.fromstring(raw.replace('& ', '&amp;'))
if doc is None:
raise ValueError('The FB2 file is not valid XML')
doc = ensure_namespace(doc)
@@ -62,22 +60,24 @@ class FB2Input(InputFormatPlugin):
except Exception:
fb_ns = FB2NS
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS}
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and '
'@type="text/css"]')
css = ''
for s in stylesheets:
css += etree.tostring(s, encoding='unicode', method='text',
with_tail=False) + '\n\n'
with_tail=False) + '\n\n'
if css:
import css_parser, logging
import css_parser
import logging
parser = css_parser.CSSParser(fetcher=None,
log=logging.getLogger('calibre.css'))
log=logging.getLogger('calibre.css'))
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS
text = XHTML_CSS_NAMESPACE + css
log.debug('Parsing stylesheet...')
stylesheet = parser.parseString(text)
stylesheet.namespaces['h'] = XHTML_NS
stylesheet.namespaces['h'] = const.XHTML_NS
css = stylesheet.cssText
if isinstance(css, bytes):
css = css.decode('utf-8', 'replace')
@@ -92,16 +92,20 @@ class FB2Input(InputFormatPlugin):
if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC')
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
re.DOTALL).sub('', ss)
re.DOTALL).sub('', ss)
styledoc = safe_xml_fromstring(ss)
styledoc = etree.fromstring(ss)
transform = etree.XSLT(styledoc)
result = transform(doc)
# Handle links of type note and cite
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
notes = {a.get('href')[1:]: a
for a in result.xpath('//a[@link_note and @href]')
if a.get('href').startswith('#')}
cites = {a.get('link_cite'): a
for a in result.xpath('//a[@link_cite]')
if not a.get('href', '')}
all_ids = {x for x in result.xpath('//*/@id')}
for cite, a in cites.items():
note = notes.get(cite, None)
@@ -137,8 +141,10 @@ class FB2Input(InputFormatPlugin):
f.write(mi.cover_data[1])
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
else:
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
for img in doc.xpath('//f:coverpage/f:image',
namespaces=NAMESPACES):
href = img.get('{%s}href' % const.XLINK_NS,
img.get('href', None))
if href is not None:
if href.startswith('#'):
href = href[1:]
@@ -165,15 +171,15 @@ class FB2Input(InputFormatPlugin):
ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'):
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}:
'png'}:
fname += '.' + ext
self.binary_map[elem.get('id')] = fname
raw = elem.text.strip()
try:
data = base64_decode(raw)
except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
elem.get('id')))
self.log.exception('Binary data with id=%s is corrupted, '
'ignoring' % elem.get('id'))
else:
with open(fname, 'wb') as f:
f.write(data)

View File

@@ -1,17 +1,17 @@
import copy
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.customize.conversion import InputFormatPlugin
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = {'lit'}
file_types = {'lit'}
commit_name = 'lit_input'
def convert(self, stream, options, file_ext, log,
@@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
return create_oebbook(log, stream, options, reader=LitReader)
def postprocess_book(self, oeb, opts, log):
from ebook_converter.ebooks.oeb.base import XHTML_NS, XPath, XHTML
from ebook_converter.ebooks.oeb.base import XPath, XHTML
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'):
@@ -37,22 +37,23 @@ class LITInput(InputFormatPlugin):
body = body[0]
if len(body) == 1 and body[0].tag == XHTML('pre'):
pre = body[0]
from ebook_converter.ebooks.txt.processor import convert_basic, \
separate_paragraphs_single_line
from ebook_converter.ebooks.txt.processor import \
convert_basic, separate_paragraphs_single_line
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.xml_parse import safe_xml_fromstring
import copy
self.log('LIT file with all text in singe <pre> tag detected')
self.log('LIT file with all text in singe <pre> tag '
'detected')
html = separate_paragraphs_single_line(pre.text)
html = convert_basic(html).replace('<html>',
'<html xmlns="%s">'%XHTML_NS)
'<html xmlns="%s">' %
const.XHTML_NS)
html = xml_to_unicode(html, strip_encoding_pats=True,
resolve_entities=True)[0]
resolve_entities=True)[0]
if opts.smarten_punctuation:
# SmartyPants skips text inside <pre> tags
from ebook_converter.ebooks.conversion.preprocess import smarten_punctuation
html = smarten_punctuation(html, self.log)
root = safe_xml_fromstring(html)
from ebook_converter.ebooks.conversion import \
preprocess
html = preprocess.smarten_punctuation(html, self.log)
root = etree.fromstring(html)
body = XPath('//h:body')(root)
pre.tag = XHTML('div')
pre.text = ''

View File

@@ -1,54 +1,52 @@
import os, sys
import os
import sys
import pkg_resources
from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class LRFInput(InputFormatPlugin):
name = 'LRF Input'
author = 'Kovid Goyal'
name = 'LRF Input'
author = 'Kovid Goyal'
description = 'Convert LRF files to HTML'
file_types = {'lrf'}
file_types = {'lrf'}
commit_name = 'lrf_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from ebook_converter.ebooks.lrf.input import (MediaType, Styles, TextBlock,
Canvas, ImageBlock, RuledLine)
from ebook_converter.ebooks.lrf.input import MediaType, Styles, \
TextBlock, Canvas, ImageBlock, RuledLine
self.log = log
self.log('Generating XML')
from ebook_converter.ebooks.lrf.lrfparser import LRFDocument
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from lxml import etree
d = LRFDocument(stream)
d.parse()
xml = d.to_xml(write_files=True)
if options.verbose > 2:
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
doc = safe_xml_fromstring(xml)
doc = etree.fromstring(xml)
char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'):
ro = x.get('refobj')
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
jump_button = doc.xpath('//*[@objid="%s"]' % ro)
if jump_button:
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage '
'and @refobj]')
if jump_to:
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
jump_to[0].get('refobj'))
char_button_map[ro] = ('%s.xhtml#%s' %
(jump_to[0].get('refpage'),
jump_to[0].get('refobj')))
plot_map = {}
for x in doc.xpath('//Plot[@refobj]'):
ro = x.get('refobj')
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
image = doc.xpath('//Image[@objid="%s" and @refstream]' % ro)
if image:
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
image[0].get('refstream'))
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]' %
image[0].get('refstream'))
if imgstr:
plot_map[ro] = imgstr[0].get('file')
@@ -58,21 +56,19 @@ class LRFInput(InputFormatPlugin):
resource_filename('ebook_converter',
'data/lrf.xsl')) as fobj:
# TODO(gryf): change this nonsense to etree.parse() instead.
styledoc = safe_xml_fromstring(fobj.read())
styledoc = etree.fromstring(fobj.read())
media_type = MediaType()
styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log)
canvas = Canvas(doc, styles, text_block, log)
image_block = ImageBlock(canvas)
ruled_line = RuledLine()
extensions = {
('calibre', 'media-type') : media_type,
('calibre', 'text-block') : text_block,
('calibre', 'ruled-line') : ruled_line,
('calibre', 'styles') : styles,
('calibre', 'canvas') : canvas,
('calibre', 'image-block'): image_block,
}
extensions = {('calibre', 'media-type'): media_type,
('calibre', 'text-block'): text_block,
('calibre', 'ruled-line'): ruled_line,
('calibre', 'styles'): styles,
('calibre', 'canvas'): canvas,
('calibre', 'image-block'): image_block}
transform = etree.XSLT(styledoc, extensions=extensions)
try:
result = transform(doc)

View File

@@ -1,57 +1,58 @@
import os, glob, re, textwrap
import glob
import os
import pkg_resources
import re
import textwrap
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
border_style_map = {
'single' : 'solid',
'double-thickness-border' : 'double',
'shadowed-border': 'outset',
'double-border': 'double',
'dotted-border': 'dotted',
'dashed': 'dashed',
'hairline': 'solid',
'inset': 'inset',
'dash-small': 'dashed',
'dot-dash': 'dotted',
'dot-dot-dash': 'dotted',
'outset': 'outset',
'tripple': 'double',
'triple': 'double',
'thick-thin-small': 'solid',
'thin-thick-small': 'solid',
'thin-thick-thin-small': 'solid',
'thick-thin-medium': 'solid',
'thin-thick-medium': 'solid',
'thin-thick-thin-medium': 'solid',
'thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid',
'wavy': 'ridge',
'double-wavy': 'ridge',
'striped': 'ridge',
'emboss': 'inset',
'engrave': 'inset',
'frame': 'ridge',
}
border_style_map = {'single': 'solid',
'double-thickness-border': 'double',
'shadowed-border': 'outset',
'double-border': 'double',
'dotted-border': 'dotted',
'dashed': 'dashed',
'hairline': 'solid',
'inset': 'inset',
'dash-small': 'dashed',
'dot-dash': 'dotted',
'dot-dot-dash': 'dotted',
'outset': 'outset',
'tripple': 'double',
'triple': 'double',
'thick-thin-small': 'solid',
'thin-thick-small': 'solid',
'thin-thick-thin-small': 'solid',
'thick-thin-medium': 'solid',
'thin-thick-medium': 'solid',
'thin-thick-thin-medium': 'solid',
'thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid',
'wavy': 'ridge',
'double-wavy': 'ridge',
'striped': 'ridge',
'emboss': 'inset',
'engrave': 'inset',
'frame': 'ridge'}
class RTFInput(InputFormatPlugin):
name = 'RTF Input'
author = 'Kovid Goyal'
name = 'RTF Input'
author = 'Kovid Goyal'
description = 'Convert RTF files to HTML'
file_types = {'rtf'}
file_types = {'rtf'}
commit_name = 'rtf_input'
options = {
OptionRecommendation(name='ignore_wmf', recommended_value=False,
help='Ignore WMF images instead of replacing them with a '
'placeholder image.'),
}
options = {OptionRecommendation(name='ignore_wmf', recommended_value=False,
help='Ignore WMF images instead of '
'replacing them with a placeholder '
'image.')}
def generate_xml(self, stream):
from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf
@@ -64,7 +65,7 @@ class RTFInput(InputFormatPlugin):
run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode')
except:
except Exception:
self.log.warn('Impossible to run RTFParser in debug mode')
parser = ParseRtf(
in_file=stream,
@@ -108,7 +109,8 @@ class RTFInput(InputFormatPlugin):
deb_dir=debug_dir,
# Default encoding
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
default_encoding=getattr(self.opts, 'input_encoding',
'cp1252') or 'cp1252',
# Run level
run_level=run_lev,
@@ -151,7 +153,7 @@ class RTFInput(InputFormatPlugin):
for count, val in imap.items():
try:
imap[count] = self.convert_image(val)
except:
except Exception:
self.log.exception('Failed to convert', val)
return imap
@@ -161,7 +163,7 @@ class RTFInput(InputFormatPlugin):
try:
return self.rasterize_wmf(name)
except Exception:
self.log.exception('Failed to convert WMF image %r'%name)
self.log.exception('Failed to convert WMF image %r' % name)
return self.replace_wmf(name)
def replace_wmf(self, name):
@@ -170,9 +172,11 @@ class RTFInput(InputFormatPlugin):
return '__REMOVE_ME__'
from ebook_converter.ebooks.covers import message_image
if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not supported.'
' Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.')
self.default_img = message_image('Conversion of WMF images is not '
'supported. Use Microsoft Word '
'or OpenOffice to save this RTF '
'file as HTML and convert that '
'in calibre.')
name = name.replace('.wmf', '.jpg')
with open(name, 'wb') as f:
f.write(self.default_img)
@@ -189,10 +193,10 @@ class RTFInput(InputFormatPlugin):
return name
def write_inline_css(self, ic, border_styles):
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
enumerate(ic.colors) if x != 'false']
font_size_classes = ['span.fs%d { font-size: %spt }' % (i, x)
for i, x in enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }' % (i, x)
for i, x in enumerate(ic.colors) if x != 'false']
css = textwrap.dedent('''
span.none {
text-decoration: none; font-weight: normal;
@@ -210,11 +214,11 @@ class RTFInput(InputFormatPlugin):
span.strike-through { text-decoration: line-through }
''')
css += '\n'+'\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes)
css += '\n' + '\n'.join(font_size_classes)
css += '\n' + '\n'.join(color_classes)
for cls, val in border_styles.items():
css += '\n\n.%s {\n%s\n}'%(cls, val)
css += '\n\n.%s {\n%s\n}' % (cls, val)
with open(u'styles.css', 'ab') as f:
f.write(css.encode('utf-8'))
@@ -224,35 +228,34 @@ class RTFInput(InputFormatPlugin):
style_map = {}
for elem in doc.xpath(r'//*[local-name()="cell"]'):
style = ['border-style: hidden', 'border-width: 1px',
'border-color: black']
'border-color: black']
for x in ('bottom', 'top', 'left', 'right'):
bs = elem.get('border-cell-%s-style'%x, None)
bs = elem.get('border-cell-%s-style' % x, None)
if bs:
cbs = border_style_map.get(bs, 'solid')
style.append('border-%s-style: %s'%(x, cbs))
bw = elem.get('border-cell-%s-line-width'%x, None)
style.append('border-%s-style: %s' % (x, cbs))
bw = elem.get('border-cell-%s-line-width' % x, None)
if bw:
style.append('border-%s-width: %spt'%(x, bw))
bc = elem.get('border-cell-%s-color'%x, None)
style.append('border-%s-width: %spt' % (x, bw))
bc = elem.get('border-cell-%s-color' % x, None)
if bc:
style.append('border-%s-color: %s'%(x, bc))
style.append('border-%s-color: %s' % (x, bc))
style = ';\n'.join(style)
if style not in border_styles:
border_styles.append(style)
idx = border_styles.index(style)
cls = 'border_style%d'%idx
cls = 'border_style%d' % idx
style_map[cls] = style
elem.set('class', cls)
return style_map
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from ebook_converter.ebooks.metadata.meta import get_metadata
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from ebook_converter.ebooks.rtf2xml.ParseRtf import \
RtfInvalidCodeException
from ebook_converter.ebooks.rtf.input import InlineClass
from ebook_converter.utils.xml_parse import safe_xml_fromstring
self.opts = options
self.log = log
self.log('Converting RTF to XML...')
@@ -269,14 +272,15 @@ class RTFInput(InputFormatPlugin):
imap = {}
try:
imap = self.extract_images(d[0])
except:
except Exception:
self.log.exception('Failed to extract images...')
self.log('Parsing XML...')
doc = safe_xml_fromstring(xml)
doc = etree.fromstring(xml)
border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
namespaces={'rtf':
'http://rtf2xml.sourceforge.net/'}):
num = int(pict.get('num'))
name = imap.get(num, None)
if name is not None:
@@ -286,8 +290,8 @@ class RTFInput(InputFormatPlugin):
inline_class = InlineClass(self.log)
with open(pkg_resources.resource_filename('ebook_converter',
'data/rtf.xsl')) as fobj:
styledoc = safe_xml_fromstring(fobj.read())
extensions = {('calibre', 'inline-class') : inline_class}
styledoc = etree.fromstring(fobj.read())
extensions = {('calibre', 'inline-class'): inline_class}
transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc)
html = u'index.xhtml'
@@ -296,7 +300,8 @@ class RTFInput(InputFormatPlugin):
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# Replace newlines inserted by the 'empty_paragraphs' option in
# rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
@@ -316,7 +321,8 @@ class RTFInput(InputFormatPlugin):
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
for img in item.data.xpath('//*[local-name()="img" and '
'@src="__REMOVE_ME__"]'):
p = img.getparent()
idx = p.index(img)
p.remove(img)

View File

@@ -1,27 +1,33 @@
import os
from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.utils.filenames import ascii_filename
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
HTML_TEMPLATE = ('<html><head><meta http-equiv="Content-Type" '
'content="text/html; charset=utf-8"/><title>%s</title>'
'</head><body>\n%s\n</body></html>')
def html_encode(s):
return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;') # noqa
return (s.replace('&', '&amp;')
.replace('<', '&lt;')
.replace('>', '&gt;')
.replace('"', '&quot;')
.replace("'", '&apos;')
.replace('\n', '<br/>')
.replace(' ', '&nbsp;'))
class SNBInput(InputFormatPlugin):
name = 'SNB Input'
author = 'Li Fanxi'
name = 'SNB Input'
author = 'Li Fanxi'
description = 'Convert SNB files to OEB'
file_types = {'snb'}
file_types = {'snb'}
commit_name = 'snb_input'
options = set()
@@ -32,13 +38,12 @@ class SNBInput(InputFormatPlugin):
from ebook_converter.ebooks.oeb.base import DirContainer
from ebook_converter.ebooks.snb.snbfile import SNBFile
from ebook_converter.utils.xml_parse import safe_xml_fromstring
log.debug("Parsing SNB file...")
snbFile = SNBFile()
try:
snbFile.Parse(stream)
except:
except Exception:
raise ValueError("Invalid SNB file")
if not snbFile.IsValid():
log.debug("Invalid SNB file")
@@ -46,27 +51,28 @@ class SNBInput(InputFormatPlugin):
log.debug("Handle meta data ...")
from ebook_converter.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, None, options,
encoding=options.input_encoding, populate=False)
encoding=options.input_encoding, populate=False)
meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None:
meta = safe_xml_fromstring(meta)
l = {'title' : './/head/name',
'creator' : './/head/author',
'language' : './/head/language',
'generator': './/head/generator',
'publisher': './/head/publisher',
'cover' : './/head/cover', }
meta = etree.fromstring(meta)
item_map = {'title': './/head/name',
'creator': './/head/author',
'language': './/head/language',
'generator': './/head/generator',
'publisher': './/head/publisher',
'cover': './/head/cover'}
d = {}
for item in l:
node = meta.find(l[item])
for key, item in item_map.items():
node = meta.find(item)
if node is not None:
d[item] = node.text if node.text is not None else ''
d[key] = node.text if node.text is not None else ''
else:
d[item] = ''
d[key] = ''
oeb.metadata.add('title', d['title'])
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
oeb.metadata.add('creator', d['creator'], attrib={'role': 'aut'})
oeb.metadata.add('language',
d['language'].lower().replace('_', '-'))
oeb.metadata.add('generator', d['generator'])
oeb.metadata.add('publisher', d['publisher'])
if d['cover'] != '':
@@ -84,7 +90,7 @@ class SNBInput(InputFormatPlugin):
toc = snbFile.GetFileStream('snbf/toc.snbf')
oeb.container = DirContainer(tdir, log)
if toc is not None:
toc = safe_xml_fromstring(toc)
toc = etree.fromstring(toc)
i = 1
for ch in toc.find('.//body'):
chapterName = ch.text
@@ -93,18 +99,22 @@ class SNBInput(InputFormatPlugin):
data = snbFile.GetFileStream('snbc/' + chapterSrc)
if data is None:
continue
snbc = safe_xml_fromstring(data)
snbc = etree.fromstring(data)
lines = []
for line in snbc.find('.//body'):
if line.tag == 'text':
lines.append('<p>%s</p>' % html_encode(line.text))
elif line.tag == 'img':
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
lines.append('<p><img src="%s" /></p>' %
html_encode(line.text))
with open(os.path.join(tdir, fname), 'wb') as f:
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
f.write((HTML_TEMPLATE %
(chapterName,
'\n'.join(lines))).encode('utf-8',
'replace'))
oeb.toc.add(ch.text, fname)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(fname))
id, href = oeb.manifest.generate(
id='html', href=ascii_filename(fname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = fname
oeb.spine.add(item, True)
@@ -112,7 +122,7 @@ class SNBInput(InputFormatPlugin):
imageFiles = snbFile.OutputImageFiles(tdir)
for f, m in imageFiles:
id, href = oeb.manifest.generate(id='image',
href=ascii_filename(f))
href=ascii_filename(f))
item = oeb.manifest.add(id, href, m)
item.html_input_href = f