mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-28 00:35:45 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
305 lines
12 KiB
Python
305 lines
12 KiB
Python
"""
|
|
Convert an ODT file into a Open Ebook
|
|
"""
|
|
import logging
|
|
import os
|
|
|
|
from css_parser import CSSParser
|
|
from css_parser.css import CSSRule
|
|
from lxml import etree
|
|
|
|
from odf.odf2xhtml import ODF2XHTML
|
|
from odf.opendocument import load as odLoad
|
|
from odf.draw import Frame as odFrame, Image as odImage
|
|
from odf.namespaces import TEXTNS as odTEXTNS
|
|
|
|
from ebook_converter import CurrentDir, walk
|
|
from ebook_converter.ebooks.oeb.base import _css_logger
|
|
from ebook_converter.polyglot.builtins import as_bytes
|
|
|
|
|
|
class Extract(ODF2XHTML):
|
|
|
|
def extract_pictures(self, zf):
|
|
if not os.path.exists('Pictures'):
|
|
os.makedirs('Pictures')
|
|
for name in zf.namelist():
|
|
if name.startswith('Pictures') and name not in {'Pictures', 'Pictures/'}:
|
|
data = zf.read(name)
|
|
with open(name, 'wb') as f:
|
|
f.write(data)
|
|
|
|
def apply_list_starts(self, root, log):
|
|
if not self.list_starts:
|
|
return
|
|
list_starts = frozenset(self.list_starts)
|
|
for ol in root.xpath('//*[local-name() = "ol" and @class]'):
|
|
classes = {'.' + x for x in ol.get('class', '').split()}
|
|
found = classes & list_starts
|
|
if found:
|
|
val = self.list_starts[next(iter(found))]
|
|
ol.set('start', val)
|
|
|
|
def fix_markup(self, html, log):
|
|
root = etree.fromstring(html)
|
|
self.filter_css(root, log)
|
|
self.extract_css(root, log)
|
|
self.epubify_markup(root, log)
|
|
self.apply_list_starts(root, log)
|
|
html = etree.tostring(root, encoding='utf-8',
|
|
xml_declaration=True)
|
|
return html
|
|
|
|
def extract_css(self, root, log):
|
|
ans = []
|
|
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
|
|
ans.append(s.text)
|
|
s.getparent().remove(s)
|
|
|
|
head = root.xpath('//*[local-name() = "head"]')
|
|
if head:
|
|
head = head[0]
|
|
ns = head.nsmap.get(None, '')
|
|
if ns:
|
|
ns = '{%s}'%ns
|
|
etree.SubElement(head, ns+'link', {'type':'text/css',
|
|
'rel':'stylesheet', 'href':'odfpy.css'})
|
|
|
|
css = u'\n\n'.join(ans)
|
|
parser = CSSParser(loglevel=logging.WARNING,
|
|
log=_css_logger)
|
|
self.css = parser.parseString(css, validate=False)
|
|
|
|
with open('odfpy.css', 'wb') as f:
|
|
f.write(css.encode('utf-8'))
|
|
|
|
def get_css_for_class(self, cls):
|
|
if not cls:
|
|
return None
|
|
for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
|
for sel in rule.selectorList:
|
|
q = sel.selectorText
|
|
if q == '.' + cls:
|
|
return rule
|
|
|
|
def epubify_markup(self, root, log):
|
|
from ebook_converter.ebooks.oeb.base import XPath, XHTML
|
|
# Fix empty title tags
|
|
for t in XPath('//h:title')(root):
|
|
if not t.text:
|
|
t.text = u' '
|
|
# Fix <p><div> constructs as the asinine epubchecker complains
|
|
# about them
|
|
pdiv = XPath('//h:p/h:div')
|
|
for div in pdiv(root):
|
|
div.getparent().tag = XHTML('div')
|
|
|
|
# Remove the position:relative as it causes problems with some epub
|
|
# renderers. Remove display: block on an image inside a div as it is
|
|
# redundant and prevents text-align:center from working in ADE
|
|
# Also ensure that the img is contained in its containing div
|
|
imgpath = XPath('//h:div/h:img[@style]')
|
|
for img in imgpath(root):
|
|
div = img.getparent()
|
|
if len(div) == 1:
|
|
style = div.attrib.get('style', '')
|
|
if style and not style.endswith(';'):
|
|
style = style + ';'
|
|
style += 'position:static' # Ensures position of containing div is static
|
|
# Ensure that the img is always contained in its frame
|
|
div.attrib['style'] = style
|
|
img.attrib['style'] = 'max-width: 100%; max-height: 100%'
|
|
|
|
# Handle anchored images. The default markup + CSS produced by
|
|
# odf2xhtml works with WebKit but not with ADE. So we convert the
|
|
# common cases of left/right/center aligned block images to work on
|
|
# both webkit and ADE. We detect the case of setting the side margins
|
|
# to auto and map it to an appropriate text-align directive, which
|
|
# works in both WebKit and ADE.
|
|
# https://bugs.launchpad.net/bugs/1063207
|
|
# https://bugs.launchpad.net/calibre/+bug/859343
|
|
imgpath = XPath('descendant::h:div/h:div/h:img')
|
|
for img in imgpath(root):
|
|
div2 = img.getparent()
|
|
div1 = div2.getparent()
|
|
if (len(div1), len(div2)) != (1, 1):
|
|
continue
|
|
cls = div1.get('class', '')
|
|
first_rules = list(filter(None, [self.get_css_for_class(x) for x in
|
|
cls.split()]))
|
|
has_align = False
|
|
for r in first_rules:
|
|
if r.style.getProperty(u'text-align') is not None:
|
|
has_align = True
|
|
ml = mr = None
|
|
if not has_align:
|
|
aval = None
|
|
cls = div2.get(u'class', u'')
|
|
rules = list(filter(None, [self.get_css_for_class(x) for x in
|
|
cls.split()]))
|
|
for r in rules:
|
|
ml = r.style.getPropertyCSSValue(u'margin-left') or ml
|
|
mr = r.style.getPropertyCSSValue(u'margin-right') or mr
|
|
ml = getattr(ml, 'value', None)
|
|
mr = getattr(mr, 'value', None)
|
|
if ml == mr == u'auto':
|
|
aval = u'center'
|
|
elif ml == u'auto' and mr != u'auto':
|
|
aval = 'right'
|
|
elif ml != u'auto' and mr == u'auto':
|
|
aval = 'left'
|
|
if aval is not None:
|
|
style = div1.attrib.get('style', '').strip()
|
|
if style and not style.endswith(';'):
|
|
style = style + ';'
|
|
style += 'text-align:%s'%aval
|
|
has_align = True
|
|
div1.attrib['style'] = style
|
|
|
|
if has_align:
|
|
# This is needed for ADE, without it the text-align has no
|
|
# effect
|
|
style = div2.attrib['style']
|
|
div2.attrib['style'] = 'display:inline;'+style
|
|
|
|
def filter_css(self, root, log):
|
|
style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
|
|
if style:
|
|
style = style[0]
|
|
css = style.text
|
|
if css:
|
|
css, sel_map = self.do_filter_css(css)
|
|
if not isinstance(css, str):
|
|
css = css.decode('utf-8', 'ignore')
|
|
style.text = css
|
|
for x in root.xpath('//*[@class]'):
|
|
extra = []
|
|
orig = x.get('class')
|
|
for cls in orig.split():
|
|
extra.extend(sel_map.get(cls, []))
|
|
if extra:
|
|
x.set('class', orig + ' ' + ' '.join(extra))
|
|
|
|
def do_filter_css(self, css):
|
|
from css_parser import parseString
|
|
from css_parser.css import CSSRule
|
|
sheet = parseString(css, validate=False)
|
|
rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
|
|
sel_map = {}
|
|
count = 0
|
|
for r in rules:
|
|
# Check if we have only class selectors for this rule
|
|
nc = [x for x in r.selectorList if not
|
|
x.selectorText.startswith('.')]
|
|
if len(r.selectorList) > 1 and not nc:
|
|
# Replace all the class selectors with a single class selector
|
|
# This will be added to the class attribute of all elements
|
|
# that have one of these selectors.
|
|
replace_name = 'c_odt%d'%count
|
|
count += 1
|
|
for sel in r.selectorList:
|
|
s = sel.selectorText[1:]
|
|
if s not in sel_map:
|
|
sel_map[s] = []
|
|
sel_map[s].append(replace_name)
|
|
r.selectorText = '.'+replace_name
|
|
return sheet.cssText, sel_map
|
|
|
|
def search_page_img(self, mi, log):
|
|
for frm in self.document.topnode.getElementsByType(odFrame):
|
|
try:
|
|
if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page':
|
|
log.warn('Document has Pictures anchored to Page, will all end up before first page!')
|
|
break
|
|
except ValueError:
|
|
pass
|
|
|
|
def filter_cover(self, mi, log):
|
|
# filter the Element tree (remove the detected cover)
|
|
if mi.cover and mi.odf_cover_frame:
|
|
for frm in self.document.topnode.getElementsByType(odFrame):
|
|
# search the right frame
|
|
if frm.getAttribute('name') == mi.odf_cover_frame:
|
|
img = frm.getElementsByType(odImage)
|
|
# only one draw:image allowed in the draw:frame
|
|
if len(img) == 1 and img[0].getAttribute('href') == mi.cover:
|
|
# ok, this is the right frame with the right image
|
|
# check if there are more childs
|
|
if len(frm.childNodes) != 1:
|
|
break
|
|
# check if the parent paragraph more childs
|
|
para = frm.parentNode
|
|
if para.tagName != 'text:p' or len(para.childNodes) != 1:
|
|
break
|
|
# now it should be safe to remove the text:p
|
|
parent = para.parentNode
|
|
parent.removeChild(para)
|
|
log("Removed cover image paragraph from document...")
|
|
break
|
|
|
|
def filter_load(self, odffile, mi, log):
|
|
""" This is an adaption from ODF2XHTML. It adds a step between
|
|
load and parse of the document where the Element tree can be
|
|
modified.
|
|
"""
|
|
# first load the odf structure
|
|
self.lines = []
|
|
self._wfunc = self._wlines
|
|
if isinstance(odffile, (str, bytes)) \
|
|
or hasattr(odffile, 'read'): # Added by Kovid
|
|
self.document = odLoad(odffile)
|
|
else:
|
|
self.document = odffile
|
|
# filter stuff
|
|
self.search_page_img(mi, log)
|
|
try:
|
|
self.filter_cover(mi, log)
|
|
except:
|
|
pass
|
|
# parse the modified tree and generate xhtml
|
|
self._walknode(self.document.topnode)
|
|
|
|
def __call__(self, stream, odir, log):
|
|
from ebook_converter.utils.zipfile import ZipFile
|
|
from ebook_converter.ebooks.metadata.odt import get_metadata
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
|
|
if not os.path.exists(odir):
|
|
os.makedirs(odir)
|
|
with CurrentDir(odir):
|
|
log('Extracting ODT file...')
|
|
stream.seek(0)
|
|
mi = get_metadata(stream, 'odt')
|
|
if not mi.title:
|
|
mi.title = 'Unknown'
|
|
if not mi.authors:
|
|
mi.authors = ['Unknown']
|
|
self.filter_load(stream, mi, log)
|
|
|
|
# NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method,
|
|
# which expects, that all lines are strings.
|
|
html = ''.join([str(l) for l in self.lines])
|
|
|
|
# A blanket img specification like this causes problems
|
|
# with EPUB output as the containing element often has
|
|
# an absolute height and width set that is larger than
|
|
# the available screen real estate
|
|
html = html.replace('img { width: 100%; height: 100%; }', '')
|
|
# odf2xhtml creates empty title tag
|
|
html = html.replace('<title></title>','<title>%s</title>'%(mi.title,))
|
|
try:
|
|
html = self.fix_markup(html, log)
|
|
except:
|
|
log.exception('Failed to filter CSS, conversion may be slow')
|
|
with open('index.xhtml', 'wb') as f:
|
|
f.write(as_bytes(html))
|
|
zf = ZipFile(stream, 'r')
|
|
self.extract_pictures(zf)
|
|
opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
|
|
opf.create_manifest([(os.path.abspath(f2), None) for f2 in
|
|
walk(os.getcwd())])
|
|
opf.create_spine([os.path.abspath('index.xhtml')])
|
|
with open('metadata.opf', 'wb') as f:
|
|
opf.render(f)
|
|
return os.path.abspath('metadata.opf')
|