1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-03 09:14:11 +01:00
Files
ebook-converter/ebook_converter/ebooks/odt/input.py

310 lines
13 KiB
Python

"""
Convert an ODT file into a Open Ebook
"""
import os, logging
from lxml import etree
from css_parser import CSSParser
from css_parser.css import CSSRule
from odf.odf2xhtml import ODF2XHTML
from odf.opendocument import load as odLoad
from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import TEXTNS as odTEXTNS
from ebook_converter import CurrentDir, walk
from ebook_converter.ebooks.oeb.base import _css_logger
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
class Extract(ODF2XHTML):
def extract_pictures(self, zf):
if not os.path.exists('Pictures'):
os.makedirs('Pictures')
for name in zf.namelist():
if name.startswith('Pictures') and name not in {'Pictures', 'Pictures/'}:
data = zf.read(name)
with open(name, 'wb') as f:
f.write(data)
def apply_list_starts(self, root, log):
if not self.list_starts:
return
list_starts = frozenset(self.list_starts)
for ol in root.xpath('//*[local-name() = "ol" and @class]'):
classes = {'.' + x for x in ol.get('class', '').split()}
found = classes & list_starts
if found:
val = self.list_starts[next(iter(found))]
ol.set('start', val)
def fix_markup(self, html, log):
root = safe_xml_fromstring(html)
self.filter_css(root, log)
self.extract_css(root, log)
self.epubify_markup(root, log)
self.apply_list_starts(root, log)
html = etree.tostring(root, encoding='utf-8',
xml_declaration=True)
return html
def extract_css(self, root, log):
ans = []
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
ans.append(s.text)
s.getparent().remove(s)
head = root.xpath('//*[local-name() = "head"]')
if head:
head = head[0]
ns = head.nsmap.get(None, '')
if ns:
ns = '{%s}'%ns
etree.SubElement(head, ns+'link', {'type':'text/css',
'rel':'stylesheet', 'href':'odfpy.css'})
css = u'\n\n'.join(ans)
parser = CSSParser(loglevel=logging.WARNING,
log=_css_logger)
self.css = parser.parseString(css, validate=False)
with open('odfpy.css', 'wb') as f:
f.write(css.encode('utf-8'))
def get_css_for_class(self, cls):
if not cls:
return None
for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
for sel in rule.selectorList:
q = sel.selectorText
if q == '.' + cls:
return rule
def epubify_markup(self, root, log):
from ebook_converter.ebooks.oeb.base import XPath, XHTML
# Fix empty title tags
for t in XPath('//h:title')(root):
if not t.text:
t.text = u' '
# Fix <p><div> constructs as the asinine epubchecker complains
# about them
pdiv = XPath('//h:p/h:div')
for div in pdiv(root):
div.getparent().tag = XHTML('div')
# Remove the position:relative as it causes problems with some epub
# renderers. Remove display: block on an image inside a div as it is
# redundant and prevents text-align:center from working in ADE
# Also ensure that the img is contained in its containing div
imgpath = XPath('//h:div/h:img[@style]')
for img in imgpath(root):
div = img.getparent()
if len(div) == 1:
style = div.attrib.get('style', '')
if style and not style.endswith(';'):
style = style + ';'
style += 'position:static' # Ensures position of containing div is static
# Ensure that the img is always contained in its frame
div.attrib['style'] = style
img.attrib['style'] = 'max-width: 100%; max-height: 100%'
# Handle anchored images. The default markup + CSS produced by
# odf2xhtml works with WebKit but not with ADE. So we convert the
# common cases of left/right/center aligned block images to work on
# both webkit and ADE. We detect the case of setting the side margins
# to auto and map it to an appropriate text-align directive, which
# works in both WebKit and ADE.
# https://bugs.launchpad.net/bugs/1063207
# https://bugs.launchpad.net/calibre/+bug/859343
imgpath = XPath('descendant::h:div/h:div/h:img')
for img in imgpath(root):
div2 = img.getparent()
div1 = div2.getparent()
if (len(div1), len(div2)) != (1, 1):
continue
cls = div1.get('class', '')
first_rules = list(filter(None, [self.get_css_for_class(x) for x in
cls.split()]))
has_align = False
for r in first_rules:
if r.style.getProperty(u'text-align') is not None:
has_align = True
ml = mr = None
if not has_align:
aval = None
cls = div2.get(u'class', u'')
rules = list(filter(None, [self.get_css_for_class(x) for x in
cls.split()]))
for r in rules:
ml = r.style.getPropertyCSSValue(u'margin-left') or ml
mr = r.style.getPropertyCSSValue(u'margin-right') or mr
ml = getattr(ml, 'value', None)
mr = getattr(mr, 'value', None)
if ml == mr == u'auto':
aval = u'center'
elif ml == u'auto' and mr != u'auto':
aval = 'right'
elif ml != u'auto' and mr == u'auto':
aval = 'left'
if aval is not None:
style = div1.attrib.get('style', '').strip()
if style and not style.endswith(';'):
style = style + ';'
style += 'text-align:%s'%aval
has_align = True
div1.attrib['style'] = style
if has_align:
# This is needed for ADE, without it the text-align has no
# effect
style = div2.attrib['style']
div2.attrib['style'] = 'display:inline;'+style
def filter_css(self, root, log):
style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
if style:
style = style[0]
css = style.text
if css:
css, sel_map = self.do_filter_css(css)
if not isinstance(css, str):
css = css.decode('utf-8', 'ignore')
style.text = css
for x in root.xpath('//*[@class]'):
extra = []
orig = x.get('class')
for cls in orig.split():
extra.extend(sel_map.get(cls, []))
if extra:
x.set('class', orig + ' ' + ' '.join(extra))
def do_filter_css(self, css):
from css_parser import parseString
from css_parser.css import CSSRule
sheet = parseString(css, validate=False)
rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
sel_map = {}
count = 0
for r in rules:
# Check if we have only class selectors for this rule
nc = [x for x in r.selectorList if not
x.selectorText.startswith('.')]
if len(r.selectorList) > 1 and not nc:
# Replace all the class selectors with a single class selector
# This will be added to the class attribute of all elements
# that have one of these selectors.
replace_name = 'c_odt%d'%count
count += 1
for sel in r.selectorList:
s = sel.selectorText[1:]
if s not in sel_map:
sel_map[s] = []
sel_map[s].append(replace_name)
r.selectorText = '.'+replace_name
return sheet.cssText, sel_map
def search_page_img(self, mi, log):
for frm in self.document.topnode.getElementsByType(odFrame):
try:
if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page':
log.warn('Document has Pictures anchored to Page, will all end up before first page!')
break
except ValueError:
pass
def filter_cover(self, mi, log):
# filter the Element tree (remove the detected cover)
if mi.cover and mi.odf_cover_frame:
for frm in self.document.topnode.getElementsByType(odFrame):
# search the right frame
if frm.getAttribute('name') == mi.odf_cover_frame:
img = frm.getElementsByType(odImage)
# only one draw:image allowed in the draw:frame
if len(img) == 1 and img[0].getAttribute('href') == mi.cover:
# ok, this is the right frame with the right image
# check if there are more childs
if len(frm.childNodes) != 1:
break
# check if the parent paragraph more childs
para = frm.parentNode
if para.tagName != 'text:p' or len(para.childNodes) != 1:
break
# now it should be safe to remove the text:p
parent = para.parentNode
parent.removeChild(para)
log("Removed cover image paragraph from document...")
break
def filter_load(self, odffile, mi, log):
""" This is an adaption from ODF2XHTML. It adds a step between
load and parse of the document where the Element tree can be
modified.
"""
# first load the odf structure
self.lines = []
self._wfunc = self._wlines
if isinstance(odffile, (str, bytes)) \
or hasattr(odffile, 'read'): # Added by Kovid
self.document = odLoad(odffile)
else:
self.document = odffile
# filter stuff
self.search_page_img(mi, log)
try:
self.filter_cover(mi, log)
except:
pass
# parse the modified tree and generate xhtml
self._walknode(self.document.topnode)
def __call__(self, stream, odir, log):
from ebook_converter.utils.zipfile import ZipFile
from ebook_converter.ebooks.metadata.odt import get_metadata
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
if not os.path.exists(odir):
os.makedirs(odir)
with CurrentDir(odir):
log('Extracting ODT file...')
stream.seek(0)
mi = get_metadata(stream, 'odt')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
self.filter_load(stream, mi, log)
# NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method,
# which expects, that all lines are strings.
html = ''.join([str(l) for l in self.lines])
# A blanket img specification like this causes problems
# with EPUB output as the containing element often has
# an absolute height and width set that is larger than
# the available screen real estate
html = html.replace('img { width: 100%; height: 100%; }', '')
# odf2xhtml creates empty title tag
html = html.replace('<title></title>','<title>%s</title>'%(mi.title,))
try:
html = self.fix_markup(html, log)
except:
log.exception('Failed to filter CSS, conversion may be slow')
with open('index.xhtml', 'wb') as f:
f.write(as_bytes(html))
zf = ZipFile(stream, 'r')
self.extract_pictures(zf)
opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
opf.create_manifest([(os.path.abspath(f2), None) for f2 in
walk(os.getcwd())])
opf.create_spine([os.path.abspath('index.xhtml')])
with open('metadata.opf', 'wb') as f:
opf.render(f)
return os.path.abspath('metadata.opf')