""" Convert an ODT file into a Open Ebook """ import os, logging from lxml import etree from css_parser import CSSParser from css_parser.css import CSSRule from odf.odf2xhtml import ODF2XHTML from odf.opendocument import load as odLoad from odf.draw import Frame as odFrame, Image as odImage from odf.namespaces import TEXTNS as odTEXTNS from ebook_converter import CurrentDir, walk from ebook_converter.ebooks.oeb.base import _css_logger from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.polyglot.builtins import as_bytes __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' class Extract(ODF2XHTML): def extract_pictures(self, zf): if not os.path.exists('Pictures'): os.makedirs('Pictures') for name in zf.namelist(): if name.startswith('Pictures') and name not in {'Pictures', 'Pictures/'}: data = zf.read(name) with open(name, 'wb') as f: f.write(data) def apply_list_starts(self, root, log): if not self.list_starts: return list_starts = frozenset(self.list_starts) for ol in root.xpath('//*[local-name() = "ol" and @class]'): classes = {'.' + x for x in ol.get('class', '').split()} found = classes & list_starts if found: val = self.list_starts[next(iter(found))] ol.set('start', val) def fix_markup(self, html, log): root = safe_xml_fromstring(html) self.filter_css(root, log) self.extract_css(root, log) self.epubify_markup(root, log) self.apply_list_starts(root, log) html = etree.tostring(root, encoding='utf-8', xml_declaration=True) return html def extract_css(self, root, log): ans = [] for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'): ans.append(s.text) s.getparent().remove(s) head = root.xpath('//*[local-name() = "head"]') if head: head = head[0] ns = head.nsmap.get(None, '') if ns: ns = '{%s}'%ns etree.SubElement(head, ns+'link', {'type':'text/css', 'rel':'stylesheet', 'href':'odfpy.css'}) css = u'\n\n'.join(ans) parser = CSSParser(loglevel=logging.WARNING, log=_css_logger) self.css = parser.parseString(css, validate=False) with open('odfpy.css', 'wb') as f: f.write(css.encode('utf-8')) def get_css_for_class(self, cls): if not cls: return None for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE): for sel in rule.selectorList: q = sel.selectorText if q == '.' + cls: return rule def epubify_markup(self, root, log): from ebook_converter.ebooks.oeb.base import XPath, XHTML # Fix empty title tags for t in XPath('//h:title')(root): if not t.text: t.text = u' ' # Fix

constructs as the asinine epubchecker complains # about them pdiv = XPath('//h:p/h:div') for div in pdiv(root): div.getparent().tag = XHTML('div') # Remove the position:relative as it causes problems with some epub # renderers. Remove display: block on an image inside a div as it is # redundant and prevents text-align:center from working in ADE # Also ensure that the img is contained in its containing div imgpath = XPath('//h:div/h:img[@style]') for img in imgpath(root): div = img.getparent() if len(div) == 1: style = div.attrib.get('style', '') if style and not style.endswith(';'): style = style + ';' style += 'position:static' # Ensures position of containing div is static # Ensure that the img is always contained in its frame div.attrib['style'] = style img.attrib['style'] = 'max-width: 100%; max-height: 100%' # Handle anchored images. The default markup + CSS produced by # odf2xhtml works with WebKit but not with ADE. So we convert the # common cases of left/right/center aligned block images to work on # both webkit and ADE. We detect the case of setting the side margins # to auto and map it to an appropriate text-align directive, which # works in both WebKit and ADE. # https://bugs.launchpad.net/bugs/1063207 # https://bugs.launchpad.net/calibre/+bug/859343 imgpath = XPath('descendant::h:div/h:div/h:img') for img in imgpath(root): div2 = img.getparent() div1 = div2.getparent() if (len(div1), len(div2)) != (1, 1): continue cls = div1.get('class', '') first_rules = list(filter(None, [self.get_css_for_class(x) for x in cls.split()])) has_align = False for r in first_rules: if r.style.getProperty(u'text-align') is not None: has_align = True ml = mr = None if not has_align: aval = None cls = div2.get(u'class', u'') rules = list(filter(None, [self.get_css_for_class(x) for x in cls.split()])) for r in rules: ml = r.style.getPropertyCSSValue(u'margin-left') or ml mr = r.style.getPropertyCSSValue(u'margin-right') or mr ml = getattr(ml, 'value', None) mr = getattr(mr, 'value', None) if ml == mr == u'auto': aval = u'center' elif ml == u'auto' and mr != u'auto': aval = 'right' elif ml != u'auto' and mr == u'auto': aval = 'left' if aval is not None: style = div1.attrib.get('style', '').strip() if style and not style.endswith(';'): style = style + ';' style += 'text-align:%s'%aval has_align = True div1.attrib['style'] = style if has_align: # This is needed for ADE, without it the text-align has no # effect style = div2.attrib['style'] div2.attrib['style'] = 'display:inline;'+style def filter_css(self, root, log): style = root.xpath('//*[local-name() = "style" and @type="text/css"]') if style: style = style[0] css = style.text if css: css, sel_map = self.do_filter_css(css) if not isinstance(css, str): css = css.decode('utf-8', 'ignore') style.text = css for x in root.xpath('//*[@class]'): extra = [] orig = x.get('class') for cls in orig.split(): extra.extend(sel_map.get(cls, [])) if extra: x.set('class', orig + ' ' + ' '.join(extra)) def do_filter_css(self, css): from css_parser import parseString from css_parser.css import CSSRule sheet = parseString(css, validate=False) rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) sel_map = {} count = 0 for r in rules: # Check if we have only class selectors for this rule nc = [x for x in r.selectorList if not x.selectorText.startswith('.')] if len(r.selectorList) > 1 and not nc: # Replace all the class selectors with a single class selector # This will be added to the class attribute of all elements # that have one of these selectors. replace_name = 'c_odt%d'%count count += 1 for sel in r.selectorList: s = sel.selectorText[1:] if s not in sel_map: sel_map[s] = [] sel_map[s].append(replace_name) r.selectorText = '.'+replace_name return sheet.cssText, sel_map def search_page_img(self, mi, log): for frm in self.document.topnode.getElementsByType(odFrame): try: if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page': log.warn('Document has Pictures anchored to Page, will all end up before first page!') break except ValueError: pass def filter_cover(self, mi, log): # filter the Element tree (remove the detected cover) if mi.cover and mi.odf_cover_frame: for frm in self.document.topnode.getElementsByType(odFrame): # search the right frame if frm.getAttribute('name') == mi.odf_cover_frame: img = frm.getElementsByType(odImage) # only one draw:image allowed in the draw:frame if len(img) == 1 and img[0].getAttribute('href') == mi.cover: # ok, this is the right frame with the right image # check if there are more childs if len(frm.childNodes) != 1: break # check if the parent paragraph more childs para = frm.parentNode if para.tagName != 'text:p' or len(para.childNodes) != 1: break # now it should be safe to remove the text:p parent = para.parentNode parent.removeChild(para) log("Removed cover image paragraph from document...") break def filter_load(self, odffile, mi, log): """ This is an adaption from ODF2XHTML. It adds a step between load and parse of the document where the Element tree can be modified. """ # first load the odf structure self.lines = [] self._wfunc = self._wlines if isinstance(odffile, (str, bytes)) \ or hasattr(odffile, 'read'): # Added by Kovid self.document = odLoad(odffile) else: self.document = odffile # filter stuff self.search_page_img(mi, log) try: self.filter_cover(mi, log) except: pass # parse the modified tree and generate xhtml self._walknode(self.document.topnode) def __call__(self, stream, odir, log): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.metadata.odt import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator if not os.path.exists(odir): os.makedirs(odir) with CurrentDir(odir): log('Extracting ODT file...') stream.seek(0) mi = get_metadata(stream, 'odt') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] self.filter_load(stream, mi, log) # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method, # which expects, that all lines are strings. html = ''.join([str(l) for l in self.lines]) # A blanket img specification like this causes problems # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') # odf2xhtml creates empty title tag html = html.replace('','%s'%(mi.title,)) try: html = self.fix_markup(html, log) except: log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: f.write(as_bytes(html)) zf = ZipFile(stream, 'r') self.extract_pictures(zf) opf = OPFCreator(os.path.abspath(os.getcwd()), mi) opf.create_manifest([(os.path.abspath(f2), None) for f2 in walk(os.getcwd())]) opf.create_spine([os.path.abspath('index.xhtml')]) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.abspath('metadata.opf')