ebook-converter/ebook_converter/ebooks/odt/input.py

"""
Convert an ODT file into a Open Ebook
"""
import logging
import os

from css_parser import CSSParser
from css_parser.css import CSSRule
from lxml import etree

from odf.odf2xhtml import ODF2XHTML
from odf.opendocument import load as odLoad
from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import TEXTNS as odTEXTNS

from ebook_converter import CurrentDir, walk
from ebook_converter.ebooks.oeb.base import _css_logger
from ebook_converter.polyglot.builtins import as_bytes


class Extract(ODF2XHTML):

    def extract_pictures(self, zf):
        if not os.path.exists('Pictures'):
            os.makedirs('Pictures')
        for name in zf.namelist():
            if name.startswith('Pictures') and name not in {'Pictures', 'Pictures/'}:
                data = zf.read(name)
                with open(name, 'wb') as f:
                    f.write(data)

    def apply_list_starts(self, root, log):
        if not self.list_starts:
            return
        list_starts = frozenset(self.list_starts)
        for ol in root.xpath('//*[local-name() = "ol" and @class]'):
            classes = {'.' + x for x in ol.get('class', '').split()}
            found = classes & list_starts
            if found:
                val = self.list_starts[next(iter(found))]
                ol.set('start', val)

    def fix_markup(self, html, log):
        root = etree.fromstring(html)
        self.filter_css(root, log)
        self.extract_css(root, log)
        self.epubify_markup(root, log)
        self.apply_list_starts(root, log)
        html = etree.tostring(root, encoding='utf-8',
                xml_declaration=True)
        return html

    def extract_css(self, root, log):
        ans = []
        for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
            ans.append(s.text)
            s.getparent().remove(s)

        head = root.xpath('//*[local-name() = "head"]')
        if head:
            head = head[0]
            ns = head.nsmap.get(None, '')
            if ns:
                ns = '{%s}'%ns
            etree.SubElement(head, ns+'link', {'type':'text/css',
                'rel':'stylesheet', 'href':'odfpy.css'})

        css = u'\n\n'.join(ans)
        parser = CSSParser(loglevel=logging.WARNING,
                            log=_css_logger)
        self.css = parser.parseString(css, validate=False)

        with open('odfpy.css', 'wb') as f:
            f.write(css.encode('utf-8'))

    def get_css_for_class(self, cls):
        if not cls:
            return None
        for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
            for sel in rule.selectorList:
                q = sel.selectorText
                if q == '.' + cls:
                    return rule

    def epubify_markup(self, root, log):
        from ebook_converter.ebooks.oeb.base import XPath, XHTML
        # Fix empty title tags
        for t in XPath('//h:title')(root):
            if not t.text:
                t.text = u' '
        # Fix <p><div> constructs as the asinine epubchecker complains
        # about them
        pdiv = XPath('//h:p/h:div')
        for div in pdiv(root):
            div.getparent().tag = XHTML('div')

        # Remove the position:relative as it causes problems with some epub
        # renderers. Remove display: block on an image inside a div as it is
        # redundant and prevents text-align:center from working in ADE
        # Also ensure that the img is contained in its containing div
        imgpath = XPath('//h:div/h:img[@style]')
        for img in imgpath(root):
            div = img.getparent()
            if len(div) == 1:
                style = div.attrib.get('style', '')
                if style and not style.endswith(';'):
                    style = style + ';'
                style += 'position:static'  # Ensures position of containing div is static
                # Ensure that the img is always contained in its frame
                div.attrib['style'] = style
                img.attrib['style'] = 'max-width: 100%; max-height: 100%'

        # Handle anchored images. The default markup + CSS produced by
        # odf2xhtml works with WebKit but not with ADE. So we convert the
        # common cases of left/right/center aligned block images to work on
        # both webkit and ADE. We detect the case of setting the side margins
        # to auto and map it to an appropriate text-align directive, which
        # works in both WebKit and ADE.
        # https://bugs.launchpad.net/bugs/1063207
        # https://bugs.launchpad.net/calibre/+bug/859343
        imgpath = XPath('descendant::h:div/h:div/h:img')
        for img in imgpath(root):
            div2 = img.getparent()
            div1 = div2.getparent()
            if (len(div1), len(div2)) != (1, 1):
                continue
            cls = div1.get('class', '')
            first_rules = list(filter(None, [self.get_css_for_class(x) for x in
                cls.split()]))
            has_align = False
            for r in first_rules:
                if r.style.getProperty(u'text-align') is not None:
                    has_align = True
            ml = mr = None
            if not has_align:
                aval = None
                cls = div2.get(u'class', u'')
                rules = list(filter(None, [self.get_css_for_class(x) for x in
                    cls.split()]))
                for r in rules:
                    ml = r.style.getPropertyCSSValue(u'margin-left') or ml
                    mr = r.style.getPropertyCSSValue(u'margin-right') or mr
                    ml = getattr(ml, 'value', None)
                    mr = getattr(mr, 'value', None)
                if ml == mr == u'auto':
                    aval = u'center'
                elif ml == u'auto' and mr != u'auto':
                    aval = 'right'
                elif ml != u'auto' and mr == u'auto':
                    aval = 'left'
                if aval is not None:
                    style = div1.attrib.get('style', '').strip()
                    if style and not style.endswith(';'):
                        style = style + ';'
                    style += 'text-align:%s'%aval
                    has_align = True
                    div1.attrib['style'] = style

            if has_align:
                # This is needed for ADE, without it the text-align has no
                # effect
                style = div2.attrib['style']
                div2.attrib['style'] = 'display:inline;'+style

    def filter_css(self, root, log):
        style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
        if style:
            style = style[0]
            css = style.text
            if css:
                css, sel_map = self.do_filter_css(css)
                if not isinstance(css, str):
                    css = css.decode('utf-8', 'ignore')
                style.text = css
                for x in root.xpath('//*[@class]'):
                    extra = []
                    orig = x.get('class')
                    for cls in orig.split():
                        extra.extend(sel_map.get(cls, []))
                    if extra:
                        x.set('class', orig + ' ' + ' '.join(extra))

    def do_filter_css(self, css):
        from css_parser import parseString
        from css_parser.css import CSSRule
        sheet = parseString(css, validate=False)
        rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
        sel_map = {}
        count = 0
        for r in rules:
            # Check if we have only class selectors for this rule
            nc = [x for x in r.selectorList if not
                    x.selectorText.startswith('.')]
            if len(r.selectorList) > 1 and not nc:
                # Replace all the class selectors with a single class selector
                # This will be added to the class attribute of all elements
                # that have one of these selectors.
                replace_name = 'c_odt%d'%count
                count += 1
                for sel in r.selectorList:
                    s = sel.selectorText[1:]
                    if s not in sel_map:
                        sel_map[s] = []
                    sel_map[s].append(replace_name)
                r.selectorText = '.'+replace_name
        return sheet.cssText, sel_map

    def search_page_img(self, mi, log):
        for frm in self.document.topnode.getElementsByType(odFrame):
            try:
                if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page':
                    log.warn('Document has Pictures anchored to Page, will all end up before first page!')
                    break
            except ValueError:
                pass

    def filter_cover(self, mi, log):
        # filter the Element tree (remove the detected cover)
        if mi.cover and mi.odf_cover_frame:
            for frm in self.document.topnode.getElementsByType(odFrame):
                # search the right frame
                if frm.getAttribute('name') == mi.odf_cover_frame:
                    img = frm.getElementsByType(odImage)
                    # only one draw:image allowed in the draw:frame
                    if len(img) == 1 and img[0].getAttribute('href') == mi.cover:
                        # ok, this is the right frame with the right image
                        # check if there are more childs
                        if len(frm.childNodes) != 1:
                            break
                        # check if the parent paragraph more childs
                        para = frm.parentNode
                        if para.tagName != 'text:p' or len(para.childNodes) != 1:
                            break
                        # now it should be safe to remove the text:p
                        parent = para.parentNode
                        parent.removeChild(para)
                        log("Removed cover image paragraph from document...")
                        break

    def filter_load(self, odffile, mi, log):
        """ This is an adaption from ODF2XHTML. It adds a step between
            load and parse of the document where the Element tree can be
            modified.
        """
        # first load the odf structure
        self.lines = []
        self._wfunc = self._wlines
        if isinstance(odffile, (str, bytes)) \
                or hasattr(odffile, 'read'):  # Added by Kovid
            self.document = odLoad(odffile)
        else:
            self.document = odffile
        # filter stuff
        self.search_page_img(mi, log)
        try:
            self.filter_cover(mi, log)
        except:
            pass
        # parse the modified tree and generate xhtml
        self._walknode(self.document.topnode)

    def __call__(self, stream, odir, log):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.metadata.odt import get_metadata
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator

        if not os.path.exists(odir):
            os.makedirs(odir)
        with CurrentDir(odir):
            log('Extracting ODT file...')
            stream.seek(0)
            mi = get_metadata(stream, 'odt')
            if not mi.title:
                mi.title = 'Unknown'
            if not mi.authors:
                mi.authors = ['Unknown']
            self.filter_load(stream, mi, log)

            # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method,
            # which expects, that all lines are strings.
            html = ''.join([str(l) for l in self.lines])

            # A blanket img specification like this causes problems
            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
            # odf2xhtml creates empty title tag
            html = html.replace('<title></title>','<title>%s</title>'%(mi.title,))
            try:
                html = self.fix_markup(html, log)
            except:
                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(as_bytes(html))
            zf = ZipFile(stream, 'r')
            self.extract_pictures(zf)
            opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
            opf.create_manifest([(os.path.abspath(f2), None) for f2 in
                walk(os.getcwd())])
            opf.create_spine([os.path.abspath('index.xhtml')])
            with open('metadata.opf', 'wb') as f:
                opf.render(f)
            return os.path.abspath('metadata.opf')