ebook-converter/ebook_converter/ebooks/mobi/mobiml.py

"""
Transform XHTML/OPS-ish content into Mobipocket HTML 3.2.
"""
import copy
import re
import numbers
from lxml import etree

from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.ebooks.oeb.transforms.flatcss import KeyMapper
from ebook_converter.ebooks.mobi.utils import convert_color_for_font_tag
from ebook_converter.utils.imghdr import identify


__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'

MBP_NS = 'http://mobipocket.com/ns/mbp'


def MBP(name):
    return '{%s}%s' % (MBP_NS, name)


MOBI_NSMAP = {None: const.XHTML_NS, 'mbp': const.MBP_NS}
INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'}
HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
# GR: Added 'caption' to both sets
NESTABLE_TAGS = {'ol', 'ul', 'li', 'table', 'tr', 'td', 'th', 'caption'}
TABLE_TAGS = {'table', 'tr', 'td', 'th', 'caption'}

SPECIAL_TAGS = {'hr', 'br'}
CONTENT_TAGS = {'img', 'hr', 'br'}

NOT_VTAGS = HEADER_TAGS | NESTABLE_TAGS | TABLE_TAGS | SPECIAL_TAGS | \
    CONTENT_TAGS
LEAF_TAGS = {'base', 'basefont', 'frame', 'link', 'meta', 'area', 'br',
'col', 'hr', 'img', 'input', 'param'}
PAGE_BREAKS = {'always', 'left', 'right'}

COLLAPSE = re.compile(r'[ \t\r\n\v]+')


def asfloat(value):
    if not isinstance(value, numbers.Number):
        return 0.0
    return float(value)


def isspace(text):
    if not text:
        return True
    if '\xa0' in text:
        return False
    return text.isspace()


class BlockState(object):

    def __init__(self, body):
        self.body = body
        self.nested = []
        self.para = None
        self.inline = None
        self.anchor = None
        self.vpadding = 0.
        self.vmargin = 0.
        self.pbreak = False
        self.istate = None
        self.content = False


class FormatState(object):

    def __init__(self):
        self.rendered = False
        self.left = 0.
        self.halign = 'auto'
        self.indent = 0.
        self.fsize = 3
        self.ids = set()
        self.italic = False
        self.bold = False
        self.strikethrough = False
        self.underline = False
        self.preserve = False
        self.pre_wrap = False
        self.family = 'serif'
        self.bgcolor = 'transparent'
        self.fgcolor = 'black'
        self.href = None
        self.list_num = 0
        self.attrib = {}

    def __eq__(self, other):
        return self.fsize == other.fsize \
               and self.italic == other.italic \
               and self.bold == other.bold \
               and self.href == other.href \
               and self.preserve == other.preserve \
               and self.pre_wrap == other.pre_wrap \
               and self.family == other.family \
               and self.bgcolor == other.bgcolor \
               and self.fgcolor == other.fgcolor \
               and self.strikethrough == other.strikethrough \
               and self.underline == other.underline

    def __ne__(self, other):
        return not self.__eq__(other)


class MobiMLizer(object):

    def __init__(self, ignore_tables=False):
        self.ignore_tables = ignore_tables

    def __call__(self, oeb, context):
        oeb.logger.info('Converting XHTML to Mobipocket markup...')
        self.oeb = oeb
        self.log = self.oeb.logger
        self.opts = context
        self.profile = profile = context.dest
        self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
        self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
        self.mobimlize_spine()

    def mobimlize_spine(self):
        'Iterate over the spine and convert it to MOBIML'
        for item in self.oeb.spine:
            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
            body = item.data.find(base.tag('xhtml', 'body'))
            nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
            nbody = etree.SubElement(nroot, base.tag('xhtml', 'body'))
            self.current_spine_item = item
            self.mobimlize_elem(body, stylizer, BlockState(nbody),
                                [FormatState()])
            item.data = nroot
            # print(etree.tostring(nroot))

    def mobimlize_font(self, ptsize):
        return self.fnums[self.fmap[ptsize]]

    def mobimlize_measure(self, ptsize):
        if isinstance(ptsize, (str, bytes)):
            return ptsize
        embase = self.profile.fbase
        if round(ptsize) < embase:
            return "%dpt" % int(round(ptsize))
        return "%dem" % int(round(ptsize / embase))

    def preize_text(self, text, pre_wrap=False):
        text = str(text)
        if pre_wrap:
            # Replace n consecutive spaces with n-1 NBSP + space
            text = re.sub(r' {2,}', lambda m:('\xa0'*(len(m.group())-1) + ' '), text)
        else:
            text = text.replace(' ', '\xa0')

        text = text.replace('\r\n', '\n')
        text = text.replace('\r', '\n')
        lines = text.split('\n')
        result = lines[:1]
        for line in lines[1:]:
            result.append(etree.Element(base.tag('xhtml', 'br')))
            if line:
                result.append(line)
        return result

    def mobimlize_content(self, tag, text, bstate, istates):
        'Convert text content'
        if text or tag != 'br':
            bstate.content = True
        istate = istates[-1]
        para = bstate.para
        if tag in SPECIAL_TAGS and not text:
            para = para if para is not None else bstate.body
        elif para is None or tag in ('td', 'th'):
            body = bstate.body
            if bstate.pbreak:
                etree.SubElement(body, MBP('pagebreak'))
                bstate.pbreak = False
            bstate.istate = None
            bstate.anchor = None
            parent = bstate.nested[-1] if bstate.nested else bstate.body
            indent = istate.indent
            left = istate.left
            if isinstance(indent, (str, bytes)):
                indent = 0
            if indent < 0 and abs(indent) < left:
                left += indent
                indent = 0
            elif indent != 0 and abs(indent) < self.profile.fbase:
                indent = (indent / abs(indent)) * self.profile.fbase
            if tag in NESTABLE_TAGS and not istate.rendered:
                para = wrapper = etree.SubElement(
                    parent, base.tag('xhtml', tag), attrib=istate.attrib)
                bstate.nested.append(para)
                if tag == 'li' and len(istates) > 1:
                    istates[-2].list_num += 1
                    para.attrib['value'] = str(istates[-2].list_num)
            elif tag in NESTABLE_TAGS and istate.rendered:
                para = wrapper = bstate.nested[-1]
            elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
                ems = self.profile.mobi_ems_per_blockquote
                para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'blockquote'))
                para = wrapper
                emleft = int(round(left / self.profile.fbase)) - ems
                emleft = min((emleft, 10))
                while emleft > ems / 2:
                    para = etree.SubElement(para, base.tag('xhtml', 'blockquote'))
                    emleft -= ems
            else:
                para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'p'))
            bstate.inline = bstate.para = para
            vspace = bstate.vpadding + bstate.vmargin
            bstate.vpadding = bstate.vmargin = 0
            if tag not in TABLE_TAGS:
                if tag in ('ul', 'ol') and vspace > 0:
                    wrapper.addprevious(etree.Element(base.tag('xhtml', 'div'),
                        height=self.mobimlize_measure(vspace)))
                else:
                    wrapper.attrib['height'] = self.mobimlize_measure(vspace)
                para.attrib['width'] = self.mobimlize_measure(indent)
            elif tag == 'table' and vspace > 0:
                vspace = int(round(vspace / self.profile.fbase))
                while vspace > 0:
                    wrapper.addprevious(etree.Element(base.tag('xhtml', 'br')))
                    vspace -= 1
            if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)):
                if isinstance(istate.halign, bytes):
                    istate.halign = istate.halign.decode('utf-8')
                para.attrib['align'] = istate.halign
        istate.rendered = True
        pstate = bstate.istate
        if tag in CONTENT_TAGS:
            bstate.inline = para
            pstate = bstate.istate = None
            try:
                etree.SubElement(para, base.tag('xhtml', tag), attrib=istate.attrib)
            except:
                print('Invalid subelement:', para, tag, istate.attrib)
                raise
        elif tag in TABLE_TAGS:
            para.attrib['valign'] = 'top'
        if istate.ids:
            for id_ in istate.ids:
                anchor = etree.Element(base.tag('xhtml', 'a'), attrib={'id': id_})
                if tag == 'li':
                    try:
                        last = bstate.body[-1][-1]
                    except:
                        break
                    last.insert(0, anchor)
                    anchor.tail = last.text
                    last.text = None
                else:
                    last = bstate.body[-1]
                    # We use append instead of addprevious so that inline
                    # anchors in large blocks point to the correct place. See
                    # https://bugs.launchpad.net/calibre/+bug/899831
                    # This could potentially break if inserting an anchor at
                    # this point in the markup is illegal, but I cannot think
                    # of such a case offhand.
                    if parse_utils.barename(last.tag) in LEAF_TAGS:
                        last.addprevious(anchor)
                    else:
                        last.append(anchor)

            istate.ids.clear()
        if not text:
            return
        if not pstate or istate != pstate:
            inline = para
            fsize = istate.fsize
            href = istate.href
            if not href:
                bstate.anchor = None
            elif pstate and pstate.href == href:
                inline = bstate.anchor
            else:
                inline = etree.SubElement(inline, base.tag('xhtml', 'a'), href=href)
                bstate.anchor = inline

            if fsize != 3:
                inline = etree.SubElement(inline, base.tag('xhtml', 'font'),
                                          size=str(fsize))
            if istate.family == 'monospace':
                inline = etree.SubElement(inline, base.tag('xhtml', 'tt'))
            if istate.italic:
                inline = etree.SubElement(inline, base.tag('xhtml', 'i'))
            if istate.bold:
                inline = etree.SubElement(inline, base.tag('xhtml', 'b'))
            if istate.bgcolor is not None and istate.bgcolor != 'transparent' :
                inline = etree.SubElement(inline, base.tag('xhtml', 'span'),
                        bgcolor=convert_color_for_font_tag(istate.bgcolor))
            if istate.fgcolor != 'black':
                inline = etree.SubElement(inline, base.tag('xhtml', 'font'),
                        color=convert_color_for_font_tag(istate.fgcolor))
            if istate.strikethrough:
                inline = etree.SubElement(inline, base.tag('xhtml', 's'))
            if istate.underline:
                inline = etree.SubElement(inline, base.tag('xhtml', 'u'))
            bstate.inline = inline
        bstate.istate = istate
        inline = bstate.inline
        content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text]
        for item in content:
            if isinstance(item, (str, bytes)):
                if len(inline) == 0:
                    inline.text = (inline.text or '') + item
                else:
                    last = inline[-1]
                    last.tail = (last.tail or '') + item
            else:
                inline.append(item)

    def mobimlize_elem(self, elem, stylizer, bstate, istates,
            ignore_valign=False):
        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) != const.XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
                elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = base.tag('xhtml', 'a')
            else:
                return
        tag = parse_utils.barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start'])-1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline') and style['display'] !=
                'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            rawti = style._get('text-indent')
            istate.indent = style['text-indent']
            if hasattr(rawti, 'strip') and '%' in rawti:
                # We have a percentage text indent, these can come out looking
                # too large if the user chooses a wide output profile like
                # tablet
                istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style['margin-top'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = ('\xa0' * spaces) + (elem.text or '')
            margin = asfloat(style['margin-right'])
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + ('\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + ('\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = style['white-space'] == 'pre'
        istate.pre_wrap = style['white-space'] == 'pre-wrap'
        istate.bgcolor  = style['background-color']
        istate.fgcolor  = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
                'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(round(float(value) /
                                (72/self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[base.urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:',
                            href)
                else:
                    try:
                        width, height = identify(item.data)[1:]
                    except Exception:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = width / height
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height'])*ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width'])/ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
            raww = style._get('width')
            if hasattr(raww, 'strip') and '%' in raww:
                istate.attrib['width'] = raww
            else:
                prop = style['width'] / self.profile.width
                istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                    'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = '\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = '\u201d' + t
        text = None
        if elem.text:
            if istate.preserve or istate.pre_wrap:
                text = elem.text
            elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
                  elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top',
                'text-bottom', 'top', 'bottom') or (
                isinstance(valign, numbers.Number) and abs(valign) != 0)
        issup = valign in ('super', 'text-top', 'top') or (
            isinstance(valign, numbers.Number) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body')))
            vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p'))
            self.mobimlize_elem(elem, stylizer, vbstate, istates,
                    ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, base.tag('xhtml', vtag))
                vtag = etree.SubElement(vtag, base.tag('xhtml', 'small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    if vbstate.para.text:
                        vtag.text = vbstate.para.text
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
                len(elem)==0)):
            if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
                try:
                    value = int(elem.attrib['value'])
                    istates[-2].list_num = value - 1
                except:
                    pass
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve or istate.pre_wrap:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == '\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style['margin-bottom'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()