ebook-converter/ebook_converter/ebooks/oeb/transforms/page_margin.py

import numbers
from collections import Counter

from ebook_converter.ebooks.oeb.base import barename, XPath
from ebook_converter.polyglot.builtins import iteritems


__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'


class RemoveAdobeMargins(object):
    '''
    Remove margins specified in Adobe's page templates.
    '''

    def __call__(self, oeb, log, opts):
        self.oeb, self.opts, self.log = oeb, opts, log

        for item in self.oeb.manifest:
            if item.media_type in {
                'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
                'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
            } and hasattr(item.data, 'xpath'):
                self.log('Removing page margins specified in the'
                        ' Adobe page template')
                for elem in item.data.xpath(
                        '//*[@margin-bottom or @margin-top '
                        'or @margin-left or @margin-right]'):
                    for margin in ('left', 'right', 'top', 'bottom'):
                        attr = 'margin-'+margin
                        elem.attrib.pop(attr, None)


class NegativeTextIndent(Exception):
    pass


class RemoveFakeMargins(object):

    '''
    Remove left and right margins from paragraph/divs if the same margin is specified
    on almost all the elements at that level.

    Must be called only after CSS flattening
    '''

    def __call__(self, oeb, log, opts):
        if not opts.remove_fake_margins:
            return
        self.oeb, self.log, self.opts = oeb, log, opts
        stylesheet = None
        self.levels = {}
        self.stats = {}
        self.selector_map = {}

        stylesheet = self.oeb.manifest.main_stylesheet
        if stylesheet is None:
            return

        self.log('Removing fake margins...')

        stylesheet = stylesheet.data

        from css_parser.css import CSSRule
        for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
            self.selector_map[rule.selectorList.selectorText] = rule.style

        self.find_levels()

        for level in self.levels:
            try:
                self.process_level(level)
            except NegativeTextIndent:
                self.log.debug('Negative text indent detected at level '
                        ' %s, ignoring this level'%level)

    def get_margins(self, elem):
        cls = elem.get('class', None)
        if cls:
            style = self.selector_map.get('.'+cls, None)
            if style:
                try:
                    ti = style['text-indent']
                except:
                    pass
                else:
                    if ((hasattr(ti, 'startswith') and ti.startswith('-')) or
                            isinstance(ti, numbers.Number) and ti < 0):
                        raise NegativeTextIndent()
                return style.marginLeft, style.marginRight, style
        return '', '', None

    def process_level(self, level):
        elems = self.levels[level]
        self.stats[level+'_left'] = Counter()
        self.stats[level+'_right'] = Counter()

        for elem in elems:
            lm, rm = self.get_margins(elem)[:2]
            self.stats[level+'_left'][lm] += 1
            self.stats[level+'_right'][rm] += 1

        self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
        self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])

        remove_left = self.analyze_stats(self.stats[level+'_left'])
        remove_right = self.analyze_stats(self.stats[level+'_right'])

        if remove_left:
            mcl = self.stats[level+'_left'].most_common(1)[0][0]
            self.log('Removing level %s left margin of:'%level, mcl)

        if remove_right:
            mcr = self.stats[level+'_right'].most_common(1)[0][0]
            self.log('Removing level %s right margin of:'%level, mcr)

        if remove_left or remove_right:
            for elem in elems:
                lm, rm, style = self.get_margins(elem)
                if remove_left and lm == mcl:
                    style.removeProperty('margin-left')
                if remove_right and rm == mcr:
                    style.removeProperty('margin-right')

    def find_levels(self):

        def level_of(elem, body):
            ans = 1
            while elem.getparent() is not body:
                ans += 1
                elem = elem.getparent()
            return ans

        paras = XPath('descendant::h:p|descendant::h:div')

        for item in self.oeb.spine:
            body = XPath('//h:body')(item.data)
            if not body:
                continue
            body = body[0]

            for p in paras(body):
                level = level_of(p, body)
                level = '%s_%d'%(barename(p.tag), level)
                if level not in self.levels:
                    self.levels[level] = []
                self.levels[level].append(p)

        remove = set()
        for k, v in iteritems(self.levels):
            num = len(v)
            self.log.debug('Found %d items of level:'%num, k)
            level = int(k.split('_')[-1])
            tag = k.split('_')[0]
            if tag == 'p' and num < 25:
                remove.add(k)
            if tag == 'div':
                if level > 2 and num < 25:
                    remove.add(k)
                elif level < 3:
                    # Check each level < 3 element and only keep those
                    # that have many child paras
                    for elem in list(v):
                        children = len(paras(elem))
                        if children < 5:
                            v.remove(elem)

        for k in remove:
            self.levels.pop(k)
            self.log.debug('Ignoring level', k)

    def analyze_stats(self, stats):
        if not stats:
            return False
        mc = stats.most_common(1)
        if len(mc) > 1:
            return False
        mc = mc[0]
        most_common, most_common_count = mc
        if not most_common or most_common == '0':
            return False
        total = sum(stats.values())
        # True if greater than 95% of elements have the same margin
        return most_common_count/total > 0.95