mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-27 16:25:46 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
187 lines
6.2 KiB
Python
187 lines
6.2 KiB
Python
import numbers
|
|
from collections import Counter
|
|
|
|
from ebook_converter.ebooks.oeb import parse_utils
|
|
from ebook_converter.ebooks.oeb.base import XPath
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
|
|
class RemoveAdobeMargins(object):
|
|
'''
|
|
Remove margins specified in Adobe's page templates.
|
|
'''
|
|
|
|
def __call__(self, oeb, log, opts):
|
|
self.oeb, self.opts, self.log = oeb, opts, log
|
|
|
|
for item in self.oeb.manifest:
|
|
if item.media_type in {
|
|
'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml',
|
|
'application/adobe-page-template+xml', 'application/adobe.page-template+xml',
|
|
} and hasattr(item.data, 'xpath'):
|
|
self.log('Removing page margins specified in the'
|
|
' Adobe page template')
|
|
for elem in item.data.xpath(
|
|
'//*[@margin-bottom or @margin-top '
|
|
'or @margin-left or @margin-right]'):
|
|
for margin in ('left', 'right', 'top', 'bottom'):
|
|
attr = 'margin-'+margin
|
|
elem.attrib.pop(attr, None)
|
|
|
|
|
|
class NegativeTextIndent(Exception):
|
|
pass
|
|
|
|
|
|
class RemoveFakeMargins(object):
|
|
|
|
'''
|
|
Remove left and right margins from paragraph/divs if the same margin is specified
|
|
on almost all the elements at that level.
|
|
|
|
Must be called only after CSS flattening
|
|
'''
|
|
|
|
def __call__(self, oeb, log, opts):
|
|
if not opts.remove_fake_margins:
|
|
return
|
|
self.oeb, self.log, self.opts = oeb, log, opts
|
|
stylesheet = None
|
|
self.levels = {}
|
|
self.stats = {}
|
|
self.selector_map = {}
|
|
|
|
stylesheet = self.oeb.manifest.main_stylesheet
|
|
if stylesheet is None:
|
|
return
|
|
|
|
self.log('Removing fake margins...')
|
|
|
|
stylesheet = stylesheet.data
|
|
|
|
from css_parser.css import CSSRule
|
|
for rule in stylesheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
|
self.selector_map[rule.selectorList.selectorText] = rule.style
|
|
|
|
self.find_levels()
|
|
|
|
for level in self.levels:
|
|
try:
|
|
self.process_level(level)
|
|
except NegativeTextIndent:
|
|
self.log.debug('Negative text indent detected at level '
|
|
' %s, ignoring this level'%level)
|
|
|
|
def get_margins(self, elem):
|
|
cls = elem.get('class', None)
|
|
if cls:
|
|
style = self.selector_map.get('.'+cls, None)
|
|
if style:
|
|
try:
|
|
ti = style['text-indent']
|
|
except:
|
|
pass
|
|
else:
|
|
if ((hasattr(ti, 'startswith') and ti.startswith('-')) or
|
|
isinstance(ti, numbers.Number) and ti < 0):
|
|
raise NegativeTextIndent()
|
|
return style.marginLeft, style.marginRight, style
|
|
return '', '', None
|
|
|
|
def process_level(self, level):
|
|
elems = self.levels[level]
|
|
self.stats[level+'_left'] = Counter()
|
|
self.stats[level+'_right'] = Counter()
|
|
|
|
for elem in elems:
|
|
lm, rm = self.get_margins(elem)[:2]
|
|
self.stats[level+'_left'][lm] += 1
|
|
self.stats[level+'_right'][rm] += 1
|
|
|
|
self.log.debug(level, ' left margin stats:', self.stats[level+'_left'])
|
|
self.log.debug(level, ' right margin stats:', self.stats[level+'_right'])
|
|
|
|
remove_left = self.analyze_stats(self.stats[level+'_left'])
|
|
remove_right = self.analyze_stats(self.stats[level+'_right'])
|
|
|
|
if remove_left:
|
|
mcl = self.stats[level+'_left'].most_common(1)[0][0]
|
|
self.log('Removing level %s left margin of:'%level, mcl)
|
|
|
|
if remove_right:
|
|
mcr = self.stats[level+'_right'].most_common(1)[0][0]
|
|
self.log('Removing level %s right margin of:'%level, mcr)
|
|
|
|
if remove_left or remove_right:
|
|
for elem in elems:
|
|
lm, rm, style = self.get_margins(elem)
|
|
if remove_left and lm == mcl:
|
|
style.removeProperty('margin-left')
|
|
if remove_right and rm == mcr:
|
|
style.removeProperty('margin-right')
|
|
|
|
def find_levels(self):
|
|
|
|
def level_of(elem, body):
|
|
ans = 1
|
|
while elem.getparent() is not body:
|
|
ans += 1
|
|
elem = elem.getparent()
|
|
return ans
|
|
|
|
paras = XPath('descendant::h:p|descendant::h:div')
|
|
|
|
for item in self.oeb.spine:
|
|
body = XPath('//h:body')(item.data)
|
|
if not body:
|
|
continue
|
|
body = body[0]
|
|
|
|
for p in paras(body):
|
|
level = level_of(p, body)
|
|
level = '%s_%d' % (parse_utils.barename(p.tag), level)
|
|
if level not in self.levels:
|
|
self.levels[level] = []
|
|
self.levels[level].append(p)
|
|
|
|
remove = set()
|
|
for k, v in self.levels.items():
|
|
num = len(v)
|
|
self.log.debug('Found %d items of level:'%num, k)
|
|
level = int(k.split('_')[-1])
|
|
tag = k.split('_')[0]
|
|
if tag == 'p' and num < 25:
|
|
remove.add(k)
|
|
if tag == 'div':
|
|
if level > 2 and num < 25:
|
|
remove.add(k)
|
|
elif level < 3:
|
|
# Check each level < 3 element and only keep those
|
|
# that have many child paras
|
|
for elem in list(v):
|
|
children = len(paras(elem))
|
|
if children < 5:
|
|
v.remove(elem)
|
|
|
|
for k in remove:
|
|
self.levels.pop(k)
|
|
self.log.debug('Ignoring level', k)
|
|
|
|
def analyze_stats(self, stats):
|
|
if not stats:
|
|
return False
|
|
mc = stats.most_common(1)
|
|
if len(mc) > 1:
|
|
return False
|
|
mc = mc[0]
|
|
most_common, most_common_count = mc
|
|
if not most_common or most_common == '0':
|
|
return False
|
|
total = sum(stats.values())
|
|
# True if greater than 95% of elements have the same margin
|
|
return most_common_count/total > 0.95
|