mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-17 19:43:34 +02:00
Added docx writer related modules
This commit is contained in:
316
ebook_converter/ebooks/oeb/transforms/subset.py
Normal file
316
ebook_converter/ebooks/oeb/transforms/subset.py
Normal file
@@ -0,0 +1,316 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from calibre.ebooks.oeb.base import urlnormalize, css_text
|
||||
from calibre.utils.fonts.sfnt.subset import subset, NoGlyphs, UnsupportedFont
|
||||
from polyglot.builtins import iteritems, itervalues, unicode_type, range
|
||||
from tinycss.fonts3 import parse_font_family
|
||||
|
||||
|
||||
def get_font_properties(rule, default=None):
|
||||
'''
|
||||
Given a CSS rule, extract normalized font properties from
|
||||
it. Note that shorthand font property should already have been expanded
|
||||
by the CSS flattening code.
|
||||
'''
|
||||
props = {}
|
||||
s = rule.style
|
||||
for q in ('font-family', 'src', 'font-weight', 'font-stretch',
|
||||
'font-style'):
|
||||
g = 'uri' if q == 'src' else 'value'
|
||||
try:
|
||||
val = s.getProperty(q).propertyValue[0]
|
||||
val = getattr(val, g)
|
||||
if q == 'font-family':
|
||||
val = parse_font_family(css_text(s.getProperty(q).propertyValue))
|
||||
if val and val[0] == 'inherit':
|
||||
val = None
|
||||
except (IndexError, KeyError, AttributeError, TypeError, ValueError):
|
||||
val = None if q in {'src', 'font-family'} else default
|
||||
if q in {'font-weight', 'font-stretch', 'font-style'}:
|
||||
val = unicode_type(val).lower() if (val or val == 0) else val
|
||||
if val == 'inherit':
|
||||
val = default
|
||||
if q == 'font-weight':
|
||||
val = {'normal':'400', 'bold':'700'}.get(val, val)
|
||||
if val not in {'100', '200', '300', '400', '500', '600', '700',
|
||||
'800', '900', 'bolder', 'lighter'}:
|
||||
val = default
|
||||
if val == 'normal':
|
||||
val = '400'
|
||||
elif q == 'font-style':
|
||||
if val not in {'normal', 'italic', 'oblique'}:
|
||||
val = default
|
||||
elif q == 'font-stretch':
|
||||
if val not in {'normal', 'ultra-condensed', 'extra-condensed',
|
||||
'condensed', 'semi-condensed', 'semi-expanded',
|
||||
'expanded', 'extra-expanded', 'ultra-expanded'}:
|
||||
val = default
|
||||
props[q] = val
|
||||
return props
|
||||
|
||||
|
||||
def find_font_face_rules(sheet, oeb):
|
||||
'''
|
||||
Find all @font-face rules in the given sheet and extract the relevant info from them.
|
||||
sheet can be either a ManifestItem or a CSSStyleSheet.
|
||||
'''
|
||||
ans = []
|
||||
try:
|
||||
rules = sheet.data.cssRules
|
||||
except AttributeError:
|
||||
rules = sheet.cssRules
|
||||
|
||||
for i, rule in enumerate(rules):
|
||||
if rule.type != rule.FONT_FACE_RULE:
|
||||
continue
|
||||
props = get_font_properties(rule, default='normal')
|
||||
if not props['font-family'] or not props['src']:
|
||||
continue
|
||||
|
||||
try:
|
||||
path = sheet.abshref(props['src'])
|
||||
except AttributeError:
|
||||
path = props['src']
|
||||
ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
|
||||
if not ff:
|
||||
continue
|
||||
props['item'] = ff
|
||||
if props['font-weight'] in {'bolder', 'lighter'}:
|
||||
props['font-weight'] = '400'
|
||||
props['weight'] = int(props['font-weight'])
|
||||
props['rule'] = rule
|
||||
props['chars'] = set()
|
||||
ans.append(props)
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
def elem_style(style_rules, cls, inherited_style):
|
||||
'''
|
||||
Find the effective style for the given element.
|
||||
'''
|
||||
classes = cls.split()
|
||||
style = inherited_style.copy()
|
||||
for cls in classes:
|
||||
style.update(style_rules.get(cls, {}))
|
||||
wt = style.get('font-weight', None)
|
||||
pwt = inherited_style.get('font-weight', '400')
|
||||
if wt == 'bolder':
|
||||
style['font-weight'] = {
|
||||
'100':'400',
|
||||
'200':'400',
|
||||
'300':'400',
|
||||
'400':'700',
|
||||
'500':'700',
|
||||
}.get(pwt, '900')
|
||||
elif wt == 'lighter':
|
||||
style['font-weight'] = {
|
||||
'600':'400', '700':'400',
|
||||
'800':'700', '900':'700'}.get(pwt, '100')
|
||||
|
||||
return style
|
||||
|
||||
|
||||
class SubsetFonts(object):
|
||||
|
||||
'''
|
||||
Subset all embedded fonts. Must be run after CSS flattening, as it requires
|
||||
CSS normalization and flattening to work.
|
||||
'''
|
||||
|
||||
def __call__(self, oeb, log, opts):
|
||||
self.oeb, self.log, self.opts = oeb, log, opts
|
||||
|
||||
self.find_embedded_fonts()
|
||||
if not self.embedded_fonts:
|
||||
self.log.debug('No embedded fonts found')
|
||||
return
|
||||
self.find_style_rules()
|
||||
self.find_font_usage()
|
||||
|
||||
totals = [0, 0]
|
||||
|
||||
def remove(font):
|
||||
totals[1] += len(font['item'].data)
|
||||
self.oeb.manifest.remove(font['item'])
|
||||
font['rule'].parentStyleSheet.deleteRule(font['rule'])
|
||||
|
||||
fonts = {}
|
||||
for font in self.embedded_fonts:
|
||||
item, chars = font['item'], font['chars']
|
||||
if item.href in fonts:
|
||||
fonts[item.href]['chars'] |= chars
|
||||
else:
|
||||
fonts[item.href] = font
|
||||
|
||||
for font in itervalues(fonts):
|
||||
if not font['chars']:
|
||||
self.log('The font %s is unused. Removing it.'%font['src'])
|
||||
remove(font)
|
||||
continue
|
||||
try:
|
||||
raw, old_stats, new_stats = subset(font['item'].data, font['chars'])
|
||||
except NoGlyphs:
|
||||
self.log('The font %s has no used glyphs. Removing it.'%font['src'])
|
||||
remove(font)
|
||||
continue
|
||||
except UnsupportedFont as e:
|
||||
self.log.warn('The font %s is unsupported for subsetting. %s'%(
|
||||
font['src'], e))
|
||||
sz = len(font['item'].data)
|
||||
totals[0] += sz
|
||||
totals[1] += sz
|
||||
else:
|
||||
font['item'].data = raw
|
||||
nlen = sum(itervalues(new_stats))
|
||||
olen = sum(itervalues(old_stats))
|
||||
self.log('Decreased the font %s to %.1f%% of its original size'%
|
||||
(font['src'], nlen/olen *100))
|
||||
totals[0] += nlen
|
||||
totals[1] += olen
|
||||
|
||||
font['item'].unload_data_from_memory()
|
||||
|
||||
if totals[0]:
|
||||
self.log('Reduced total font size to %.1f%% of original'%
|
||||
(totals[0]/totals[1] * 100))
|
||||
|
||||
def find_embedded_fonts(self):
|
||||
'''
|
||||
Find all @font-face rules and extract the relevant info from them.
|
||||
'''
|
||||
self.embedded_fonts = []
|
||||
for item in self.oeb.manifest:
|
||||
if not hasattr(item.data, 'cssRules'):
|
||||
continue
|
||||
self.embedded_fonts.extend(find_font_face_rules(item, self.oeb))
|
||||
|
||||
def find_style_rules(self):
|
||||
'''
|
||||
Extract all font related style information from all stylesheets into a
|
||||
dict mapping classes to font properties specified by that class. All
|
||||
the heavy lifting has already been done by the CSS flattening code.
|
||||
'''
|
||||
rules = defaultdict(dict)
|
||||
for item in self.oeb.manifest:
|
||||
if not hasattr(item.data, 'cssRules'):
|
||||
continue
|
||||
for i, rule in enumerate(item.data.cssRules):
|
||||
if rule.type != rule.STYLE_RULE:
|
||||
continue
|
||||
props = {k:v for k,v in
|
||||
iteritems(get_font_properties(rule)) if v}
|
||||
if not props:
|
||||
continue
|
||||
for sel in rule.selectorList:
|
||||
sel = sel.selectorText
|
||||
if sel and sel.startswith('.'):
|
||||
# We dont care about pseudo-selectors as the worst that
|
||||
# can happen is some extra characters will remain in
|
||||
# the font
|
||||
sel = sel.partition(':')[0]
|
||||
rules[sel[1:]].update(props)
|
||||
|
||||
self.style_rules = dict(rules)
|
||||
|
||||
def find_font_usage(self):
|
||||
for item in self.oeb.manifest:
|
||||
if not hasattr(item.data, 'xpath'):
|
||||
continue
|
||||
for body in item.data.xpath('//*[local-name()="body"]'):
|
||||
base = {'font-family':['serif'], 'font-weight': '400',
|
||||
'font-style':'normal', 'font-stretch':'normal'}
|
||||
self.find_usage_in(body, base)
|
||||
|
||||
def used_font(self, style):
|
||||
'''
|
||||
Given a style find the embedded font that matches it. Returns None if
|
||||
no match is found (can happen if no family matches).
|
||||
'''
|
||||
ff = style.get('font-family', [])
|
||||
lnames = {unicode_type(x).lower() for x in ff}
|
||||
matching_set = []
|
||||
|
||||
# Filter on font-family
|
||||
for ef in self.embedded_fonts:
|
||||
flnames = {x.lower() for x in ef.get('font-family', [])}
|
||||
if not lnames.intersection(flnames):
|
||||
continue
|
||||
matching_set.append(ef)
|
||||
if not matching_set:
|
||||
return None
|
||||
|
||||
# Filter on font-stretch
|
||||
widths = {x:i for i, x in enumerate(('ultra-condensed',
|
||||
'extra-condensed', 'condensed', 'semi-condensed', 'normal',
|
||||
'semi-expanded', 'expanded', 'extra-expanded', 'ultra-expanded'
|
||||
))}
|
||||
|
||||
width = widths[style.get('font-stretch', 'normal')]
|
||||
for f in matching_set:
|
||||
f['width'] = widths[style.get('font-stretch', 'normal')]
|
||||
|
||||
min_dist = min(abs(width-f['width']) for f in matching_set)
|
||||
nearest = [f for f in matching_set if abs(width-f['width']) ==
|
||||
min_dist]
|
||||
if width <= 4:
|
||||
lmatches = [f for f in nearest if f['width'] <= width]
|
||||
else:
|
||||
lmatches = [f for f in nearest if f['width'] >= width]
|
||||
matching_set = (lmatches or nearest)
|
||||
|
||||
# Filter on font-style
|
||||
fs = style.get('font-style', 'normal')
|
||||
order = {
|
||||
'oblique':['oblique', 'italic', 'normal'],
|
||||
'normal':['normal', 'oblique', 'italic']
|
||||
}.get(fs, ['italic', 'oblique', 'normal'])
|
||||
for q in order:
|
||||
matches = [f for f in matching_set if f.get('font-style', 'normal') == q]
|
||||
if matches:
|
||||
matching_set = matches
|
||||
break
|
||||
|
||||
# Filter on font weight
|
||||
fw = int(style.get('font-weight', '400'))
|
||||
if fw == 400:
|
||||
q = [400, 500, 300, 200, 100, 600, 700, 800, 900]
|
||||
elif fw == 500:
|
||||
q = [500, 400, 300, 200, 100, 600, 700, 800, 900]
|
||||
elif fw < 400:
|
||||
q = [fw] + list(range(fw-100, -100, -100)) + list(range(fw+100,
|
||||
100, 1000))
|
||||
else:
|
||||
q = [fw] + list(range(fw+100, 100, 1000)) + list(range(fw-100,
|
||||
-100, -100))
|
||||
for wt in q:
|
||||
matches = [f for f in matching_set if f['weight'] == wt]
|
||||
if matches:
|
||||
return matches[0]
|
||||
|
||||
def find_chars(self, elem):
|
||||
ans = set()
|
||||
if elem.text:
|
||||
ans |= set(elem.text)
|
||||
for child in elem:
|
||||
if child.tail:
|
||||
ans |= set(child.tail)
|
||||
return ans
|
||||
|
||||
def find_usage_in(self, elem, inherited_style):
|
||||
style = elem_style(self.style_rules, elem.get('class', '') or '', inherited_style)
|
||||
for child in elem:
|
||||
self.find_usage_in(child, style)
|
||||
font = self.used_font(style)
|
||||
if font:
|
||||
chars = self.find_chars(elem)
|
||||
if chars:
|
||||
font['chars'] |= chars
|
||||
Reference in New Issue
Block a user