mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-21 01:05:52 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
389 lines
15 KiB
Python
389 lines
15 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import re, string
|
|
from collections import Counter, defaultdict
|
|
from functools import partial
|
|
|
|
from lxml.html.builder import OL, UL, SPAN
|
|
|
|
from ebook_converter.ebooks.docx.block_styles import ParagraphStyle
|
|
from ebook_converter.ebooks.docx.char_styles import RunStyle, inherit
|
|
from ebook_converter.ebooks.metadata import roman
|
|
from ebook_converter.polyglot.builtins import iteritems, unicode_type
|
|
|
|
STYLE_MAP = {
|
|
'aiueo': 'hiragana',
|
|
'aiueoFullWidth': 'hiragana',
|
|
'hebrew1': 'hebrew',
|
|
'iroha': 'katakana-iroha',
|
|
'irohaFullWidth': 'katakana-iroha',
|
|
'lowerLetter': 'lower-alpha',
|
|
'lowerRoman': 'lower-roman',
|
|
'none': 'none',
|
|
'upperLetter': 'upper-alpha',
|
|
'upperRoman': 'upper-roman',
|
|
'chineseCounting': 'cjk-ideographic',
|
|
'decimalZero': 'decimal-leading-zero',
|
|
}
|
|
|
|
|
|
def alphabet(val, lower=True):
|
|
x = string.ascii_lowercase if lower else string.ascii_uppercase
|
|
return x[(abs(val - 1)) % len(x)]
|
|
|
|
|
|
alphabet_map = {
|
|
'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
|
|
'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
|
|
'decimal-leading-zero': lambda x: '0%d' % x
|
|
}
|
|
|
|
|
|
class Level(object):
|
|
|
|
def __init__(self, namespace, lvl=None):
|
|
self.namespace = namespace
|
|
self.restart = None
|
|
self.start = 0
|
|
self.fmt = 'decimal'
|
|
self.para_link = None
|
|
self.paragraph_style = self.character_style = None
|
|
self.is_numbered = False
|
|
self.num_template = None
|
|
self.bullet_template = None
|
|
self.pic_id = None
|
|
|
|
if lvl is not None:
|
|
self.read_from_xml(lvl)
|
|
|
|
def copy(self):
|
|
ans = Level(self.namespace)
|
|
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
|
|
setattr(ans, x, getattr(self, x))
|
|
return ans
|
|
|
|
def format_template(self, counter, ilvl, template):
|
|
def sub(m):
|
|
x = int(m.group(1)) - 1
|
|
if x > ilvl or x not in counter:
|
|
return ''
|
|
val = counter[x] - (0 if x == ilvl else 1)
|
|
formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
|
|
return formatter(val)
|
|
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
|
|
|
|
def read_from_xml(self, lvl, override=False):
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
|
|
try:
|
|
self.restart = int(get(lr, 'w:val'))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
for lr in XPath('./w:start[@w:val]')(lvl):
|
|
try:
|
|
self.start = int(get(lr, 'w:val'))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
for rPr in XPath('./w:rPr')(lvl):
|
|
ps = RunStyle(self.namespace, rPr)
|
|
if self.character_style is None:
|
|
self.character_style = ps
|
|
else:
|
|
self.character_style.update(ps)
|
|
|
|
lt = None
|
|
for lr in XPath('./w:lvlText[@w:val]')(lvl):
|
|
lt = get(lr, 'w:val')
|
|
|
|
for lr in XPath('./w:numFmt[@w:val]')(lvl):
|
|
val = get(lr, 'w:val')
|
|
if val == 'bullet':
|
|
self.is_numbered = False
|
|
cs = self.character_style
|
|
if lt in {'\uf0a7', 'o'} or (
|
|
cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
|
|
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
|
|
else:
|
|
self.bullet_template = lt
|
|
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
|
|
self.pic_id = get(lpid, 'w:val')
|
|
else:
|
|
self.is_numbered = True
|
|
self.fmt = STYLE_MAP.get(val, 'decimal')
|
|
if lt and re.match(r'%\d+\.$', lt) is None:
|
|
self.num_template = lt
|
|
|
|
for lr in XPath('./w:pStyle[@w:val]')(lvl):
|
|
self.para_link = get(lr, 'w:val')
|
|
|
|
for pPr in XPath('./w:pPr')(lvl):
|
|
ps = ParagraphStyle(self.namespace, pPr)
|
|
if self.paragraph_style is None:
|
|
self.paragraph_style = ps
|
|
else:
|
|
self.paragraph_style.update(ps)
|
|
|
|
def css(self, images, pic_map, rid_map):
|
|
ans = {'list-style-type': self.fmt}
|
|
if self.pic_id:
|
|
rid = pic_map.get(self.pic_id, None)
|
|
if rid:
|
|
try:
|
|
fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
|
|
except Exception:
|
|
fname = None
|
|
else:
|
|
ans['list-style-image'] = 'url("images/%s")' % fname
|
|
return ans
|
|
|
|
def char_css(self):
|
|
try:
|
|
css = self.character_style.css
|
|
except AttributeError:
|
|
css = {}
|
|
css.pop('font-family', None)
|
|
return css
|
|
|
|
|
|
class NumberingDefinition(object):
|
|
|
|
def __init__(self, namespace, parent=None, an_id=None):
|
|
self.namespace = namespace
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
self.levels = {}
|
|
self.abstract_numbering_definition_id = an_id
|
|
if parent is not None:
|
|
for lvl in XPath('./w:lvl')(parent):
|
|
try:
|
|
ilvl = int(get(lvl, 'w:ilvl', 0))
|
|
except (TypeError, ValueError):
|
|
ilvl = 0
|
|
self.levels[ilvl] = Level(namespace, lvl)
|
|
|
|
def copy(self):
|
|
ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
|
|
for l, lvl in iteritems(self.levels):
|
|
ans.levels[l] = lvl.copy()
|
|
return ans
|
|
|
|
|
|
class Numbering(object):
|
|
|
|
def __init__(self, namespace):
|
|
self.namespace = namespace
|
|
self.definitions = {}
|
|
self.instances = {}
|
|
self.counters = defaultdict(Counter)
|
|
self.starts = {}
|
|
self.pic_map = {}
|
|
|
|
def __call__(self, root, styles, rid_map):
|
|
' Read all numbering style definitions '
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
self.rid_map = rid_map
|
|
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
|
|
npbid = get(npb, 'w:numPicBulletId')
|
|
for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
|
|
rid = get(idata, 'r:id')
|
|
self.pic_map[npbid] = rid
|
|
lazy_load = {}
|
|
for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
|
|
an_id = get(an, 'w:abstractNumId')
|
|
nsl = XPath('./w:numStyleLink[@w:val]')(an)
|
|
if nsl:
|
|
lazy_load[an_id] = get(nsl[0], 'w:val')
|
|
else:
|
|
nd = NumberingDefinition(self.namespace, an, an_id=an_id)
|
|
self.definitions[an_id] = nd
|
|
|
|
def create_instance(n, definition):
|
|
nd = definition.copy()
|
|
start_overrides = {}
|
|
for lo in XPath('./w:lvlOverride')(n):
|
|
try:
|
|
ilvl = int(get(lo, 'w:ilvl'))
|
|
except (ValueError, TypeError):
|
|
ilvl = None
|
|
for so in XPath('./w:startOverride[@w:val]')(lo):
|
|
try:
|
|
start_override = int(get(so, 'w:val'))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
else:
|
|
start_overrides[ilvl] = start_override
|
|
for lvl in XPath('./w:lvl')(lo)[:1]:
|
|
nilvl = get(lvl, 'w:ilvl')
|
|
ilvl = nilvl if ilvl is None else ilvl
|
|
alvl = nd.levels.get(ilvl, None)
|
|
if alvl is None:
|
|
alvl = Level(self.namespace)
|
|
alvl.read_from_xml(lvl, override=True)
|
|
for ilvl, so in iteritems(start_overrides):
|
|
try:
|
|
nd.levels[ilvl].start = start_override
|
|
except KeyError:
|
|
pass
|
|
return nd
|
|
|
|
next_pass = {}
|
|
for n in XPath('./w:num[@w:numId]')(root):
|
|
an_id = None
|
|
num_id = get(n, 'w:numId')
|
|
for an in XPath('./w:abstractNumId[@w:val]')(n):
|
|
an_id = get(an, 'w:val')
|
|
d = self.definitions.get(an_id, None)
|
|
if d is None:
|
|
next_pass[num_id] = (an_id, n)
|
|
continue
|
|
self.instances[num_id] = create_instance(n, d)
|
|
|
|
numbering_links = styles.numbering_style_links
|
|
for an_id, style_link in iteritems(lazy_load):
|
|
num_id = numbering_links[style_link]
|
|
self.definitions[an_id] = self.instances[num_id].copy()
|
|
|
|
for num_id, (an_id, n) in iteritems(next_pass):
|
|
d = self.definitions.get(an_id, None)
|
|
if d is not None:
|
|
self.instances[num_id] = create_instance(n, d)
|
|
|
|
for num_id, d in iteritems(self.instances):
|
|
self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
|
|
|
|
def get_pstyle(self, num_id, style_id):
|
|
d = self.instances.get(num_id, None)
|
|
if d is not None:
|
|
for ilvl, lvl in iteritems(d.levels):
|
|
if lvl.para_link == style_id:
|
|
return ilvl
|
|
|
|
def get_para_style(self, num_id, lvl):
|
|
d = self.instances.get(num_id, None)
|
|
if d is not None:
|
|
lvl = d.levels.get(lvl, None)
|
|
return getattr(lvl, 'paragraph_style', None)
|
|
|
|
def update_counter(self, counter, levelnum, levels):
|
|
counter[levelnum] += 1
|
|
for ilvl, lvl in iteritems(levels):
|
|
restart = lvl.restart
|
|
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
|
|
counter[ilvl] = lvl.start
|
|
|
|
def apply_markup(self, items, body, styles, object_map, images):
|
|
seen_instances = set()
|
|
for p, num_id, ilvl in items:
|
|
d = self.instances.get(num_id, None)
|
|
if d is not None:
|
|
lvl = d.levels.get(ilvl, None)
|
|
if lvl is not None:
|
|
an_id = d.abstract_numbering_definition_id
|
|
counter = self.counters[an_id]
|
|
if ilvl not in counter or num_id not in seen_instances:
|
|
counter[ilvl] = self.starts[num_id][ilvl]
|
|
seen_instances.add(num_id)
|
|
p.tag = 'li'
|
|
p.set('value', '%s' % counter[ilvl])
|
|
p.set('list-lvl', unicode_type(ilvl))
|
|
p.set('list-id', num_id)
|
|
if lvl.num_template is not None:
|
|
val = lvl.format_template(counter, ilvl, lvl.num_template)
|
|
p.set('list-template', val)
|
|
elif lvl.bullet_template is not None:
|
|
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
|
|
p.set('list-template', val)
|
|
self.update_counter(counter, ilvl, d.levels)
|
|
|
|
templates = {}
|
|
|
|
def commit(current_run):
|
|
if not current_run:
|
|
return
|
|
start = current_run[0]
|
|
parent = start.getparent()
|
|
idx = parent.index(start)
|
|
|
|
d = self.instances[start.get('list-id')]
|
|
ilvl = int(start.get('list-lvl'))
|
|
lvl = d.levels[ilvl]
|
|
lvlid = start.get('list-id') + start.get('list-lvl')
|
|
has_template = 'list-template' in start.attrib
|
|
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
|
|
if has_template:
|
|
wrap.set('lvlid', lvlid)
|
|
else:
|
|
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
|
|
ccss = lvl.char_css()
|
|
if ccss:
|
|
ccss = styles.register(ccss, 'bullet')
|
|
parent.insert(idx, wrap)
|
|
last_val = None
|
|
for child in current_run:
|
|
wrap.append(child)
|
|
child.tail = '\n\t'
|
|
if has_template:
|
|
span = SPAN()
|
|
span.text = child.text
|
|
child.text = None
|
|
for gc in child:
|
|
span.append(gc)
|
|
child.append(span)
|
|
span = SPAN(child.get('list-template'))
|
|
if ccss:
|
|
span.set('class', ccss)
|
|
last = templates.get(lvlid, '')
|
|
if span.text and len(span.text) > len(last):
|
|
templates[lvlid] = span.text
|
|
child.insert(0, span)
|
|
for attr in ('list-lvl', 'list-id', 'list-template'):
|
|
child.attrib.pop(attr, None)
|
|
val = int(child.get('value'))
|
|
if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
|
|
child.attrib.pop('value')
|
|
last_val = val
|
|
current_run[-1].tail = '\n'
|
|
del current_run[:]
|
|
|
|
parents = set()
|
|
for child in body.iterdescendants('li'):
|
|
parents.add(child.getparent())
|
|
|
|
for parent in parents:
|
|
current_run = []
|
|
for child in parent:
|
|
if child.tag == 'li':
|
|
if current_run:
|
|
last = current_run[-1]
|
|
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
|
|
commit(current_run)
|
|
current_run.append(child)
|
|
else:
|
|
commit(current_run)
|
|
commit(current_run)
|
|
|
|
# Convert the list items that use custom text for bullets into tables
|
|
# so that they display correctly
|
|
for wrap in body.xpath('//ol[@lvlid]'):
|
|
wrap.attrib.pop('lvlid')
|
|
wrap.tag = 'div'
|
|
wrap.set('style', 'display:table')
|
|
for i, li in enumerate(wrap.iterchildren('li')):
|
|
li.tag = 'div'
|
|
li.attrib.pop('value', None)
|
|
li.set('style', 'display:table-row')
|
|
obj = object_map[li]
|
|
bs = styles.para_cache[obj]
|
|
if i == 0:
|
|
wrap.set('style', 'display:table; padding-left:%s' %
|
|
bs.css.get('margin-left', '0'))
|
|
bs.css.pop('margin-left', None)
|
|
for child in li:
|
|
child.set('style', 'display:table-cell')
|