mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-05 18:54:11 +01:00
386 lines
14 KiB
Python
386 lines
14 KiB
Python
import re, string
|
|
from collections import Counter, defaultdict
|
|
from functools import partial
|
|
|
|
from lxml.html.builder import OL, UL, SPAN
|
|
|
|
from ebook_converter.ebooks.docx.block_styles import ParagraphStyle
|
|
from ebook_converter.ebooks.docx.char_styles import RunStyle, inherit
|
|
from ebook_converter.ebooks.metadata import roman
|
|
from ebook_converter.polyglot.builtins import iteritems
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
STYLE_MAP = {
|
|
'aiueo': 'hiragana',
|
|
'aiueoFullWidth': 'hiragana',
|
|
'hebrew1': 'hebrew',
|
|
'iroha': 'katakana-iroha',
|
|
'irohaFullWidth': 'katakana-iroha',
|
|
'lowerLetter': 'lower-alpha',
|
|
'lowerRoman': 'lower-roman',
|
|
'none': 'none',
|
|
'upperLetter': 'upper-alpha',
|
|
'upperRoman': 'upper-roman',
|
|
'chineseCounting': 'cjk-ideographic',
|
|
'decimalZero': 'decimal-leading-zero',
|
|
}
|
|
|
|
|
|
def alphabet(val, lower=True):
|
|
x = string.ascii_lowercase if lower else string.ascii_uppercase
|
|
return x[(abs(val - 1)) % len(x)]
|
|
|
|
|
|
alphabet_map = {
|
|
'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
|
|
'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
|
|
'decimal-leading-zero': lambda x: '0%d' % x
|
|
}
|
|
|
|
|
|
class Level(object):
|
|
|
|
def __init__(self, namespace, lvl=None):
|
|
self.namespace = namespace
|
|
self.restart = None
|
|
self.start = 0
|
|
self.fmt = 'decimal'
|
|
self.para_link = None
|
|
self.paragraph_style = self.character_style = None
|
|
self.is_numbered = False
|
|
self.num_template = None
|
|
self.bullet_template = None
|
|
self.pic_id = None
|
|
|
|
if lvl is not None:
|
|
self.read_from_xml(lvl)
|
|
|
|
def copy(self):
|
|
ans = Level(self.namespace)
|
|
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
|
|
setattr(ans, x, getattr(self, x))
|
|
return ans
|
|
|
|
def format_template(self, counter, ilvl, template):
|
|
def sub(m):
|
|
x = int(m.group(1)) - 1
|
|
if x > ilvl or x not in counter:
|
|
return ''
|
|
val = counter[x] - (0 if x == ilvl else 1)
|
|
formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
|
|
return formatter(val)
|
|
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
|
|
|
|
def read_from_xml(self, lvl, override=False):
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
|
|
try:
|
|
self.restart = int(get(lr, 'w:val'))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
for lr in XPath('./w:start[@w:val]')(lvl):
|
|
try:
|
|
self.start = int(get(lr, 'w:val'))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
for rPr in XPath('./w:rPr')(lvl):
|
|
ps = RunStyle(self.namespace, rPr)
|
|
if self.character_style is None:
|
|
self.character_style = ps
|
|
else:
|
|
self.character_style.update(ps)
|
|
|
|
lt = None
|
|
for lr in XPath('./w:lvlText[@w:val]')(lvl):
|
|
lt = get(lr, 'w:val')
|
|
|
|
for lr in XPath('./w:numFmt[@w:val]')(lvl):
|
|
val = get(lr, 'w:val')
|
|
if val == 'bullet':
|
|
self.is_numbered = False
|
|
cs = self.character_style
|
|
if lt in {'\uf0a7', 'o'} or (
|
|
cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
|
|
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
|
|
else:
|
|
self.bullet_template = lt
|
|
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
|
|
self.pic_id = get(lpid, 'w:val')
|
|
else:
|
|
self.is_numbered = True
|
|
self.fmt = STYLE_MAP.get(val, 'decimal')
|
|
if lt and re.match(r'%\d+\.$', lt) is None:
|
|
self.num_template = lt
|
|
|
|
for lr in XPath('./w:pStyle[@w:val]')(lvl):
|
|
self.para_link = get(lr, 'w:val')
|
|
|
|
for pPr in XPath('./w:pPr')(lvl):
|
|
ps = ParagraphStyle(self.namespace, pPr)
|
|
if self.paragraph_style is None:
|
|
self.paragraph_style = ps
|
|
else:
|
|
self.paragraph_style.update(ps)
|
|
|
|
def css(self, images, pic_map, rid_map):
|
|
ans = {'list-style-type': self.fmt}
|
|
if self.pic_id:
|
|
rid = pic_map.get(self.pic_id, None)
|
|
if rid:
|
|
try:
|
|
fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
|
|
except Exception:
|
|
fname = None
|
|
else:
|
|
ans['list-style-image'] = 'url("images/%s")' % fname
|
|
return ans
|
|
|
|
def char_css(self):
|
|
try:
|
|
css = self.character_style.css
|
|
except AttributeError:
|
|
css = {}
|
|
css.pop('font-family', None)
|
|
return css
|
|
|
|
|
|
class NumberingDefinition(object):
|
|
|
|
def __init__(self, namespace, parent=None, an_id=None):
|
|
self.namespace = namespace
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
self.levels = {}
|
|
self.abstract_numbering_definition_id = an_id
|
|
if parent is not None:
|
|
for lvl in XPath('./w:lvl')(parent):
|
|
try:
|
|
ilvl = int(get(lvl, 'w:ilvl', 0))
|
|
except (TypeError, ValueError):
|
|
ilvl = 0
|
|
self.levels[ilvl] = Level(namespace, lvl)
|
|
|
|
def copy(self):
|
|
ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
|
|
for l, lvl in iteritems(self.levels):
|
|
ans.levels[l] = lvl.copy()
|
|
return ans
|
|
|
|
|
|
class Numbering(object):
|
|
|
|
def __init__(self, namespace):
|
|
self.namespace = namespace
|
|
self.definitions = {}
|
|
self.instances = {}
|
|
self.counters = defaultdict(Counter)
|
|
self.starts = {}
|
|
self.pic_map = {}
|
|
|
|
def __call__(self, root, styles, rid_map):
|
|
' Read all numbering style definitions '
|
|
XPath, get = self.namespace.XPath, self.namespace.get
|
|
self.rid_map = rid_map
|
|
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
|
|
npbid = get(npb, 'w:numPicBulletId')
|
|
for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
|
|
rid = get(idata, 'r:id')
|
|
self.pic_map[npbid] = rid
|
|
lazy_load = {}
|
|
for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
|
|
an_id = get(an, 'w:abstractNumId')
|
|
nsl = XPath('./w:numStyleLink[@w:val]')(an)
|
|
if nsl:
|
|
lazy_load[an_id] = get(nsl[0], 'w:val')
|
|
else:
|
|
nd = NumberingDefinition(self.namespace, an, an_id=an_id)
|
|
self.definitions[an_id] = nd
|
|
|
|
def create_instance(n, definition):
|
|
nd = definition.copy()
|
|
start_overrides = {}
|
|
for lo in XPath('./w:lvlOverride')(n):
|
|
try:
|
|
ilvl = int(get(lo, 'w:ilvl'))
|
|
except (ValueError, TypeError):
|
|
ilvl = None
|
|
for so in XPath('./w:startOverride[@w:val]')(lo):
|
|
try:
|
|
start_override = int(get(so, 'w:val'))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
else:
|
|
start_overrides[ilvl] = start_override
|
|
for lvl in XPath('./w:lvl')(lo)[:1]:
|
|
nilvl = get(lvl, 'w:ilvl')
|
|
ilvl = nilvl if ilvl is None else ilvl
|
|
alvl = nd.levels.get(ilvl, None)
|
|
if alvl is None:
|
|
alvl = Level(self.namespace)
|
|
alvl.read_from_xml(lvl, override=True)
|
|
for ilvl, so in iteritems(start_overrides):
|
|
try:
|
|
nd.levels[ilvl].start = start_override
|
|
except KeyError:
|
|
pass
|
|
return nd
|
|
|
|
next_pass = {}
|
|
for n in XPath('./w:num[@w:numId]')(root):
|
|
an_id = None
|
|
num_id = get(n, 'w:numId')
|
|
for an in XPath('./w:abstractNumId[@w:val]')(n):
|
|
an_id = get(an, 'w:val')
|
|
d = self.definitions.get(an_id, None)
|
|
if d is None:
|
|
next_pass[num_id] = (an_id, n)
|
|
continue
|
|
self.instances[num_id] = create_instance(n, d)
|
|
|
|
numbering_links = styles.numbering_style_links
|
|
for an_id, style_link in iteritems(lazy_load):
|
|
num_id = numbering_links[style_link]
|
|
self.definitions[an_id] = self.instances[num_id].copy()
|
|
|
|
for num_id, (an_id, n) in iteritems(next_pass):
|
|
d = self.definitions.get(an_id, None)
|
|
if d is not None:
|
|
self.instances[num_id] = create_instance(n, d)
|
|
|
|
for num_id, d in iteritems(self.instances):
|
|
self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
|
|
|
|
def get_pstyle(self, num_id, style_id):
|
|
d = self.instances.get(num_id, None)
|
|
if d is not None:
|
|
for ilvl, lvl in iteritems(d.levels):
|
|
if lvl.para_link == style_id:
|
|
return ilvl
|
|
|
|
def get_para_style(self, num_id, lvl):
|
|
d = self.instances.get(num_id, None)
|
|
if d is not None:
|
|
lvl = d.levels.get(lvl, None)
|
|
return getattr(lvl, 'paragraph_style', None)
|
|
|
|
def update_counter(self, counter, levelnum, levels):
|
|
counter[levelnum] += 1
|
|
for ilvl, lvl in iteritems(levels):
|
|
restart = lvl.restart
|
|
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
|
|
counter[ilvl] = lvl.start
|
|
|
|
def apply_markup(self, items, body, styles, object_map, images):
|
|
seen_instances = set()
|
|
for p, num_id, ilvl in items:
|
|
d = self.instances.get(num_id, None)
|
|
if d is not None:
|
|
lvl = d.levels.get(ilvl, None)
|
|
if lvl is not None:
|
|
an_id = d.abstract_numbering_definition_id
|
|
counter = self.counters[an_id]
|
|
if ilvl not in counter or num_id not in seen_instances:
|
|
counter[ilvl] = self.starts[num_id][ilvl]
|
|
seen_instances.add(num_id)
|
|
p.tag = 'li'
|
|
p.set('value', '%s' % counter[ilvl])
|
|
p.set('list-lvl', str(ilvl))
|
|
p.set('list-id', num_id)
|
|
if lvl.num_template is not None:
|
|
val = lvl.format_template(counter, ilvl, lvl.num_template)
|
|
p.set('list-template', val)
|
|
elif lvl.bullet_template is not None:
|
|
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
|
|
p.set('list-template', val)
|
|
self.update_counter(counter, ilvl, d.levels)
|
|
|
|
templates = {}
|
|
|
|
def commit(current_run):
|
|
if not current_run:
|
|
return
|
|
start = current_run[0]
|
|
parent = start.getparent()
|
|
idx = parent.index(start)
|
|
|
|
d = self.instances[start.get('list-id')]
|
|
ilvl = int(start.get('list-lvl'))
|
|
lvl = d.levels[ilvl]
|
|
lvlid = start.get('list-id') + start.get('list-lvl')
|
|
has_template = 'list-template' in start.attrib
|
|
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
|
|
if has_template:
|
|
wrap.set('lvlid', lvlid)
|
|
else:
|
|
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
|
|
ccss = lvl.char_css()
|
|
if ccss:
|
|
ccss = styles.register(ccss, 'bullet')
|
|
parent.insert(idx, wrap)
|
|
last_val = None
|
|
for child in current_run:
|
|
wrap.append(child)
|
|
child.tail = '\n\t'
|
|
if has_template:
|
|
span = SPAN()
|
|
span.text = child.text
|
|
child.text = None
|
|
for gc in child:
|
|
span.append(gc)
|
|
child.append(span)
|
|
span = SPAN(child.get('list-template'))
|
|
if ccss:
|
|
span.set('class', ccss)
|
|
last = templates.get(lvlid, '')
|
|
if span.text and len(span.text) > len(last):
|
|
templates[lvlid] = span.text
|
|
child.insert(0, span)
|
|
for attr in ('list-lvl', 'list-id', 'list-template'):
|
|
child.attrib.pop(attr, None)
|
|
val = int(child.get('value'))
|
|
if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
|
|
child.attrib.pop('value')
|
|
last_val = val
|
|
current_run[-1].tail = '\n'
|
|
del current_run[:]
|
|
|
|
parents = set()
|
|
for child in body.iterdescendants('li'):
|
|
parents.add(child.getparent())
|
|
|
|
for parent in parents:
|
|
current_run = []
|
|
for child in parent:
|
|
if child.tag == 'li':
|
|
if current_run:
|
|
last = current_run[-1]
|
|
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
|
|
commit(current_run)
|
|
current_run.append(child)
|
|
else:
|
|
commit(current_run)
|
|
commit(current_run)
|
|
|
|
# Convert the list items that use custom text for bullets into tables
|
|
# so that they display correctly
|
|
for wrap in body.xpath('//ol[@lvlid]'):
|
|
wrap.attrib.pop('lvlid')
|
|
wrap.tag = 'div'
|
|
wrap.set('style', 'display:table')
|
|
for i, li in enumerate(wrap.iterchildren('li')):
|
|
li.tag = 'div'
|
|
li.attrib.pop('value', None)
|
|
li.set('style', 'display:table-row')
|
|
obj = object_map[li]
|
|
bs = styles.para_cache[obj]
|
|
if i == 0:
|
|
wrap.set('style', 'display:table; padding-left:%s' %
|
|
bs.css.get('margin-left', '0'))
|
|
bs.css.pop('margin-left', None)
|
|
for child in li:
|
|
child.set('style', 'display:table-cell')
|