mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-05 18:54:11 +01:00
271 lines
8.7 KiB
Python
271 lines
8.7 KiB
Python
from operator import itemgetter
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter.utils.icu import partition_by_first_letter, sort_key
|
|
from ebook_converter.polyglot.builtins import iteritems
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
|
|
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
|
|
iet = index.get('entry-type', None)
|
|
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
|
|
|
|
lr = index.get('letter-range', None)
|
|
if lr is not None:
|
|
sl, el = lr.parition('-')[0::2]
|
|
sl, el = sl.strip(), el.strip()
|
|
if sl and el:
|
|
def inrange(text):
|
|
return sl <= text[0] <= el
|
|
xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
|
|
|
|
bmark = index.get('bookmark', None)
|
|
if bmark is None:
|
|
return xe_fields
|
|
attr = expand('w:name')
|
|
bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
|
|
ancestors = XPath('ancestor::w:bookmarkStart')
|
|
|
|
def contained(xe):
|
|
# Check if the xe field is contained inside a bookmark with the
|
|
# specified name
|
|
return bool(set(ancestors(xe['start_elem'])) & bookmarks)
|
|
|
|
return [xe for xe in xe_fields if contained(xe)]
|
|
|
|
|
|
def make_block(expand, style, parent, pos):
|
|
p = parent.makeelement(expand('w:p'))
|
|
parent.insert(pos, p)
|
|
if style is not None:
|
|
ppr = p.makeelement(expand('w:pPr'))
|
|
p.append(ppr)
|
|
ps = ppr.makeelement(expand('w:pStyle'))
|
|
ppr.append(ps)
|
|
ps.set(expand('w:val'), style)
|
|
r = p.makeelement(expand('w:r'))
|
|
p.append(r)
|
|
t = r.makeelement(expand('w:t'))
|
|
t.set(expand('xml:space'), 'preserve')
|
|
r.append(t)
|
|
return p, t
|
|
|
|
|
|
def add_xe(xe, t, expand):
|
|
run = t.getparent()
|
|
idx = run.index(t)
|
|
t.text = xe.get('text') or ' '
|
|
pt = xe.get('page-number-text', None)
|
|
|
|
if pt:
|
|
p = t.getparent().getparent()
|
|
r = p.makeelement(expand('w:r'))
|
|
p.append(r)
|
|
t2 = r.makeelement(expand('w:t'))
|
|
t2.set(expand('xml:space'), 'preserve')
|
|
t2.text = ' [%s]' % pt
|
|
r.append(t2)
|
|
# put separate entries on separate lines
|
|
run.insert(idx + 1, run.makeelement(expand('w:br')))
|
|
return xe['anchor'], run
|
|
|
|
|
|
def process_index(field, index, xe_fields, log, XPath, expand):
|
|
'''
|
|
We remove all the word generated index markup and replace it with our own
|
|
that is more suitable for an ebook.
|
|
'''
|
|
styles = []
|
|
heading_text = index.get('heading', None)
|
|
heading_style = 'IndexHeading'
|
|
start_pos = None
|
|
for elem in field.contents:
|
|
if elem.tag.endswith('}p'):
|
|
s = XPath('descendant::pStyle/@w:val')(elem)
|
|
if s:
|
|
styles.append(s[0])
|
|
p = elem.getparent()
|
|
if start_pos is None:
|
|
start_pos = (p, p.index(elem))
|
|
p.remove(elem)
|
|
|
|
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
|
|
if not xe_fields:
|
|
return [], []
|
|
if heading_text is not None:
|
|
groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
|
|
items = []
|
|
for key, fields in iteritems(groups):
|
|
items.append(key), items.extend(fields)
|
|
if styles:
|
|
heading_style = styles[0]
|
|
else:
|
|
items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
|
|
|
|
hyperlinks = []
|
|
blocks = []
|
|
for item in reversed(items):
|
|
is_heading = not isinstance(item, dict)
|
|
style = heading_style if is_heading else None
|
|
p, t = make_block(expand, style, *start_pos)
|
|
if is_heading:
|
|
text = heading_text
|
|
if text.lower().startswith('a'):
|
|
text = item + text[1:]
|
|
t.text = text
|
|
else:
|
|
hyperlinks.append(add_xe(item, t, expand))
|
|
blocks.append(p)
|
|
|
|
return hyperlinks, blocks
|
|
|
|
|
|
def split_up_block(block, a, text, parts, ldict):
|
|
prefix = parts[:-1]
|
|
a.text = parts[-1]
|
|
parent = a.getparent()
|
|
style = 'display:block; margin-left: %.3gem'
|
|
for i, prefix in enumerate(prefix):
|
|
m = 1.5 * i
|
|
span = parent.makeelement('span', style=style % m)
|
|
ldict[span] = i
|
|
parent.append(span)
|
|
span.text = prefix
|
|
span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
|
|
parent.append(span)
|
|
span.append(a)
|
|
ldict[span] = len(prefix)
|
|
|
|
|
|
"""
|
|
The merge algorithm is a little tricky.
|
|
We start with a list of elementary blocks. Each is an HtmlElement, a p node
|
|
with a list of child nodes. The last child may be a link, and the earlier ones are
|
|
just text.
|
|
The list is in reverse order from what we want in the index.
|
|
There is a dictionary ldict which records the level of each child node.
|
|
|
|
Now we want to do a reduce-like operation, combining all blocks with the same
|
|
top level index entry into a single block representing the structure of all
|
|
references, subentries, etc. under that top entry.
|
|
Here's the algorithm.
|
|
|
|
Given a block p and the next block n, and the top level entries p1 and n1 in each
|
|
block, which we assume have the same text:
|
|
|
|
Start with (p, p1) and (n, n1).
|
|
|
|
Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
|
|
|
|
If there are no more levels in n, and we have a link in nk,
|
|
then add the link from nk to the links for pk.
|
|
This might be the first link for pk, or we might get a list of references.
|
|
|
|
Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
|
|
the same text, it must follow pk, it must come before we find any other p entries at
|
|
the same level as pk, and it must have the same level as nk+1.
|
|
|
|
If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
|
|
|
|
If there is no matching entry, then because of the original reversed order we want
|
|
to insert nk+1 and all following entries from n into p immediately following pk.
|
|
"""
|
|
|
|
|
|
def find_match(prev_block, pind, nextent, ldict):
|
|
curlevel = ldict.get(prev_block[pind], -1)
|
|
if curlevel < 0:
|
|
return -1
|
|
for p in range(pind+1, len(prev_block)):
|
|
trylev = ldict.get(prev_block[p], -1)
|
|
if trylev <= curlevel:
|
|
return -1
|
|
if trylev > (curlevel+1):
|
|
continue
|
|
if prev_block[p].text_content() == nextent.text_content():
|
|
return p
|
|
return -1
|
|
|
|
|
|
def add_link(pent, nent, ldict):
|
|
na = nent.xpath('descendant::a[1]')
|
|
# If there is no link, leave it as text
|
|
if not na or len(na) == 0:
|
|
return
|
|
na = na[0]
|
|
pa = pent.xpath('descendant::a')
|
|
if pa and len(pa) > 0:
|
|
# Put on same line with a comma
|
|
pa = pa[-1]
|
|
pa.tail = ', '
|
|
p = pa.getparent()
|
|
p.insert(p.index(pa) + 1, na)
|
|
else:
|
|
# substitute link na for plain text in pent
|
|
pent.text = ""
|
|
pent.append(na)
|
|
|
|
|
|
def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
|
|
# First elements match. Any more in next?
|
|
if len(next_path) == (nind + 1):
|
|
nextent = next_block[nind]
|
|
add_link(prev_block[pind], nextent, ldict)
|
|
return
|
|
|
|
nind = nind + 1
|
|
nextent = next_block[nind]
|
|
prevent = find_match(prev_block, pind, nextent, ldict)
|
|
if prevent > 0:
|
|
merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
|
|
return
|
|
|
|
# Want to insert elements into previous block
|
|
while nind < len(next_block):
|
|
# insert takes it out of old
|
|
pind = pind + 1
|
|
prev_block.insert(pind, next_block[nind])
|
|
|
|
next_block.getparent().remove(next_block)
|
|
|
|
|
|
def polish_index_markup(index, blocks):
|
|
# Blocks are in reverse order at this point
|
|
path_map = {}
|
|
ldict = {}
|
|
for block in blocks:
|
|
cls = block.get('class', '') or ''
|
|
block.set('class', (cls + ' index-entry').lstrip())
|
|
a = block.xpath('descendant::a[1]')
|
|
text = ''
|
|
if a:
|
|
text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
|
|
if ':' in text:
|
|
path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
|
|
if len(parts) > 1:
|
|
split_up_block(block, a[0], text, parts, ldict)
|
|
else:
|
|
# try using a span all the time
|
|
path_map[block] = [text]
|
|
parent = a[0].getparent()
|
|
span = parent.makeelement('span', style='display:block; margin-left: 0em')
|
|
parent.append(span)
|
|
span.append(a[0])
|
|
ldict[span] = 0
|
|
|
|
for br in block.xpath('descendant::br'):
|
|
br.tail = None
|
|
|
|
# We want a single block for each main entry
|
|
prev_block = blocks[0]
|
|
for block in blocks[1:]:
|
|
pp, pn = path_map[prev_block], path_map[block]
|
|
if pp[0] == pn[0]:
|
|
merge_blocks(prev_block, block, 0, 0, pn, ldict)
|
|
else:
|
|
prev_block = block
|