mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-23 14:41:30 +02:00
Initial import
This commit is contained in:
@@ -0,0 +1,273 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from operator import itemgetter
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.utils.icu import partition_by_first_letter, sort_key
|
||||
from polyglot.builtins import iteritems, filter
|
||||
|
||||
|
||||
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
|
||||
iet = index.get('entry-type', None)
|
||||
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
|
||||
|
||||
lr = index.get('letter-range', None)
|
||||
if lr is not None:
|
||||
sl, el = lr.parition('-')[0::2]
|
||||
sl, el = sl.strip(), el.strip()
|
||||
if sl and el:
|
||||
def inrange(text):
|
||||
return sl <= text[0] <= el
|
||||
xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
|
||||
|
||||
bmark = index.get('bookmark', None)
|
||||
if bmark is None:
|
||||
return xe_fields
|
||||
attr = expand('w:name')
|
||||
bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
|
||||
ancestors = XPath('ancestor::w:bookmarkStart')
|
||||
|
||||
def contained(xe):
|
||||
# Check if the xe field is contained inside a bookmark with the
|
||||
# specified name
|
||||
return bool(set(ancestors(xe['start_elem'])) & bookmarks)
|
||||
|
||||
return [xe for xe in xe_fields if contained(xe)]
|
||||
|
||||
|
||||
def make_block(expand, style, parent, pos):
|
||||
p = parent.makeelement(expand('w:p'))
|
||||
parent.insert(pos, p)
|
||||
if style is not None:
|
||||
ppr = p.makeelement(expand('w:pPr'))
|
||||
p.append(ppr)
|
||||
ps = ppr.makeelement(expand('w:pStyle'))
|
||||
ppr.append(ps)
|
||||
ps.set(expand('w:val'), style)
|
||||
r = p.makeelement(expand('w:r'))
|
||||
p.append(r)
|
||||
t = r.makeelement(expand('w:t'))
|
||||
t.set(expand('xml:space'), 'preserve')
|
||||
r.append(t)
|
||||
return p, t
|
||||
|
||||
|
||||
def add_xe(xe, t, expand):
|
||||
run = t.getparent()
|
||||
idx = run.index(t)
|
||||
t.text = xe.get('text') or ' '
|
||||
pt = xe.get('page-number-text', None)
|
||||
|
||||
if pt:
|
||||
p = t.getparent().getparent()
|
||||
r = p.makeelement(expand('w:r'))
|
||||
p.append(r)
|
||||
t2 = r.makeelement(expand('w:t'))
|
||||
t2.set(expand('xml:space'), 'preserve')
|
||||
t2.text = ' [%s]' % pt
|
||||
r.append(t2)
|
||||
# put separate entries on separate lines
|
||||
run.insert(idx + 1, run.makeelement(expand('w:br')))
|
||||
return xe['anchor'], run
|
||||
|
||||
|
||||
def process_index(field, index, xe_fields, log, XPath, expand):
|
||||
'''
|
||||
We remove all the word generated index markup and replace it with our own
|
||||
that is more suitable for an ebook.
|
||||
'''
|
||||
styles = []
|
||||
heading_text = index.get('heading', None)
|
||||
heading_style = 'IndexHeading'
|
||||
start_pos = None
|
||||
for elem in field.contents:
|
||||
if elem.tag.endswith('}p'):
|
||||
s = XPath('descendant::pStyle/@w:val')(elem)
|
||||
if s:
|
||||
styles.append(s[0])
|
||||
p = elem.getparent()
|
||||
if start_pos is None:
|
||||
start_pos = (p, p.index(elem))
|
||||
p.remove(elem)
|
||||
|
||||
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
|
||||
if not xe_fields:
|
||||
return [], []
|
||||
if heading_text is not None:
|
||||
groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
|
||||
items = []
|
||||
for key, fields in iteritems(groups):
|
||||
items.append(key), items.extend(fields)
|
||||
if styles:
|
||||
heading_style = styles[0]
|
||||
else:
|
||||
items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
|
||||
|
||||
hyperlinks = []
|
||||
blocks = []
|
||||
for item in reversed(items):
|
||||
is_heading = not isinstance(item, dict)
|
||||
style = heading_style if is_heading else None
|
||||
p, t = make_block(expand, style, *start_pos)
|
||||
if is_heading:
|
||||
text = heading_text
|
||||
if text.lower().startswith('a'):
|
||||
text = item + text[1:]
|
||||
t.text = text
|
||||
else:
|
||||
hyperlinks.append(add_xe(item, t, expand))
|
||||
blocks.append(p)
|
||||
|
||||
return hyperlinks, blocks
|
||||
|
||||
|
||||
def split_up_block(block, a, text, parts, ldict):
|
||||
prefix = parts[:-1]
|
||||
a.text = parts[-1]
|
||||
parent = a.getparent()
|
||||
style = 'display:block; margin-left: %.3gem'
|
||||
for i, prefix in enumerate(prefix):
|
||||
m = 1.5 * i
|
||||
span = parent.makeelement('span', style=style % m)
|
||||
ldict[span] = i
|
||||
parent.append(span)
|
||||
span.text = prefix
|
||||
span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
|
||||
parent.append(span)
|
||||
span.append(a)
|
||||
ldict[span] = len(prefix)
|
||||
|
||||
|
||||
"""
|
||||
The merge algorithm is a little tricky.
|
||||
We start with a list of elementary blocks. Each is an HtmlElement, a p node
|
||||
with a list of child nodes. The last child may be a link, and the earlier ones are
|
||||
just text.
|
||||
The list is in reverse order from what we want in the index.
|
||||
There is a dictionary ldict which records the level of each child node.
|
||||
|
||||
Now we want to do a reduce-like operation, combining all blocks with the same
|
||||
top level index entry into a single block representing the structure of all
|
||||
references, subentries, etc. under that top entry.
|
||||
Here's the algorithm.
|
||||
|
||||
Given a block p and the next block n, and the top level entries p1 and n1 in each
|
||||
block, which we assume have the same text:
|
||||
|
||||
Start with (p, p1) and (n, n1).
|
||||
|
||||
Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
|
||||
|
||||
If there are no more levels in n, and we have a link in nk,
|
||||
then add the link from nk to the links for pk.
|
||||
This might be the first link for pk, or we might get a list of references.
|
||||
|
||||
Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
|
||||
the same text, it must follow pk, it must come before we find any other p entries at
|
||||
the same level as pk, and it must have the same level as nk+1.
|
||||
|
||||
If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
|
||||
|
||||
If there is no matching entry, then because of the original reversed order we want
|
||||
to insert nk+1 and all following entries from n into p immediately following pk.
|
||||
"""
|
||||
|
||||
|
||||
def find_match(prev_block, pind, nextent, ldict):
|
||||
curlevel = ldict.get(prev_block[pind], -1)
|
||||
if curlevel < 0:
|
||||
return -1
|
||||
for p in range(pind+1, len(prev_block)):
|
||||
trylev = ldict.get(prev_block[p], -1)
|
||||
if trylev <= curlevel:
|
||||
return -1
|
||||
if trylev > (curlevel+1):
|
||||
continue
|
||||
if prev_block[p].text_content() == nextent.text_content():
|
||||
return p
|
||||
return -1
|
||||
|
||||
|
||||
def add_link(pent, nent, ldict):
|
||||
na = nent.xpath('descendant::a[1]')
|
||||
# If there is no link, leave it as text
|
||||
if not na or len(na) == 0:
|
||||
return
|
||||
na = na[0]
|
||||
pa = pent.xpath('descendant::a')
|
||||
if pa and len(pa) > 0:
|
||||
# Put on same line with a comma
|
||||
pa = pa[-1]
|
||||
pa.tail = ', '
|
||||
p = pa.getparent()
|
||||
p.insert(p.index(pa) + 1, na)
|
||||
else:
|
||||
# substitute link na for plain text in pent
|
||||
pent.text = ""
|
||||
pent.append(na)
|
||||
|
||||
|
||||
def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
|
||||
# First elements match. Any more in next?
|
||||
if len(next_path) == (nind + 1):
|
||||
nextent = next_block[nind]
|
||||
add_link(prev_block[pind], nextent, ldict)
|
||||
return
|
||||
|
||||
nind = nind + 1
|
||||
nextent = next_block[nind]
|
||||
prevent = find_match(prev_block, pind, nextent, ldict)
|
||||
if prevent > 0:
|
||||
merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
|
||||
return
|
||||
|
||||
# Want to insert elements into previous block
|
||||
while nind < len(next_block):
|
||||
# insert takes it out of old
|
||||
pind = pind + 1
|
||||
prev_block.insert(pind, next_block[nind])
|
||||
|
||||
next_block.getparent().remove(next_block)
|
||||
|
||||
|
||||
def polish_index_markup(index, blocks):
|
||||
# Blocks are in reverse order at this point
|
||||
path_map = {}
|
||||
ldict = {}
|
||||
for block in blocks:
|
||||
cls = block.get('class', '') or ''
|
||||
block.set('class', (cls + ' index-entry').lstrip())
|
||||
a = block.xpath('descendant::a[1]')
|
||||
text = ''
|
||||
if a:
|
||||
text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
|
||||
if ':' in text:
|
||||
path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
|
||||
if len(parts) > 1:
|
||||
split_up_block(block, a[0], text, parts, ldict)
|
||||
else:
|
||||
# try using a span all the time
|
||||
path_map[block] = [text]
|
||||
parent = a[0].getparent()
|
||||
span = parent.makeelement('span', style='display:block; margin-left: 0em')
|
||||
parent.append(span)
|
||||
span.append(a[0])
|
||||
ldict[span] = 0
|
||||
|
||||
for br in block.xpath('descendant::br'):
|
||||
br.tail = None
|
||||
|
||||
# We want a single block for each main entry
|
||||
prev_block = blocks[0]
|
||||
for block in blocks[1:]:
|
||||
pp, pn = path_map[prev_block], path_map[block]
|
||||
if pp[0] == pn[0]:
|
||||
merge_blocks(prev_block, block, 0, 0, pn, ldict)
|
||||
else:
|
||||
prev_block = block
|
||||
Reference in New Issue
Block a user