mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-31 02:25:45 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
840 lines
34 KiB
Python
840 lines
34 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import sys, os, re, math, errno, uuid, numbers
|
|
from collections import OrderedDict, defaultdict
|
|
|
|
from lxml import html
|
|
from lxml.html.builder import (
|
|
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
|
|
|
|
from ebook_converter import guess_type
|
|
from ebook_converter.ebooks.docx.container import DOCX, fromstring
|
|
from ebook_converter.ebooks.docx.names import XML, generate_anchor
|
|
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
|
|
from ebook_converter.ebooks.docx.numbering import Numbering
|
|
from ebook_converter.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
|
|
from ebook_converter.ebooks.docx.images import Images
|
|
from ebook_converter.ebooks.docx.tables import Tables
|
|
from ebook_converter.ebooks.docx.footnotes import Footnotes
|
|
from ebook_converter.ebooks.docx.cleanup import cleanup_markup
|
|
from ebook_converter.ebooks.docx.theme import Theme
|
|
from ebook_converter.ebooks.docx.toc import create_toc
|
|
from ebook_converter.ebooks.docx.fields import Fields
|
|
from ebook_converter.ebooks.docx.settings import Settings
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
|
from ebook_converter.polyglot.builtins import iteritems, itervalues, filter, getcwd, map, unicode_type
|
|
|
|
|
|
NBSP = '\xa0'
|
|
|
|
|
|
class Text:
|
|
|
|
def __init__(self, elem, attr, buf):
|
|
self.elem, self.attr, self.buf = elem, attr, buf
|
|
self.elems = [self.elem]
|
|
|
|
def add_elem(self, elem):
|
|
self.elems.append(elem)
|
|
setattr(self.elem, self.attr, ''.join(self.buf))
|
|
self.elem, self.attr, self.buf = elem, 'tail', []
|
|
|
|
def __iter__(self):
|
|
return iter(self.elems)
|
|
|
|
|
|
def html_lang(docx_lang):
|
|
lang = canonicalize_lang(docx_lang)
|
|
if lang and lang != 'und':
|
|
lang = lang_as_iso639_1(lang)
|
|
if lang:
|
|
return lang
|
|
|
|
|
|
class Convert(object):
|
|
|
|
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
|
|
self.docx = DOCX(path_or_stream, log=log)
|
|
self.namespace = self.docx.namespace
|
|
self.ms_pat = re.compile(r'\s{2,}')
|
|
self.ws_pat = re.compile(r'[\n\r\t]')
|
|
self.log = self.docx.log
|
|
self.detect_cover = detect_cover
|
|
self.notes_text = notes_text or _('Notes')
|
|
self.notes_nopb = notes_nopb
|
|
self.nosupsub = nosupsub
|
|
self.dest_dir = dest_dir or getcwd()
|
|
self.mi = self.docx.metadata
|
|
self.body = BODY()
|
|
self.theme = Theme(self.namespace)
|
|
self.settings = Settings(self.namespace)
|
|
self.tables = Tables(self.namespace)
|
|
self.fields = Fields(self.namespace)
|
|
self.styles = Styles(self.namespace, self.tables)
|
|
self.images = Images(self.namespace, self.log)
|
|
self.object_map = OrderedDict()
|
|
self.html = HTML(
|
|
HEAD(
|
|
META(charset='utf-8'),
|
|
TITLE(self.mi.title or _('Unknown')),
|
|
LINK(rel='stylesheet', type='text/css', href='docx.css'),
|
|
),
|
|
self.body
|
|
)
|
|
self.html.text='\n\t'
|
|
self.html[0].text='\n\t\t'
|
|
self.html[0].tail='\n'
|
|
for child in self.html[0]:
|
|
child.tail = '\n\t\t'
|
|
self.html[0][-1].tail = '\n\t'
|
|
self.html[1].text = self.html[1].tail = '\n'
|
|
lang = html_lang(self.mi.language)
|
|
if lang:
|
|
self.html.set('lang', lang)
|
|
self.doc_lang = lang
|
|
else:
|
|
self.doc_lang = None
|
|
|
|
def __call__(self):
|
|
doc = self.docx.document
|
|
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
|
self.resolve_alternate_content(doc)
|
|
self.fields(doc, self.log)
|
|
self.read_styles(relationships_by_type)
|
|
self.images(relationships_by_id)
|
|
self.layers = OrderedDict()
|
|
self.framed = [[]]
|
|
self.frame_map = {}
|
|
self.framed_map = {}
|
|
self.anchor_map = {}
|
|
self.link_map = defaultdict(list)
|
|
self.link_source_map = {}
|
|
self.toc_anchor = None
|
|
self.block_runs = []
|
|
paras = []
|
|
|
|
self.log.debug('Converting Word markup to HTML')
|
|
|
|
self.read_page_properties(doc)
|
|
self.current_rels = relationships_by_id
|
|
for wp, page_properties in iteritems(self.page_map):
|
|
self.current_page = page_properties
|
|
if wp.tag.endswith('}p'):
|
|
p = self.convert_p(wp)
|
|
self.body.append(p)
|
|
paras.append(wp)
|
|
|
|
self.read_block_anchors(doc)
|
|
self.styles.apply_contextual_spacing(paras)
|
|
self.mark_block_runs(paras)
|
|
# Apply page breaks at the start of every section, except the first
|
|
# section (since that will be the start of the file)
|
|
self.styles.apply_section_page_breaks(self.section_starts[1:])
|
|
|
|
notes_header = None
|
|
orig_rid_map = self.images.rid_map
|
|
if self.footnotes.has_notes:
|
|
self.body.append(H1(self.notes_text))
|
|
notes_header = self.body[-1]
|
|
notes_header.set('class', 'notes-header')
|
|
for anchor, text, note in self.footnotes:
|
|
dl = DL(id=anchor)
|
|
dl.set('class', 'footnote')
|
|
self.body.append(dl)
|
|
dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
|
|
dl[-1][0].tail = ']'
|
|
dl.append(DD())
|
|
paras = []
|
|
self.images.rid_map = self.current_rels = note.rels[0]
|
|
for wp in note:
|
|
if wp.tag.endswith('}tbl'):
|
|
self.tables.register(wp, self.styles)
|
|
self.page_map[wp] = self.current_page
|
|
else:
|
|
p = self.convert_p(wp)
|
|
dl[-1].append(p)
|
|
paras.append(wp)
|
|
self.styles.apply_contextual_spacing(paras)
|
|
self.mark_block_runs(paras)
|
|
|
|
for p, wp in iteritems(self.object_map):
|
|
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
|
|
# Paragraph uses tabs for indentation, convert to text-indent
|
|
parent = p[0]
|
|
tabs = []
|
|
for child in parent:
|
|
if child.get('class', None) == 'tab':
|
|
tabs.append(child)
|
|
if child.tail:
|
|
break
|
|
else:
|
|
break
|
|
indent = len(tabs) * self.settings.default_tab_stop
|
|
style = self.styles.resolve(wp)
|
|
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
|
|
if style.text_indent is not inherit:
|
|
indent = float(style.text_indent[:-2]) + indent
|
|
style.text_indent = '%.3gpt' % indent
|
|
parent.text = tabs[-1].tail or ''
|
|
list(map(parent.remove, tabs))
|
|
|
|
self.images.rid_map = orig_rid_map
|
|
|
|
self.resolve_links()
|
|
|
|
self.styles.cascade(self.layers)
|
|
|
|
self.tables.apply_markup(self.object_map, self.page_map)
|
|
|
|
numbered = []
|
|
for html_obj, obj in iteritems(self.object_map):
|
|
raw = obj.get('calibre_num_id', None)
|
|
if raw is not None:
|
|
lvl, num_id = raw.partition(':')[0::2]
|
|
try:
|
|
lvl = int(lvl)
|
|
except (TypeError, ValueError):
|
|
lvl = 0
|
|
numbered.append((html_obj, num_id, lvl))
|
|
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
|
|
self.apply_frames()
|
|
|
|
if len(self.body) > 0:
|
|
self.body.text = '\n\t'
|
|
for child in self.body:
|
|
child.tail = '\n\t'
|
|
self.body[-1].tail = '\n'
|
|
|
|
self.log.debug('Converting styles to CSS')
|
|
self.styles.generate_classes()
|
|
for html_obj, obj in iteritems(self.object_map):
|
|
style = self.styles.resolve(obj)
|
|
if style is not None:
|
|
css = style.css
|
|
if css:
|
|
cls = self.styles.class_name(css)
|
|
if cls:
|
|
html_obj.set('class', cls)
|
|
for html_obj, css in iteritems(self.framed_map):
|
|
cls = self.styles.class_name(css)
|
|
if cls:
|
|
html_obj.set('class', cls)
|
|
|
|
if notes_header is not None:
|
|
for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
|
|
notes_header.tag = h.tag
|
|
cls = h.get('class', None)
|
|
if cls and cls != 'notes-header':
|
|
notes_header.set('class', '%s notes-header' % cls)
|
|
break
|
|
|
|
self.fields.polish_markup(self.object_map)
|
|
|
|
self.log.debug('Cleaning up redundant markup generated by Word')
|
|
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
|
|
|
|
return self.write(doc)
|
|
|
|
def read_page_properties(self, doc):
|
|
current = []
|
|
self.page_map = OrderedDict()
|
|
self.section_starts = []
|
|
|
|
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
|
|
if p.tag.endswith('}tbl'):
|
|
self.tables.register(p, self.styles)
|
|
current.append(p)
|
|
continue
|
|
sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
|
|
if sect:
|
|
pr = PageProperties(self.namespace, sect)
|
|
paras = current + [p]
|
|
for x in paras:
|
|
self.page_map[x] = pr
|
|
self.section_starts.append(paras[0])
|
|
current = []
|
|
else:
|
|
current.append(p)
|
|
|
|
if current:
|
|
self.section_starts.append(current[0])
|
|
last = self.namespace.XPath('./w:body/w:sectPr')(doc)
|
|
pr = PageProperties(self.namespace, last)
|
|
for x in current:
|
|
self.page_map[x] = pr
|
|
|
|
def resolve_alternate_content(self, doc):
|
|
# For proprietary extensions in Word documents use the fallback, spec
|
|
# compliant form
|
|
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
|
|
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
|
|
choices = self.namespace.XPath('./mc:Choice')(ac)
|
|
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
|
|
if fallbacks:
|
|
for choice in choices:
|
|
ac.remove(choice)
|
|
|
|
def read_styles(self, relationships_by_type):
|
|
|
|
def get_name(rtype, defname):
|
|
name = relationships_by_type.get(rtype, None)
|
|
if name is None:
|
|
cname = self.docx.document_name.split('/')
|
|
cname[-1] = defname
|
|
if self.docx.exists('/'.join(cname)):
|
|
name = name
|
|
if name and name.startswith('word/word') and not self.docx.exists(name):
|
|
name = name.partition('/')[2]
|
|
return name
|
|
|
|
nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
|
|
sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
|
|
sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
|
|
fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
|
|
tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
|
|
foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
|
|
enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
|
|
numbering = self.numbering = Numbering(self.namespace)
|
|
footnotes = self.footnotes = Footnotes(self.namespace)
|
|
fonts = self.fonts = Fonts(self.namespace)
|
|
|
|
foraw = enraw = None
|
|
forel, enrel = ({}, {}), ({}, {})
|
|
if sename is not None:
|
|
try:
|
|
seraw = self.docx.read(sename)
|
|
except KeyError:
|
|
self.log.warn('Settings %s do not exist' % sename)
|
|
except EnvironmentError as e:
|
|
if e.errno != errno.ENOENT:
|
|
raise
|
|
self.log.warn('Settings %s file missing' % sename)
|
|
else:
|
|
self.settings(fromstring(seraw))
|
|
|
|
if foname is not None:
|
|
try:
|
|
foraw = self.docx.read(foname)
|
|
except KeyError:
|
|
self.log.warn('Footnotes %s do not exist' % foname)
|
|
else:
|
|
forel = self.docx.get_relationships(foname)
|
|
if enname is not None:
|
|
try:
|
|
enraw = self.docx.read(enname)
|
|
except KeyError:
|
|
self.log.warn('Endnotes %s do not exist' % enname)
|
|
else:
|
|
enrel = self.docx.get_relationships(enname)
|
|
footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
|
|
|
|
if fname is not None:
|
|
embed_relationships = self.docx.get_relationships(fname)[0]
|
|
try:
|
|
raw = self.docx.read(fname)
|
|
except KeyError:
|
|
self.log.warn('Fonts table %s does not exist' % fname)
|
|
else:
|
|
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
|
|
|
|
if tname is not None:
|
|
try:
|
|
raw = self.docx.read(tname)
|
|
except KeyError:
|
|
self.log.warn('Styles %s do not exist' % sname)
|
|
else:
|
|
self.theme(fromstring(raw))
|
|
|
|
styles_loaded = False
|
|
if sname is not None:
|
|
try:
|
|
raw = self.docx.read(sname)
|
|
except KeyError:
|
|
self.log.warn('Styles %s do not exist' % sname)
|
|
else:
|
|
self.styles(fromstring(raw), fonts, self.theme)
|
|
styles_loaded = True
|
|
if not styles_loaded:
|
|
self.styles(None, fonts, self.theme)
|
|
|
|
if nname is not None:
|
|
try:
|
|
raw = self.docx.read(nname)
|
|
except KeyError:
|
|
self.log.warn('Numbering styles %s do not exist' % nname)
|
|
else:
|
|
numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
|
|
|
|
self.styles.resolve_numbering(numbering)
|
|
|
|
def write(self, doc):
|
|
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
|
|
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
|
|
with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
|
|
f.write(raw)
|
|
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
|
|
if css:
|
|
with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
|
|
f.write(css.encode('utf-8'))
|
|
|
|
opf = OPFCreator(self.dest_dir, self.mi)
|
|
opf.toc = toc
|
|
opf.create_manifest_from_files_in([self.dest_dir])
|
|
for item in opf.manifest:
|
|
if item.media_type == 'text/html':
|
|
item.media_type = guess_type('a.xhtml')[0]
|
|
opf.create_spine(['index.html'])
|
|
if self.cover_image is not None:
|
|
opf.guide.set_cover(self.cover_image)
|
|
|
|
def process_guide(E, guide):
|
|
if self.toc_anchor is not None:
|
|
guide.append(E.reference(
|
|
href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
|
|
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
|
|
with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
|
|
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
|
|
if os.path.getsize(toc_file) == 0:
|
|
os.remove(toc_file)
|
|
return os.path.join(self.dest_dir, 'metadata.opf')
|
|
|
|
def read_block_anchors(self, doc):
|
|
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
|
|
if doc_anchors:
|
|
current_bm = set()
|
|
rmap = {v:k for k, v in iteritems(self.object_map)}
|
|
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
|
|
if p.tag.endswith('}p'):
|
|
if current_bm and p in rmap:
|
|
para = rmap[p]
|
|
if 'id' not in para.attrib:
|
|
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
|
|
for name in current_bm:
|
|
self.anchor_map[name] = para.get('id')
|
|
current_bm = set()
|
|
elif p in doc_anchors:
|
|
anchor = self.namespace.get(p, 'w:name')
|
|
if anchor:
|
|
current_bm.add(anchor)
|
|
|
|
def convert_p(self, p):
|
|
dest = P()
|
|
self.object_map[dest] = p
|
|
style = self.styles.resolve_paragraph(p)
|
|
self.layers[p] = []
|
|
self.frame_map[p] = style.frame
|
|
self.add_frame(dest, style.frame)
|
|
|
|
current_anchor = None
|
|
current_hyperlink = None
|
|
hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
|
|
|
|
def p_parent(x):
|
|
# Ensure that nested <w:p> tags are handled. These can occur if a
|
|
# textbox is present inside a paragraph.
|
|
while True:
|
|
x = x.getparent()
|
|
try:
|
|
if x.tag.endswith('}p'):
|
|
return x
|
|
except AttributeError:
|
|
break
|
|
|
|
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
|
|
if p_parent(x) is not p:
|
|
continue
|
|
if x.tag.endswith('}r'):
|
|
span = self.convert_run(x)
|
|
if current_anchor is not None:
|
|
(dest if len(dest) == 0 else span).set('id', current_anchor)
|
|
current_anchor = None
|
|
if current_hyperlink is not None:
|
|
try:
|
|
hl = hl_xpath(x)[0]
|
|
self.link_map[hl].append(span)
|
|
self.link_source_map[hl] = self.current_rels
|
|
x.set('is-link', '1')
|
|
except IndexError:
|
|
current_hyperlink = None
|
|
dest.append(span)
|
|
self.layers[p].append(x)
|
|
elif x.tag.endswith('}bookmarkStart'):
|
|
anchor = self.namespace.get(x, 'w:name')
|
|
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
|
|
# _GoBack is a special bookmark inserted by Word 2010 for
|
|
# the return to previous edit feature, we ignore it
|
|
old_anchor = current_anchor
|
|
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
|
|
if old_anchor is not None:
|
|
# The previous anchor was not applied to any element
|
|
for a, t in tuple(iteritems(self.anchor_map)):
|
|
if t == old_anchor:
|
|
self.anchor_map[a] = current_anchor
|
|
elif x.tag.endswith('}hyperlink'):
|
|
current_hyperlink = x
|
|
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
|
|
old_anchor = current_anchor
|
|
anchor = unicode_type(uuid.uuid4())
|
|
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
|
|
self.toc_anchor = current_anchor
|
|
if old_anchor is not None:
|
|
# The previous anchor was not applied to any element
|
|
for a, t in tuple(iteritems(self.anchor_map)):
|
|
if t == old_anchor:
|
|
self.anchor_map[a] = current_anchor
|
|
if current_anchor is not None:
|
|
# This paragraph had no <w:r> descendants
|
|
dest.set('id', current_anchor)
|
|
current_anchor = None
|
|
|
|
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
|
|
if m is not None:
|
|
n = min(6, max(1, int(m.group(1))))
|
|
dest.tag = 'h%d' % n
|
|
dest.set('data-heading-level', unicode_type(n))
|
|
|
|
if style.bidi is True:
|
|
dest.set('dir', 'rtl')
|
|
|
|
border_runs = []
|
|
common_borders = []
|
|
for span in dest:
|
|
run = self.object_map[span]
|
|
style = self.styles.resolve_run(run)
|
|
if not border_runs or border_runs[-1][1].same_border(style):
|
|
border_runs.append((span, style))
|
|
elif border_runs:
|
|
if len(border_runs) > 1:
|
|
common_borders.append(border_runs)
|
|
border_runs = []
|
|
|
|
for border_run in common_borders:
|
|
spans = []
|
|
bs = {}
|
|
for span, style in border_run:
|
|
style.get_border_css(bs)
|
|
style.clear_border_css()
|
|
spans.append(span)
|
|
if bs:
|
|
cls = self.styles.register(bs, 'text_border')
|
|
wrapper = self.wrap_elems(spans, SPAN())
|
|
wrapper.set('class', cls)
|
|
|
|
if not dest.text and len(dest) == 0 and not style.has_visible_border():
|
|
# Empty paragraph add a non-breaking space so that it is rendered
|
|
# by WebKit
|
|
dest.text = NBSP
|
|
|
|
# If the last element in a block is a <br> the <br> is not rendered in
|
|
# HTML, unless it is followed by a trailing space. Word, on the other
|
|
# hand inserts a blank line for trailing <br>s.
|
|
if len(dest) > 0 and not dest[-1].tail:
|
|
if dest[-1].tag == 'br':
|
|
dest[-1].tail = NBSP
|
|
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
|
|
dest[-1][-1].tail = NBSP
|
|
|
|
return dest
|
|
|
|
def wrap_elems(self, elems, wrapper):
|
|
p = elems[0].getparent()
|
|
idx = p.index(elems[0])
|
|
p.insert(idx, wrapper)
|
|
wrapper.tail = elems[-1].tail
|
|
elems[-1].tail = None
|
|
for elem in elems:
|
|
try:
|
|
p.remove(elem)
|
|
except ValueError:
|
|
# Probably a hyperlink that spans multiple
|
|
# paragraphs,theoretically we should break this up into
|
|
# multiple hyperlinks, but I can't be bothered.
|
|
elem.getparent().remove(elem)
|
|
wrapper.append(elem)
|
|
return wrapper
|
|
|
|
def resolve_links(self):
|
|
self.resolved_link_map = {}
|
|
for hyperlink, spans in iteritems(self.link_map):
|
|
relationships_by_id = self.link_source_map[hyperlink]
|
|
span = spans[0]
|
|
if len(spans) > 1:
|
|
span = self.wrap_elems(spans, SPAN())
|
|
span.tag = 'a'
|
|
self.resolved_link_map[hyperlink] = span
|
|
tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
|
|
if tgt:
|
|
span.set('target', tgt)
|
|
tt = self.namespace.get(hyperlink, 'w:tooltip')
|
|
if tt:
|
|
span.set('title', tt)
|
|
rid = self.namespace.get(hyperlink, 'r:id')
|
|
if rid and rid in relationships_by_id:
|
|
span.set('href', relationships_by_id[rid])
|
|
continue
|
|
anchor = self.namespace.get(hyperlink, 'w:anchor')
|
|
if anchor and anchor in self.anchor_map:
|
|
span.set('href', '#' + self.anchor_map[anchor])
|
|
continue
|
|
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
|
|
(rid, anchor))
|
|
# hrefs that point nowhere give epubcheck a hernia. The element
|
|
# should be styled explicitly by Word anyway.
|
|
# span.set('href', '#')
|
|
rmap = {v:k for k, v in iteritems(self.object_map)}
|
|
for hyperlink, runs in self.fields.hyperlink_fields:
|
|
spans = [rmap[r] for r in runs if r in rmap]
|
|
if not spans:
|
|
continue
|
|
span = spans[0]
|
|
if len(spans) > 1:
|
|
span = self.wrap_elems(spans, SPAN())
|
|
span.tag = 'a'
|
|
tgt = hyperlink.get('target', None)
|
|
if tgt:
|
|
span.set('target', tgt)
|
|
tt = hyperlink.get('title', None)
|
|
if tt:
|
|
span.set('title', tt)
|
|
url = hyperlink.get('url', None)
|
|
if url is None:
|
|
anchor = hyperlink.get('anchor', None)
|
|
if anchor in self.anchor_map:
|
|
span.set('href', '#' + self.anchor_map[anchor])
|
|
continue
|
|
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
|
|
else:
|
|
if url in self.anchor_map:
|
|
span.set('href', '#' + self.anchor_map[url])
|
|
continue
|
|
span.set('href', url)
|
|
|
|
for img, link, relationships_by_id in self.images.links:
|
|
parent = img.getparent()
|
|
idx = parent.index(img)
|
|
a = A(img)
|
|
a.tail, img.tail = img.tail, None
|
|
parent.insert(idx, a)
|
|
tgt = link.get('target', None)
|
|
if tgt:
|
|
a.set('target', tgt)
|
|
tt = link.get('title', None)
|
|
if tt:
|
|
a.set('title', tt)
|
|
rid = link['id']
|
|
if rid in relationships_by_id:
|
|
dest = relationships_by_id[rid]
|
|
if dest.startswith('#'):
|
|
if dest[1:] in self.anchor_map:
|
|
a.set('href', '#' + self.anchor_map[dest[1:]])
|
|
else:
|
|
a.set('href', dest)
|
|
|
|
def convert_run(self, run):
|
|
ans = SPAN()
|
|
self.object_map[ans] = run
|
|
text = Text(ans, 'text', [])
|
|
|
|
for child in run:
|
|
if self.namespace.is_tag(child, 'w:t'):
|
|
if not child.text:
|
|
continue
|
|
space = child.get(XML('space'), None)
|
|
preserve = False
|
|
ctext = child.text
|
|
if space != 'preserve':
|
|
# Remove leading and trailing whitespace. Word ignores
|
|
# leading and trailing whitespace without preserve
|
|
ctext = ctext.strip(' \n\r\t')
|
|
# Only use a <span> with white-space:pre-wrap if this element
|
|
# actually needs it, i.e. if it has more than one
|
|
# consecutive space or it has newlines or tabs.
|
|
multi_spaces = self.ms_pat.search(ctext) is not None
|
|
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
|
|
if preserve:
|
|
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
|
|
ans.append(text.elem)
|
|
else:
|
|
text.buf.append(ctext)
|
|
elif self.namespace.is_tag(child, 'w:cr'):
|
|
text.add_elem(BR())
|
|
ans.append(text.elem)
|
|
elif self.namespace.is_tag(child, 'w:br'):
|
|
typ = self.namespace.get(child, 'w:type')
|
|
if typ in {'column', 'page'}:
|
|
br = BR(style='page-break-after:always')
|
|
else:
|
|
clear = child.get('clear', None)
|
|
if clear in {'all', 'left', 'right'}:
|
|
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
|
|
else:
|
|
br = BR()
|
|
text.add_elem(br)
|
|
ans.append(text.elem)
|
|
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
|
|
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
|
|
text.add_elem(img)
|
|
ans.append(text.elem)
|
|
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
|
|
anchor, name = self.footnotes.get_ref(child)
|
|
if anchor and name:
|
|
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
|
|
l.set('class', 'noteref')
|
|
text.add_elem(l)
|
|
ans.append(text.elem)
|
|
elif self.namespace.is_tag(child, 'w:tab'):
|
|
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
|
|
text.add_elem(SPAN(NBSP * spaces))
|
|
ans.append(text.elem)
|
|
ans[-1].set('class', 'tab')
|
|
elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
|
|
text.buf.append('\u2011')
|
|
elif self.namespace.is_tag(child, 'w:softHyphen'):
|
|
text.buf.append('\u00ad')
|
|
if text.buf:
|
|
setattr(text.elem, text.attr, ''.join(text.buf))
|
|
|
|
style = self.styles.resolve_run(run)
|
|
if style.vert_align in {'superscript', 'subscript'}:
|
|
if ans.text or len(ans):
|
|
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
|
|
if style.lang is not inherit:
|
|
lang = html_lang(style.lang)
|
|
if lang is not None and lang != self.doc_lang:
|
|
ans.set('lang', lang)
|
|
if style.rtl is True:
|
|
ans.set('dir', 'rtl')
|
|
if is_symbol_font(style.font_family):
|
|
for elem in text:
|
|
if elem.text:
|
|
elem.text = map_symbol_text(elem.text, style.font_family)
|
|
if elem.tail:
|
|
elem.tail = map_symbol_text(elem.tail, style.font_family)
|
|
style.font_family = 'sans-serif'
|
|
return ans
|
|
|
|
def add_frame(self, html_obj, style):
|
|
last_run = self.framed[-1]
|
|
if style is inherit:
|
|
if last_run:
|
|
self.framed.append([])
|
|
return
|
|
|
|
if last_run:
|
|
if last_run[-1][1] == style:
|
|
last_run.append((html_obj, style))
|
|
else:
|
|
self.framed[-1].append((html_obj, style))
|
|
else:
|
|
last_run.append((html_obj, style))
|
|
|
|
def apply_frames(self):
|
|
for run in filter(None, self.framed):
|
|
style = run[0][1]
|
|
paras = tuple(x[0] for x in run)
|
|
parent = paras[0].getparent()
|
|
idx = parent.index(paras[0])
|
|
frame = DIV(*paras)
|
|
parent.insert(idx, frame)
|
|
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
|
|
self.styles.register(css, 'frame')
|
|
|
|
if not self.block_runs:
|
|
return
|
|
rmap = {v:k for k, v in iteritems(self.object_map)}
|
|
for border_style, blocks in self.block_runs:
|
|
paras = tuple(rmap[p] for p in blocks)
|
|
for p in paras:
|
|
if p.tag == 'li':
|
|
has_li = True
|
|
break
|
|
else:
|
|
has_li = False
|
|
parent = paras[0].getparent()
|
|
if parent.tag in ('ul', 'ol'):
|
|
ul = parent
|
|
parent = ul.getparent()
|
|
idx = parent.index(ul)
|
|
frame = DIV(ul)
|
|
elif has_li:
|
|
def top_level_tag(x):
|
|
while True:
|
|
q = x.getparent()
|
|
if q is parent or q is None:
|
|
break
|
|
x = q
|
|
return x
|
|
paras = tuple(map(top_level_tag, paras))
|
|
idx = parent.index(paras[0])
|
|
frame = DIV(*paras)
|
|
else:
|
|
idx = parent.index(paras[0])
|
|
frame = DIV(*paras)
|
|
parent.insert(idx, frame)
|
|
self.framed_map[frame] = css = border_style.css
|
|
self.styles.register(css, 'frame')
|
|
|
|
def mark_block_runs(self, paras):
|
|
|
|
def process_run(run):
|
|
max_left = max_right = 0
|
|
has_visible_border = None
|
|
for p in run:
|
|
style = self.styles.resolve_paragraph(p)
|
|
if has_visible_border is None:
|
|
has_visible_border = style.has_visible_border()
|
|
if isinstance(style.margin_left, numbers.Number):
|
|
max_left = max(style.margin_left, max_left)
|
|
if isinstance(style.margin_right, numbers.Number):
|
|
max_right = max(style.margin_right, max_right)
|
|
if has_visible_border:
|
|
style.margin_left = style.margin_right = inherit
|
|
if p is not run[0]:
|
|
style.padding_top = 0
|
|
else:
|
|
border_style = style.clone_border_styles()
|
|
if has_visible_border:
|
|
border_style.margin_top, style.margin_top = style.margin_top, inherit
|
|
if p is not run[-1]:
|
|
style.padding_bottom = 0
|
|
else:
|
|
if has_visible_border:
|
|
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
|
|
style.clear_borders()
|
|
if p is not run[-1]:
|
|
style.apply_between_border()
|
|
if has_visible_border:
|
|
border_style.margin_left, border_style.margin_right = max_left,max_right
|
|
self.block_runs.append((border_style, run))
|
|
|
|
run = []
|
|
for p in paras:
|
|
if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
|
|
style = self.styles.resolve_paragraph(p)
|
|
last_style = self.styles.resolve_paragraph(run[-1])
|
|
if style.has_identical_borders(last_style):
|
|
run.append(p)
|
|
continue
|
|
if len(run) > 1:
|
|
process_run(run)
|
|
run = [p]
|
|
if len(run) > 1:
|
|
process_run(run)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import shutil
|
|
from ebook_converter.utils.logging import default_log
|
|
default_log.filter_level = default_log.DEBUG
|
|
dest_dir = os.path.join(getcwd(), 'docx_input')
|
|
if os.path.exists(dest_dir):
|
|
shutil.rmtree(dest_dir)
|
|
os.mkdir(dest_dir)
|
|
Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
|