import collections import re from ebook_converter.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area from ebook_converter.ebooks.docx.writer.fonts import FontsManager from ebook_converter.ebooks.docx.writer.images import ImagesManager from ebook_converter.ebooks.docx.writer.links import LinksManager from ebook_converter.ebooks.docx.writer.lists import ListsManager from ebook_converter.ebooks.docx.writer.styles import StylesManager, FloatSpec from ebook_converter.ebooks.docx.writer.tables import Table from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from ebook_converter.utils.localization import lang_as_iso639_1 def lang_for_tag(tag): for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'): val = lang_as_iso639_1(tag.get(attr)) if val: return val class Style(St): def __init__(self, *args, **kwargs): St.__init__(self, *args, **kwargs) self._letterSpacing = None @property def letterSpacing(self): if self._letterSpacing is not None: val = self._get('letter-spacing') if val == 'normal': self._letterSpacing = val else: self._letterSpacing = self._unit_convert(val) return self._letterSpacing class Stylizer(Sz): def style(self, element): try: return self._styles[element] except KeyError: return Style(element, self) class TextRun(object): ws_pat = None def __init__(self, namespace, style, first_html_parent, lang=None): self.first_html_parent = first_html_parent if self.ws_pat is None: TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') self.style = style self.texts = [] self.link = None self.lang = lang self.parent_style = None self.makeelement = namespace.makeelement self.descendant_style = None def add_text(self, text, preserve_whitespace, bookmark=None, link=None): if not preserve_whitespace: text = self.ws_pat.sub(' ', text) if text.strip() != text: # If preserve_whitespace is False, Word ignores leading and # trailing whitespace preserve_whitespace = True self.texts.append((text, preserve_whitespace, bookmark)) self.link = link def add_break(self, clear='none', bookmark=None): self.texts.append((None, clear, bookmark)) def add_image(self, drawing, bookmark=None): self.texts.append((drawing, None, bookmark)) def serialize(self, p, links_manager): makeelement = self.makeelement parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link) r = makeelement(parent, 'w:r') rpr = makeelement(r, 'w:rPr', append=False) if getattr(self.descendant_style, 'id', None) is not None: makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id) if self.lang: makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang) if len(rpr) > 0: r.append(rpr) for text, preserve_whitespace, bookmark in self.texts: if bookmark is not None: bid = links_manager.bookmark_id makeelement(r, 'w:bookmarkStart', w_id=str(bid), w_name=bookmark) if text is None: makeelement(r, 'w:br', w_clear=preserve_whitespace) elif hasattr(text, 'xpath'): r.append(text) else: t = makeelement(r, 'w:t') t.text = text or '' if preserve_whitespace: t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') if bookmark is not None: makeelement(r, 'w:bookmarkEnd', w_id=str(bid)) def __repr__(self): return repr(self.texts) def is_empty(self): if not self.texts: return True if len(self.texts) == 1 and self.texts[0][:2] == ('', False): return True return False @property def style_weight(self): ans = 0 for text, preserve_whitespace, bookmark in self.texts: if isinstance(text, str): ans += len(text) return ans class Block(object): def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None): self.force_not_empty = False self.namespace = namespace self.bookmarks = set() self.list_tag = (html_block, style) if is_list_item else None self.is_first_block = False self.numbering_id = None self.parent_items = None self.html_block = html_block self.html_tag = parse_utils.barename(html_block.tag) self.float_spec = float_spec if float_spec is not None: float_spec.blocks.append(self) self.html_style = style self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg) self.styles_manager, self.links_manager = styles_manager, links_manager self.keep_next = False self.runs = [] self.skipped = False self.linked_style = None self.page_break_before = style['page-break-before'] == 'always' self.keep_lines = style['page-break-inside'] == 'avoid' self.page_break_after = False self.block_lang = None def resolve_skipped(self, next_block): if not self.is_empty(): return if len(self.html_block) > 0 and self.html_block[0] is next_block.html_block: self.skipped = True if self.list_tag is not None: next_block.list_tag = self.list_tag def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None, is_parent_style=False, bookmark=None, link=None, lang=None): ws = style['white-space'] preserve_whitespace = ws in {'pre', 'pre-wrap', '-o-pre-wrap'} ts = self.styles_manager.create_text_style(style, is_parent_style=is_parent_style) if self.runs and ts == self.runs[-1].style and link == self.runs[-1].link and lang == self.runs[-1].lang: run = self.runs[-1] else: run = TextRun(self.namespace, ts, self.html_block if html_parent is None else html_parent, lang=lang) self.runs.append(run) if ignore_leading_whitespace and not preserve_whitespace: text = text.lstrip() if preserve_whitespace or ws == 'pre-line': for text in text.splitlines(): run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link) bookmark = None run.add_break() else: run.add_text(text, preserve_whitespace, bookmark=bookmark, link=link) def add_break(self, clear='none', bookmark=None): if self.runs: run = self.runs[-1] else: run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block) self.runs.append(run) run.add_break(clear=clear, bookmark=bookmark) def add_image(self, drawing, bookmark=None): if self.runs: run = self.runs[-1] else: run = TextRun(self.namespace, self.styles_manager.create_text_style(self.html_style), self.html_block) self.runs.append(run) run.add_image(drawing, bookmark=bookmark) def serialize(self, body): makeelement = self.namespace.makeelement p = makeelement(body, 'w:p') end_bookmarks = [] for bmark in self.bookmarks: end_bookmarks.append(str(self.links_manager.bookmark_id)) makeelement(p, 'w:bookmarkStart', w_id=end_bookmarks[-1], w_name=bmark) if self.block_lang: rpr = makeelement(p, 'w:rPr') makeelement(rpr, 'w:lang', w_val=self.block_lang, w_bidi=self.block_lang, w_eastAsia=self.block_lang) ppr = makeelement(p, 'w:pPr') if self.keep_next: makeelement(ppr, 'w:keepNext') if self.float_spec is not None: self.float_spec.serialize(self, ppr) if self.numbering_id is not None: numpr = makeelement(ppr, 'w:numPr') makeelement(numpr, 'w:ilvl', w_val=str(self.numbering_id[1])) makeelement(numpr, 'w:numId', w_val=str(self.numbering_id[0])) if self.linked_style is not None: makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id) elif self.style.id: makeelement(ppr, 'w:pStyle', w_val=self.style.id) if self.is_first_block: makeelement(ppr, 'w:pageBreakBefore', w_val='off') elif self.page_break_before: makeelement(ppr, 'w:pageBreakBefore', w_val='on') if self.keep_lines: makeelement(ppr, 'w:keepLines', w_val='on') for run in self.runs: run.serialize(p, self.links_manager) for bmark in end_bookmarks: makeelement(p, 'w:bookmarkEnd', w_id=bmark) def __repr__(self): return 'Block(%r)' % self.runs __str__ = __repr__ def is_empty(self): if self.force_not_empty: return False for run in self.runs: if not run.is_empty(): return False return True class Blocks(object): def __init__(self, namespace, styles_manager, links_manager): self.top_bookmark = None self.namespace = namespace self.styles_manager = styles_manager self.links_manager = links_manager self.all_blocks = [] self.pos = 0 self.current_block = None self.items = [] self.tables = [] self.current_table = None self.open_html_blocks = set() self.html_tag_start_blocks = {} def current_or_new_block(self, html_tag, tag_style): return self.current_block or self.start_new_block(html_tag, tag_style) def end_current_block(self): if self.current_block is not None: self.all_blocks.append(self.current_block) if self.current_table is not None and self.current_table.current_row is not None: self.current_table.add_block(self.current_block) else: self.block_map[self.current_block] = len(self.items) self.items.append(self.current_block) self.current_block.parent_items = self.items self.current_block = None def start_new_block(self, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False): parent_bg = None if html_block is not None: p = html_block.getparent() b = self.html_tag_start_blocks.get(p) if b is not None: ps = self.styles_manager.styles_for_html_blocks.get(p) if ps is not None and ps.background_color is not None: parent_bg = ps.background_color self.end_current_block() self.current_block = Block( self.namespace, self.styles_manager, self.links_manager, html_block, style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item, parent_bg=parent_bg) self.html_tag_start_blocks[html_block] = self.current_block self.open_html_blocks.add(html_block) return self.current_block def start_new_table(self, html_tag, tag_style=None): self.current_table = Table(self.namespace, html_tag, tag_style) self.tables.append(self.current_table) def start_new_row(self, html_tag, tag_style): if self.current_table is None: self.start_new_table(html_tag) self.current_table.start_new_row(html_tag, tag_style) def start_new_cell(self, html_tag, tag_style): if self.current_table is None: self.start_new_table(html_tag) self.current_table.start_new_cell(html_tag, tag_style) def finish_tag(self, html_tag): if self.current_block is not None and html_tag in self.open_html_blocks: start_block = self.html_tag_start_blocks.get(html_tag) if start_block is not None and start_block.html_style['page-break-after'] == 'always': self.current_block.page_break_after = True self.end_current_block() self.open_html_blocks.discard(html_tag) if self.current_table is not None: table_finished = self.current_table.finish_tag(html_tag) if table_finished: table = self.tables[-1] del self.tables[-1] if self.tables: self.current_table = self.tables[-1] self.current_table.add_table(table) else: self.current_table = None self.block_map[table] = len(self.items) self.items.append(table) def serialize(self, body): for item in self.items: item.serialize(body) def delete_block_at(self, pos=None): pos = self.pos if pos is None else pos block = self.all_blocks[pos] del self.all_blocks[pos] bpos = self.block_map.pop(block, None) if bpos is not None: del self.items[bpos] else: items = self.items if block.parent_items is None else block.parent_items items.remove(block) block.parent_items = None if block.float_spec is not None: block.float_spec.blocks.remove(block) try: next_block = self.all_blocks[pos] next_block.bookmarks.update(block.bookmarks) for attr in 'page_break_after page_break_before'.split(): setattr(next_block, attr, getattr(block, attr)) except (IndexError, KeyError): pass def __enter__(self): self.pos = len(self.all_blocks) self.block_map = {} def __exit__(self, etype, value, traceback): if value is not None: return # Since there was an exception, the data structures are not in a consistent state if self.current_block is not None: self.all_blocks.append(self.current_block) self.current_block = None if len(self.all_blocks) > self.pos and self.all_blocks[self.pos].is_empty(): # Delete the empty block corresponding to the
tag when the # body tag has no inline content before its first sub-block self.delete_block_at(self.pos) if self.pos > 0 and self.pos < len(self.all_blocks): # Insert a page break corresponding to the start of the html file self.all_blocks[self.pos].page_break_before = True if self.top_bookmark is not None: self.all_blocks[self.pos].bookmarks.add(self.top_bookmark) self.top_bookmark = None self.block_map = {} def apply_page_break_after(self): for i, block in enumerate(self.all_blocks): if block.page_break_after and i < len(self.all_blocks) - 1: next_block = self.all_blocks[i + 1] if next_block.parent_items is block.parent_items and block.parent_items is self.items: next_block.page_break_before = True def resolve_language(self): default_lang = self.styles_manager.document_lang for block in self.all_blocks: count = collections.Counter() for run in block.runs: count[run.lang] += 1 if count: block.block_lang = bl = count.most_common(1)[0][0] for run in block.runs: if run.lang == bl: run.lang = None if bl == default_lang: block.block_lang = None def __repr__(self): return 'Block(%r)' % self.runs class Convert(object): # Word does not apply default styling to hyperlinks, so we ensure they get # default styling (the conversion pipeline does not apply any styling to # them). base_css = ''' a[href] { text-decoration: underline; color: blue } ''' def __init__(self, oeb, docx, mi, add_cover, add_toc): self.oeb, self.docx, self.add_cover, self.add_toc = oeb, docx, add_cover, add_toc self.log, self.opts = docx.log, docx.opts self.mi = mi self.cover_img = None p = self.opts.output_profile p.width_pts, p.height_pts = page_effective_area(self.opts) def __call__(self): from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts) self.lists_manager = ListsManager(self.docx) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) self.current_link = self.current_lang = None for item in self.oeb.spine: self.log.debug('Processing', item.href) self.process_item(item) if self.add_toc: self.links_manager.process_toc_links(self.oeb) if self.add_cover and self.oeb.metadata.cover and str(self.oeb.metadata.cover[0]) in self.oeb.manifest.ids: cover_id = str(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids[cover_id] self.cover_img = self.images_manager.read_image(item.href) all_blocks = self.blocks.all_blocks remove_blocks = [] for i, block in enumerate(all_blocks): try: nb = all_blocks[i+1] except IndexError: break block.resolve_skipped(nb) if block.skipped: remove_blocks.append((i, block)) for pos, block in reversed(remove_blocks): self.blocks.delete_block_at(pos) self.blocks.all_blocks[0].is_first_block = True self.blocks.apply_page_break_after() self.blocks.resolve_language() if self.cover_img is not None: self.cover_img = self.images_manager.create_cover_markup(self.cover_img, self.opts.preserve_cover_aspect_ratio, *page_size(self.opts)) self.lists_manager.finalize(all_blocks) self.styles_manager.finalize(all_blocks) self.write() def process_item(self, item): self.current_item = item stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css) self.abshref = self.images_manager.abshref = item.abshref self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang for i, body in enumerate(base.XPath('//h:body')(item.data)): with self.blocks: self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body) self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): tagname = parse_utils.barename(html_tag.tag) tag_style = stylizer.style(html_tag) ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden display = tag_style._get('display') is_block = False if not ignore_tag_contents: previous_link = self.current_link if tagname == 'a' and html_tag.get('href'): self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) previous_lang = self.current_lang tag_lang = lang_for_tag(html_tag) if tag_lang: self.current_lang = tag_lang is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag if float_spec is None and is_float: float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) if display in {'inline', 'inline-block'} or tagname == 'br': #