""" Transform OEB content into FB2 markup """ from datetime import datetime import re import textwrap import urllib.parse import uuid from lxml import etree from ebook_converter import constants as const from ebook_converter import prepare_string_for_xml from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.polyglot.binary import as_base64_unicode from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.localization import lang_as_iso639_1 __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' class FB2MLizer(object): ''' Todo: * Include more FB2 specific tags in the conversion. * Handle notes and anchor links. ''' def __init__(self, log): self.log = log self.reset_state() def reset_state(self): # Used to ensure text and tags are always within

and

self.in_p = False # Mapping of image names. OEB allows for images to have the same name # but be stored in different directories. FB2 images are all in a flat # layout so we rename all images into a sequential numbering system to # ensure there are no collisions between image names. self.image_hrefs = {} # Mapping of toc items and their self.toc = {} # Used to see whether a new
needs to be opened self.section_level = 0 def extract_content(self, oeb_book, opts): self.log.info('Converting XHTML to FB2 markup...') self.oeb_book = oeb_book self.opts = opts self.reset_state() # Used for adding
s and s to allow readers # to generate toc from the document. if self.opts.sectionize == 'toc': self.create_flat_toc(self.oeb_book.toc, 1) return self.fb2mlize_spine() def fb2mlize_spine(self): output = ( self.fb2_header(), self.get_text(), self.fb2mlize_images(), self.fb2_footer(), ) output = self.clean_text('\n'.join(output)) if self.opts.pretty_print: output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True) return '<?xml version="1.0" encoding="UTF-8"?>\n' + output def clean_text(self, text): # Remove pointless tags, but keep their contents. text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>' r'(\s*)</\1>', r'\2', text) # Clean up paragraphs endings. text = re.sub(r'(?mu)\s+</p>', '</p>', text) # Condense empty paragraphs into a line break. text = re.sub(r'(?mu)(?:<p></p>\s*){3,}', '<empty-line/>', text) # Remove empty paragraphs. text = re.sub(r'(?mu)<p></p>\s*', '', text) # Put the paragraph following a paragraph on a separate line. text = re.sub(r'(?mu)</p>\s*<p>', '</p>\n<p>', text) if self.opts.insert_blank_line: text = re.sub(r'(?mu)</p>', '</p><empty-line/>', text) # Clean up title endings. text = re.sub(r'(?mu)\s+', '', text) # Remove empty title elements. text = re.sub(r'(?mu)\s*', '', text) # Put the paragraph following a title on a separate line. text = re.sub(r'(?mu)\s*

', '\n

', text) # Put line breaks between paragraphs on a separate line. text = re.sub(r'(?mu)\s*', r'\n', text) text = re.sub(r'(?mu)\s*

', '\n

', text) # Remove empty sections. text = re.sub(r'(?mu)

\s*
', '', text) # Clean up sections starts and ends. text = re.sub(r'(?mu)\s*
', '\n
', text) text = re.sub(r'(?mu)
\s*', '
\n', text) text = re.sub(r'(?mu)\s*
', '\n
', text) text = re.sub(r'(?mu)
\s*', '
\n', text) return text def fb2_header(self): metadata = {} metadata['title'] = self.oeb_book.metadata.title[0].value metadata['appname'] = __appname__ metadata['version'] = __version__ metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) if self.oeb_book.metadata.language: lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) if not lc: lc = self.oeb_book.metadata.language[0].value metadata['lang'] = lc or 'en' else: metadata['lang'] = u'en' metadata['id'] = None metadata['cover'] = self.get_cover() metadata['genre'] = self.opts.fb2_genre metadata['author'] = '' for auth in self.oeb_book.metadata.creator: author_first = '' author_middle = '' author_last = '' author_parts = auth.value.split(' ') if len(author_parts) == 1: author_last = author_parts[0] elif len(author_parts) == 2: author_first = author_parts[0] author_last = author_parts[1] else: author_first = author_parts[0] author_middle = ' '.join(author_parts[1:-1]) author_last = author_parts[-1] metadata['author'] += '' metadata['author'] += ('%s' % prepare_string_for_xml(author_first)) if author_middle: metadata['author'] += ('%s' % prepare_string_for_xml(author_middle)) metadata['author'] += ('%s' % prepare_string_for_xml(author_last)) metadata['author'] += '' if not metadata['author']: metadata['author'] = ('' '') metadata['keywords'] = '' tags = list(map(str, self.oeb_book.metadata.subject)) if tags: tags = ', '.join(prepare_string_for_xml(x) for x in tags) metadata['keywords'] = '%s' % tags metadata['sequence'] = '' if self.oeb_book.metadata.series: index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0])) metadata['sequence'] = ('' % (seq, index)) year = publisher = isbn = '' identifiers = self.oeb_book.metadata['identifier'] for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): metadata['id'] = str(x).split(':')[-1] break if metadata['id'] is None: self.log.warn('No UUID identifier found') metadata['id'] = str(uuid.uuid4()) try: date = self.oeb_book.metadata['date'][0] except IndexError: pass else: year = ('%s' % prepare_string_for_xml(date.value.partition('-')[0])) try: publisher = self.oeb_book.metadata['publisher'][0] except IndexError: pass else: publisher = ('%s' % prepare_string_for_xml(publisher.value)) for x in identifiers: if x.get(base.tag('opf', 'scheme'), None).lower() == 'isbn': isbn = '%s' % prepare_string_for_xml(x.value) metadata['year'] = year metadata['isbn'] = isbn metadata['publisher'] = publisher for key, value in metadata.items(): if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): metadata[key] = prepare_string_for_xml(value) try: comments = self.oeb_book.metadata['description'][0] except Exception: metadata['comments'] = '' else: from ebook_converter.utils.html2text import html2text annot = prepare_string_for_xml(html2text(comments.value).strip()) metadata['comments'] = f'

{annot}

' # Keep the indentation level of the description the same as the body. header = textwrap.dedent('''\ %(genre)s %(author)s %(title)s %(cover)s %(lang)s %(keywords)s %(sequence)s %(comments)s %(author)s %(appname)s %(version)s %(date)s %(id)s 1.0 %(publisher)s %(year)s %(isbn)s ''') % metadata # Remove empty lines. return '\n'.join(filter(str.strip, header.splitlines())) def fb2_footer(self): return '' def get_cover(self): cover_href = None # Get the raster cover if it's available. if (self.oeb_book.metadata.cover and str(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids): id = str(self.oeb_book.metadata.cover[0]) cover_item = self.oeb_book.manifest.ids[id] if cover_item.media_type in base.OEB_RASTER_IMAGES: cover_href = cover_item.href else: # Figure out if we have a title page or a cover page page_name = '' if 'titlepage' in self.oeb_book.guide: page_name = 'titlepage' elif 'cover' in self.oeb_book.guide: page_name = 'cover' if page_name: key = self.oeb_book.guide[page_name].href cover_item = self.oeb_book.manifest.hrefs[key] # Get the first image in the page for img in cover_item.xpath('//img'): cover_href = cover_item.abshref(img.get('src')) break if cover_href: # Only write the image tag if it is in the manifest. if (cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs): self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) return ('' % self.image_hrefs[cover_href]) return '' def get_text(self): from ebook_converter.ebooks.oeb.stylizer import Stylizer text = [''] # Create main section if there are no others to create if self.opts.sectionize == 'nothing': text.append('
') self.section_level += 1 for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) # Start a
if we must sectionize each file or if the TOC # references this page page_section_open = False if (self.opts.sectionize == 'files' or None in self.toc.get(item.href, ())): text.append('
') page_section_open = True self.section_level += 1 text += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) if page_section_open: text.append('
') self.section_level -= 1 # Close any open sections while self.section_level > 0: text.append('
') self.section_level -= 1 text.append('') return ''.join(text) def fb2mlize_images(self): """ This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. """ images = [] for item in self.oeb_book.manifest: # Don't write the image if it's not referenced in the document's # text. if item.href not in self.image_hrefs: continue if item.media_type in base.OEB_RASTER_IMAGES: try: if item.media_type not in ('image/jpeg', 'image/png'): imdata = save_cover_data_to(item.data, compression_quality=70) raw_data = as_base64_unicode(imdata) content_type = 'image/jpeg' else: raw_data = as_base64_unicode(item.data) content_type = item.media_type # Don't put the encoded image on a single line. step = 72 data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step)) images.append('%s' '' % (self.image_hrefs[item.href], content_type, data)) except Exception as e: self.log.error('Error: Could not include file %s because ' '%s.' % (item.href, e)) return '\n'.join(images) def create_flat_toc(self, nodes, level): for item in nodes: href, mid, id = item.href.partition('#') if not id: self.toc[href] = {None: 'page'} else: if not self.toc.get(href, None): self.toc[href] = {} self.toc[href][id] = level self.create_flat_toc(item.nodes, level + 1) def ensure_p(self): if self.in_p: return [], [] else: self.in_p = True return ['

'], ['p'] def close_open_p(self, tags): text = [''] added_p = False if self.in_p: # Close all up to p. Close p. Reopen all closed tags including p. closed_tags = [] tags.reverse() for t in tags: text.append('' % t) closed_tags.append(t) if t == 'p': break closed_tags.reverse() for t in closed_tags: text.append('<%s>' % t) else: text.append('

') added_p = True self.in_p = True return text, added_p def handle_simple_tag(self, tag, tags): s_out = [] s_tags = [] if tag not in tags: p_out, p_tags = self.ensure_p() s_out += p_out s_tags += p_tags s_out.append('<%s>' % tag) s_tags.append(tag) return s_out, s_tags def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): """ This function is intended to be used in a recursive manner. dump_text will run though all elements in the elem_tree and call itself on each element. self.image_hrefs will be populated by calling this function. @param elem_tree: etree representation of XHTML content to be transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. """ elem = elem_tree # Ensure what we are converting is not a string and that the fist tag # is part of the XHTML namespace. if (not isinstance(elem_tree.tag, (str, bytes)) or parse_utils.namespace(elem_tree.tag) != const.XHTML_NS): p = elem.getparent() if (p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) == const.XHTML_NS and elem.tail): return [elem.tail] return [] style = stylizer.style(elem_tree) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] # FB2 generated output. fb2_out = [] # FB2 tags in the order they are opened. This will be used to close # the tags. tags = [] # First tag in tree tag = parse_utils.barename(elem_tree.tag) # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 except Exception: ems = 0 # Convert TOC entries to s and add <section>s if self.opts.sectionize == 'toc': # A section cannot be a child of any other element than another # section, so leave the tag alone if there are parents if not tag_stack: # There are two reasons to start a new section here: the TOC # pointed to this page (then we use the first non-<body> on # the page as a <title>), or the TOC pointed to a specific # element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: if (tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text): newlevel = 1 self.toc[page.href] = None if (not newlevel and elem_tree.attrib.get('id', None) is not None): newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None) # Start a new section if necessary if newlevel: while newlevel <= self.section_level: fb2_out.append('</section>') self.section_level -= 1 fb2_out.append('<section>') self.section_level += 1 fb2_out.append('<title>') tags.append('title') if self.section_level == 0: # If none of the prior processing made a section, make one now # to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be only one XHTML # tag but it can have multiple styles. if tag == 'img' and elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = base.urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: multiplier = ems if self.in_p: closed_tags = [] open_tags = tag_stack+tags open_tags.reverse() for t in open_tags: fb2_out.append('</%s>' % t) closed_tags.append(t) if t == 'p': break fb2_out.append('<empty-line/>' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: fb2_out.append('<empty-line/>' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack+tags) fb2_out += p_text if added_p: tags.append('p') if tag == 'a' and elem_tree.attrib.get('href', None): # Handle only external links for now if urllib.parse.urlparse(elem_tree.attrib['href']).netloc: p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<a l:href="%s">' % base.urlnormalize(elem_tree.attrib['href'])) tags.append('a') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'i' or style['font-style'] == 'italic': s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) fb2_out += s_out tags += s_tags if (tag in ('del', 'strike') or style['text-decoration'] == 'line-through'): s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'sub': s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'sup': s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags) fb2_out += s_out tags += s_tags # Process element text. if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('</p>') # Process sub-elements. for item in elem_tree: fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags) # Close open FB2 tags. tags.reverse() fb2_out += self.close_tags(tags) # Process element text that comes after the close of the XHTML tag but # before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('</p>') return fb2_out def close_tags(self, tags): text = [] for tag in tags: text.append('</%s>' % tag) if tag == 'p': self.in_p = False return text