""" Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. License: GPLv3 Copyright: 2008, Kovid Goyal """ import copy import glob import os import re import sys import tempfile import urllib.parse import collections import functools import itertools import math import bs4 from ebook_converter import __appname__, entity_to_unicode, fit_image, \ force_unicode from ebook_converter.constants_old import filesystem_encoding, \ preferred_encoding from ebook_converter.devices.interface import DevicePlugin as Device from ebook_converter.ebooks import ConversionError from ebook_converter.ebooks.BeautifulSoup import html5_parser from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.lrf import Book from ebook_converter.ebooks.lrf.html.color_map import lrs_color from ebook_converter.ebooks.lrf.html.table import Table from ebook_converter.ebooks.lrf.pylrs.pylrs import ( CR, BlockSpace, BookSetting, Canvas, CharButton, DropCaps, EmpLine, Image, ImageBlock, ImageStream, Italic, JumpButton, LrsError, Paragraph, Plot, RuledLine, Span, Sub, Sup, TextBlock ) from ebook_converter.ptempfile import PersistentTemporaryFile from PIL import Image as PILImage def strip_style_comments(match): src = match.group() while True: lindex = src.find('/*') if lindex < 0: break rindex = src.find('*/', lindex) if rindex < 0: src = src[:lindex] break src = src[:lindex] + src[rindex+2:] return src SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+" r"[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction) MARKUP_MASSAGE = [ # Close tags (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), # Strip comments from )', re.IGNORECASE | re.DOTALL), strip_style_comments), # Remove self closing script tags as they also mess up # BeautifulSoup (re.compile(r'(?i)]+?/>'), lambda match: ''), # BeautifulSoup treats self closing

tags as open

# tags (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), lambda match: '

' % match.group(1))] # Fix Baen markup BAEN = [(re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: '')] # Fix pdftohtml markup PDFTOHTML = [(re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove page numbers (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Remove
and replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), (re.compile(r'(.*)', re.IGNORECASE), lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: '')] # Fix Book Designer markup BOOK_DESIGNER = [(re.compile('

', re.IGNORECASE), lambda match: '' ' '), (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)' r'(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match: '

%s

' % (match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)' r'(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match: '

%s

' % (match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=title[^><]*?>(.*?)', re.IGNORECASE | re.DOTALL), lambda match: '

%s

' % match.group(1)), (re.compile(r'<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE | re.DOTALL), lambda match: '

%s

' % match.group(1)), (re.compile(r'<]*?>( ){4}

', re.IGNORECASE), lambda match: '

')] def update_css(ncss, ocss): for key in ncss.keys(): if key in ocss: ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] def munge_paths(basepath, url): purl = urllib.parse.urlparse(urllib.parse.unquote(url),) path, fragment = purl[2], purl[5] if path: path = path.replace('/', os.sep) if not path: path = basepath elif not os.path.isabs(path): dn = os.path.dirname(basepath) path = os.path.join(dn, path) return os.path.normpath(path), fragment def tag_regex(tagname): """ Return non-grouping regular expressions that match the opening and closing tags for tagname """ return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)' % dict(t=tagname), close=r'' % dict(t=tagname)) class HTMLConverter(object): def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattribute__(self, attr) def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) CSS = {'h1': {"font-size": "xx-large", "font-weight": "bold", 'text-indent': '0pt'}, 'h2': {"font-size": "x-large", "font-weight": "bold", 'text-indent': '0pt'}, 'h3': {"font-size": "large", "font-weight": "bold", 'text-indent': '0pt'}, 'h4': {"font-size": "large", 'text-indent': '0pt'}, 'h5': {"font-weight": "bold", 'text-indent': '0pt'}, 'b': {"font-weight": "bold"}, 'strong': {"font-weight": "bold"}, 'i': {"font-style": "italic"}, 'cite': {'font-style': 'italic'}, 'em': {"font-style": "italic"}, 'small': {'font-size': 'small'}, 'pre': {'font-family': 'monospace', 'white-space': 'pre'}, 'code': {'font-family': 'monospace'}, 'tt': {'font-family': 'monospace'}, 'center': {'text-align': 'center'}, 'th': {'font-size': 'large', 'font-weight': 'bold'}, 'big': {'font-size': 'large', 'font-weight': 'bold'}, '.libprs500_dropcaps': {'font-size': 'xx-large'}, 'u': {'text-decoration': 'underline'}, 'sup': {'vertical-align': 'super', 'font-size': '60%'}, 'sub': {'vertical-align': 'sub', 'font-size': '60%'}, } def __init__(self, book, fonts, options, logger, paths): """ Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. @param book: The LRF book @type book: L{lrf.pylrs.Book} @param fonts: dict specifying the font families to use """ # Defaults for various formatting tags object.__setattr__(self, 'options', options) self.log = logger self.fonts = fonts # : dict specifying font families to use # Memory # Temporary files with scaled version of images self.scaled_images = {} # Temporary files with rotated version of images self.rotated_images = {} # Keep track of already used textstyles self.text_styles = [] # Keep track of already used blockstyles self.block_styles = [] # Images referenced in the HTML document self.images = {} # and id elements self.targets = {} # elements self.links = collections.deque() self.processed_files = [] # TOC entries gleaned from semantic information self.extra_toc_entries = [] self.image_memory = [] self.id_counter = 0 # Used to remove extra TextBlocks self.unused_target_blocks = [] # Current link level self.link_level = 0 # Used to ensure that duplicate CSS unhandled erros are not reported self.memory = [] # element representing the top of each HTML file in the LRF file self.tops = {} # Used to figure out when to lstrip self.previous_text = '' self.stripped_space = '' # Used so that

tags in

elements are handled properly self.preserve_block_style = False self.avoid_page_break = False self.current_page = book.create_page() # Styles self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 # The Book object representing a BBeB book self.book = book self.override_css = {} self.override_pcss = {} if self._override_css is not None: if os.access(self._override_css, os.R_OK): with open(self._override_css, 'rb') as f: src = f.read() else: src = self._override_css if isinstance(src, bytes): src = src.decode('utf-8', 'replace') match = PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) if ncss: update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) paths = [os.path.abspath(path) for path in paths] paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, str) else path for path in paths] while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: continue try: self.add_file(path) except KeyboardInterrupt: raise except Exception: # Die on errors in the first level if self.link_level == 0: raise for link in self.links: if link['path'] == path: self.links.remove(link) break self.log.warn('Could not process '+path) if self.verbose: self.log.exception(' ') self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] if self.current_page is not None and self.current_page.has_text(): self.book.append(self.current_page) for text, tb in self.extra_toc_entries: self.book.addTocEntry(text, tb) if self.base_font_size > 0: self.log.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) def is_baen(self, soup): return bool(soup.find('meta', attrs={'name': 'Publisher', 'content': re.compile('Baen', re.IGNORECASE)})) def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) def preprocess(self, raw): nmassage = MARKUP_MASSAGE[:] if not self.book_designer and self.is_book_designer(raw): self.book_designer = True self.log.info('\tBook Designer file detected.') self.log.info('\tParsing HTML...') if self.baen: nmassage.extend(BAEN) if self.pdftohtml: nmassage.extend(PDFTOHTML) if self.book_designer: nmassage.extend(BOOK_DESIGNER) if isinstance(raw, bytes): raw = xml_to_unicode(raw, replace_entities=True)[0] for pat, repl in nmassage: raw = pat.sub(repl, raw) soup = html5_parser(raw) if not self.baen and self.is_baen(soup): self.baen = True self.log.info('\tBaen file detected. Re-parsing...') return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') if t: self.book.set_title(self.get_text(t)) a = soup.find(id='BookAuthor') if a: self.book.set_author(self.get_text(a)) if self.verbose: tdir = tempfile.gettempdir() if not os.path.exists(tdir): os.makedirs(tdir) try: with open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') as f: f.write(str(soup).encode('utf-8')) self.log.info('Written preprocessed HTML to '+f.name) except Exception: pass return soup def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() for selector in self.override_css: if selector in self.css: self.css[selector].update(self.override_css[selector]) else: self.css[selector] = self.override_css[selector] self.file_name = os.path.basename(path) self.log.info('Processing %s' % (path if self.verbose else self.file_name)) if not os.path.exists(path): # convertlit replaces & with %26 in file names path = path.replace('&', '%26') with open(path, 'rb') as f: raw = f.read() # Bug in pdftohtml that causes it to output invalid UTF-8 files if self.pdftohtml: raw = raw.decode('utf-8', 'ignore') elif self.encoding is not None: raw = raw.decode(self.encoding, 'ignore') else: raw = xml_to_unicode(raw, self.verbose)[0] soup = self.preprocess(raw) self.log.info('\tConverting to BBeB...') self.current_style = {} self.page_break_found = False if not isinstance(path, str): path = path.decode(sys.getfilesystemencoding()) self.target_prefix = path self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) def parse_css(self, style): """ Parse the contents of a