""" Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. License: GPLv3 Copyright: 2008, Kovid Goyal """ import copy import glob import os import re import sys import tempfile import urllib.parse import collections import functools import itertools import math import bs4 from PIL import Image as PILImage from ebook_converter.constants_old import __appname__, filesystem_encoding, \ preferred_encoding from ebook_converter.devices.interface import DevicePlugin as Device from ebook_converter.ebooks import ConversionError from ebook_converter.ebooks.BeautifulSoup import html5_parser from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.lrf import Book from ebook_converter.ebooks.lrf.html.color_map import lrs_color from ebook_converter.ebooks.lrf.html.table import Table from ebook_converter.ebooks.lrf.pylrs.pylrs import ( CR, BlockSpace, BookSetting, Canvas, CharButton, DropCaps, EmpLine, Image, ImageBlock, ImageStream, Italic, JumpButton, LrsError, Paragraph, Plot, RuledLine, Span, Sub, Sup, TextBlock ) from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.utils import encoding as uenc from ebook_converter.utils import img as uimg from ebook_converter.utils import entities def strip_style_comments(match): src = match.group() while True: lindex = src.find('/*') if lindex < 0: break rindex = src.find('*/', lindex) if rindex < 0: src = src[:lindex] break src = src[:lindex] + src[rindex+2:] return src SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+" r"[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction) MARKUP_MASSAGE = [ # Close tags (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), # Strip comments from )', re.IGNORECASE | re.DOTALL), strip_style_comments), # Remove self closing script tags as they also mess up # BeautifulSoup (re.compile(r'(?i)]+?/>'), lambda match: ''), # BeautifulSoup treats self closing

tags as open

# tags (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), lambda match: '

' % match.group(1))] # Fix Baen markup BAEN = [(re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: '')] # Fix pdftohtml markup PDFTOHTML = [(re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove page numbers (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Remove
and replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), (re.compile(r'(.*)', re.IGNORECASE), lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: '')] # Fix Book Designer markup BOOK_DESIGNER = [(re.compile('

', re.IGNORECASE), lambda match: '' ' '), (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)' r'(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match: '

%s

' % (match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)' r'(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match: '

%s

' % (match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=title[^><]*?>(.*?)', re.IGNORECASE | re.DOTALL), lambda match: '

%s

' % match.group(1)), (re.compile(r'<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE | re.DOTALL), lambda match: '

%s

' % match.group(1)), (re.compile(r'<]*?>( ){4}

', re.IGNORECASE), lambda match: '

')] def update_css(ncss, ocss): for key in ncss.keys(): if key in ocss: ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] def munge_paths(basepath, url): purl = urllib.parse.urlparse(urllib.parse.unquote(url),) path, fragment = purl[2], purl[5] if path: path = path.replace('/', os.sep) if not path: path = basepath elif not os.path.isabs(path): dn = os.path.dirname(basepath) path = os.path.join(dn, path) return os.path.normpath(path), fragment def tag_regex(tagname): """ Return non-grouping regular expressions that match the opening and closing tags for tagname """ return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)' % dict(t=tagname), close=r'' % dict(t=tagname)) class HTMLConverter(object): def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattribute__(self, attr) def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) CSS = {'h1': {"font-size": "xx-large", "font-weight": "bold", 'text-indent': '0pt'}, 'h2': {"font-size": "x-large", "font-weight": "bold", 'text-indent': '0pt'}, 'h3': {"font-size": "large", "font-weight": "bold", 'text-indent': '0pt'}, 'h4': {"font-size": "large", 'text-indent': '0pt'}, 'h5': {"font-weight": "bold", 'text-indent': '0pt'}, 'b': {"font-weight": "bold"}, 'strong': {"font-weight": "bold"}, 'i': {"font-style": "italic"}, 'cite': {'font-style': 'italic'}, 'em': {"font-style": "italic"}, 'small': {'font-size': 'small'}, 'pre': {'font-family': 'monospace', 'white-space': 'pre'}, 'code': {'font-family': 'monospace'}, 'tt': {'font-family': 'monospace'}, 'center': {'text-align': 'center'}, 'th': {'font-size': 'large', 'font-weight': 'bold'}, 'big': {'font-size': 'large', 'font-weight': 'bold'}, '.libprs500_dropcaps': {'font-size': 'xx-large'}, 'u': {'text-decoration': 'underline'}, 'sup': {'vertical-align': 'super', 'font-size': '60%'}, 'sub': {'vertical-align': 'sub', 'font-size': '60%'}, } def __init__(self, book, fonts, options, logger, paths): """ Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. @param book: The LRF book @type book: L{lrf.pylrs.Book} @param fonts: dict specifying the font families to use """ # Defaults for various formatting tags object.__setattr__(self, 'options', options) self.log = logger self.fonts = fonts # : dict specifying font families to use # Memory # Temporary files with scaled version of images self.scaled_images = {} # Temporary files with rotated version of images self.rotated_images = {} # Keep track of already used textstyles self.text_styles = [] # Keep track of already used blockstyles self.block_styles = [] # Images referenced in the HTML document self.images = {} # and id elements self.targets = {} # elements self.links = collections.deque() self.processed_files = [] # TOC entries gleaned from semantic information self.extra_toc_entries = [] self.image_memory = [] self.id_counter = 0 # Used to remove extra TextBlocks self.unused_target_blocks = [] # Current link level self.link_level = 0 # Used to ensure that duplicate CSS unhandled erros are not reported self.memory = [] # element representing the top of each HTML file in the LRF file self.tops = {} # Used to figure out when to lstrip self.previous_text = '' self.stripped_space = '' # Used so that

tags in

elements are handled properly self.preserve_block_style = False self.avoid_page_break = False self.current_page = book.create_page() # Styles self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 # The Book object representing a BBeB book self.book = book self.override_css = {} self.override_pcss = {} if self._override_css is not None: if os.access(self._override_css, os.R_OK): with open(self._override_css, 'rb') as f: src = f.read() else: src = self._override_css if isinstance(src, bytes): src = src.decode('utf-8', 'replace') match = PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) if ncss: update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) paths = [os.path.abspath(path) for path in paths] paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, str) else path for path in paths] while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: continue try: self.add_file(path) except KeyboardInterrupt: raise except Exception: # Die on errors in the first level if self.link_level == 0: raise for link in self.links: if link['path'] == path: self.links.remove(link) break self.log.warning('Could not process %s', path) if self.verbose: self.log.exception(' ') # WAT self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] if self.current_page is not None and self.current_page.has_text(): self.book.append(self.current_page) for text, tb in self.extra_toc_entries: self.book.addTocEntry(text, tb) if self.base_font_size > 0: self.log.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) def is_baen(self, soup): return bool(soup.find('meta', attrs={'name': 'Publisher', 'content': re.compile('Baen', re.IGNORECASE)})) def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) def preprocess(self, raw): nmassage = MARKUP_MASSAGE[:] if not self.book_designer and self.is_book_designer(raw): self.book_designer = True self.log.info('\tBook Designer file detected.') self.log.info('\tParsing HTML...') if self.baen: nmassage.extend(BAEN) if self.pdftohtml: nmassage.extend(PDFTOHTML) if self.book_designer: nmassage.extend(BOOK_DESIGNER) if isinstance(raw, bytes): raw = xml_to_unicode(raw, replace_entities=True)[0] for pat, repl in nmassage: raw = pat.sub(repl, raw) soup = html5_parser(raw) if not self.baen and self.is_baen(soup): self.baen = True self.log.info('\tBaen file detected. Re-parsing...') return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') if t: self.book.set_title(self.get_text(t)) a = soup.find(id='BookAuthor') if a: self.book.set_author(self.get_text(a)) if self.verbose: tdir = tempfile.gettempdir() if not os.path.exists(tdir): os.makedirs(tdir) try: with open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') as f: f.write(str(soup).encode('utf-8')) self.log.info('Written preprocessed HTML to %s', f.name) except Exception: pass return soup def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() for selector in self.override_css: if selector in self.css: self.css[selector].update(self.override_css[selector]) else: self.css[selector] = self.override_css[selector] self.file_name = os.path.basename(path) self.log.info('Processing %s', path if self.verbose else self.file_name) if not os.path.exists(path): # convertlit replaces & with %26 in file names path = path.replace('&', '%26') with open(path, 'rb') as f: raw = f.read() # Bug in pdftohtml that causes it to output invalid UTF-8 files if self.pdftohtml: raw = raw.decode('utf-8', 'ignore') elif self.encoding is not None: raw = raw.decode(self.encoding, 'ignore') else: raw = xml_to_unicode(raw, self.verbose)[0] soup = self.preprocess(raw) self.log.info('\tConverting to BBeB...') self.current_style = {} self.page_break_found = False if not isinstance(path, str): path = path.decode(sys.getfilesystemencoding()) self.target_prefix = path self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) def parse_css(self, style): """ Parse the contents of a