""" Code to convert HTML ebooks into LRF ebooks. I am indebted to esperanc for the initial CSS->Xylog Style conversion code and to Falstaff for pylrs. License: GPLv3 Copyright: 2008, Kovid Goyal """ import copy import glob import os import re import sys import tempfile import urllib.parse from collections import deque from functools import partial from itertools import chain from math import ceil, floor import bs4 from ebook_converter import __appname__, entity_to_unicode, fit_image, \ force_unicode from ebook_converter.constants_old import filesystem_encoding, \ preferred_encoding from ebook_converter.devices.interface import DevicePlugin as Device from ebook_converter.ebooks import ConversionError from ebook_converter.ebooks.BeautifulSoup import html5_parser from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.lrf import Book from ebook_converter.ebooks.lrf.html.color_map import lrs_color from ebook_converter.ebooks.lrf.html.table import Table from ebook_converter.ebooks.lrf.pylrs.pylrs import ( CR, BlockSpace, BookSetting, Canvas, CharButton, DropCaps, EmpLine, Image, ImageBlock, ImageStream, Italic, JumpButton, LrsError, Paragraph, Plot, RuledLine, Span, Sub, Sup, TextBlock ) from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.polyglot.urllib import unquote from PIL import Image as PILImage def update_css(ncss, ocss): for key in ncss.keys(): if key in ocss: ocss[key].update(ncss[key]) else: ocss[key] = ncss[key] def munge_paths(basepath, url): purl = urllib.parse.urlparse(unquote(url),) path, fragment = purl[2], purl[5] if path: path = path.replace('/', os.sep) if not path: path = basepath elif not os.path.isabs(path): dn = os.path.dirname(basepath) path = os.path.join(dn, path) return os.path.normpath(path), fragment def strip_style_comments(match): src = match.group() while True: lindex = src.find('/*') if lindex < 0: break rindex = src.find('*/', lindex) if rindex < 0: src = src[:lindex] break src = src[:lindex] + src[rindex+2:] return src def tag_regex(tagname): '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), close=r''%dict(t=tagname)) class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) IGNORED_TAGS = (bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction) MARKUP_MASSAGE = [ # Close tags (re.compile(r']*)?/>', re.IGNORECASE), lambda match: ''), # Strip comments from )', re.IGNORECASE|re.DOTALL), strip_style_comments), # Remove self closing script tags as they also mess up BeautifulSoup (re.compile(r'(?i)]+?/>'), lambda match: ''), # BeautifulSoup treats self closing
tags as open
tags (re.compile(r'(?i)<\s*div([^>]*)/\s*>'), lambda match: '
'%match.group(1)) ] # Fix Baen markup BAEN = [ (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'

\s*(\s*)\s*

', re.IGNORECASE), lambda match: match.group(1)), (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*', re.IGNORECASE), lambda match: ''), ] # Fix pdftohtml markup PDFTOHTML = [ # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), # Remove page numbers (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''), # Remove
and replace

with

(re.compile(r'\s*', re.IGNORECASE), lambda match: '

'), (re.compile(r'(.*)', re.IGNORECASE), lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 else match.group(1)), # Remove hyphenation (re.compile(r'-\n\r?'), lambda match: ''), ] # Fix Book Designer markup BOOK_DESIGNER = [ # HR (re.compile('


', re.IGNORECASE), lambda match : ' '), # Create header tags (re.compile(r'<]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?', re.IGNORECASE), lambda match : '

%s

'%(match.group(2) if match.group(2) else 'center', match.group(3))), (re.compile(r'<]*?id=title[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), (re.compile(r'<]*?id=subtitle[^><]*?>(.*?)', re.IGNORECASE|re.DOTALL), lambda match : '

%s

'%(match.group(1),)), # Blank lines (re.compile(r'<]*?>( ){4}
', re.IGNORECASE), lambda match : '

'), ] def __hasattr__(self, attr): if hasattr(self.options, attr): return True return object.__hasattr__(self, attr) def __getattr__(self, attr): if hasattr(self.options, attr): return getattr(self.options, attr) return object.__getattribute__(self, attr) def __setattr__(self, attr, val): if hasattr(self.options, attr): setattr(self.options, attr, val) else: object.__setattr__(self, attr, val) CSS = { 'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, 'h3' : {"font-size" : "large", "font-weight":"bold", 'text-indent':'0pt'}, 'h4' : {"font-size" : "large", 'text-indent':'0pt'}, 'h5' : {"font-weight" : "bold", 'text-indent':'0pt'}, 'b' : {"font-weight" : "bold"}, 'strong' : {"font-weight" : "bold"}, 'i' : {"font-style" : "italic"}, 'cite' : {'font-style' : 'italic'}, 'em' : {"font-style" : "italic"}, 'small' : {'font-size' : 'small'}, 'pre' : {'font-family' : 'monospace', 'white-space': 'pre'}, 'code' : {'font-family' : 'monospace'}, 'tt' : {'font-family' : 'monospace'}, 'center' : {'text-align' : 'center'}, 'th' : {'font-size' : 'large', 'font-weight':'bold'}, 'big' : {'font-size' : 'large', 'font-weight':'bold'}, '.libprs500_dropcaps' : {'font-size': 'xx-large'}, 'u' : {'text-decoration': 'underline'}, 'sup' : {'vertical-align': 'super', 'font-size': '60%'}, 'sub' : {'vertical-align': 'sub', 'font-size': '60%'}, } def __init__(self, book, fonts, options, logger, paths): ''' Convert HTML files at C{paths} and add to C{book}. After creating the object, you must call L{self.writeto} to output the LRF/S file. @param book: The LRF book @type book: L{lrf.pylrs.Book} @param fonts: dict specifying the font families to use ''' # Defaults for various formatting tags object.__setattr__(self, 'options', options) self.log = logger self.fonts = fonts # : dict specifying font families to use # Memory self.scaled_images = {} #: Temporary files with scaled version of images self.rotated_images = {} #: Temporary files with rotated version of images self.text_styles = [] #: Keep track of already used textstyles self.block_styles = [] #: Keep track of already used blockstyles self.images = {} #: Images referenced in the HTML document self.targets = {} #: and id elements self.links = deque() # : elements self.processed_files = [] self.extra_toc_entries = [] # : TOC entries gleaned from semantic information self.image_memory = [] self.id_counter = 0 self.unused_target_blocks = [] # : Used to remove extra TextBlocks self.link_level = 0 #: Current link level self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported self.tops = {} #: element representing the top of each HTML file in the LRF file self.previous_text = '' # : Used to figure out when to lstrip self.stripped_space = '' self.preserve_block_style = False # : Used so that

tags in

elements are handled properly self.avoid_page_break = False self.current_page = book.create_page() # Styles self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) self.in_table = False # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 self.book = book #: The Book object representing a BBeB book self.override_css = {} self.override_pcss = {} if self._override_css is not None: if os.access(self._override_css, os.R_OK): with open(self._override_css, 'rb') as f: src = f.read() else: src = self._override_css if isinstance(src, bytes): src = src.decode('utf-8', 'replace') match = self.PAGE_BREAK_PAT.search(src) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True ncss, npcss = self.parse_css(src) if ncss: update_css(ncss, self.override_css) if npcss: update_css(npcss, self.override_pcss) paths = [os.path.abspath(path) for path in paths] paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, str) else path for path in paths] while len(paths) > 0 and self.link_level <= self.link_levels: for path in paths: if path in self.processed_files: continue try: self.add_file(path) except KeyboardInterrupt: raise except: if self.link_level == 0: # Die on errors in the first level raise for link in self.links: if link['path'] == path: self.links.remove(link) break self.log.warn('Could not process '+path) if self.verbose: self.log.exception(' ') self.links = self.process_links() self.link_level += 1 paths = [link['path'] for link in self.links] if self.current_page is not None and self.current_page.has_text(): self.book.append(self.current_page) for text, tb in self.extra_toc_entries: self.book.addTocEntry(text, tb) if self.base_font_size > 0: self.log.info('\tRationalizing font sizes...') self.book.rationalize_font_sizes(self.base_font_size) def is_baen(self, soup): return bool(soup.find('meta', attrs={'name':'Publisher', 'content':re.compile('Baen', re.IGNORECASE)})) def is_book_designer(self, raw): return bool(re.search('<]*id=BookTitle', raw)) def preprocess(self, raw): nmassage = [] nmassage.extend(HTMLConverter.MARKUP_MASSAGE) if not self.book_designer and self.is_book_designer(raw): self.book_designer = True self.log.info('\tBook Designer file detected.') self.log.info('\tParsing HTML...') if self.baen: nmassage.extend(HTMLConverter.BAEN) if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) if self.book_designer: nmassage.extend(HTMLConverter.BOOK_DESIGNER) if isinstance(raw, bytes): raw = xml_to_unicode(raw, replace_entities=True)[0] for pat, repl in nmassage: raw = pat.sub(repl, raw) soup = html5_parser(raw) if not self.baen and self.is_baen(soup): self.baen = True self.log.info('\tBaen file detected. Re-parsing...') return self.preprocess(raw) if self.book_designer: t = soup.find(id='BookTitle') if t: self.book.set_title(self.get_text(t)) a = soup.find(id='BookAuthor') if a: self.book.set_author(self.get_text(a)) if self.verbose: tdir = tempfile.gettempdir() if not os.path.exists(tdir): os.makedirs(tdir) try: with open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') as f: f.write(str(soup).encode('utf-8')) self.log.info('Written preprocessed HTML to '+f.name) except: pass return soup def add_file(self, path): self.css = HTMLConverter.CSS.copy() self.pseudo_css = self.override_pcss.copy() for selector in self.override_css: if selector in self.css: self.css[selector].update(self.override_css[selector]) else: self.css[selector] = self.override_css[selector] self.file_name = os.path.basename(path) self.log.info('Processing %s' % (path if self.verbose else self.file_name)) if not os.path.exists(path): path = path.replace('&', '%26') # convertlit replaces & with %26 in file names with open(path, 'rb') as f: raw = f.read() if self.pdftohtml: # Bug in pdftohtml that causes it to output invalid UTF-8 files raw = raw.decode('utf-8', 'ignore') elif self.encoding is not None: raw = raw.decode(self.encoding, 'ignore') else: raw = xml_to_unicode(raw, self.verbose)[0] soup = self.preprocess(raw) self.log.info('\tConverting to BBeB...') self.current_style = {} self.page_break_found = False if not isinstance(path, str): path = path.decode(sys.getfilesystemencoding()) self.target_prefix = path self.previous_text = '\n' self.tops[path] = self.parse_file(soup) self.processed_files.append(path) def parse_css(self, style): """ Parse the contents of a