"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
License: GPLv3 Copyright: 2008, Kovid Goyal
"""
import copy
import glob
import os
import re
import sys
import tempfile
import urllib.parse
from collections import deque
from functools import partial
from itertools import chain
from math import ceil, floor
import bs4
from ebook_converter import __appname__, entity_to_unicode, fit_image, \
force_unicode
from ebook_converter.constants_old import filesystem_encoding, \
preferred_encoding
from ebook_converter.devices.interface import DevicePlugin as Device
from ebook_converter.ebooks import ConversionError
from ebook_converter.ebooks.BeautifulSoup import html5_parser
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.lrf import Book
from ebook_converter.ebooks.lrf.html.color_map import lrs_color
from ebook_converter.ebooks.lrf.html.table import Table
from ebook_converter.ebooks.lrf.pylrs.pylrs import (
CR, BlockSpace, BookSetting, Canvas, CharButton, DropCaps, EmpLine, Image,
ImageBlock, ImageStream, Italic, JumpButton, LrsError, Paragraph, Plot,
RuledLine, Span, Sub, Sup, TextBlock
)
from ebook_converter.ptempfile import PersistentTemporaryFile
from ebook_converter.polyglot.urllib import unquote
from PIL import Image as PILImage
def update_css(ncss, ocss):
for key in ncss.keys():
if key in ocss:
ocss[key].update(ncss[key])
else:
ocss[key] = ncss[key]
def munge_paths(basepath, url):
purl = urllib.parse.urlparse(unquote(url),)
path, fragment = purl[2], purl[5]
if path:
path = path.replace('/', os.sep)
if not path:
path = basepath
elif not os.path.isabs(path):
dn = os.path.dirname(basepath)
path = os.path.join(dn, path)
return os.path.normpath(path), fragment
def strip_style_comments(match):
src = match.group()
while True:
lindex = src.find('/*')
if lindex < 0:
break
rindex = src.find('*/', lindex)
if rindex < 0:
src = src[:lindex]
break
src = src[:lindex] + src[rindex+2:]
return src
def tag_regex(tagname):
'''Return non-grouping regular expressions that match the opening and closing tags for tagname'''
return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname),
close=r'\s*%(t)s\s*>'%dict(t=tagname))
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
IGNORED_TAGS = (bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction)
MARKUP_MASSAGE = [
# Close tags
(re.compile(r']*)?/>', re.IGNORECASE),
lambda match: ''),
# Strip comments from )', re.IGNORECASE|re.DOTALL),
strip_style_comments),
# Remove self closing script tags as they also mess up BeautifulSoup
(re.compile(r'(?i)