"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
License: GPLv3 Copyright: 2008, Kovid Goyal
"""
import copy
import glob
import os
import re
import sys
import tempfile
import urllib.parse
import collections
import functools
import itertools
import math
import bs4
from ebook_converter import __appname__, entity_to_unicode, fit_image, \
force_unicode
from ebook_converter.constants_old import filesystem_encoding, \
preferred_encoding
from ebook_converter.devices.interface import DevicePlugin as Device
from ebook_converter.ebooks import ConversionError
from ebook_converter.ebooks.BeautifulSoup import html5_parser
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ebooks.lrf import Book
from ebook_converter.ebooks.lrf.html.color_map import lrs_color
from ebook_converter.ebooks.lrf.html.table import Table
from ebook_converter.ebooks.lrf.pylrs.pylrs import (
CR, BlockSpace, BookSetting, Canvas, CharButton, DropCaps, EmpLine, Image,
ImageBlock, ImageStream, Italic, JumpButton, LrsError, Paragraph, Plot,
RuledLine, Span, Sub, Sup, TextBlock
)
from ebook_converter.ptempfile import PersistentTemporaryFile
from PIL import Image as PILImage
def strip_style_comments(match):
src = match.group()
while True:
lindex = src.find('/*')
if lindex < 0:
break
rindex = src.find('*/', lindex)
if rindex < 0:
src = src[:lindex]
break
src = src[:lindex] + src[rindex+2:]
return src
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+"
r"[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)',
re.IGNORECASE)
IGNORED_TAGS = (bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction)
MARKUP_MASSAGE = [ # Close tags
(re.compile(r']*)?/>', re.IGNORECASE),
lambda match: ''),
# Strip comments from )',
re.IGNORECASE | re.DOTALL),
strip_style_comments),
# Remove self closing script tags as they also mess up
# BeautifulSoup
(re.compile(r'(?i)