diff --git a/ebook_converter/ebooks/lrf/html/convert_from.py b/ebook_converter/ebooks/lrf/html/convert_from.py
index 20d0d93..dacd773 100644
--- a/ebook_converter/ebooks/lrf/html/convert_from.py
+++ b/ebook_converter/ebooks/lrf/html/convert_from.py
@@ -13,10 +13,10 @@ import re
import sys
import tempfile
import urllib.parse
-from collections import deque
-from functools import partial
-from itertools import chain
-from math import ceil, floor
+import collections
+import functools
+import itertools
+import math
import bs4
@@ -37,32 +37,10 @@ from ebook_converter.ebooks.lrf.pylrs.pylrs import (
RuledLine, Span, Sub, Sup, TextBlock
)
from ebook_converter.ptempfile import PersistentTemporaryFile
-from ebook_converter.polyglot.urllib import unquote
from PIL import Image as PILImage
-def update_css(ncss, ocss):
- for key in ncss.keys():
- if key in ocss:
- ocss[key].update(ncss[key])
- else:
- ocss[key] = ncss[key]
-
-
-def munge_paths(basepath, url):
- purl = urllib.parse.urlparse(unquote(url),)
- path, fragment = purl[2], purl[5]
- if path:
- path = path.replace('/', os.sep)
- if not path:
- path = basepath
- elif not os.path.isabs(path):
- dn = os.path.dirname(basepath)
- path = os.path.join(dn, path)
- return os.path.normpath(path), fragment
-
-
def strip_style_comments(match):
src = match.group()
while True:
@@ -77,94 +55,135 @@ def strip_style_comments(match):
return src
+SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+"
+ r"[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
+PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)',
+ re.IGNORECASE)
+IGNORED_TAGS = (bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction)
+
+MARKUP_MASSAGE = [ # Close tags
+ (re.compile(r']*)?/>', re.IGNORECASE),
+ lambda match: ''),
+
+ # Strip comments from )',
+ re.IGNORECASE | re.DOTALL),
+ strip_style_comments),
+
+ # Remove self closing script tags as they also mess up
+ # BeautifulSoup
+ (re.compile(r'(?i)