mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-27 21:05:51 +01:00
37 lines
1.1 KiB
Python
37 lines
1.1 KiB
Python
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
|
|
import bs4
|
|
from bs4 import ( # noqa
|
|
CData, Comment, Declaration, NavigableString, ProcessingInstruction,
|
|
SoupStrainer, Tag, __version__
|
|
)
|
|
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
|
|
def parse_html(markup):
|
|
from ebook_converter.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
if isinstance(markup, unicode_type):
|
|
markup = strip_encoding_declarations(markup)
|
|
markup = substitute_entites(markup)
|
|
else:
|
|
markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
|
|
markup = clean_xml_chars(markup)
|
|
from html5_parser.soup import parse
|
|
return parse(markup, return_root=False)
|
|
|
|
|
|
def prettify(soup):
|
|
ans = soup.prettify()
|
|
if isinstance(ans, bytes):
|
|
ans = ans.decode('utf-8')
|
|
return ans
|
|
|
|
|
|
def BeautifulSoup(markup='', *a, **kw):
|
|
return parse_html(markup)
|
|
|
|
|
|
def BeautifulStoneSoup(markup='', *a, **kw):
|
|
return bs4.BeautifulSoup(markup, 'xml')
|