mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-18 15:35:48 +01:00
Initial import
This commit is contained in:
42
ebook_converter/utils/html2text.py
Normal file
42
ebook_converter/utils/html2text.py
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
def html2text(html):
|
||||
from html2text import HTML2Text
|
||||
import re
|
||||
if isinstance(html, bytes):
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
# replace <u> tags with <span> as <u> becomes emphasis in html2text
|
||||
html = re.sub(
|
||||
r'<\s*(?P<solidus>/?)\s*[uU]\b(?P<rest>[^>]*)>',
|
||||
r'<\g<solidus>span\g<rest>>', html)
|
||||
h2t = HTML2Text()
|
||||
h2t.default_image_alt = _('Unnamed image')
|
||||
h2t.body_width = 0
|
||||
h2t.single_line_break = True
|
||||
h2t.emphasis_mark = '*'
|
||||
return h2t.handle(html)
|
||||
|
||||
|
||||
def find_tests():
|
||||
import unittest
|
||||
|
||||
class TestH2T(unittest.TestCase):
|
||||
|
||||
def test_html2text_behavior(self):
|
||||
for src, expected in {
|
||||
'<u>test</U>': 'test\n',
|
||||
'<i>test</i>': '*test*\n',
|
||||
'<a href="http://else.where/other">other</a>': '[other](http://else.where/other)\n',
|
||||
'<img src="test.jpeg">': '\n',
|
||||
'<a href="#t">test</a> <span id="t">dest</span>': 'test dest\n',
|
||||
'<>a': '<>a\n',
|
||||
'<p>a<p>b': 'a\nb\n',
|
||||
}.items():
|
||||
self.assertEqual(html2text(src), expected)
|
||||
|
||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestH2T)
|
||||
Reference in New Issue
Block a user