import re import bs4 from ebook_converter import prepare_string_for_xml from ebook_converter.constants import preferred_encoding from ebook_converter.ebooks.BeautifulSoup import html5_parser from ebook_converter.utils.html2text import html2text # Hackish - ignoring sentences ending or beginning in numbers to avoid # confusion with decimal points. lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])') lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') sanitize_pat = re.compile(r's 'plain text' returns as

plain text

'plain text with minimal markup' returns as

plain text with minimal markup

'

pre-formatted text

returns untouched 'A line of text\n\nFollowed by a line of text' returns as

A line of text

Followed by a line of text

'A line of text.\nA second line of text.\rA third line of text' returns as

A line of text.
A second line of text.
A third line of text.

'...end of a paragraph.Somehow the break was lost...' returns as

...end of a paragraph.

Somehow the break was lost...

Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'

' if not isinstance(comments, str): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'

%s

'%x.replace(u'\n', u'
') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'

' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to

s comments = comments.replace(u'\n\n', u'

') # Convert solo returns to
comments = comments.replace(u'\n', '
') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = html5_parser('

' + comments + '
').find('div') result = html5_parser('
') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr') for token in all_tokens: if isinstance(token, (bs4.CData, bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction)): continue if isinstance(token, bs4.NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in inline_tags: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents() def markdown(val): try: md = markdown.Markdown except AttributeError: from ebook_converter.ebooks.markdown import Markdown md = markdown.Markdown = Markdown() return md.convert(val) def merge_comments(one, two): return comments_to_html(one) + '\n\n' + comments_to_html(two) def sanitize_comments_html(html): from ebook_converter.ebooks.markdown import Markdown text = html2text(html) md = Markdown() html = md.convert(text) return html def find_tests(): import unittest class Test(unittest.TestCase): def test_comments_to_html(self): for pat, val in [ (b'lineone\n\nlinetwo', '

lineone

\n

linetwo

'), ('a b&c\nf', '

a b&c
f

'), ('a b\n\ncd', '

a b

cd

'), ]: cval = comments_to_html(pat) self.assertEqual(cval, val) return unittest.defaultTestLoader.loadTestsFromTestCase(Test)