diff --git a/rtv/terminal.py b/rtv/terminal.py index 746ee6d..6fb0fae 100644 --- a/rtv/terminal.py +++ b/rtv/terminal.py @@ -19,6 +19,13 @@ from kitchen.text.display import textual_width_chop from . import exceptions from .objects import LoadScreen, Color +try: + # Added in python 3.4+ + from html import unescape +except ImportError: + from six.moves import html_parser + unescape = html_parser.HTMLParser().unescape + class Terminal(object): @@ -173,11 +180,22 @@ class Terminal(object): curses will treat each code point as one character and will not account for wide characters. If utf-8 is passed in, addnstr will treat each 'byte' as a single character. + + Reddit's api sometimes chokes and double-encodes some html characters + Praw handles the initial decoding, but we need to do a second pass + just to make sure. See https://github.com/michael-lazar/rtv/issues/96 + + Example: + &amp; -> returned directly from reddit's api + & -> returned after PRAW decodes the html characters + & -> returned after our second pass, this is the true value """ if n_cols is not None and n_cols <= 0: return '' + string = unescape(string) + if self.ascii: if isinstance(string, six.binary_type): string = string.decode('utf-8') diff --git a/tests/test_terminal.py b/tests/test_terminal.py index 52f81af..8b5acb5 100644 --- a/tests/test_terminal.py +++ b/tests/test_terminal.py @@ -145,6 +145,16 @@ def test_terminal_clean_ncols(terminal): assert text.decode('utf-8') == 'hell' +@pytest.mark.parametrize('ascii', [True, False]) +def test_terminal_clean_unescape_html(terminal, ascii): + + # HTML characters get decoded + terminal.ascii = ascii + text = terminal.clean('<') + assert isinstance(text, six.binary_type) + assert text.decode('ascii' if ascii else 'utf-8') == '<' + + @pytest.mark.parametrize('ascii', [True, False]) def test_terminal_add_line(terminal, stdscr, ascii):