Run a second pass on escaped html characters.

2015-12-09 01:23:22 -08:00
parent 52dfbe786c
commit 9de6056b04
2 changed files with 28 additions and 0 deletions
--- a/rtv/terminal.py
+++ b/rtv/terminal.py
@@ -19,6 +19,13 @@ from kitchen.text.display import textual_width_chop
 from . import exceptions
 from .objects import LoadScreen, Color

+try:
+    # Added in python 3.4+
+    from html import unescape
+except ImportError:
+    from six.moves import html_parser
+    unescape = html_parser.HTMLParser().unescape
+

 class Terminal(object):

@@ -173,11 +180,22 @@ class Terminal(object):
        curses will treat each code point as one character and will not account
        for wide characters. If utf-8 is passed in, addnstr will treat each
        'byte' as a single character.
+
+        Reddit's api sometimes chokes and double-encodes some html characters
+        Praw handles the initial decoding, but we need to do a second pass
+        just to make sure. See https://github.com/michael-lazar/rtv/issues/96
+
+        Example:
+            &amp;amp; -> returned directly from reddit's api
+            &amp;     -> returned after PRAW decodes the html characters
+            &         -> returned after our second pass, this is the true value
        """

        if n_cols is not None and n_cols <= 0:
            return ''

+        string = unescape(string)
+
        if self.ascii:
            if isinstance(string, six.binary_type):
                string = string.decode('utf-8')
--- a/tests/test_terminal.py
+++ b/tests/test_terminal.py
@@ -145,6 +145,16 @@ def test_terminal_clean_ncols(terminal):
    assert text.decode('utf-8') == 'ｈｅｌｌ'


+@pytest.mark.parametrize('ascii', [True, False])
+def test_terminal_clean_unescape_html(terminal, ascii):
+
+    # HTML characters get decoded
+    terminal.ascii = ascii
+    text = terminal.clean('&lt;')
+    assert isinstance(text, six.binary_type)
+    assert text.decode('ascii' if ascii else 'utf-8') == '<'
+
+
@pytest.mark.parametrize('ascii', [True, False])
 def test_terminal_add_line(terminal, stdscr, ascii):