From 738a46e6ddac90b4e56c3e4ca3b48840c3fe43b8 Mon Sep 17 00:00:00 2001
From: Michael Lazar <lazar.michael22@gmail.com>
Date: Thu, 21 Jul 2016 00:25:55 -0700
Subject: [PATCH] Added tests for mime parsers.

---
 rtv/{mime_handlers.py => mime_parsers.py} | 113 ++++++----------------
 rtv/terminal.py                           |   4 +-
 setup.py                                  |   3 +-
 tests/test_mime_parsers.py                |  44 +++++++++
 4 files changed, 80 insertions(+), 84 deletions(-)
 rename rtv/{mime_handlers.py => mime_parsers.py} (72%)
 create mode 100644 tests/test_mime_parsers.py
diff --git a/rtv/mime_handlers.py b/rtv/mime_parsers.py
similarity index 72%
rename from rtv/mime_handlers.py
rename to rtv/mime_parsers.py
index 7015133..4b50567 100644
--- a/rtv/mime_handlers.py
+++ b/rtv/mime_parsers.py
@@ -3,71 +3,10 @@ import logging
 import mimetypes
 
 import requests
-from six.moves.html_parser import HTMLParser
+from bs4 import BeautifulSoup
 
 _logger = logging.getLogger(__name__)
 
-# HTML Parsers
-
-
-class HTMLParsed(Exception):
-    def __init__(self, data):
-        self.data = data
-
-# TODO: open temp file, close after 60 seconds with thread.timer()
-# TODO: switch to bs4 with "html.parser"
-# TODO: Add media_readme.rst
-# TODO: Add environment variables to config
-
-class ImgurHTMLParser(HTMLParser):
-    """
-    Scrape the actual image url from an imgur landing page. Imgur intentionally
-    obscures this on most reddit links in order to draw more traffic for their
-    advertisements.
-
-    There are a couple of <meta> tags that supply the relevant info:
-        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
-        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
-        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
-
-    Note:
-        BeautifulSoup or lxml would be faster here but I wanted to skip adding
-        an extra dependency for something this trivial.
-    """
-    def handle_starttag(self, tag, attr):
-        if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
-            raise HTMLParsed(attr[1][1])
-
-
-class ImgurAlbumHTMLParser(HTMLParser):
-    """
-    Scrape the complete list of images from an imgur album. The HTML parser is
-    very limited, so this assumes the following html structure:
-
-        <div class="post-image">
-            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
-                <img class="post-image-placeholder"
-                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
-                <img class="js-post-image-thumb"
-                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
-            </a>
-        </div>
-    """
-    def reset(self):
-        super(ImgurAlbumHTMLParser, self).reset()
-        self.primed = False
-        self.hrefs = []
-
-    def handle_starttag(self, tag, attr):
-        if tag == 'div' and ('class', 'post-image') in attr:
-            self.primed = True
-        elif self.primed:
-            self.primed = False
-            if tag == 'a' and attr[0][0] == 'href':
-                self.hrefs.append(attr[0][1])
-
-
-# MIME Parsers
 
 class BaseMIMEParser(object):
     """
@@ -103,7 +42,8 @@ class GfycatMIMEParser(BaseMIMEParser):
     downloaded as either gif, webm, or mjpg. Webm was selected because it's
     fast and works with VLC.
 
-    https://gfycat.com/api
+        https://gfycat.com/api
+
         https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
         https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
     """
@@ -166,43 +106,54 @@ class ImgurMIMEParser(BaseMIMEParser):
     """
     The majority of imgur links don't point directly to the image, so we need
     to open the provided url and scrape the page for the link.
+
+    Scrape the actual image url from an imgur landing page. Imgur intentionally
+    obscures this on most reddit links in order to draw more traffic for their
+    advertisements.
+
+    There are a couple of <meta> tags that supply the relevant info:
+        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
+        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
+        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
     """
     pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
 
     @staticmethod
     def get_mimetype(url):
-        imgur_page = requests.get(url)
-        try:
-            # convert_charrefs will be true by default in python 3.5
-            ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text)
-        except HTMLParsed as data:
-            # We found a link
-            url = data.data
+        page = requests.get(url)
+        soup = BeautifulSoup(page.content, 'html.parser')
+        tag = soup.find('meta', attrs={'name': 'twitter:image'})
+        if tag:
+            url = tag.get('content')
             if GifvMIMEParser.pattern.match(url):
                 return GifvMIMEParser.get_mimetype(url)
-
         return BaseMIMEParser.get_mimetype(url)
 
 
 class ImgurAlbumMIMEParser(BaseMIMEParser):
     """
     Imgur albums can contain several images, which need to be scraped from the
-    landing page.
+    landing page. Assumes the following html structure:
+
+        <div class="post-image">
+            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
+                <img class="post-image-placeholder"
+                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
+                <img class="js-post-image-thumb"
+                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
+            </a>
+        </div>
     """
     pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
 
     @staticmethod
     def get_mimetype(url):
-        imgur_page = requests.get(url)
-        parser = ImgurAlbumHTMLParser(convert_charrefs=True)
+        page = requests.get(url)
+        soup = BeautifulSoup(page.content, 'html.parser')
 
-        try:
-            parser.feed(imgur_page.text)
-        except Exception as e:
-            _logger.warning(e)
-            urls = []
-        else:
-            urls = ['http:' + href for href in parser.hrefs]
+        urls = []
+        for div in soup.find_all('div', class_='post-image'):
+            urls.append('http:' + div.find('img').get('src'))
 
         if urls:
             return "' '".join(urls), 'image/x-imgur-album'
diff --git a/rtv/terminal.py b/rtv/terminal.py
index b7e25d3..4d4b5dc 100644
--- a/rtv/terminal.py
+++ b/rtv/terminal.py
@@ -20,7 +20,7 @@ from kitchen.text.display import textual_width_chop
 from mailcap_fix import mailcap
 
 from . import exceptions
-from . import mime_handlers
+from . import mime_parsers
 from .objects import LoadScreen, Color
 
 
@@ -401,7 +401,7 @@ class Terminal(object):
             entry (dict): The full mailcap entry for the corresponding command
         """
 
-        for parser in mime_handlers.parsers:
+        for parser in mime_parsers.parsers:
             if parser.pattern.match(url):
                 # modified_url may be the same as the original url, but it
                 # could also be updated to point to a different page, or it
diff --git a/setup.py b/setup.py
index 6e2f2c3..30a8047 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,8 @@ import setuptools
 
 from version import __version__ as version
 
-requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen']
+requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen',
+                'beautifulsoup4', 'mailcap-fix']
 
 # Python 2: add required concurrent.futures backport from Python 3.2
 if sys.version_info.major <= 2:
diff --git a/tests/test_mime_parsers.py b/tests/test_mime_parsers.py
new file mode 100644
index 0000000..47bd04e
--- /dev/null
+++ b/tests/test_mime_parsers.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import pytest
+
+from rtv.mime_parsers import parsers
+
+URLS = [
+    ('http://www.example.com/i/image.png',
+     'http://www.example.com/i/image.png', 'image/png'),
+    ('http://www.example.com/v/video.mpeg',
+     'http://www.example.com/v/video.mpeg', 'video/mpeg'),
+    ('http://www.example.com/i/image',
+     'http://www.example.com/i/image', None),
+    ('https://gfycat.com/DeliciousUnfortunateAdouri',
+     'https://giant.gfycat.com/DeliciousUnfortunateAdouri.webm', 'video/webm'),
+    ('https://www.youtube.com/watch?v=FjNdYp2gXRY',
+     'https://www.youtube.com/watch?v=FjNdYp2gXRY', 'video/x-youtube'),
+    ('http://i.imgur.com/i/image.gifv',
+     'http://i.imgur.com/i/image.mp4', 'video/mp4'),
+    ('https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
+     '1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac',
+     'https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
+     '1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac', 'image/jpeg'),
+    ('http://imgur.com/yW0kbMi',
+     'https://i.imgur.com/yW0kbMi.jpg', 'image/jpeg'),
+    ('http://imgur.com/yjP1v4B',
+     'https://i.imgur.com/yjP1v4Bh.jpg', 'image/jpeg'),
+    ('http://imgur.com/a/qx9t5',
+     'http://i.imgur.com/uEt0YLI.jpg', 'image/x-imgur-album'),
+]
+
+
+@pytest.mark.parametrize('url,modified_url,mime_type', URLS)
+def test_parser(url, modified_url, mime_type, reddit):
+    # Add the reddit fixture so the cassettes get generated
+
+    for parser in parsers:
+        if parser.pattern.match(url):
+            assert parser.get_mimetype(url) == (modified_url, mime_type)
+            break
+    else:
+        # The base parser should catch all urls before this point
+        assert False