Added tests for mime parsers.

2016-07-21 00:25:55 -07:00
parent 40732fb90c
commit 738a46e6dd
4 changed files with 80 additions and 84 deletions
--- a/rtv/mime_handlers.py
+++ b/rtv/mime_handlers.py
@@ -3,71 +3,10 @@ import logging
 import mimetypes
 import requests
-from six.moves.html_parser import HTMLParser
+from bs4 import BeautifulSoup
 _logger = logging.getLogger(__name__)
 # HTML Parsers
 class HTMLParsed(Exception):
    def __init__(self, data):
        self.data = data
 # TODO: open temp file, close after 60 seconds with thread.timer()
 # TODO: switch to bs4 with "html.parser"
 # TODO: Add media_readme.rst
 # TODO: Add environment variables to config
 class ImgurHTMLParser(HTMLParser):
    """
    Scrape the actual image url from an imgur landing page. Imgur intentionally
    obscures this on most reddit links in order to draw more traffic for their
    advertisements.
    There are a couple of <meta> tags that supply the relevant info:
        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
    Note:
        BeautifulSoup or lxml would be faster here but I wanted to skip adding
        an extra dependency for something this trivial.
    """
    def handle_starttag(self, tag, attr):
        if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
            raise HTMLParsed(attr[1][1])
 class ImgurAlbumHTMLParser(HTMLParser):
    """
    Scrape the complete list of images from an imgur album. The HTML parser is
    very limited, so this assumes the following html structure:
        <div class="post-image">
            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
                <img class="post-image-placeholder"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
                <img class="js-post-image-thumb"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
            </a>
        </div>
    """
    def reset(self):
        super(ImgurAlbumHTMLParser, self).reset()
        self.primed = False
        self.hrefs = []
    def handle_starttag(self, tag, attr):
        if tag == 'div' and ('class', 'post-image') in attr:
            self.primed = True
        elif self.primed:
            self.primed = False
            if tag == 'a' and attr[0][0] == 'href':
                self.hrefs.append(attr[0][1])
 # MIME Parsers
 class BaseMIMEParser(object):
    """
@@ -103,7 +42,8 @@ class GfycatMIMEParser(BaseMIMEParser):
    downloaded as either gif, webm, or mjpg. Webm was selected because it's
    fast and works with VLC.
-    https://gfycat.com/api
+        https://gfycat.com/api
        https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
        https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
    """
@@ -166,43 +106,54 @@ class ImgurMIMEParser(BaseMIMEParser):
    """
    The majority of imgur links don't point directly to the image, so we need
    to open the provided url and scrape the page for the link.
    Scrape the actual image url from an imgur landing page. Imgur intentionally
    obscures this on most reddit links in order to draw more traffic for their
    advertisements.
    There are a couple of <meta> tags that supply the relevant info:
        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
    """
    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
    @staticmethod
    def get_mimetype(url):
-        imgur_page = requests.get(url)
+        page = requests.get(url)
-        try:
+        soup = BeautifulSoup(page.content, 'html.parser')
-            # convert_charrefs will be true by default in python 3.5
+        tag = soup.find('meta', attrs={'name': 'twitter:image'})
-            ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text)
+        if tag:
-        except HTMLParsed as data:
+            url = tag.get('content')
            # We found a link
            url = data.data
            if GifvMIMEParser.pattern.match(url):
                return GifvMIMEParser.get_mimetype(url)
        return BaseMIMEParser.get_mimetype(url)
 class ImgurAlbumMIMEParser(BaseMIMEParser):
    """
    Imgur albums can contain several images, which need to be scraped from the
-    landing page.
+    landing page. Assumes the following html structure:
        <div class="post-image">
            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
                <img class="post-image-placeholder"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
                <img class="js-post-image-thumb"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
            </a>
        </div>
    """
    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
    @staticmethod
    def get_mimetype(url):
-        imgur_page = requests.get(url)
+        page = requests.get(url)
-        parser = ImgurAlbumHTMLParser(convert_charrefs=True)
+        soup = BeautifulSoup(page.content, 'html.parser')
-        try:
+        urls = []
-            parser.feed(imgur_page.text)
+        for div in soup.find_all('div', class_='post-image'):
-        except Exception as e:
+            urls.append('http:' + div.find('img').get('src'))
            _logger.warning(e)
            urls = []
        else:
            urls = ['http:' + href for href in parser.hrefs]
        if urls:
            return "' '".join(urls), 'image/x-imgur-album'
--- a/rtv/terminal.py
+++ b/rtv/terminal.py
@@ -20,7 +20,7 @@ from kitchen.text.display import textual_width_chop
 from mailcap_fix import mailcap
 from . import exceptions
-from . import mime_handlers
+from . import mime_parsers
 from .objects import LoadScreen, Color
@@ -401,7 +401,7 @@ class Terminal(object):
            entry (dict): The full mailcap entry for the corresponding command
        """
-        for parser in mime_handlers.parsers:
+        for parser in mime_parsers.parsers:
            if parser.pattern.match(url):
                # modified_url may be the same as the original url, but it
                # could also be updated to point to a different page, or it
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,8 @@ import setuptools
 from version import __version__ as version
-requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen']
+requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen',
                'beautifulsoup4', 'mailcap-fix']
 # Python 2: add required concurrent.futures backport from Python 3.2
 if sys.version_info.major <= 2:
--- a/tests/test_mime_parsers.py
+++ b/tests/test_mime_parsers.py
@@ -0,0 +1,44 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import pytest
 from rtv.mime_parsers import parsers
 URLS = [
    ('http://www.example.com/i/image.png',
     'http://www.example.com/i/image.png', 'image/png'),
    ('http://www.example.com/v/video.mpeg',
     'http://www.example.com/v/video.mpeg', 'video/mpeg'),
    ('http://www.example.com/i/image',
     'http://www.example.com/i/image', None),
    ('https://gfycat.com/DeliciousUnfortunateAdouri',
     'https://giant.gfycat.com/DeliciousUnfortunateAdouri.webm', 'video/webm'),
    ('https://www.youtube.com/watch?v=FjNdYp2gXRY',
     'https://www.youtube.com/watch?v=FjNdYp2gXRY', 'video/x-youtube'),
    ('http://i.imgur.com/i/image.gifv',
     'http://i.imgur.com/i/image.mp4', 'video/mp4'),
    ('https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
     '1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac',
     'https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
     '1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac', 'image/jpeg'),
    ('http://imgur.com/yW0kbMi',
     'https://i.imgur.com/yW0kbMi.jpg', 'image/jpeg'),
    ('http://imgur.com/yjP1v4B',
     'https://i.imgur.com/yjP1v4Bh.jpg', 'image/jpeg'),
    ('http://imgur.com/a/qx9t5',
     'http://i.imgur.com/uEt0YLI.jpg', 'image/x-imgur-album'),
 ]
@pytest.mark.parametrize('url,modified_url,mime_type', URLS)
 def test_parser(url, modified_url, mime_type, reddit):
    # Add the reddit fixture so the cassettes get generated
    for parser in parsers:
        if parser.pattern.match(url):
            assert parser.get_mimetype(url) == (modified_url, mime_type)
            break
    else:
        # The base parser should catch all urls before this point
        assert False