From 738a46e6ddac90b4e56c3e4ca3b48840c3fe43b8 Mon Sep 17 00:00:00 2001 From: Michael Lazar Date: Thu, 21 Jul 2016 00:25:55 -0700 Subject: [PATCH] Added tests for mime parsers. --- rtv/{mime_handlers.py => mime_parsers.py} | 113 ++++++---------------- rtv/terminal.py | 4 +- setup.py | 3 +- tests/test_mime_parsers.py | 44 +++++++++ 4 files changed, 80 insertions(+), 84 deletions(-) rename rtv/{mime_handlers.py => mime_parsers.py} (72%) create mode 100644 tests/test_mime_parsers.py diff --git a/rtv/mime_handlers.py b/rtv/mime_parsers.py similarity index 72% rename from rtv/mime_handlers.py rename to rtv/mime_parsers.py index 7015133..4b50567 100644 --- a/rtv/mime_handlers.py +++ b/rtv/mime_parsers.py @@ -3,71 +3,10 @@ import logging import mimetypes import requests -from six.moves.html_parser import HTMLParser +from bs4 import BeautifulSoup _logger = logging.getLogger(__name__) -# HTML Parsers - - -class HTMLParsed(Exception): - def __init__(self, data): - self.data = data - -# TODO: open temp file, close after 60 seconds with thread.timer() -# TODO: switch to bs4 with "html.parser" -# TODO: Add media_readme.rst -# TODO: Add environment variables to config - -class ImgurHTMLParser(HTMLParser): - """ - Scrape the actual image url from an imgur landing page. Imgur intentionally - obscures this on most reddit links in order to draw more traffic for their - advertisements. - - There are a couple of tags that supply the relevant info: - - - - - Note: - BeautifulSoup or lxml would be faster here but I wanted to skip adding - an extra dependency for something this trivial. - """ - def handle_starttag(self, tag, attr): - if tag == 'meta' and attr[0] == ('name', 'twitter:image'): - raise HTMLParsed(attr[1][1]) - - -class ImgurAlbumHTMLParser(HTMLParser): - """ - Scrape the complete list of images from an imgur album. The HTML parser is - very limited, so this assumes the following html structure: - -
- - Close up - Close up - -
- """ - def reset(self): - super(ImgurAlbumHTMLParser, self).reset() - self.primed = False - self.hrefs = [] - - def handle_starttag(self, tag, attr): - if tag == 'div' and ('class', 'post-image') in attr: - self.primed = True - elif self.primed: - self.primed = False - if tag == 'a' and attr[0][0] == 'href': - self.hrefs.append(attr[0][1]) - - -# MIME Parsers class BaseMIMEParser(object): """ @@ -103,7 +42,8 @@ class GfycatMIMEParser(BaseMIMEParser): downloaded as either gif, webm, or mjpg. Webm was selected because it's fast and works with VLC. - https://gfycat.com/api + https://gfycat.com/api + https://gfycat.com/UntidyAcidicIberianemeraldlizard --> https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm """ @@ -166,43 +106,54 @@ class ImgurMIMEParser(BaseMIMEParser): """ The majority of imgur links don't point directly to the image, so we need to open the provided url and scrape the page for the link. + + Scrape the actual image url from an imgur landing page. Imgur intentionally + obscures this on most reddit links in order to draw more traffic for their + advertisements. + + There are a couple of tags that supply the relevant info: + + + """ pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$') @staticmethod def get_mimetype(url): - imgur_page = requests.get(url) - try: - # convert_charrefs will be true by default in python 3.5 - ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text) - except HTMLParsed as data: - # We found a link - url = data.data + page = requests.get(url) + soup = BeautifulSoup(page.content, 'html.parser') + tag = soup.find('meta', attrs={'name': 'twitter:image'}) + if tag: + url = tag.get('content') if GifvMIMEParser.pattern.match(url): return GifvMIMEParser.get_mimetype(url) - return BaseMIMEParser.get_mimetype(url) class ImgurAlbumMIMEParser(BaseMIMEParser): """ Imgur albums can contain several images, which need to be scraped from the - landing page. + landing page. Assumes the following html structure: + +
+ + Close up + Close up + +
""" pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$') @staticmethod def get_mimetype(url): - imgur_page = requests.get(url) - parser = ImgurAlbumHTMLParser(convert_charrefs=True) + page = requests.get(url) + soup = BeautifulSoup(page.content, 'html.parser') - try: - parser.feed(imgur_page.text) - except Exception as e: - _logger.warning(e) - urls = [] - else: - urls = ['http:' + href for href in parser.hrefs] + urls = [] + for div in soup.find_all('div', class_='post-image'): + urls.append('http:' + div.find('img').get('src')) if urls: return "' '".join(urls), 'image/x-imgur-album' diff --git a/rtv/terminal.py b/rtv/terminal.py index b7e25d3..4d4b5dc 100644 --- a/rtv/terminal.py +++ b/rtv/terminal.py @@ -20,7 +20,7 @@ from kitchen.text.display import textual_width_chop from mailcap_fix import mailcap from . import exceptions -from . import mime_handlers +from . import mime_parsers from .objects import LoadScreen, Color @@ -401,7 +401,7 @@ class Terminal(object): entry (dict): The full mailcap entry for the corresponding command """ - for parser in mime_handlers.parsers: + for parser in mime_parsers.parsers: if parser.pattern.match(url): # modified_url may be the same as the original url, but it # could also be updated to point to a different page, or it diff --git a/setup.py b/setup.py index 6e2f2c3..30a8047 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,8 @@ import setuptools from version import __version__ as version -requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen'] +requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen', + 'beautifulsoup4', 'mailcap-fix'] # Python 2: add required concurrent.futures backport from Python 3.2 if sys.version_info.major <= 2: diff --git a/tests/test_mime_parsers.py b/tests/test_mime_parsers.py new file mode 100644 index 0000000..47bd04e --- /dev/null +++ b/tests/test_mime_parsers.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import pytest + +from rtv.mime_parsers import parsers + +URLS = [ + ('http://www.example.com/i/image.png', + 'http://www.example.com/i/image.png', 'image/png'), + ('http://www.example.com/v/video.mpeg', + 'http://www.example.com/v/video.mpeg', 'video/mpeg'), + ('http://www.example.com/i/image', + 'http://www.example.com/i/image', None), + ('https://gfycat.com/DeliciousUnfortunateAdouri', + 'https://giant.gfycat.com/DeliciousUnfortunateAdouri.webm', 'video/webm'), + ('https://www.youtube.com/watch?v=FjNdYp2gXRY', + 'https://www.youtube.com/watch?v=FjNdYp2gXRY', 'video/x-youtube'), + ('http://i.imgur.com/i/image.gifv', + 'http://i.imgur.com/i/image.mp4', 'video/mp4'), + ('https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h=' + '1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac', + 'https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h=' + '1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac', 'image/jpeg'), + ('http://imgur.com/yW0kbMi', + 'https://i.imgur.com/yW0kbMi.jpg', 'image/jpeg'), + ('http://imgur.com/yjP1v4B', + 'https://i.imgur.com/yjP1v4Bh.jpg', 'image/jpeg'), + ('http://imgur.com/a/qx9t5', + 'http://i.imgur.com/uEt0YLI.jpg', 'image/x-imgur-album'), +] + + +@pytest.mark.parametrize('url,modified_url,mime_type', URLS) +def test_parser(url, modified_url, mime_type, reddit): + # Add the reddit fixture so the cassettes get generated + + for parser in parsers: + if parser.pattern.match(url): + assert parser.get_mimetype(url) == (modified_url, mime_type) + break + else: + # The base parser should catch all urls before this point + assert False