Added tests for mime parsers.

This commit is contained in:
Michael Lazar
2016-07-21 00:25:55 -07:00
parent 40732fb90c
commit 738a46e6dd
4 changed files with 80 additions and 84 deletions

View File

@@ -3,71 +3,10 @@ import logging
import mimetypes import mimetypes
import requests import requests
from six.moves.html_parser import HTMLParser from bs4 import BeautifulSoup
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
# HTML Parsers
class HTMLParsed(Exception):
def __init__(self, data):
self.data = data
# TODO: open temp file, close after 60 seconds with thread.timer()
# TODO: switch to bs4 with "html.parser"
# TODO: Add media_readme.rst
# TODO: Add environment variables to config
class ImgurHTMLParser(HTMLParser):
"""
Scrape the actual image url from an imgur landing page. Imgur intentionally
obscures this on most reddit links in order to draw more traffic for their
advertisements.
There are a couple of <meta> tags that supply the relevant info:
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
Note:
BeautifulSoup or lxml would be faster here but I wanted to skip adding
an extra dependency for something this trivial.
"""
def handle_starttag(self, tag, attr):
if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
raise HTMLParsed(attr[1][1])
class ImgurAlbumHTMLParser(HTMLParser):
"""
Scrape the complete list of images from an imgur album. The HTML parser is
very limited, so this assumes the following html structure:
<div class="post-image">
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
<img class="post-image-placeholder"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
<img class="js-post-image-thumb"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
</a>
</div>
"""
def reset(self):
super(ImgurAlbumHTMLParser, self).reset()
self.primed = False
self.hrefs = []
def handle_starttag(self, tag, attr):
if tag == 'div' and ('class', 'post-image') in attr:
self.primed = True
elif self.primed:
self.primed = False
if tag == 'a' and attr[0][0] == 'href':
self.hrefs.append(attr[0][1])
# MIME Parsers
class BaseMIMEParser(object): class BaseMIMEParser(object):
""" """
@@ -103,7 +42,8 @@ class GfycatMIMEParser(BaseMIMEParser):
downloaded as either gif, webm, or mjpg. Webm was selected because it's downloaded as either gif, webm, or mjpg. Webm was selected because it's
fast and works with VLC. fast and works with VLC.
https://gfycat.com/api https://gfycat.com/api
https://gfycat.com/UntidyAcidicIberianemeraldlizard --> https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
""" """
@@ -166,43 +106,54 @@ class ImgurMIMEParser(BaseMIMEParser):
""" """
The majority of imgur links don't point directly to the image, so we need The majority of imgur links don't point directly to the image, so we need
to open the provided url and scrape the page for the link. to open the provided url and scrape the page for the link.
Scrape the actual image url from an imgur landing page. Imgur intentionally
obscures this on most reddit links in order to draw more traffic for their
advertisements.
There are a couple of <meta> tags that supply the relevant info:
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
""" """
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$') pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
@staticmethod @staticmethod
def get_mimetype(url): def get_mimetype(url):
imgur_page = requests.get(url) page = requests.get(url)
try: soup = BeautifulSoup(page.content, 'html.parser')
# convert_charrefs will be true by default in python 3.5 tag = soup.find('meta', attrs={'name': 'twitter:image'})
ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text) if tag:
except HTMLParsed as data: url = tag.get('content')
# We found a link
url = data.data
if GifvMIMEParser.pattern.match(url): if GifvMIMEParser.pattern.match(url):
return GifvMIMEParser.get_mimetype(url) return GifvMIMEParser.get_mimetype(url)
return BaseMIMEParser.get_mimetype(url) return BaseMIMEParser.get_mimetype(url)
class ImgurAlbumMIMEParser(BaseMIMEParser): class ImgurAlbumMIMEParser(BaseMIMEParser):
""" """
Imgur albums can contain several images, which need to be scraped from the Imgur albums can contain several images, which need to be scraped from the
landing page. landing page. Assumes the following html structure:
<div class="post-image">
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
<img class="post-image-placeholder"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
<img class="js-post-image-thumb"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
</a>
</div>
""" """
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$') pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
@staticmethod @staticmethod
def get_mimetype(url): def get_mimetype(url):
imgur_page = requests.get(url) page = requests.get(url)
parser = ImgurAlbumHTMLParser(convert_charrefs=True) soup = BeautifulSoup(page.content, 'html.parser')
try: urls = []
parser.feed(imgur_page.text) for div in soup.find_all('div', class_='post-image'):
except Exception as e: urls.append('http:' + div.find('img').get('src'))
_logger.warning(e)
urls = []
else:
urls = ['http:' + href for href in parser.hrefs]
if urls: if urls:
return "' '".join(urls), 'image/x-imgur-album' return "' '".join(urls), 'image/x-imgur-album'

View File

@@ -20,7 +20,7 @@ from kitchen.text.display import textual_width_chop
from mailcap_fix import mailcap from mailcap_fix import mailcap
from . import exceptions from . import exceptions
from . import mime_handlers from . import mime_parsers
from .objects import LoadScreen, Color from .objects import LoadScreen, Color
@@ -401,7 +401,7 @@ class Terminal(object):
entry (dict): The full mailcap entry for the corresponding command entry (dict): The full mailcap entry for the corresponding command
""" """
for parser in mime_handlers.parsers: for parser in mime_parsers.parsers:
if parser.pattern.match(url): if parser.pattern.match(url):
# modified_url may be the same as the original url, but it # modified_url may be the same as the original url, but it
# could also be updated to point to a different page, or it # could also be updated to point to a different page, or it

View File

@@ -3,7 +3,8 @@ import setuptools
from version import __version__ as version from version import __version__ as version
requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen'] requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen',
'beautifulsoup4', 'mailcap-fix']
# Python 2: add required concurrent.futures backport from Python 3.2 # Python 2: add required concurrent.futures backport from Python 3.2
if sys.version_info.major <= 2: if sys.version_info.major <= 2:

View File

@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import pytest
from rtv.mime_parsers import parsers
URLS = [
('http://www.example.com/i/image.png',
'http://www.example.com/i/image.png', 'image/png'),
('http://www.example.com/v/video.mpeg',
'http://www.example.com/v/video.mpeg', 'video/mpeg'),
('http://www.example.com/i/image',
'http://www.example.com/i/image', None),
('https://gfycat.com/DeliciousUnfortunateAdouri',
'https://giant.gfycat.com/DeliciousUnfortunateAdouri.webm', 'video/webm'),
('https://www.youtube.com/watch?v=FjNdYp2gXRY',
'https://www.youtube.com/watch?v=FjNdYp2gXRY', 'video/x-youtube'),
('http://i.imgur.com/i/image.gifv',
'http://i.imgur.com/i/image.mp4', 'video/mp4'),
('https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
'1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac',
'https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
'1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac', 'image/jpeg'),
('http://imgur.com/yW0kbMi',
'https://i.imgur.com/yW0kbMi.jpg', 'image/jpeg'),
('http://imgur.com/yjP1v4B',
'https://i.imgur.com/yjP1v4Bh.jpg', 'image/jpeg'),
('http://imgur.com/a/qx9t5',
'http://i.imgur.com/uEt0YLI.jpg', 'image/x-imgur-album'),
]
@pytest.mark.parametrize('url,modified_url,mime_type', URLS)
def test_parser(url, modified_url, mime_type, reddit):
# Add the reddit fixture so the cassettes get generated
for parser in parsers:
if parser.pattern.match(url):
assert parser.get_mimetype(url) == (modified_url, mime_type)
break
else:
# The base parser should catch all urls before this point
assert False