Added tests for mime parsers.
This commit is contained in:
@@ -3,71 +3,10 @@ import logging
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from six.moves.html_parser import HTMLParser
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# HTML Parsers
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLParsed(Exception):
|
|
||||||
def __init__(self, data):
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
# TODO: open temp file, close after 60 seconds with thread.timer()
|
|
||||||
# TODO: switch to bs4 with "html.parser"
|
|
||||||
# TODO: Add media_readme.rst
|
|
||||||
# TODO: Add environment variables to config
|
|
||||||
|
|
||||||
class ImgurHTMLParser(HTMLParser):
|
|
||||||
"""
|
|
||||||
Scrape the actual image url from an imgur landing page. Imgur intentionally
|
|
||||||
obscures this on most reddit links in order to draw more traffic for their
|
|
||||||
advertisements.
|
|
||||||
|
|
||||||
There are a couple of <meta> tags that supply the relevant info:
|
|
||||||
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
|
|
||||||
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
|
|
||||||
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
|
|
||||||
|
|
||||||
Note:
|
|
||||||
BeautifulSoup or lxml would be faster here but I wanted to skip adding
|
|
||||||
an extra dependency for something this trivial.
|
|
||||||
"""
|
|
||||||
def handle_starttag(self, tag, attr):
|
|
||||||
if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
|
|
||||||
raise HTMLParsed(attr[1][1])
|
|
||||||
|
|
||||||
|
|
||||||
class ImgurAlbumHTMLParser(HTMLParser):
|
|
||||||
"""
|
|
||||||
Scrape the complete list of images from an imgur album. The HTML parser is
|
|
||||||
very limited, so this assumes the following html structure:
|
|
||||||
|
|
||||||
<div class="post-image">
|
|
||||||
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
|
|
||||||
<img class="post-image-placeholder"
|
|
||||||
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
|
||||||
<img class="js-post-image-thumb"
|
|
||||||
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
|
||||||
</a>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
def reset(self):
|
|
||||||
super(ImgurAlbumHTMLParser, self).reset()
|
|
||||||
self.primed = False
|
|
||||||
self.hrefs = []
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attr):
|
|
||||||
if tag == 'div' and ('class', 'post-image') in attr:
|
|
||||||
self.primed = True
|
|
||||||
elif self.primed:
|
|
||||||
self.primed = False
|
|
||||||
if tag == 'a' and attr[0][0] == 'href':
|
|
||||||
self.hrefs.append(attr[0][1])
|
|
||||||
|
|
||||||
|
|
||||||
# MIME Parsers
|
|
||||||
|
|
||||||
class BaseMIMEParser(object):
|
class BaseMIMEParser(object):
|
||||||
"""
|
"""
|
||||||
@@ -103,7 +42,8 @@ class GfycatMIMEParser(BaseMIMEParser):
|
|||||||
downloaded as either gif, webm, or mjpg. Webm was selected because it's
|
downloaded as either gif, webm, or mjpg. Webm was selected because it's
|
||||||
fast and works with VLC.
|
fast and works with VLC.
|
||||||
|
|
||||||
https://gfycat.com/api
|
https://gfycat.com/api
|
||||||
|
|
||||||
https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
|
https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
|
||||||
https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
|
https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
|
||||||
"""
|
"""
|
||||||
@@ -166,43 +106,54 @@ class ImgurMIMEParser(BaseMIMEParser):
|
|||||||
"""
|
"""
|
||||||
The majority of imgur links don't point directly to the image, so we need
|
The majority of imgur links don't point directly to the image, so we need
|
||||||
to open the provided url and scrape the page for the link.
|
to open the provided url and scrape the page for the link.
|
||||||
|
|
||||||
|
Scrape the actual image url from an imgur landing page. Imgur intentionally
|
||||||
|
obscures this on most reddit links in order to draw more traffic for their
|
||||||
|
advertisements.
|
||||||
|
|
||||||
|
There are a couple of <meta> tags that supply the relevant info:
|
||||||
|
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
|
||||||
|
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
|
||||||
|
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
|
||||||
"""
|
"""
|
||||||
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_mimetype(url):
|
def get_mimetype(url):
|
||||||
imgur_page = requests.get(url)
|
page = requests.get(url)
|
||||||
try:
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
# convert_charrefs will be true by default in python 3.5
|
tag = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||||
ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text)
|
if tag:
|
||||||
except HTMLParsed as data:
|
url = tag.get('content')
|
||||||
# We found a link
|
|
||||||
url = data.data
|
|
||||||
if GifvMIMEParser.pattern.match(url):
|
if GifvMIMEParser.pattern.match(url):
|
||||||
return GifvMIMEParser.get_mimetype(url)
|
return GifvMIMEParser.get_mimetype(url)
|
||||||
|
|
||||||
return BaseMIMEParser.get_mimetype(url)
|
return BaseMIMEParser.get_mimetype(url)
|
||||||
|
|
||||||
|
|
||||||
class ImgurAlbumMIMEParser(BaseMIMEParser):
|
class ImgurAlbumMIMEParser(BaseMIMEParser):
|
||||||
"""
|
"""
|
||||||
Imgur albums can contain several images, which need to be scraped from the
|
Imgur albums can contain several images, which need to be scraped from the
|
||||||
landing page.
|
landing page. Assumes the following html structure:
|
||||||
|
|
||||||
|
<div class="post-image">
|
||||||
|
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
|
||||||
|
<img class="post-image-placeholder"
|
||||||
|
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
||||||
|
<img class="js-post-image-thumb"
|
||||||
|
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
"""
|
"""
|
||||||
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_mimetype(url):
|
def get_mimetype(url):
|
||||||
imgur_page = requests.get(url)
|
page = requests.get(url)
|
||||||
parser = ImgurAlbumHTMLParser(convert_charrefs=True)
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
|
||||||
try:
|
urls = []
|
||||||
parser.feed(imgur_page.text)
|
for div in soup.find_all('div', class_='post-image'):
|
||||||
except Exception as e:
|
urls.append('http:' + div.find('img').get('src'))
|
||||||
_logger.warning(e)
|
|
||||||
urls = []
|
|
||||||
else:
|
|
||||||
urls = ['http:' + href for href in parser.hrefs]
|
|
||||||
|
|
||||||
if urls:
|
if urls:
|
||||||
return "' '".join(urls), 'image/x-imgur-album'
|
return "' '".join(urls), 'image/x-imgur-album'
|
||||||
@@ -20,7 +20,7 @@ from kitchen.text.display import textual_width_chop
|
|||||||
from mailcap_fix import mailcap
|
from mailcap_fix import mailcap
|
||||||
|
|
||||||
from . import exceptions
|
from . import exceptions
|
||||||
from . import mime_handlers
|
from . import mime_parsers
|
||||||
from .objects import LoadScreen, Color
|
from .objects import LoadScreen, Color
|
||||||
|
|
||||||
|
|
||||||
@@ -401,7 +401,7 @@ class Terminal(object):
|
|||||||
entry (dict): The full mailcap entry for the corresponding command
|
entry (dict): The full mailcap entry for the corresponding command
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for parser in mime_handlers.parsers:
|
for parser in mime_parsers.parsers:
|
||||||
if parser.pattern.match(url):
|
if parser.pattern.match(url):
|
||||||
# modified_url may be the same as the original url, but it
|
# modified_url may be the same as the original url, but it
|
||||||
# could also be updated to point to a different page, or it
|
# could also be updated to point to a different page, or it
|
||||||
|
|||||||
3
setup.py
3
setup.py
@@ -3,7 +3,8 @@ import setuptools
|
|||||||
|
|
||||||
from version import __version__ as version
|
from version import __version__ as version
|
||||||
|
|
||||||
requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen']
|
requirements = ['tornado', 'praw==3.5.0', 'six', 'requests', 'kitchen',
|
||||||
|
'beautifulsoup4', 'mailcap-fix']
|
||||||
|
|
||||||
# Python 2: add required concurrent.futures backport from Python 3.2
|
# Python 2: add required concurrent.futures backport from Python 3.2
|
||||||
if sys.version_info.major <= 2:
|
if sys.version_info.major <= 2:
|
||||||
|
|||||||
44
tests/test_mime_parsers.py
Normal file
44
tests/test_mime_parsers.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from rtv.mime_parsers import parsers
|
||||||
|
|
||||||
|
URLS = [
|
||||||
|
('http://www.example.com/i/image.png',
|
||||||
|
'http://www.example.com/i/image.png', 'image/png'),
|
||||||
|
('http://www.example.com/v/video.mpeg',
|
||||||
|
'http://www.example.com/v/video.mpeg', 'video/mpeg'),
|
||||||
|
('http://www.example.com/i/image',
|
||||||
|
'http://www.example.com/i/image', None),
|
||||||
|
('https://gfycat.com/DeliciousUnfortunateAdouri',
|
||||||
|
'https://giant.gfycat.com/DeliciousUnfortunateAdouri.webm', 'video/webm'),
|
||||||
|
('https://www.youtube.com/watch?v=FjNdYp2gXRY',
|
||||||
|
'https://www.youtube.com/watch?v=FjNdYp2gXRY', 'video/x-youtube'),
|
||||||
|
('http://i.imgur.com/i/image.gifv',
|
||||||
|
'http://i.imgur.com/i/image.mp4', 'video/mp4'),
|
||||||
|
('https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
|
||||||
|
'1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac',
|
||||||
|
'https://i.reddituploads.com/a065472e47a4405da159189ee48bff46?fit=max&h='
|
||||||
|
'1536&w=1536&s=5639918a0c696b9bb3ec694dc3cf59ac', 'image/jpeg'),
|
||||||
|
('http://imgur.com/yW0kbMi',
|
||||||
|
'https://i.imgur.com/yW0kbMi.jpg', 'image/jpeg'),
|
||||||
|
('http://imgur.com/yjP1v4B',
|
||||||
|
'https://i.imgur.com/yjP1v4Bh.jpg', 'image/jpeg'),
|
||||||
|
('http://imgur.com/a/qx9t5',
|
||||||
|
'http://i.imgur.com/uEt0YLI.jpg', 'image/x-imgur-album'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('url,modified_url,mime_type', URLS)
|
||||||
|
def test_parser(url, modified_url, mime_type, reddit):
|
||||||
|
# Add the reddit fixture so the cassettes get generated
|
||||||
|
|
||||||
|
for parser in parsers:
|
||||||
|
if parser.pattern.match(url):
|
||||||
|
assert parser.get_mimetype(url) == (modified_url, mime_type)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# The base parser should catch all urls before this point
|
||||||
|
assert False
|
||||||
Reference in New Issue
Block a user