Files
tuir/rtv/mime_handlers.py
2016-07-15 18:33:07 -07:00

222 lines
6.9 KiB
Python

import re
import logging
import mimetypes
import requests
from six.moves.html_parser import HTMLParser
_logger = logging.getLogger(__name__)
# HTML Parsers
class HTMLParsed(Exception):
def __init__(self, data):
self.data = data
# TODO: open temp file, close after 60 seconds with thread.timer()
# TODO: switch to bs4 with "html.parser"
# TODO: Add media_readme.rst
# TODO: Add environment variables to config
class ImgurHTMLParser(HTMLParser):
"""
Scrape the actual image url from an imgur landing page. Imgur intentionally
obscures this on most reddit links in order to draw more traffic for their
advertisements.
There are a couple of <meta> tags that supply the relevant info:
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
Note:
BeautifulSoup or lxml would be faster here but I wanted to skip adding
an extra dependency for something this trivial.
"""
def handle_starttag(self, tag, attr):
if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
raise HTMLParsed(attr[1][1])
class ImgurAlbumHTMLParser(HTMLParser):
"""
Scrape the complete list of images from an imgur album. The HTML parser is
very limited, so this assumes the following html structure:
<div class="post-image">
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
<img class="post-image-placeholder"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
<img class="js-post-image-thumb"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
</a>
</div>
"""
def reset(self):
super(ImgurAlbumHTMLParser, self).reset()
self.primed = False
self.hrefs = []
def handle_starttag(self, tag, attr):
if tag == 'div' and ('class', 'post-image') in attr:
self.primed = True
elif self.primed:
self.primed = False
if tag == 'a' and attr[0][0] == 'href':
self.hrefs.append(attr[0][1])
# MIME Parsers
class BaseMIMEParser(object):
"""
BaseMIMEParser can be sub-classed to define custom handlers for determining
the MIME type of external urls.
"""
pattern = re.compile(r'.*$')
@staticmethod
def get_mimetype(url):
"""
Guess based on the file extension.
Args:
url (text): Web url that was linked to by a reddit submission.
Returns:
modified_url (text): The url (or filename) that will be used when
constructing the command to run.
content_type (text): The mime-type that will be used when
constructing the command to run. If the mime-type is unknown,
return None and the program will fallback to using the web
browser.
"""
filename = url.split('?')[0]
content_type, _ = mimetypes.guess_type(filename)
return url, content_type
class GfycatMIMEParser(BaseMIMEParser):
"""
Gfycat provides a primitive json api to generate image links. URLs can be
downloaded as either gif, webm, or mjpg. Webm was selected because it's
fast and works with VLC.
https://gfycat.com/api
https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
"""
pattern = re.compile(r'https?://(www\.)?gfycat\.com/[^.]+$')
@staticmethod
def get_mimetype(url):
parts = url.split('/')
api_url = '/'.join(parts[:-1] + ['cajax', 'get'] + parts[-1:])
resp = requests.get(api_url)
image_url = resp.json()['gfyItem']['webmUrl']
return image_url, 'video/webm'
class YoutubeMIMEParser(BaseMIMEParser):
"""
Youtube videos can be streamed with vlc or downloaded with youtube-dl.
Assign a custom mime-type so they can be referenced in mailcap.
"""
pattern = re.compile(
r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
@staticmethod
def get_mimetype(url):
return url, 'video/x-youtube'
class GifvMIMEParser(BaseMIMEParser):
"""
Special case for .gifv, which is a custom video format for imgur serves
as html with a special <video> frame. Note that attempting for download as
.webm also returns this html page. However, .mp4 appears to return the raw
video file.
"""
pattern = re.compile(r'.*[.]gifv$')
@staticmethod
def get_mimetype(url):
modified_url = url[:-4] + 'mp4'
return modified_url, 'video/mp4'
class RedditUploadsMIMEParser(BaseMIMEParser):
"""
Reddit uploads do not have a file extension, but we can grab the mime-type
from the page header.
"""
pattern = re.compile(r'https://i\.reddituploads\.com/.+$')
@staticmethod
def get_mimetype(url):
page = requests.head(url)
content_type = page.headers.get('Content-Type', '')
content_type = content_type.split(';')[0] # Strip out the encoding
return url, content_type
class ImgurMIMEParser(BaseMIMEParser):
"""
The majority of imgur links don't point directly to the image, so we need
to open the provided url and scrape the page for the link.
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
@staticmethod
def get_mimetype(url):
imgur_page = requests.get(url)
try:
# convert_charrefs will be true by default in python 3.5
ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text)
except HTMLParsed as data:
# We found a link
url = data.data
if GifvMIMEParser.pattern.match(url):
return GifvMIMEParser.get_mimetype(url)
return BaseMIMEParser.get_mimetype(url)
class ImgurAlbumMIMEParser(BaseMIMEParser):
"""
Imgur albums can contain several images, which need to be scraped from the
landing page.
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
@staticmethod
def get_mimetype(url):
imgur_page = requests.get(url)
parser = ImgurAlbumHTMLParser(convert_charrefs=True)
try:
parser.feed(imgur_page.text)
except Exception as e:
_logger.warning(e)
urls = []
else:
urls = ['http:' + href for href in parser.hrefs]
if urls:
return "' '".join(urls), 'image/x-imgur-album'
else:
return url, None
# Parsers should be listed in the order they will be checked
parsers = [
GfycatMIMEParser,
ImgurAlbumMIMEParser,
ImgurMIMEParser,
RedditUploadsMIMEParser,
YoutubeMIMEParser,
GifvMIMEParser,
BaseMIMEParser]