tuir/rtv/mime_handlers.py

import re
import logging
import mimetypes

import requests
from six.moves.html_parser import HTMLParser

_logger = logging.getLogger(__name__)

# HTML Parsers


class HTMLParsed(Exception):
    def __init__(self, data):
        self.data = data

# TODO: open temp file, close after 60 seconds with thread.timer()
# TODO: switch to bs4 with "html.parser"
# TODO: Add media_readme.rst
# TODO: Add environment variables to config

class ImgurHTMLParser(HTMLParser):
    """
    Scrape the actual image url from an imgur landing page. Imgur intentionally
    obscures this on most reddit links in order to draw more traffic for their
    advertisements.

    There are a couple of <meta> tags that supply the relevant info:
        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">

    Note:
        BeautifulSoup or lxml would be faster here but I wanted to skip adding
        an extra dependency for something this trivial.
    """
    def handle_starttag(self, tag, attr):
        if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
            raise HTMLParsed(attr[1][1])


class ImgurAlbumHTMLParser(HTMLParser):
    """
    Scrape the complete list of images from an imgur album. The HTML parser is
    very limited, so this assumes the following html structure:

        <div class="post-image">
            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
                <img class="post-image-placeholder"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
                <img class="js-post-image-thumb"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
            </a>
        </div>
    """
    def reset(self):
        super(ImgurAlbumHTMLParser, self).reset()
        self.primed = False
        self.hrefs = []

    def handle_starttag(self, tag, attr):
        if tag == 'div' and ('class', 'post-image') in attr:
            self.primed = True
        elif self.primed:
            self.primed = False
            if tag == 'a' and attr[0][0] == 'href':
                self.hrefs.append(attr[0][1])


# MIME Parsers

class BaseMIMEParser(object):
    """
    BaseMIMEParser can be sub-classed to define custom handlers for determining
    the MIME type of external urls.
    """
    pattern = re.compile(r'.*$')

    @staticmethod
    def get_mimetype(url):
        """
        Guess based on the file extension.

        Args:
            url (text): Web url that was linked to by a reddit submission.

        Returns:
            modified_url (text): The url (or filename) that will be used when
                constructing the command to run.
            content_type (text): The mime-type that will be used when
                constructing the command to run. If the mime-type is unknown,
                return None and the program will fallback to using the web
                browser.
        """
        filename = url.split('?')[0]
        content_type, _ = mimetypes.guess_type(filename)
        return url, content_type


class GfycatMIMEParser(BaseMIMEParser):
    """
    Gfycat provides a primitive json api to generate image links. URLs can be
    downloaded as either gif, webm, or mjpg. Webm was selected because it's
    fast and works with VLC.

    https://gfycat.com/api
        https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
        https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
    """
    pattern = re.compile(r'https?://(www\.)?gfycat\.com/[^.]+$')

    @staticmethod
    def get_mimetype(url):
        parts = url.split('/')
        api_url = '/'.join(parts[:-1] + ['cajax', 'get'] + parts[-1:])
        resp = requests.get(api_url)
        image_url = resp.json()['gfyItem']['webmUrl']
        return image_url, 'video/webm'


class YoutubeMIMEParser(BaseMIMEParser):
    """
    Youtube videos can be streamed with vlc or downloaded with youtube-dl.
    Assign a custom mime-type so they can be referenced in mailcap.
    """
    pattern = re.compile(
            r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
            r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')

    @staticmethod
    def get_mimetype(url):
        return url, 'video/x-youtube'


class GifvMIMEParser(BaseMIMEParser):
    """
    Special case for .gifv, which is a custom video format for imgur serves
    as html with a special <video> frame. Note that attempting for download as
    .webm also returns this html page. However, .mp4 appears to return the raw
    video file.
    """
    pattern = re.compile(r'.*[.]gifv$')

    @staticmethod
    def get_mimetype(url):
        modified_url = url[:-4] + 'mp4'
        return modified_url, 'video/mp4'


class RedditUploadsMIMEParser(BaseMIMEParser):
    """
    Reddit uploads do not have a file extension, but we can grab the mime-type
    from the page header.
    """
    pattern = re.compile(r'https://i\.reddituploads\.com/.+$')

    @staticmethod
    def get_mimetype(url):
        page = requests.head(url)
        content_type = page.headers.get('Content-Type', '')
        content_type = content_type.split(';')[0]  # Strip out the encoding
        return url, content_type


class ImgurMIMEParser(BaseMIMEParser):
    """
    The majority of imgur links don't point directly to the image, so we need
    to open the provided url and scrape the page for the link.
    """
    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')

    @staticmethod
    def get_mimetype(url):
        imgur_page = requests.get(url)
        try:
            # convert_charrefs will be true by default in python 3.5
            ImgurHTMLParser(convert_charrefs=True).feed(imgur_page.text)
        except HTMLParsed as data:
            # We found a link
            url = data.data
            if GifvMIMEParser.pattern.match(url):
                return GifvMIMEParser.get_mimetype(url)

        return BaseMIMEParser.get_mimetype(url)


class ImgurAlbumMIMEParser(BaseMIMEParser):
    """
    Imgur albums can contain several images, which need to be scraped from the
    landing page.
    """
    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')

    @staticmethod
    def get_mimetype(url):
        imgur_page = requests.get(url)
        parser = ImgurAlbumHTMLParser(convert_charrefs=True)

        try:
            parser.feed(imgur_page.text)
        except Exception as e:
            _logger.warning(e)
            urls = []
        else:
            urls = ['http:' + href for href in parser.hrefs]

        if urls:
            return "' '".join(urls), 'image/x-imgur-album'
        else:
            return url, None


# Parsers should be listed in the order they will be checked
parsers = [
    GfycatMIMEParser,
    ImgurAlbumMIMEParser,
    ImgurMIMEParser,
    RedditUploadsMIMEParser,
    YoutubeMIMEParser,
    GifvMIMEParser,
    BaseMIMEParser]