Merge branch 'imgur_api' of https://github.com/woorst/rtv into woorst-imgur_api

2017-06-02 23:55:04 -04:00
parent 332ad463f7 cb4e56e0c6
commit fd6e6cc8ec
2 changed files with 29 additions and 54 deletions
--- a/rtv/mime_parsers.py
+++ b/rtv/mime_parsers.py
@@ -130,65 +130,41 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
 class ImgurMIMEParser(BaseMIMEParser):
    """
-    The majority of imgur links don't point directly to the image, so we need
+    Imgur provides a json api exposing its entire infrastructure. Each imgur
-    to open the provided url and scrape the page for the link.
+    page has an associated hash and can either contain an album, a gallery, or single image.
-    Scrape the actual image url from an imgur landing page. Imgur intentionally
+    see https://apidocs.imgur.com
    obscures this on most reddit links in order to draw more traffic for their
    advertisements.
    There are a couple of <meta> tags that supply the relevant info:
        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
    """
    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
    @staticmethod
    def get_mimetype(url):
-        page = requests.get(url)
+        endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
-        soup = BeautifulSoup(page.content, 'html.parser')
+        header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')}
        tag = soup.find('meta', attrs={'name': 'twitter:image'})
        if tag:
            url = tag.get('content')
            if GifvMIMEParser.pattern.match(url):
                return GifvMIMEParser.get_mimetype(url)
        return BaseMIMEParser.get_mimetype(url)
        pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?P<domain>a|album|gallery)/)?(?P<hash>.+)$')
        m = pattern.match(url)
        page_hash = m.group('hash')
        domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery'
-class ImgurAlbumMIMEParser(BaseMIMEParser):
+        r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
-    """
+                                         headers=header)
-    Imgur albums can contain several images, which need to be scraped from the
+        if r.status_code != 200:
-    landing page. Assumes the following html structure:
+            r = requests.get(endpoint.format(domain='image',
-
+                                page_hash=page_hash), headers=header)
-        <div class="post-image">
+            if r.status_code != 200:
            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
                <img class="post-image-placeholder"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
                <img class="js-post-image-thumb"
                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
            </a>
        </div>
    """
    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
    @staticmethod
    def get_mimetype(url):
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        urls = []
        for div in soup.find_all('div', class_='post-image'):
            img = div.find('img')
            src = img.get('src') if img else None
            if src:
                urls.append('http:{0}'.format(src))
        if urls:
            return " ".join(urls), 'image/x-imgur-album'
        else:
                return url, None
        data = r.json()['data']
        if 'images' in data:
            # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
            links = ' '.join([d['link'] for d in data['images'] if not d['animated']])
            return links.replace('http://', 'https://'), 'image/x-imgur-album'
        else :
            link = data['mp4'] if data['animated'] else data['link']
            mime = 'video/mp4' if data['animated'] else data['type']
            return link.replace('http://', 'https://'), mime
 class InstagramMIMEParser(OpenGraphMIMEParser):
    """
@@ -227,7 +203,6 @@ parsers = [
    VidmeMIMEParser,
    InstagramMIMEParser,
    GfycatMIMEParser,
    ImgurAlbumMIMEParser,
    ImgurMIMEParser,
    RedditUploadsMIMEParser,
    YoutubeMIMEParser,
--- a/tests/test_mime_parsers.py
+++ b/tests/test_mime_parsers.py
@@ -46,12 +46,12 @@ URLS = OrderedDict([
        'https://i.imgur.com/yW0kbMi.jpg',
        'image/jpeg')),
    ('imgur_2', (
-        'http://imgur.com/yjP1v4B',
+        'http://imgur.com/gallery/yjP1v4B',
-        'https://i.imgur.com/yjP1v4Bh.jpg',
+        'https://i.imgur.com/yjP1v4B.mp4',
-        'image/jpeg')),
+        'video/mp4')),
    ('imgur_album', (
        'http://imgur.com/a/qx9t5',
-        'http://i.imgur.com/uEt0YLI.jpg',
+        'https://i.imgur.com/uEt0YLI.jpg',
        'image/x-imgur-album')),
    ('instagram_image', (
        'https://www.instagram.com/p/BIxQ0vrBN2Y/?taken-by=kimchi_chic',