From 5da32c258a8e98eb0dde00cc1dd7e4a18a7d707c Mon Sep 17 00:00:00 2001
From: woorst <woorst@github.com>
Date: Sun, 28 May 2017 00:25:00 -0500
Subject: [PATCH 1/4] Use imgur api to retrieve direct links to images.

---
 rtv/mime_parsers.py | 72 +++++++++++++++------------------------------
 1 file changed, 23 insertions(+), 49 deletions(-)
diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py
index 59b978d..b8450de 100644
--- a/rtv/mime_parsers.py
+++ b/rtv/mime_parsers.py
@@ -4,6 +4,7 @@ import mimetypes
 
 import requests
 from bs4 import BeautifulSoup
+import json
 
 _logger = logging.getLogger(__name__)
 
@@ -104,64 +105,38 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
 
 class ImgurMIMEParser(BaseMIMEParser):
     """
-    The majority of imgur links don't point directly to the image, so we need
-    to open the provided url and scrape the page for the link.
+    Imgur provides a json api exposing its entire infrastructure. Each imgur
+    page has an associated hash and can either contain an album, a gallery, or single image.
 
-    Scrape the actual image url from an imgur landing page. Imgur intentionally
-    obscures this on most reddit links in order to draw more traffic for their
-    advertisements.
-
-    There are a couple of <meta> tags that supply the relevant info:
-        <meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
-        <meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
-        <link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
+    see https://apidocs.imgur.com
     """
     pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
 
     @staticmethod
     def get_mimetype(url):
-        page = requests.get(url)
-        soup = BeautifulSoup(page.content, 'html.parser')
-        tag = soup.find('meta', attrs={'name': 'twitter:image'})
-        if tag:
-            url = tag.get('content')
-            if GifvMIMEParser.pattern.match(url):
-                return GifvMIMEParser.get_mimetype(url)
-        return BaseMIMEParser.get_mimetype(url)
+        endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
+        header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')}
 
+        pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?P<domain>a|album|gallery)/)?(?P<hash>.+)$')
+        m = pattern.match(url)
+        page_hash = m.group('hash')
+        domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery'
 
-class ImgurAlbumMIMEParser(BaseMIMEParser):
-    """
-    Imgur albums can contain several images, which need to be scraped from the
-    landing page. Assumes the following html structure:
+        r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
+                                         headers=header)
+        if r.status_code == 404:
+            r = requests.get(endpoint.format(domain='image',
+                                page_hash=page_hash), headers=header)
 
-        <div class="post-image">
-            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
-                <img class="post-image-placeholder"
-                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
-                <img class="js-post-image-thumb"
-                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
-            </a>
-        </div>
-    """
-    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
-
-    @staticmethod
-    def get_mimetype(url):
-        page = requests.get(url)
-        soup = BeautifulSoup(page.content, 'html.parser')
-
-        urls = []
-        for div in soup.find_all('div', class_='post-image'):
-            img = div.find('img')
-            src = img.get('src') if img else None
-            if src:
-                urls.append('http:{0}'.format(src))
-
-        if urls:
-            return " ".join(urls), 'image/x-imgur-album'
+        data = json.loads(r.text)['data']
+        if 'images' in data:
+            # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
+            urls = ' '.join([d['link'] for d in data['images'] if not d['animated']])
+            return urls, 'image/x-imgur-album'
         else:
-            return url, None
+            return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type'])
+
+        return url, None
 
 
 class InstagramMIMEParser(BaseMIMEParser):
@@ -192,7 +167,6 @@ class InstagramMIMEParser(BaseMIMEParser):
 parsers = [
     InstagramMIMEParser,
     GfycatMIMEParser,
-    ImgurAlbumMIMEParser,
     ImgurMIMEParser,
     RedditUploadsMIMEParser,
     YoutubeMIMEParser,

From d123986d4f1db4de111f1068a6a6c0aebb4dbaf6 Mon Sep 17 00:00:00 2001
From: woorst <woorst@github.com>
Date: Sun, 28 May 2017 02:13:39 -0500
Subject: [PATCH 2/4] pass unhandled imgur page to browser

---
 rtv/mime_parsers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py
index b8450de..43ef191 100644
--- a/rtv/mime_parsers.py
+++ b/rtv/mime_parsers.py
@@ -124,20 +124,20 @@ class ImgurMIMEParser(BaseMIMEParser):
 
         r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
                                          headers=header)
-        if r.status_code == 404:
+        if r.status_code != 200:
             r = requests.get(endpoint.format(domain='image',
                                 page_hash=page_hash), headers=header)
+            if r.status_code != 200:
+                return url, None
 
         data = json.loads(r.text)['data']
         if 'images' in data:
             # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
             urls = ' '.join([d['link'] for d in data['images'] if not d['animated']])
             return urls, 'image/x-imgur-album'
-        else:
+        else :
             return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type'])
 
-        return url, None
-
 
 class InstagramMIMEParser(BaseMIMEParser):
     """

From 371f8db06aa61373cb2920b9d6f5737ac2021e9b Mon Sep 17 00:00:00 2001
From: woorst <woorst@github.com>
Date: Sun, 28 May 2017 09:25:28 -0500
Subject: [PATCH 3/4] Fix test for imgur gif link

---
 rtv/mime_parsers.py        | 8 +++++---
 tests/test_mime_parsers.py | 8 ++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py
index 43ef191..469f6ec 100644
--- a/rtv/mime_parsers.py
+++ b/rtv/mime_parsers.py
@@ -133,10 +133,12 @@ class ImgurMIMEParser(BaseMIMEParser):
         data = json.loads(r.text)['data']
         if 'images' in data:
             # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
-            urls = ' '.join([d['link'] for d in data['images'] if not d['animated']])
-            return urls, 'image/x-imgur-album'
+            links = ' '.join([d['link'] for d in data['images'] if not d['animated']])
+            return links.replace('http://', 'https://'), 'image/x-imgur-album'
         else :
-            return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type'])
+            link = data['mp4'] if data['animated'] else data['link']
+            mime = 'video/mp4' if data['animated'] else data['type']
+            return link.replace('http://', 'https://'), mime
 
 
 class InstagramMIMEParser(BaseMIMEParser):
diff --git a/tests/test_mime_parsers.py b/tests/test_mime_parsers.py
index f281215..d7a5d9a 100644
--- a/tests/test_mime_parsers.py
+++ b/tests/test_mime_parsers.py
@@ -41,12 +41,12 @@ URLS = OrderedDict([
         'https://i.imgur.com/yW0kbMi.jpg',
         'image/jpeg')),
     ('imgur_2', (
-        'http://imgur.com/yjP1v4B',
-        'https://i.imgur.com/yjP1v4Bh.jpg',
-        'image/jpeg')),
+        'http://imgur.com/gallery/yjP1v4B',
+        'https://i.imgur.com/yjP1v4B.mp4',
+        'video/mp4')),
     ('imgur_album', (
         'http://imgur.com/a/qx9t5',
-        'http://i.imgur.com/uEt0YLI.jpg',
+        'https://i.imgur.com/uEt0YLI.jpg',
         'image/x-imgur-album')),
     ('instagram_image', (
         'https://www.instagram.com/p/BIxQ0vrBN2Y/?taken-by=kimchi_chic',

From cb4e56e0c6fa805301b590b390f8238bdc9e2a42 Mon Sep 17 00:00:00 2001
From: woorst <woorst@github.com>
Date: Mon, 29 May 2017 11:46:18 -0500
Subject: [PATCH 4/4] use request json method

---
 rtv/mime_parsers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py
index 469f6ec..e358082 100644
--- a/rtv/mime_parsers.py
+++ b/rtv/mime_parsers.py
@@ -4,7 +4,6 @@ import mimetypes
 
 import requests
 from bs4 import BeautifulSoup
-import json
 
 _logger = logging.getLogger(__name__)
 
@@ -130,7 +129,7 @@ class ImgurMIMEParser(BaseMIMEParser):
             if r.status_code != 200:
                 return url, None
 
-        data = json.loads(r.text)['data']
+        data = r.json()['data']
         if 'images' in data:
             # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
             links = ' '.join([d['link'] for d in data['images'] if not d['animated']])