Use imgur api to retrieve direct links to images.

This commit is contained in:
woorst
2017-05-28 00:25:00 -05:00
parent 84824f0352
commit 5da32c258a

View File

@@ -4,6 +4,7 @@ import mimetypes
import requests
from bs4 import BeautifulSoup
import json
_logger = logging.getLogger(__name__)
@@ -104,63 +105,37 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
class ImgurMIMEParser(BaseMIMEParser):
"""
The majority of imgur links don't point directly to the image, so we need
to open the provided url and scrape the page for the link.
Imgur provides a json api exposing its entire infrastructure. Each imgur
page has an associated hash and can either contain an album, a gallery, or single image.
Scrape the actual image url from an imgur landing page. Imgur intentionally
obscures this on most reddit links in order to draw more traffic for their
advertisements.
There are a couple of <meta> tags that supply the relevant info:
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
see https://apidocs.imgur.com
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tag = soup.find('meta', attrs={'name': 'twitter:image'})
if tag:
url = tag.get('content')
if GifvMIMEParser.pattern.match(url):
return GifvMIMEParser.get_mimetype(url)
return BaseMIMEParser.get_mimetype(url)
endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')}
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?P<domain>a|album|gallery)/)?(?P<hash>.+)$')
m = pattern.match(url)
page_hash = m.group('hash')
domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery'
class ImgurAlbumMIMEParser(BaseMIMEParser):
"""
Imgur albums can contain several images, which need to be scraped from the
landing page. Assumes the following html structure:
r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
headers=header)
if r.status_code == 404:
r = requests.get(endpoint.format(domain='image',
page_hash=page_hash), headers=header)
<div class="post-image">
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
<img class="post-image-placeholder"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
<img class="js-post-image-thumb"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
</a>
</div>
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
urls = []
for div in soup.find_all('div', class_='post-image'):
img = div.find('img')
src = img.get('src') if img else None
if src:
urls.append('http:{0}'.format(src))
if urls:
return " ".join(urls), 'image/x-imgur-album'
data = json.loads(r.text)['data']
if 'images' in data:
# TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
urls = ' '.join([d['link'] for d in data['images'] if not d['animated']])
return urls, 'image/x-imgur-album'
else:
return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type'])
return url, None
@@ -192,7 +167,6 @@ class InstagramMIMEParser(BaseMIMEParser):
parsers = [
InstagramMIMEParser,
GfycatMIMEParser,
ImgurAlbumMIMEParser,
ImgurMIMEParser,
RedditUploadsMIMEParser,
YoutubeMIMEParser,