Merge branch 'imgur_api' of https://github.com/woorst/rtv into woorst-imgur_api

This commit is contained in:
Michael Lazar
2017-06-02 23:55:04 -04:00
2 changed files with 29 additions and 54 deletions

View File

@@ -130,64 +130,40 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
class ImgurMIMEParser(BaseMIMEParser):
"""
The majority of imgur links don't point directly to the image, so we need
to open the provided url and scrape the page for the link.
Imgur provides a json api exposing its entire infrastructure. Each imgur
page has an associated hash and can either contain an album, a gallery, or single image.
Scrape the actual image url from an imgur landing page. Imgur intentionally
obscures this on most reddit links in order to draw more traffic for their
advertisements.
There are a couple of <meta> tags that supply the relevant info:
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
see https://apidocs.imgur.com
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tag = soup.find('meta', attrs={'name': 'twitter:image'})
if tag:
url = tag.get('content')
if GifvMIMEParser.pattern.match(url):
return GifvMIMEParser.get_mimetype(url)
return BaseMIMEParser.get_mimetype(url)
endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')}
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?P<domain>a|album|gallery)/)?(?P<hash>.+)$')
m = pattern.match(url)
page_hash = m.group('hash')
domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery'
class ImgurAlbumMIMEParser(BaseMIMEParser):
"""
Imgur albums can contain several images, which need to be scraped from the
landing page. Assumes the following html structure:
r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
headers=header)
if r.status_code != 200:
r = requests.get(endpoint.format(domain='image',
page_hash=page_hash), headers=header)
if r.status_code != 200:
return url, None
<div class="post-image">
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
<img class="post-image-placeholder"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
<img class="js-post-image-thumb"
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
</a>
</div>
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
urls = []
for div in soup.find_all('div', class_='post-image'):
img = div.find('img')
src = img.get('src') if img else None
if src:
urls.append('http:{0}'.format(src))
if urls:
return " ".join(urls), 'image/x-imgur-album'
else:
return url, None
data = r.json()['data']
if 'images' in data:
# TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
links = ' '.join([d['link'] for d in data['images'] if not d['animated']])
return links.replace('http://', 'https://'), 'image/x-imgur-album'
else :
link = data['mp4'] if data['animated'] else data['link']
mime = 'video/mp4' if data['animated'] else data['type']
return link.replace('http://', 'https://'), mime
class InstagramMIMEParser(OpenGraphMIMEParser):
@@ -227,7 +203,6 @@ parsers = [
VidmeMIMEParser,
InstagramMIMEParser,
GfycatMIMEParser,
ImgurAlbumMIMEParser,
ImgurMIMEParser,
RedditUploadsMIMEParser,
YoutubeMIMEParser,

View File

@@ -46,12 +46,12 @@ URLS = OrderedDict([
'https://i.imgur.com/yW0kbMi.jpg',
'image/jpeg')),
('imgur_2', (
'http://imgur.com/yjP1v4B',
'https://i.imgur.com/yjP1v4Bh.jpg',
'image/jpeg')),
'http://imgur.com/gallery/yjP1v4B',
'https://i.imgur.com/yjP1v4B.mp4',
'video/mp4')),
('imgur_album', (
'http://imgur.com/a/qx9t5',
'http://i.imgur.com/uEt0YLI.jpg',
'https://i.imgur.com/uEt0YLI.jpg',
'image/x-imgur-album')),
('instagram_image', (
'https://www.instagram.com/p/BIxQ0vrBN2Y/?taken-by=kimchi_chic',