mime parser for reddit hosted video
This commit is contained in:
@@ -128,6 +128,28 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
|
|||||||
return url, content_type
|
return url, content_type
|
||||||
|
|
||||||
|
|
||||||
|
class RedditVideoMIMEParser(BaseMIMEParser):
|
||||||
|
"""
|
||||||
|
Reddit hosted videos/gifs.
|
||||||
|
Media uses MPEG-DASH format (.mpd)
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'https://v\.redd\.it/.+$')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mimetype(url):
|
||||||
|
request_url = url + '/DASHPlaylist.mpd'
|
||||||
|
page = requests.get(request_url)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
if not soup.find('representation', attrs={'mimetype': 'audio/mp4'}):
|
||||||
|
reps = soup.find_all('representation',
|
||||||
|
attrs={'mimetype': 'video/mp4'})
|
||||||
|
rep = sorted(reps, reverse=True,
|
||||||
|
key=lambda t: int(t.get('bandwidth')))[0]
|
||||||
|
return url + '/' + rep.find('baseurl').text, 'video/mp4'
|
||||||
|
else:
|
||||||
|
return request_url, 'video/x-youtube'
|
||||||
|
|
||||||
|
|
||||||
class ImgurApiMIMEParser(BaseMIMEParser):
|
class ImgurApiMIMEParser(BaseMIMEParser):
|
||||||
"""
|
"""
|
||||||
Imgur now provides a json API exposing its entire infrastructure. Each Imgur
|
Imgur now provides a json API exposing its entire infrastructure. Each Imgur
|
||||||
@@ -335,6 +357,7 @@ parsers = [
|
|||||||
GfycatMIMEParser,
|
GfycatMIMEParser,
|
||||||
ImgurApiMIMEParser,
|
ImgurApiMIMEParser,
|
||||||
RedditUploadsMIMEParser,
|
RedditUploadsMIMEParser,
|
||||||
|
RedditVideoMIMEParser,
|
||||||
YoutubeMIMEParser,
|
YoutubeMIMEParser,
|
||||||
LiveleakMIMEParser,
|
LiveleakMIMEParser,
|
||||||
GifvMIMEParser,
|
GifvMIMEParser,
|
||||||
|
|||||||
40
tests/cassettes/test_parser[reddit_gif].yaml
Normal file
40
tests/cassettes/test_parser[reddit_gif].yaml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
interactions:
|
||||||
|
- request:
|
||||||
|
body: null
|
||||||
|
headers:
|
||||||
|
Accept: ['*/*']
|
||||||
|
Accept-Encoding: ['gzip, deflate']
|
||||||
|
Connection: [keep-alive]
|
||||||
|
User-Agent: [python-requests/2.12.4]
|
||||||
|
method: GET
|
||||||
|
uri: https://v.redd.it/wkm9zol8c6fz/DASHPlaylist.mpd
|
||||||
|
response:
|
||||||
|
body:
|
||||||
|
string: !!binary |
|
||||||
|
H4sIAAAAAAAAA4VSXY+bMBB8z6+w/E4whKYJCjnllKt6aquikGsfTz68gCVskG3SXH99F8ippDmp
|
||||||
|
vAD7MTM7u5u7s6rJCYyVjU5oMGeUgM4bIXWZ0KfjJ29FiXVcC143GhL6CpbebWebb+meKBCSpwYs
|
||||||
|
aMcdAuw7M7wTmh6DaB4uoowSJfV9VxRgjlLBkJl/YAwTrWkKWYNNaGd0rFooY8FtFV/isbRNUXiN
|
||||||
|
9gQoFBCHLAgoca8totieMKcE1esbAJtX2IGBS892RvDZpGBkI4h4R+RYMVTtBG/HaTJwxEKpcLpd
|
||||||
|
LUvdfyTUmQ7Qku7l/6nMcePsT+mqbJeiuROageoA7cQ88oJD/pLCVQldspCtF5TgIiDH+fgpD+aR
|
||||||
|
iFiABIXhCg7coQ0LXFcFsqyQPYzwR4qE/njcP3z3gt55BcfBrpMU0PiqjYZlGjfRRC6Uffu1vkHj
|
||||||
|
PbfwdPi63e+yz89Lxp6/bPy32G11Ng7eFxCpBZwPXJfIv1p/9AIWIv3f6MOZ52+m3UINcI9aOslr
|
||||||
|
+Xs0yIxgzFutl9R/h96f8P/jtX9t9mTh/tXGL7fij8eCh+7jpW9nfwDvbQVhKAMAAA==
|
||||||
|
headers:
|
||||||
|
Accept-Ranges: [bytes]
|
||||||
|
Age: ['1814']
|
||||||
|
Connection: [keep-alive]
|
||||||
|
Content-Encoding: [gzip]
|
||||||
|
Content-Length: ['448']
|
||||||
|
Content-Type: [application/dash+xml]
|
||||||
|
Date: ['Sat, 12 Aug 2017 06:02:01 GMT']
|
||||||
|
ETag: ['"981cd6c498c30ab3c4f1c743be7e6f60"']
|
||||||
|
Last-Modified: ['Fri, 11 Aug 2017 21:10:49 GMT']
|
||||||
|
Server: [snooserv]
|
||||||
|
Vary: ['Accept-Encoding,Origin']
|
||||||
|
Via: [1.1 varnish]
|
||||||
|
X-Cache: [HIT]
|
||||||
|
X-Cache-Hits: ['2']
|
||||||
|
X-Served-By: [cache-mdw17337-MDW]
|
||||||
|
X-Timer: ['S1502517721.219943,VS0,VE0']
|
||||||
|
status: {code: 200, message: OK}
|
||||||
|
version: 1
|
||||||
44
tests/cassettes/test_parser[reddit_video].yaml
Normal file
44
tests/cassettes/test_parser[reddit_video].yaml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
interactions:
|
||||||
|
- request:
|
||||||
|
body: null
|
||||||
|
headers:
|
||||||
|
Accept: ['*/*']
|
||||||
|
Accept-Encoding: ['gzip, deflate']
|
||||||
|
Connection: [keep-alive]
|
||||||
|
User-Agent: [python-requests/2.12.4]
|
||||||
|
method: GET
|
||||||
|
uri: https://v.redd.it/zv89llsvexdz/DASHPlaylist.mpd
|
||||||
|
response:
|
||||||
|
body:
|
||||||
|
string: !!binary |
|
||||||
|
H4sIAAAAAAAAA+WWXW+bMBSG7/srLN8TbKAUUEiVNp0WbdWikGyXkYsNWAKD+Mja/fodIJFImq5L
|
||||||
|
lV0tN0ns4/PxPuYcxrfPWYq2oqxkrnxMRwQjocKcSxX7eL36pDkYVTVTnKW5Ej5+ERW+nVyNHxcz
|
||||||
|
lAku2aIUlVA1q8HBrCm7bx8vVpSOblw7wCiT6q6JIlGuZCa6ndE1IbBRlHkkU1H5uCmVlxUi9jir
|
||||||
|
Em+37skqjyItVxoXGSTgGYRSjOqXArxUbcAQI8hevXJQhQmcgIXdmckVgs94IUqZc8RPJNlbdFZT
|
||||||
|
zoq+mkDUqBJxBtVNUxmr9oeP67IRIEnz9P5WULOyrn7IOgmmCxB3EKYLtRTFQDz0BEX+lLxOfGxY
|
||||||
|
1KaOiRGQECEUyLYhHVncIjTCKCpZJpasBh1M4JUIGScQ3nLgj+Q+/j6fPXzTaCt9JladXlvJRa5n
|
||||||
|
hdXRLOtBUmgf07aOEuySvGOVWC+/TmbT4PPG2Fibx7G+X3ttHfSVtwZIKi6el0zFEN8lpkYJsfFg
|
||||||
|
9eGZhXvVXrvq3M2VrCVL5a9eobJ3RjSXGFg/EV4fxD8SWz9U+69RUIM4N8bNKRTiTRSmfYDCOBMF
|
||||||
|
dZ13UVCA8Z+hsInr0uvzSBjWAQnzXBKm8S4Jm5DNl4+RMFoS1xcjQS9GYqwf9MG3+uOf0bGGyzxg
|
||||||
|
WZHCNOnRQI8iAGT4fJmUtn19DxVwsJFFRkaPbbqezY96Wef2NLYT1U9b6/uEKSXS+1xFMt71f9SN
|
||||||
|
CTHn61IeTxDDJMT0TK8LtQn705tweHw3jrYsbSCn08/A/kp0bj5yQxzT0FzzUhfEMf/xBRnr/YyF
|
||||||
|
9wMdXhAmV78BYdjQZV8IAAA=
|
||||||
|
headers:
|
||||||
|
Accept-Ranges: [bytes]
|
||||||
|
Age: ['3435']
|
||||||
|
Connection: [keep-alive]
|
||||||
|
Content-Encoding: [gzip]
|
||||||
|
Content-Length: ['644']
|
||||||
|
Content-Type: [application/dash+xml]
|
||||||
|
Date: ['Sat, 12 Aug 2017 06:02:01 GMT']
|
||||||
|
ETag: ['"c8a71f31b45dbcdb30f8c03bc08e444b"']
|
||||||
|
Last-Modified: ['Sat, 05 Aug 2017 14:05:39 GMT']
|
||||||
|
Server: [snooserv]
|
||||||
|
Vary: ['Accept-Encoding,Origin']
|
||||||
|
Via: [1.1 varnish]
|
||||||
|
X-Cache: [HIT]
|
||||||
|
X-Cache-Hits: ['2']
|
||||||
|
X-Served-By: [cache-mdw17335-MDW]
|
||||||
|
X-Timer: ['S1502517721.351252,VS0,VE0']
|
||||||
|
status: {code: 200, message: OK}
|
||||||
|
version: 1
|
||||||
@@ -73,6 +73,14 @@ URLS = OrderedDict([
|
|||||||
'https://www.liveleak.com/view?i=08b_1499296574',
|
'https://www.liveleak.com/view?i=08b_1499296574',
|
||||||
re.compile('https://cdn.liveleak.com/80281E/ll_a_s/2017/Jul/5/LiveLeak-dot-com-08b_1499296574-NMHH8690_1499296571.mov.h264_720p.mp4(.*)'),
|
re.compile('https://cdn.liveleak.com/80281E/ll_a_s/2017/Jul/5/LiveLeak-dot-com-08b_1499296574-NMHH8690_1499296571.mov.h264_720p.mp4(.*)'),
|
||||||
'video/mp4')),
|
'video/mp4')),
|
||||||
|
('reddit_gif', (
|
||||||
|
'https://v.redd.it/wkm9zol8c6fz',
|
||||||
|
'https://v.redd.it/wkm9zol8c6fz/DASH_600_K',
|
||||||
|
'video/mp4')),
|
||||||
|
('reddit_video', (
|
||||||
|
'https://v.redd.it/zv89llsvexdz',
|
||||||
|
'https://v.redd.it/zv89llsvexdz/DASHPlaylist.mpd',
|
||||||
|
'video/x-youtube')),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
@@ -121,4 +129,4 @@ def test_imgur_fallback(reddit):
|
|||||||
parsed_url, parsed_type = ImgurApiMIMEParser.get_mimetype(url)
|
parsed_url, parsed_type = ImgurApiMIMEParser.get_mimetype(url)
|
||||||
# Not sure why, but http://imgur.com/gallery/yjP1v4B (a .gif)
|
# Not sure why, but http://imgur.com/gallery/yjP1v4B (a .gif)
|
||||||
# appears to incorrectly return as a JPG type from the scraper
|
# appears to incorrectly return as a JPG type from the scraper
|
||||||
assert parsed_type is not None
|
assert parsed_type is not None
|
||||||
|
|||||||
Reference in New Issue
Block a user