Bundling praw v3 with rtv

2017-03-28 21:33:10 -07:00
parent 31024e1a6c
commit 6b1eab1a97
24 changed files with 6930 additions and 23 deletions
--- a/rtv/packages/praw/helpers.py
+++ b/rtv/packages/praw/helpers.py
@@ -0,0 +1,481 @@
+# This file is part of PRAW.
+#
+# PRAW is free software: you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# PRAW is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# PRAW.  If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Helper functions.
+
+The functions here provide functionality that is often needed by programs using
+PRAW, but which isn't part of reddit's API.
+"""
+
+from __future__ import unicode_literals
+
+import six
+import sys
+import time
+from collections import deque
+from functools import partial
+from timeit import default_timer as timer
+from praw.errors import HTTPException, PRAWException
+from operator import attrgetter
+
+BACKOFF_START = 4  # Minimum number of seconds to sleep during errors
+KEEP_ITEMS = 128  # On each iteration only remember the first # items
+
+# for conversion between broken reddit timestamps and unix timestamps
+REDDIT_TIMESTAMP_OFFSET = 28800
+
+
+def comment_stream(reddit_session, subreddit, limit=None, verbosity=1):
+    """Indefinitely yield new comments from the provided subreddit.
+
+    Comments are yielded from oldest to newest.
+
+    :param reddit_session: The reddit_session to make requests from. In all the
+        examples this is assigned to the variable ``r``.
+    :param subreddit: Either a subreddit object, or the name of a
+        subreddit. Use `all` to get the comment stream for all comments made to
+        reddit.
+    :param limit: The maximum number of comments to fetch in a single
+        iteration. When None, fetch all available comments (reddit limits this
+        to 1000 (or multiple of 1000 for multi-subreddits). If this number is
+        too small, comments may be missed.
+    :param verbosity: A number that controls the amount of output produced to
+        stderr. <= 0: no output; >= 1: output the total number of comments
+        processed and provide the short-term number of comments processed per
+        second; >= 2: output when additional delays are added in order to avoid
+        subsequent unexpected http errors. >= 3: output debugging information
+        regarding the comment stream. (Default: 1)
+
+    """
+    get_function = partial(reddit_session.get_comments,
+                           six.text_type(subreddit))
+    return _stream_generator(get_function, limit, verbosity)
+
+
+def submission_stream(reddit_session, subreddit, limit=None, verbosity=1):
+    """Indefinitely yield new submissions from the provided subreddit.
+
+    Submissions are yielded from oldest to newest.
+
+    :param reddit_session: The reddit_session to make requests from. In all the
+        examples this is assigned to the variable ``r``.
+    :param subreddit: Either a subreddit object, or the name of a
+        subreddit. Use `all` to get the submissions stream for all submissions
+        made to reddit.
+    :param limit: The maximum number of submissions to fetch in a single
+        iteration. When None, fetch all available submissions (reddit limits
+        this to 1000 (or multiple of 1000 for multi-subreddits). If this number
+        is too small, submissions may be missed. Since there isn't a limit to
+        the number of submissions that can be retrieved from r/all, the limit
+        will be set to 1000 when limit is None.
+    :param verbosity: A number that controls the amount of output produced to
+        stderr. <= 0: no output; >= 1: output the total number of submissions
+        processed and provide the short-term number of submissions processed
+        per second; >= 2: output when additional delays are added in order to
+        avoid subsequent unexpected http errors. >= 3: output debugging
+        information regarding the submission stream. (Default: 1)
+
+    """
+    if six.text_type(subreddit).lower() == "all":
+        if limit is None:
+            limit = 1000
+    if not hasattr(subreddit, 'reddit_session'):
+        subreddit = reddit_session.get_subreddit(subreddit)
+    return _stream_generator(subreddit.get_new, limit, verbosity)
+
+
+def valid_redditors(redditors, sub):
+    """Return a verified list of valid Redditor instances.
+
+    :param redditors: A list comprised of Redditor instances and/or strings
+        that are to be verified as actual redditor accounts.
+    :param sub: A Subreddit instance that the authenticated account has
+        flair changing permission on.
+
+    Note: Flair will be unset for all valid redditors in `redditors` on the
+    subreddit `sub`. A valid redditor is defined as a redditor that is
+    registered on reddit.
+
+    """
+    simplified = list(set(six.text_type(x).lower() for x in redditors))
+    return [sub.reddit_session.get_redditor(simplified[i], fetch=False)
+            for (i, resp) in enumerate(sub.set_flair_csv(
+                ({'user': x, 'flair_text': x} for x in simplified)))
+            if resp['ok']]
+
+
+def submissions_between(reddit_session,
+                        subreddit,
+                        lowest_timestamp=None,
+                        highest_timestamp=None,
+                        newest_first=True,
+                        extra_cloudsearch_fields=None,
+                        verbosity=1):
+    """Yield submissions between two timestamps.
+
+    If both ``highest_timestamp`` and ``lowest_timestamp`` are unspecified,
+    yields all submissions in the ``subreddit``.
+
+    Submissions are yielded from newest to oldest(like in the "new" queue).
+
+    :param reddit_session: The reddit_session to make requests from. In all the
+        examples this is assigned to the variable ``r``.
+    :param subreddit: Either a subreddit object, or the name of a
+        subreddit. Use `all` to get the submissions stream for all submissions
+        made to reddit.
+    :param lowest_timestamp: The lower bound for ``created_utc`` atributed of
+        submissions.
+        (Default: subreddit's created_utc or 0 when subreddit == "all").
+    :param highest_timestamp: The upper bound for ``created_utc`` attribute
+        of submissions. (Default: current unix time)
+        NOTE: both highest_timestamp and lowest_timestamp are proper
+        unix timestamps(just like ``created_utc`` attributes)
+    :param newest_first: If set to true, yields submissions
+        from newest to oldest. Otherwise yields submissions
+        from oldest to newest
+    :param extra_cloudsearch_fields: Allows extra filtering of results by
+        parameters like author, self. Full list is available here:
+        https://www.reddit.com/wiki/search
+    :param verbosity: A number that controls the amount of output produced to
+        stderr. <= 0: no output; >= 1: output the total number of submissions
+        processed; >= 2: output debugging information regarding
+        the search queries. (Default: 1)
+    """
+    def debug(msg, level):
+        if verbosity >= level:
+            sys.stderr.write(msg + '\n')
+
+    def format_query_field(k, v):
+        if k in ["nsfw", "self"]:
+            # even though documentation lists "no" and "yes"
+            # as possible values, in reality they don't work
+            if v not in [0, 1, "0", "1"]:
+                raise PRAWException("Invalid value for the extra"
+                                    "field {}. Only '0' and '1' are"
+                                    "valid values.".format(k))
+            return "{}:{}".format(k, v)
+        return "{}:'{}'".format(k, v)
+
+    if extra_cloudsearch_fields is None:
+        extra_cloudsearch_fields = {}
+
+    extra_query_part = " ".join(
+        [format_query_field(k, v) for (k, v)
+         in sorted(extra_cloudsearch_fields.items())]
+    )
+
+    if highest_timestamp is None:
+        highest_timestamp = int(time.time()) + REDDIT_TIMESTAMP_OFFSET
+    else:
+        highest_timestamp = int(highest_timestamp) + REDDIT_TIMESTAMP_OFFSET
+
+    if lowest_timestamp is not None:
+        lowest_timestamp = int(lowest_timestamp) + REDDIT_TIMESTAMP_OFFSET
+    elif not isinstance(subreddit, six.string_types):
+        lowest_timestamp = int(subreddit.created)
+    elif subreddit not in ("all", "contrib", "mod", "friend"):
+        lowest_timestamp = int(reddit_session.get_subreddit(subreddit).created)
+    else:
+        lowest_timestamp = 0
+
+    original_highest_timestamp = highest_timestamp
+    original_lowest_timestamp = lowest_timestamp
+
+    # When making timestamp:X..Y queries, reddit misses submissions
+    # inside X..Y range, but they can be found inside Y..Z range
+    # It is not clear what is the value of Z should be, but it seems
+    # like the difference is usually about ~1 hour or less
+    # To be sure, let's set the workaround offset to 2 hours
+    out_of_order_submissions_workaround_offset = 7200
+    highest_timestamp += out_of_order_submissions_workaround_offset
+    lowest_timestamp -= out_of_order_submissions_workaround_offset
+
+    # Those parameters work ok, but there may be a better set of parameters
+    window_size = 60 * 60
+    search_limit = 100
+    min_search_results_in_window = 50
+    window_adjustment_ratio = 1.25
+    backoff = BACKOFF_START
+
+    processed_submissions = 0
+    prev_win_increased = False
+    prev_win_decreased = False
+
+    while highest_timestamp >= lowest_timestamp:
+        try:
+            if newest_first:
+                t1 = max(highest_timestamp - window_size, lowest_timestamp)
+                t2 = highest_timestamp
+            else:
+                t1 = lowest_timestamp
+                t2 = min(lowest_timestamp + window_size, highest_timestamp)
+
+            search_query = 'timestamp:{}..{}'.format(t1, t2)
+            if extra_query_part:
+                search_query = "(and {} {})".format(search_query,
+                                                    extra_query_part)
+
+            debug(search_query, 3)
+            search_results = list(reddit_session.search(search_query,
+                                                        subreddit=subreddit,
+                                                        limit=search_limit,
+                                                        syntax='cloudsearch',
+                                                        sort='new'))
+
+            debug("Received {0} search results for query {1}"
+                  .format(len(search_results), search_query),
+                  2)
+
+            backoff = BACKOFF_START
+        except HTTPException as exc:
+            debug("{0}. Sleeping for {1} seconds".format(exc, backoff), 2)
+            time.sleep(backoff)
+            backoff *= 2
+            continue
+
+        if len(search_results) >= search_limit:
+            power = 2 if prev_win_decreased else 1
+            window_size = int(window_size / window_adjustment_ratio**power)
+            prev_win_decreased = True
+            debug("Decreasing window size to {0} seconds".format(window_size),
+                  2)
+            # Since it is possible that there are more submissions
+            # in the current window, we have to re-do the request
+            # with reduced window
+            continue
+        else:
+            prev_win_decreased = False
+
+        search_results = [s for s in search_results
+                          if original_lowest_timestamp <= s.created and
+                          s.created <= original_highest_timestamp]
+
+        for submission in sorted(search_results,
+                                 key=attrgetter('created_utc', 'id'),
+                                 reverse=newest_first):
+            yield submission
+
+        processed_submissions += len(search_results)
+        debug('Total processed submissions: {}'
+              .format(processed_submissions), 1)
+
+        if newest_first:
+            highest_timestamp -= (window_size + 1)
+        else:
+            lowest_timestamp += (window_size + 1)
+
+        if len(search_results) < min_search_results_in_window:
+            power = 2 if prev_win_increased else 1
+            window_size = int(window_size * window_adjustment_ratio**power)
+            prev_win_increased = True
+            debug("Increasing window size to {0} seconds"
+                  .format(window_size), 2)
+        else:
+            prev_win_increased = False
+
+
+def _stream_generator(get_function, limit=None, verbosity=1):
+    def debug(msg, level):
+        if verbosity >= level:
+            sys.stderr.write(msg + '\n')
+
+    def b36_id(item):
+        return int(item.id, 36)
+
+    seen = BoundedSet(KEEP_ITEMS * 16)
+    before = None
+    count = 0  # Count is incremented to bypass the cache
+    processed = 0
+    backoff = BACKOFF_START
+    while True:
+        items = []
+        sleep = None
+        start = timer()
+        try:
+            i = None
+            params = {'uniq': count}
+            count = (count + 1) % 100
+            if before:
+                params['before'] = before
+            gen = enumerate(get_function(limit=limit, params=params))
+            for i, item in gen:
+                if b36_id(item) in seen:
+                    if i == 0:
+                        if before is not None:
+                            # reddit sent us out of order data  -- log it
+                            debug('(INFO) {0} already seen with before of {1}'
+                                  .format(item.fullname, before), 3)
+                            before = None
+                    break
+                if i == 0:  # Always the first item in the generator
+                    before = item.fullname
+                if b36_id(item) not in seen:
+                    items.append(item)
+                    processed += 1
+                if verbosity >= 1 and processed % 100 == 0:
+                    sys.stderr.write(' Items: {0}            \r'
+                                     .format(processed))
+                    sys.stderr.flush()
+                if i < KEEP_ITEMS:
+                    seen.add(b36_id(item))
+            else:  # Generator exhausted
+                if i is None:  # Generator yielded no items
+                    assert before is not None
+                    # Try again without before as the before item may be too
+                    # old or no longer exist.
+                    before = None
+            backoff = BACKOFF_START
+        except HTTPException as exc:
+            sleep = (backoff, '{0}. Sleeping for {{0}} seconds.'.format(exc),
+                     2)
+            backoff *= 2
+        # Provide rate limit
+        if verbosity >= 1:
+            rate = len(items) / (timer() - start)
+            sys.stderr.write(' Items: {0} ({1:.2f} ips)    \r'
+                             .format(processed, rate))
+            sys.stderr.flush()
+        # Yield items from oldest to newest
+        for item in items[::-1]:
+            yield item
+        # Sleep if necessary
+        if sleep:
+            sleep_time, msg, msg_level = sleep  # pylint: disable=W0633
+            debug(msg.format(sleep_time), msg_level)
+            time.sleep(sleep_time)
+
+
+def chunk_sequence(sequence, chunk_length, allow_incomplete=True):
+    """Given a sequence, divide it into sequences of length `chunk_length`.
+
+    :param allow_incomplete: If True, allow final chunk to be shorter if the
+        given sequence is not an exact multiple of `chunk_length`.
+        If False, the incomplete chunk will be discarded.
+    """
+    (complete, leftover) = divmod(len(sequence), chunk_length)
+    if not allow_incomplete:
+        leftover = 0
+
+    chunk_count = complete + min(leftover, 1)
+
+    chunks = []
+    for x in range(chunk_count):
+        left = chunk_length * x
+        right = left + chunk_length
+        chunks.append(sequence[left:right])
+
+    return chunks
+
+
+def convert_id36_to_numeric_id(id36):
+    """Convert strings representing base36 numbers into an integer."""
+    if not isinstance(id36, six.string_types) or id36.count("_") > 0:
+        raise ValueError("must supply base36 string, not fullname (e.g. use "
+                         "xxxxx, not t3_xxxxx)")
+    return int(id36, 36)
+
+
+def convert_numeric_id_to_id36(numeric_id):
+    """Convert an integer into its base36 string representation.
+
+    This method has been cleaned up slightly to improve readability. For more
+    info see:
+
+    https://github.com/reddit/reddit/blob/master/r2/r2/lib/utils/_utils.pyx
+
+    https://www.reddit.com/r/redditdev/comments/n624n/submission_ids_question/
+
+    https://en.wikipedia.org/wiki/Base36
+    """
+    # base36 allows negative numbers, but reddit does not
+    if not isinstance(numeric_id, six.integer_types) or numeric_id < 0:
+        raise ValueError("must supply a positive int/long")
+
+    # Alphabet used for base 36 conversion
+    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
+    alphabet_len = len(alphabet)
+
+    # Temp assign
+    current_number = numeric_id
+    base36 = []
+
+    # Current_number must be greater than alphabet length to while/divmod
+    if 0 <= current_number < alphabet_len:
+        return alphabet[current_number]
+
+    # Break up into chunks
+    while current_number != 0:
+        current_number, rem = divmod(current_number, alphabet_len)
+        base36.append(alphabet[rem])
+
+    # String is built in reverse order
+    return ''.join(reversed(base36))
+
+
+def flatten_tree(tree, nested_attr='replies', depth_first=False):
+    """Return a flattened version of the passed in tree.
+
+    :param nested_attr: The attribute name that contains the nested items.
+        Defaults to ``replies`` which is suitable for comments.
+    :param depth_first: When true, add to the list in a depth-first manner
+        rather than the default breadth-first manner.
+
+    """
+    stack = deque(tree)
+    extend = stack.extend if depth_first else stack.extendleft
+    retval = []
+    while stack:
+        item = stack.popleft()
+        nested = getattr(item, nested_attr, None)
+        if nested:
+            extend(nested)
+        retval.append(item)
+    return retval
+
+
+def normalize_url(url):
+    """Return url after stripping trailing .json and trailing slashes."""
+    if url.endswith('.json'):
+        url = url[:-5]
+    if url.endswith('/'):
+        url = url[:-1]
+    return url
+
+
+class BoundedSet(object):
+    """A set with a maximum size that evicts the oldest items when necessary.
+
+    This class does not implement the complete set interface.
+
+    """
+
+    def __init__(self, max_items):
+        """Construct an instance of the BoundedSet."""
+        self.max_items = max_items
+        self._fifo = []
+        self._set = set()
+
+    def __contains__(self, item):
+        """Test if the BoundedSet contains item."""
+        return item in self._set
+
+    def add(self, item):
+        """Add an item to the set discarding the oldest item if necessary."""
+        if item in self._set:
+            self._fifo.remove(item)
+        elif len(self._set) == self.max_items:
+            self._set.remove(self._fifo.pop(0))
+        self._fifo.append(item)
+        self._set.add(item)