Source code for perceval.backends.core.stackexchange

# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Alberto Martín <alberto.martin@bitergia.com>
#     Santiago Dueñas <sduenas@bitergia.com>
#     Stephan Barth <stephan.barth@gmail.com>
#     Valerio Cosentino <valcos@bitergia.com>
#     Jesus M. Gonzalez-Barahona <jgb@gsyc.es>
#     Harshal Mittal <harshalmittal4@gmail.com>
#

import json
import logging
import time

from grimoirelab_toolkit.datetime import datetime_to_utc
from grimoirelab_toolkit.uris import urijoin

from ...backend import (Backend,
                        BackendCommand,
                        BackendCommandArgumentParser)
from ...client import HttpClient
from ...errors import BackendError
from ...utils import DEFAULT_DATETIME

CATEGORY_QUESTION = "question"

MAX_QUESTIONS = 100  # Maximum number of reviews per query

logger = logging.getLogger(__name__)


[docs]class StackExchange(Backend):
    """StackExchange backend for Perceval.

    This class retrieves the questions stored in any of the
    StackExchange sites. To initialize this class the
    site must be provided.

    :param site: StackExchange site
    :param tagged: filter items by question Tag
    :param api_token: StackExchange application key for the API
    :param access_token: StackExchange user access_token for the API
    :param max_questions: max of questions per page retrieved
    :param tag: label used to mark the data
    :param archive: archive to store/retrieve items
    :param ssl_verify: enable/disable SSL verification
    """
    version = '0.12.1'

    CATEGORIES = [CATEGORY_QUESTION]
    EXTRA_SEARCH_FIELDS = {
        'tags': ['tags']
    }

    def __init__(self, site, tagged=None, api_token=None, access_token=None,
                 max_questions=MAX_QUESTIONS, tag=None, archive=None, ssl_verify=True):
        origin = site

        if not api_token and access_token:
            raise BackendError(cause="access_token is defined but api_token is not")

        super().__init__(origin, tag=tag, archive=archive, ssl_verify=ssl_verify)
        self.site = site
        self.api_token = api_token
        self.access_token = access_token
        self.tagged = tagged
        self.max_questions = max_questions

        self.client = None

[docs]    def fetch(self, category=CATEGORY_QUESTION, from_date=DEFAULT_DATETIME):
        """Fetch the questions from the site.

        The method retrieves, from a StackExchange site, the
        questions updated since the given date.

        :param from_date: obtain questions updated since this date

        :returns: a generator of questions
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        from_date = datetime_to_utc(from_date)

        kwargs = {'from_date': from_date}
        items = super().fetch(category, **kwargs)

        return items

[docs]    def fetch_items(self, category, **kwargs):
        """Fetch the questions

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """
        from_date = kwargs['from_date']

        logger.info("Looking for questions at site '%s', with tag '%s' and updated from '%s'",
                    self.site, self.tagged, str(from_date))

        whole_pages = self.client.get_questions(from_date)

        for whole_page in whole_pages:
            questions = self.parse_questions(whole_page)
            for question in questions:
                yield question

[docs]    @classmethod
    def has_archiving(cls):
        """Returns whether it supports archiving items on the fetch process.

        :returns: this backend supports items archive
        """
        return True

[docs]    @classmethod
    def has_resuming(cls):
        """Returns whether it supports to resume the fetch process.

        :returns: this backend supports items resuming
        """
        return True

[docs]    @staticmethod
    def metadata_id(item):
        """Extracts the identifier from a StackExchange item."""

        return str(item['question_id'])

[docs]    @staticmethod
    def metadata_updated_on(item):
        """Extracts the update time from a StackExchange item.

        The timestamp is extracted from 'last_activity_date' field.
        This date is a UNIX timestamp but needs to be converted to
        a float value.

        :param item: item generated by the backend

        :returns: a UNIX timestamp
        """
        return float(item['last_activity_date'])

[docs]    @staticmethod
    def metadata_category(item):
        """Extracts the category from a StackExchange item.

        This backend only generates one type of item which is
        'question'.
        """
        return CATEGORY_QUESTION

[docs]    @staticmethod
    def parse_questions(raw_page):
        """Parse a StackExchange API raw response.

        The method parses the API response retrieving the
        questions from the received items

        :param items: items from where to parse the questions

        :returns: a generator of questions
        """
        raw_questions = json.loads(raw_page)
        questions = raw_questions['items']
        for question in questions:
            yield question

    def _init_client(self, from_archive=False):
        """Init client"""

        return StackExchangeClient(self.site, self.tagged, self.api_token, self.access_token,
                                   self.max_questions, self.archive, from_archive, self.ssl_verify)


[docs]class StackExchangeClient(HttpClient):
    """StackExchange API client.

    This class implements a simple client to retrieve questions from
    any Stackexchange site.

    :param site: URL of the Bugzilla server
    :param tagged: filter items by question Tag
    :param token: StackExchange application key for the API
    :param access_token: StackExchange user access token for the API
    :param max_questions: max number of questions per query
    :param archive: an archive to store/read fetched data
    :param from_archive: it tells whether to write/read the archive
    :param ssl_verify: enable/disable SSL verification

    :raises HTTPError: when an error occurs doing the request
    """
    # Filters are immutable and non-expiring. This filter allows to retrieve all
    # the information regarding Each question. To know more, visit
    # https://api.stackexchange.com/docs/questions and paste the filter in the
    # whitebox filter. It will display a list of checkboxes with the selected
    # values for the filter provided.

    STACKEXCHANGE_API_URL = 'https://api.stackexchange.com'
    VERSION_API = '2.2'

    # API resources
    RQUESTIONS = 'questions'

    # Resource parameters
    PPAGE = 'page'
    PPAGESIZE = 'pagesize'
    PORDER = 'order'
    PSORT = 'sort'
    PTAGGED = 'tagged'
    PSITE = 'site'
    PKEY = 'key'
    PFILTER = 'filter'
    PMIN = 'min'
    PACCESSTOKEN = 'access_token'

    # Predefined values
    VQUESTIONS_FILTER = 'Bf*y*ByQD_upZqozgU6lXL_62USGOoV3)MFNgiHqHpmO_Y-jHR'

    def __init__(self, site, tagged, token, access_token=None, max_questions=MAX_QUESTIONS,
                 archive=None, from_archive=False, ssl_verify=True):
        super().__init__(self.STACKEXCHANGE_API_URL, archive=archive,
                         from_archive=from_archive, ssl_verify=ssl_verify)
        self.site = site
        self.tagged = tagged
        self.token = token
        self.access_token = access_token
        self.max_questions = max_questions

[docs]    def get_questions(self, from_date):
        """Retrieve all the questions from a given date.

        :param from_date: obtain questions updated since this date
        """

        page = 1
        url = urijoin(self.base_url, self.VERSION_API, self.RQUESTIONS)

        req = self.fetch(url, payload=self.__build_payload(page, from_date))
        questions = req.text

        data = req.json()
        tquestions = data['total']
        nquestions = data['page_size']

        self.__log_status(data['quota_remaining'],
                          data['quota_max'],
                          nquestions,
                          tquestions)

        while questions:
            yield questions
            questions = None

            if data['has_more']:
                page += 1

                backoff = data.get('backoff', None)
                if backoff:
                    logger.debug("Expensive query. Wait %s secs to send a new request",
                                 backoff)
                    time.sleep(float(backoff))

                req = self.fetch(url, payload=self.__build_payload(page, from_date))
                data = req.json()
                questions = req.text
                nquestions += data['page_size']
                self.__log_status(data['quota_remaining'],
                                  data['quota_max'],
                                  nquestions,
                                  tquestions)

[docs]    @staticmethod
    def sanitize_for_archive(url, headers, payload):
        """Sanitize payload of a HTTP request by removing the token information
        before storing/retrieving archived items

        :param: url: HTTP url request
        :param: headers: HTTP headers request
        :param: payload: HTTP payload request

        :returns url, headers and the sanitized payload
        """
        if StackExchangeClient.PKEY in payload:
            payload.pop(StackExchangeClient.PKEY)

        if StackExchangeClient.PACCESSTOKEN in payload:
            payload.pop(StackExchangeClient.PACCESSTOKEN)

        return url, headers, payload

    def __build_payload(self, page, from_date, order='asc', sort='activity'):
        payload = {self.PPAGE: page,
                   self.PPAGESIZE: self.max_questions,
                   self.PORDER: order,
                   self.PSORT: sort,
                   self.PTAGGED: self.tagged,
                   self.PSITE: self.site,
                   self.PKEY: self.token,
                   self.PFILTER: self.VQUESTIONS_FILTER}
        if from_date:
            timestamp = int(from_date.timestamp())
            payload[self.PMIN] = timestamp
        if self.access_token:
            payload[self.PACCESSTOKEN] = self.access_token
        return payload

    def __log_status(self, quota_remaining, quota_max, page_size, total):

        logger.debug("Rate limit: %s/%s" % (quota_remaining,
                                            quota_max))
        if (total != 0):
            nquestions = min(page_size, total)
            logger.info("Fetching questions: %s/%s" % (nquestions,
                                                       total))
        else:
            logger.info("No questions were found.")


[docs]class StackExchangeCommand(BackendCommand):
    """Class to run StackExchange backend from the command line."""

    BACKEND = StackExchange

[docs]    @classmethod
    def setup_cmd_parser(cls):
        """Returns the StackExchange argument parser."""

        parser = BackendCommandArgumentParser(cls.BACKEND,
                                              from_date=True,
                                              token_auth=True,
                                              archive=True,
                                              ssl_verify=True)

        # StackExchange options
        group = parser.parser.add_argument_group('StackExchange arguments')
        group.add_argument('--site', dest='site',
                           required=True,
                           help="StackExchange site")
        group.add_argument('--tagged', dest='tagged',
                           help="filter items by question Tag")
        group.add_argument('--max-questions', dest='max_questions',
                           type=int, default=MAX_QUESTIONS,
                           help="Maximum number of questions requested in the same query")
        group.add_argument('--access-token', dest='access_token',
                           default=None,
                           help="Token obtained via authenticating an user")

        return parser