Source code for perceval.backends.core.askbot

# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Alberto Martín <alberto.martin@bitergia.com>
#     Santiago Dueñas <sduenas@bitergia.com>
#     Stephan Barth <stephan.barth@gmail.com>
#     Valerio Cosentino <valcos@bitergia.com>
#     Jesus M. Gonzalez-Barahona <jgb@gsyc.es>
#     Harshal Mittal <harshalmittal4@gmail.com>
#     animesh <animuz111@gmail.com>
#

import json
import logging
import re

import bs4
import requests

from grimoirelab_toolkit.datetime import datetime_to_utc, str_to_datetime
from grimoirelab_toolkit.uris import urijoin

from ...backend import (Backend,
                        BackendCommand,
                        BackendCommandArgumentParser)
from ...client import HttpClient
from ...utils import DEFAULT_DATETIME

CATEGORY_QUESTION = 'question'

logger = logging.getLogger(__name__)


[docs]class Askbot(Backend):
    """Askbot backend.

    This class retrieves the questions posted on an Askbot site.
    To initialize this class the URL must be provided. The `url`
    will be set as the origin of the data.

    :param url: Askbot site URL
    :param tag: label used to mark the data
    :param archive: archive to store/retrieve items
    :param ssl_verify: enable/disable SSL verification
    """
    version = '0.8.0'

    CATEGORIES = [CATEGORY_QUESTION]
    EXTRA_SEARCH_FIELDS = {
        'tags': ['tags']
    }

    def __init__(self, url, tag=None, archive=None, ssl_verify=True):
        origin = url

        super().__init__(origin, tag=tag, archive=archive, ssl_verify=ssl_verify)
        self.url = url
        self.client = None
        self.ab_parser = AskbotParser()

[docs]    def fetch(self, category=CATEGORY_QUESTION, from_date=DEFAULT_DATETIME):
        """Fetch the questions/answers from the repository.

        The method retrieves, from an Askbot site, the questions and answers
        updated since the given date.

        :param category: the category of items to fetch
        :param from_date: obtain questions/answers updated since this date

        :returns: a generator of items
        """
        if not from_date:
            from_date = DEFAULT_DATETIME
        logger.info(f"Pulling Askbot data from {from_date}")
        kwargs = {'from_date': from_date}
        items = super().fetch(category, **kwargs)

        return items

[docs]    def fetch_items(self, category, **kwargs):
        """Fetch the questions

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """

        from_date = datetime_to_utc(kwargs['from_date']).timestamp()

        questions_groups = self.client.get_api_questions(AskbotClient.API_QUESTIONS)
        for questions in questions_groups:

            for question in questions['questions']:
                updated_at = int(question['last_activity_at'])
                if updated_at > from_date:
                    html_question = self.__fetch_question(question)
                    if not html_question:
                        continue

                    logger.debug("Fetching HTML question %s", question['id'])
                    comments = self.__fetch_comments(question)
                    question_obj = self.__build_question(html_question, question, comments)
                    question.update(question_obj)
                    yield question

[docs]    @classmethod
    def has_resuming(cls):
        """Returns whether it supports to resume the fetch process.

        :returns: this backend supports items resuming
        """
        return True

[docs]    @classmethod
    def has_archiving(cls):
        """Returns whether it supports archiving items on the fetch process.

        :returns: this backend supports items archive
        """
        return True

[docs]    @staticmethod
    def metadata_category(item):
        """Extracts the category from an Askbot item.

        This backend only generates one type of item which is
        'question'.
        """
        return CATEGORY_QUESTION

[docs]    @staticmethod
    def metadata_id(item):
        """Extracts the identifier from an Askbot question item."""

        return str(item['id'])

[docs]    @staticmethod
    def metadata_updated_on(item):
        """Extracts the update time from an Askbot item.

        The timestamp is extracted from 'last_activity_at' field.
        This date is a UNIX timestamp but needs to be converted to
        a float value.

        :param item: item generated by the backend

        :returns: a UNIX timestamp
        """
        return float(item['last_activity_at'])

    def _init_client(self, from_archive=False):
        """Init client"""

        return AskbotClient(self.url, self.archive, from_archive, self.ssl_verify)

    def __fetch_question(self, question):
        """Fetch an Askbot HTML question body.

        The method fetches the HTML question retrieving the
        question body of the item question received

        :param question: item with the question itself

        :returns: a list of HTML page/s for the question
        """
        html_question_items = []

        npages = 1
        next_request = True

        while next_request:
            try:
                html_question = self.client.get_html_question(question['id'], npages)
                html_question_items.append(html_question)
                tpages = self.ab_parser.parse_number_of_html_pages(html_question)
                logger.info(f"{tpages} of questions found")
                if npages == tpages:
                    next_request = False

                npages = npages + 1
            except requests.exceptions.TooManyRedirects as e:
                logger.warning("%s, data not retrieved for question %s", e, question['id'])
                next_request = False

        return html_question_items

    def __fetch_comments(self, question):
        """Fetch all the comments of an Askbot question and answers.

        The method fetches the list of every comment existing in a question and
        its answers.

        :param question: item with the question itself

        :returns: a list of comments with the ids as hashes
        """
        comments = {}
        comments[question['id']] = json.loads(self.client.get_comments(question['id']))
        for object_id in question['answer_ids']:
            comments[object_id] = json.loads(self.client.get_comments(object_id))
        logger.debug(f"{len(comments)} comments found")
        return comments

    @staticmethod
    def __build_question(html_question, question, comments):
        """Build an Askbot HTML response.

        The method puts together all the information regarding a question

        :param html_question: array of HTML raw pages
        :param question: question object from the API
        :param comments: list of comments to add

        :returns: a dict item with the parsed question information
        """
        question_object = {}
        # Parse the user info from the soup container
        question_container = AskbotParser.parse_question_container(html_question[0])
        # Add the info to the question object
        question_object.update(question_container)
        # Add the comments of the question (if any)
        if comments[int(question['id'])]:
            question_object['comments'] = comments[int(question['id'])]

        answers = []

        for page in html_question:
            answers.extend(AskbotParser.parse_answers(page))

        if len(answers) != 0:
            question_object['answers'] = answers
            for answer in question_object['answers']:
                if comments[int(answer['id'])]:
                    answer['comments'] = comments[int(answer['id'])]

        return question_object


[docs]class AskbotClient(HttpClient):
    """Askbot client.

    This class implements a simple client to retrieve distinct
    kind of data from an Askbot site.

    :param base_url: URL of the Askbot site
    :param archive: an archive to store/read fetched data
    :param from_archive: it tells whether to write/read the archive
    :param ssl_verify: enable/disable SSL verification

    :raises HTTPError: when an error occurs doing the request
    """

    API_QUESTIONS = 'api/v1/questions/'

    # API resources
    RHTML_QUESTION = 'question/'
    RCOMMENTS = 's/post_comments'
    RCOMMENTS_OLD = 'post_comments'

    # API header
    HREQUEST_WITH = 'X-Requested-With'

    # Resource parameters
    PPAGE = 'page'
    PSORT = 'sort'
    PPOST_ID = 'post_id'
    PPOST_TYPE = 'post_type'
    PAVATAR_SIZE = 'avatar_size'

    # Predefined values
    VORDER_API = 'activity-asc'
    VORDER_HTML = 'votes'
    VANSWER = 'answer'
    VAVATAR_SIZE = 0
    VHTTP_REQUEST = 'XMLHttpRequest'

    def __init__(self, base_url, archive=None, from_archive=False, ssl_verify=True):
        super().__init__(base_url, archive=archive, from_archive=from_archive, ssl_verify=ssl_verify)
        self._use_new_urls = True

[docs]    def get_api_questions(self, path):
        """Retrieve a question page using the API.

        :param page: page to retrieve
        """
        npages = 1
        next_request = True
        logger.debug("Retrieving question pages")
        path = urijoin(self.base_url, path)
        while next_request:

            try:
                params = {
                    self.PPAGE: npages,
                    self.PSORT: self.VORDER_API
                }

                response = self.fetch(path, payload=params)

                whole_page = response.text

                raw_questions = json.loads(whole_page)
                tpages = raw_questions['pages']

                logger.debug("Fetching questions from '%s': page %s/%s",
                             self.base_url, npages, tpages)

                if npages == tpages:
                    next_request = False

                npages = npages + 1
                yield raw_questions

            except requests.exceptions.TooManyRedirects as e:
                logger.warning("%s, data not retrieved for resource %s", e, path)
                next_request = False

[docs]    def get_html_question(self, question_id, page=1):
        """Retrieve a raw HTML question and all it's information.

        :param question_id: question identifier
        :param page: page to retrieve
        """
        path = urijoin(self.base_url, self.RHTML_QUESTION, question_id)
        logger.debug(f"Raw html retrieved: {path}")
        params = {
            self.PPAGE: page,
            self.PSORT: self.VORDER_HTML
        }

        response = self.fetch(path, payload=params)
        return response.text

[docs]    def get_comments(self, post_id):
        """Retrieve a list of comments by a given id.

        :param object_id: object identifiere
        """
        path = urijoin(self.base_url, self.RCOMMENTS if self._use_new_urls else self.RCOMMENTS_OLD)
        params = {
            self.PPOST_ID: post_id,
            self.PPOST_TYPE: self.VANSWER,
            self.PAVATAR_SIZE: self.VAVATAR_SIZE
        }
        headers = {self.HREQUEST_WITH: self.VHTTP_REQUEST}

        try:
            response = self.fetch(path, payload=params, headers=headers)
            raw = response.text
        except requests.exceptions.HTTPError as ex:
            if ex.response.status_code == 404:
                logger.debug("Comments URL did not work. Using old URL schema.")
                self._use_new_urls = False
                path = urijoin(self.base_url, self.RCOMMENTS_OLD)
                response = self.fetch(path, payload=params, headers=headers)
                raw = response.text
            elif ex.response.status_code == 500:
                logger.warning("Comments not retrieved due to %s", ex)
                raw = '[]'
            else:
                raise ex

        return raw


[docs]class AskbotParser:
    """Askbot HTML parser.

    This class parses a plain HTML document, converting questions, answers,
    comments and user information into dict items.
    """

[docs]    @staticmethod
    def parse_question_container(html_question):
        """Parse the question info container of a given HTML question.

        The method parses the information available in the question information
        container. The container can have up to 2 elements: the first one
        contains the information related to the user who generated the question
        and the date (if any). The second one contains the date of the update
        and the user who updated it (if not the same who generated the question).

        :param html_question: raw HTML question element

        :returns: an object with the parsed information
        """
        container_info = {}
        bs_question = bs4.BeautifulSoup(html_question, "html.parser")
        question = AskbotParser._find_question_container(bs_question)
        container = question.select("div.post-update-info")
        created = container[0]
        container_info['author'] = AskbotParser.parse_user_info(created)
        try:
            container[1]
        except IndexError:
            pass
        else:
            updated = container[1]
            if AskbotParser.parse_user_info(updated):
                container_info['updated_by'] = AskbotParser.parse_user_info(updated)
        logger.debug("Container info parsed")
        return container_info

[docs]    @staticmethod
    def parse_answers(html_question):
        """Parse the answers of a given HTML question.

        The method parses the answers related with a given HTML question,
        as well as all the comments related to the answer.

        :param html_question: raw HTML question element

        :returns: a list with the answers
        """

        def parse_answer_container(update_info):
            """Parse the answer info container of a given HTML question.

            The method parses the information available in the answer information
            container. The container can have up to 2 elements: the first one
            contains the information related to the user who generated the question
            and the date (if any). The second one contains the date of the update
            and the user who updated it (if not the same who generated the question).

            :param update_info: beautiful soup update_info container element

            :returns: an object with the parsed information
            """
            container_info = {}
            created = update_info[0]
            answered_at = created.abbr.attrs["title"]
            # Convert date to UNIX timestamp
            container_info['added_at'] = str(str_to_datetime(answered_at).timestamp())
            container_info['answered_by'] = AskbotParser.parse_user_info(created)
            try:
                update_info[1]
            except IndexError:
                pass
            else:
                updated = update_info[1]
                updated_at = updated.abbr.attrs["title"]
                # Convert date to UNIX timestamp
                container_info['updated_at'] = str(str_to_datetime(updated_at).timestamp())
                if AskbotParser.parse_user_info(updated):
                    container_info['updated_by'] = AskbotParser.parse_user_info(updated)
            return container_info

        answer_list = []
        # Select all the answers
        bs_question = bs4.BeautifulSoup(html_question, "html.parser")
        bs_answers = bs_question.select("div.answer")
        logger.debug(f"{str(len(bs_answers))} answers found")
        for bs_answer in bs_answers:
            answer_id = bs_answer.attrs["data-post-id"]
            votes_element = bs_answer.select("div.vote-number")[0].text
            accepted_answer = bs_answer.select("div.answer-img-accept")[0].get('title').endswith("correct")
            # Select the body of the answer
            body = bs_answer.select("div.post-body")
            # Get the user information container and parse it
            update_info = body[0].select("div.post-update-info")
            answer_container = parse_answer_container(update_info)
            # Remove the update-info-container div to be able to get the body
            body[0].div.extract().select("div.post-update-info-container")
            # Override the body with a clean one
            body = body[0].get_text(strip=True)
            # Generate the answer object
            answer = {'id': answer_id,
                      'score': votes_element,
                      'summary': body,
                      'accepted': accepted_answer
                      }
            # Update the object with the information in the answer container
            answer.update(answer_container)
            answer_list.append(answer)
        logger.debug("Answers parsed")
        return answer_list

[docs]    @staticmethod
    def parse_number_of_html_pages(html_question):
        """Parse number of answer pages to paginate over them.

        :param html_question: raw HTML question element

        :returns: an integer with the number of pages
        """
        bs_question = bs4.BeautifulSoup(html_question, "html.parser")
        try:
            bs_question.select('div.paginator')[0]
        except IndexError:
            return 1
        else:
            return int(bs_question.select('div.paginator')[0].attrs['data-num-pages'])

[docs]    @staticmethod
    def parse_user_info(update_info):
        """Parse the user information of a given HTML container.

        The method parses all the available user information in the container.
        If the class "user-info" exists, the method will get all the available
        information in the container. If not, if a class "tip" exists, it will be
        a wiki post with no user associated. Else, it can be an empty container.

        :param update_info: beautiful soup answer container element

        :returns: an object with the parsed information
        """
        user_info = {}
        if update_info.select("div.user-info"):
            # Get all the <a> elements in the container. First <a> contains the user
            # information, second one (if exists), the website of the user.
            elements = update_info.select("div.user-info")[0].find_all("a")
            href = elements[0].attrs["href"]
            user_info['id'] = re.search(r'\d+', href).group(0)
            user_info['username'] = elements[0].text
            user_info['reputation'] = update_info.select('span.reputation-score')[0].text
            user_info['badges'] = update_info.select("span.badges")[0].attrs["title"]
            try:
                elements[1]
            except IndexError:
                pass
            else:
                user_info['website'] = elements[1].attrs["href"]
            if update_info.select("img.flag"):
                flag = update_info.select("img.flag")[0].attrs["alt"]
                user_info['country'] = re.sub("flag of ", "", flag)
        logger.debug("User info parsed")
        return user_info

    @staticmethod
    def _find_question_container(bs_question):
        questions = bs_question.find_all("div",
                                         attrs={'class': re.compile(".*question")})
        for question in questions:
            if 'post' in question.attrs['class']:
                return question


[docs]class AskbotCommand(BackendCommand):
    """Class to run Askbot backend from the command line."""

    BACKEND = Askbot

[docs]    @classmethod
    def setup_cmd_parser(cls):
        """Returns the Askbot argument parser."""

        parser = BackendCommandArgumentParser(cls.BACKEND,
                                              from_date=True,
                                              archive=True,
                                              ssl_verify=True)

        # Required arguments
        parser.parser.add_argument('url',
                                   help="URL of the Askbot server")

        return parser