Source code for perceval.backends.core.googlehits

# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Valerio Cosentino <valcos@bitergia.com>
#     Harshal Mittal <harshalmittal4@gmail.com>
#

import bs4
import logging
import re

from grimoirelab_toolkit.datetime import datetime_utcnow

from ...backend import (Backend,
                        BackendCommand,
                        BackendCommandArgumentParser,
                        uuid)
from ...client import HttpClient
from ...errors import BackendError

# Default sleep time and retries to deal with connection/server problems
DEFAULT_SLEEP_TIME = 1
MAX_RETRIES = 5

CATEGORY_HITS = "google_hits"
GOOGLE_SEARCH_URL = 'https://www.google.com/search'


logger = logging.getLogger(__name__)


[docs]class GoogleHits(Backend):
    """GoogleHits backend for Perceval.

    This class retrieves the number of hits for a given list of
    keywords via the Google API. To initialize this class a list
    of keywords is needed.

    :param keywords: a list of keywords
    :param tag: label used to mark the data
    :param archive: archive to store/retrieve items
    :param max_retries: number of max retries to a data source
        before raising a RetryError exception
    :param sleep_time: time (in seconds) to sleep in case
        of connection problems
    :param ssl_verify: enable/disable SSL verification
    """
    version = '0.4.0'

    CATEGORIES = [CATEGORY_HITS]
    EXTRA_SEARCH_FIELDS = {
        'keywords': ['keywords']
    }

    def __init__(self, keywords, tag=None, archive=None,
                 max_retries=MAX_RETRIES, sleep_time=DEFAULT_SLEEP_TIME, ssl_verify=True):

        if len(keywords) == 1 and keywords[0].strip() == "":
            cause = "No keywords provided"
            raise BackendError(cause=cause)

        self.keywords = keywords
        super().__init__(GOOGLE_SEARCH_URL, tag=tag, archive=archive, ssl_verify=ssl_verify)

        self.max_retries = max_retries
        self.sleep_time = sleep_time

        self.client = None

[docs]    def fetch(self, category=CATEGORY_HITS):
        """Fetch data from Google API.

        The method retrieves a list of hits for some
        given keywords using the Google API.

        :param category: the category of items to fetch

        :returns: a generator of data
        """
        kwargs = {}
        items = super().fetch(category, **kwargs)

        return items

[docs]    def fetch_items(self, category, **kwargs):
        """Fetch Google hit items

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """
        logger.info("Fetching data for '%s'", self.keywords)

        hits_raw = self.client.hits(self.keywords)
        hits = self.__parse_hits(hits_raw)

        yield hits

        logger.info("Fetch process completed")

[docs]    @classmethod
    def has_archiving(cls):
        """Returns whether it supports archiving items on the fetch process.

        :returns: this backend supports items archive
        """
        return True

[docs]    @classmethod
    def has_resuming(cls):
        """Returns whether it supports to resume the fetch process.

        :returns: this backend supports items resuming
        """
        return True

[docs]    @staticmethod
    def metadata_id(item):
        """Extracts the identifier from a GoogleHit item."""

        return item['id']

[docs]    @staticmethod
    def metadata_updated_on(item):
        """Extracts the update time from a GoogleHit item.

        The timestamp is based on the current time when the hit was extracted.
        This field is not part of the data provided by Google API. It is added
        by this backend.

        :param item: item generated by the backend

        :returns: a UNIX timestamp
        """
        return item['fetched_on']

[docs]    @staticmethod
    def metadata_category(item):
        """Extracts the category from a GoogleHits item.

        This backend only generates one type of item which is
        'google_hits'.
        """
        return CATEGORY_HITS

    def _init_client(self, from_archive=False):
        """Init client"""

        return GoogleHitsClient(self.sleep_time, self.max_retries,
                                archive=self.archive, from_archive=from_archive, ssl_verify=True)

    def __parse_hits(self, hit_raw):
        """Parse the hits returned by the Google Search API"""

        # Create the soup and get the desired div
        bs_result = bs4.BeautifulSoup(hit_raw, 'html.parser')
        hit_string = bs_result.find("div", id="resultStats").text

        # Remove commas or dots
        hit_string = hit_string.replace(',', u'')
        hit_string = hit_string.replace('.', u'')

        fetched_on = datetime_utcnow().timestamp()
        id_args = self.keywords[:]
        id_args.append(str(fetched_on))

        hits_json = {
            'fetched_on': fetched_on,
            'id': uuid(*id_args),
            'keywords': self.keywords,
            'type': 'googleSearchHits'
        }

        if not hit_string:
            logger.warning("No hits for %s", self.keywords)
            hits_json['hits'] = 0

            return hits_json

        str_hits = re.search(r'\d+', hit_string).group(0)
        hits = int(str_hits)
        hits_json['hits'] = hits

        return hits_json


[docs]class GoogleHitsClient(HttpClient):
    """GoogleHits API client.

    Client for fetching hits data from Google API.

    :param sleep_time: time (in seconds) to sleep in case
        of connection problems
    :param max_retries: number of max retries to a data source
        before raising a RetryError exception
    :param archive: an archive to store/read fetched data
    :param from_archive: it tells whether to write/read the archive
    :param ssl_verify: enable/disable SSL verification
    """
    EXTRA_STATUS_FORCELIST = [429]

    # Resource parameters
    PQUERY = 'q'

    def __init__(self, sleep_time=DEFAULT_SLEEP_TIME, max_retries=MAX_RETRIES,
                 archive=None, from_archive=False, ssl_verify=True):
        super().__init__(GOOGLE_SEARCH_URL, extra_status_forcelist=self.EXTRA_STATUS_FORCELIST,
                         sleep_time=sleep_time, max_retries=max_retries,
                         archive=archive, from_archive=from_archive, ssl_verify=ssl_verify)

[docs]    def hits(self, keywords):
        """Fetch information about a list of keywords."""

        if len(keywords) == 1:
            query_str = keywords[0]
        else:
            query_str = ' '.join([k for k in keywords])

        logger.info("Fetching hits for '%s'", query_str)
        params = {self.PQUERY: query_str}

        # Make the request
        req = self.fetch(GOOGLE_SEARCH_URL, payload=params)

        return req.text


[docs]class GoogleHitsCommand(BackendCommand):
    """Class to run GoogleHits backend from the command line."""

    BACKEND = GoogleHits

[docs]    @classmethod
    def setup_cmd_parser(cls):
        """Returns the GoogleHits argument parser."""

        parser = BackendCommandArgumentParser(cls.BACKEND,
                                              archive=True,
                                              ssl_verify=True)

        group = parser.parser.add_argument_group('GoogleHits arguments')
        # Generic client options
        group.add_argument('--max-retries', dest='max_retries',
                           default=MAX_RETRIES, type=int,
                           help="number of API call retries")
        group.add_argument('--sleep-time', dest='sleep_time',
                           default=DEFAULT_SLEEP_TIME, type=int,
                           help="sleeping time between API call retries")

        # Required arguments
        parser.parser.add_argument('keywords', nargs='+',
                                   help="Keywords to search as Google hits")

        return parser