# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Valerio Cosentino <valcos@bitergia.com>
# Harshal Mittal <harshalmittal4@gmail.com>
#
import bs4
import logging
import re
from grimoirelab_toolkit.datetime import datetime_utcnow
from ...backend import (Backend,
BackendCommand,
BackendCommandArgumentParser,
uuid)
from ...client import HttpClient
from ...errors import BackendError
# Default sleep time and retries to deal with connection/server problems
DEFAULT_SLEEP_TIME = 1
MAX_RETRIES = 5
CATEGORY_HITS = "google_hits"
GOOGLE_SEARCH_URL = 'https://www.google.com/search'
logger = logging.getLogger(__name__)
[docs]class GoogleHits(Backend):
"""GoogleHits backend for Perceval.
This class retrieves the number of hits for a given list of
keywords via the Google API. To initialize this class a list
of keywords is needed.
:param keywords: a list of keywords
:param tag: label used to mark the data
:param archive: archive to store/retrieve items
:param max_retries: number of max retries to a data source
before raising a RetryError exception
:param sleep_time: time (in seconds) to sleep in case
of connection problems
:param ssl_verify: enable/disable SSL verification
"""
version = '0.4.0'
CATEGORIES = [CATEGORY_HITS]
EXTRA_SEARCH_FIELDS = {
'keywords': ['keywords']
}
def __init__(self, keywords, tag=None, archive=None,
max_retries=MAX_RETRIES, sleep_time=DEFAULT_SLEEP_TIME, ssl_verify=True):
if len(keywords) == 1 and keywords[0].strip() == "":
cause = "No keywords provided"
raise BackendError(cause=cause)
self.keywords = keywords
super().__init__(GOOGLE_SEARCH_URL, tag=tag, archive=archive, ssl_verify=ssl_verify)
self.max_retries = max_retries
self.sleep_time = sleep_time
self.client = None
[docs] def fetch(self, category=CATEGORY_HITS):
"""Fetch data from Google API.
The method retrieves a list of hits for some
given keywords using the Google API.
:param category: the category of items to fetch
:returns: a generator of data
"""
kwargs = {}
items = super().fetch(category, **kwargs)
return items
[docs] def fetch_items(self, category, **kwargs):
"""Fetch Google hit items
:param category: the category of items to fetch
:param kwargs: backend arguments
:returns: a generator of items
"""
logger.info("Fetching data for '%s'", self.keywords)
hits_raw = self.client.hits(self.keywords)
hits = self.__parse_hits(hits_raw)
yield hits
logger.info("Fetch process completed")
[docs] @classmethod
def has_archiving(cls):
"""Returns whether it supports archiving items on the fetch process.
:returns: this backend supports items archive
"""
return True
[docs] @classmethod
def has_resuming(cls):
"""Returns whether it supports to resume the fetch process.
:returns: this backend supports items resuming
"""
return True
def _init_client(self, from_archive=False):
"""Init client"""
return GoogleHitsClient(self.sleep_time, self.max_retries,
archive=self.archive, from_archive=from_archive, ssl_verify=True)
def __parse_hits(self, hit_raw):
"""Parse the hits returned by the Google Search API"""
# Create the soup and get the desired div
bs_result = bs4.BeautifulSoup(hit_raw, 'html.parser')
hit_string = bs_result.find("div", id="resultStats").text
# Remove commas or dots
hit_string = hit_string.replace(',', u'')
hit_string = hit_string.replace('.', u'')
fetched_on = datetime_utcnow().timestamp()
id_args = self.keywords[:]
id_args.append(str(fetched_on))
hits_json = {
'fetched_on': fetched_on,
'id': uuid(*id_args),
'keywords': self.keywords,
'type': 'googleSearchHits'
}
if not hit_string:
logger.warning("No hits for %s", self.keywords)
hits_json['hits'] = 0
return hits_json
str_hits = re.search(r'\d+', hit_string).group(0)
hits = int(str_hits)
hits_json['hits'] = hits
return hits_json
[docs]class GoogleHitsClient(HttpClient):
"""GoogleHits API client.
Client for fetching hits data from Google API.
:param sleep_time: time (in seconds) to sleep in case
of connection problems
:param max_retries: number of max retries to a data source
before raising a RetryError exception
:param archive: an archive to store/read fetched data
:param from_archive: it tells whether to write/read the archive
:param ssl_verify: enable/disable SSL verification
"""
EXTRA_STATUS_FORCELIST = [429]
# Resource parameters
PQUERY = 'q'
def __init__(self, sleep_time=DEFAULT_SLEEP_TIME, max_retries=MAX_RETRIES,
archive=None, from_archive=False, ssl_verify=True):
super().__init__(GOOGLE_SEARCH_URL, extra_status_forcelist=self.EXTRA_STATUS_FORCELIST,
sleep_time=sleep_time, max_retries=max_retries,
archive=archive, from_archive=from_archive, ssl_verify=ssl_verify)
[docs] def hits(self, keywords):
"""Fetch information about a list of keywords."""
if len(keywords) == 1:
query_str = keywords[0]
else:
query_str = ' '.join([k for k in keywords])
logger.info("Fetching hits for '%s'", query_str)
params = {self.PQUERY: query_str}
# Make the request
req = self.fetch(GOOGLE_SEARCH_URL, payload=params)
return req.text
[docs]class GoogleHitsCommand(BackendCommand):
"""Class to run GoogleHits backend from the command line."""
BACKEND = GoogleHits
[docs] @classmethod
def setup_cmd_parser(cls):
"""Returns the GoogleHits argument parser."""
parser = BackendCommandArgumentParser(cls.BACKEND,
archive=True,
ssl_verify=True)
group = parser.parser.add_argument_group('GoogleHits arguments')
# Generic client options
group.add_argument('--max-retries', dest='max_retries',
default=MAX_RETRIES, type=int,
help="number of API call retries")
group.add_argument('--sleep-time', dest='sleep_time',
default=DEFAULT_SLEEP_TIME, type=int,
help="sleeping time between API call retries")
# Required arguments
parser.parser.add_argument('keywords', nargs='+',
help="Keywords to search as Google hits")
return parser