# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Valerio Cosentino <valcos@bitergia.com>
# Harshal Mittal <harshalmittal4@gmail.com>
# Santiago DueƱas <sduenas@bitergia.com>
#
import logging
import os
import requests
from grimoirelab_toolkit.datetime import datetime_to_utc
from grimoirelab_toolkit.uris import urijoin
from .mbox import MBox, MailingList, CATEGORY_MESSAGE
from ...backend import (BackendCommand,
BackendCommandArgumentParser,
DEFAULT_SEARCH_FIELD)
from ...errors import BackendError
from ...utils import DEFAULT_DATETIME
MBOX_FILE = 'messages.zip'
GROUPSIO_URL = 'https://groups.io/'
GROUPSIO_API_URL = 'https://groups.io/api/v1'
PER_PAGE = 100
logger = logging.getLogger(__name__)
[docs]class Groupsio(MBox):
"""Groups.io backend.
This class allows the fetch the messages of a Groups.io group.
Initialize this class passing the name of the group, the
directory path where the mbox files will be fetched and
stored, and the email and password of the Groupsio user.
The origin of the data will be set to the url of the group
on Groups.io.
In order to know the group names where you are subscribed,
you can use the following script:
https://gist.github.com/valeriocos/2e2231e17fd3052800303bf99bd0c7c4
:param group_name: Name of the group
:param dirpath: directory path where the mboxes are stored
:param email: Groupsio user email
:param password: Groupsio user password
:param tag: label used to mark the data
:param archive: archive to store/retrieve items
:param ssl_verify: enable/disable SSL verification
"""
version = '0.4.2'
CATEGORIES = [CATEGORY_MESSAGE]
def __init__(self, group_name, dirpath, email, password, tag=None, archive=None, ssl_verify=True):
url = urijoin(GROUPSIO_URL, 'g', group_name)
super().__init__(url, dirpath, tag=tag, archive=archive, ssl_verify=ssl_verify)
self.email = email
self.password = password
self.group_name = group_name
[docs] def search_fields(self, item):
"""Add search fields to an item.
It adds the values of `metadata_id` plus the `group_name`
:param item: the item to extract the search fields values
:returns: a dict of search fields
"""
search_fields = {
DEFAULT_SEARCH_FIELD: self.metadata_id(item)
}
origin_parts = self.origin.split('/')
search_fields['group_name'] = origin_parts[-1]
return search_fields
[docs] def fetch(self, category=CATEGORY_MESSAGE, from_date=DEFAULT_DATETIME):
"""Fetch the messages from a Groups.io group.
The method fetches the mbox files from a remote Groups.io group
and retrieves the messages stored on them.
:param category: the category of items to fetch
:param from_date: obtain messages since this date
:returns: a generator of messages
"""
items = super().fetch(category, from_date)
return items
[docs] def fetch_items(self, category, **kwargs):
"""Fetch the messages
:param category: the category of items to fetch
:param kwargs: backend arguments
:returns: a generator of items
"""
from_date = kwargs['from_date']
logger.info("Looking for messages from '%s' since %s",
self.uri, str(from_date))
mailing_list = GroupsioClient(self.group_name, self.dirpath,
self.email, self.password, self.ssl_verify)
mailing_list.fetch(from_date)
messages = self._fetch_and_parse_messages(mailing_list, from_date)
for message in messages:
yield message
logger.info("Fetch process completed")
[docs] @classmethod
def has_archiving(cls):
"""Returns whether it supports archiving items on the fetch process.
:returns: this backend does not support items archive
"""
return False
[docs] @classmethod
def has_resuming(cls):
"""Returns whether it supports to resume the fetch process.
:returns: this backend supports items resuming
"""
return True
[docs]class GroupsioClient(MailingList):
"""Manage mailing list archives stored by Groups.io.
This class gives access to remote and local mboxes archives
from a mailing list stored by Groups.io. This class also allows
to keep them in sync.
:param group_name: Name of the group
:param dirpath: directory path where the mboxes are stored
:param email: Groupsio user email
:param password: Groupsio user password
:param ssl_verify: enable/disable SSL verification
"""
# API resources
RDOWNLOAD_ARCHIVES = 'downloadarchives'
RGET_SUBSCRIPTIONS = 'getsubs'
RLOGIN = 'login'
# Resource parameters
PGROUP_ID = 'group_id'
PSTART_TIME = 'start_time'
PLIMIT = 'limit'
PPAGE_TOKEN = 'page_token'
PEMAIL = 'email'
PPASSWORD = 'password'
def __init__(self, group_name, dirpath, email, password, ssl_verify=True):
url = urijoin(GROUPSIO_URL, 'g', group_name)
super().__init__(url, dirpath)
self.session = requests.Session()
self.group_name = group_name
self.ssl_verify = ssl_verify
self.__login(email, password)
[docs] def fetch(self, from_date=None):
"""Fetch the mbox files from the remote archiver.
Stores the archives in the path given during the initialization
of this object. Those archives which a not valid extension will
be ignored.
Groups.io archives are returned as a .zip file, which contains
one file in mbox format.
:param from_date: fetch messages after a given date (included) expressed in ISO format
:returns: a list of tuples, storing the links and paths of the
fetched archives
"""
logger.info("Downloading mboxes from '%s'", self.uri)
logger.debug("Storing mboxes in '%s'", self.dirpath)
if not os.path.exists(self.dirpath):
os.makedirs(self.dirpath)
group_id, group_download_archive = self.__find_group_info()
if not group_download_archive:
msg = "Download archive permission disabled for the group %s" % self.group_name
logger.error(msg)
raise BackendError(cause=msg)
url = urijoin(GROUPSIO_API_URL, self.RDOWNLOAD_ARCHIVES)
payload = {
self.PGROUP_ID: group_id
}
if from_date:
payload[self.PSTART_TIME] = datetime_to_utc(from_date).isoformat()
filepath = os.path.join(self.dirpath, MBOX_FILE)
success = self._download_archive(url, payload, filepath)
return success
[docs] def subscriptions(self, per_page=PER_PAGE):
"""Fetch the groupsio paginated subscriptions for a given token
:param per_page: number of subscriptions per page
:returns: an iterator of subscriptions
"""
url = urijoin(GROUPSIO_API_URL, self.RGET_SUBSCRIPTIONS)
logger.debug("Get groupsio paginated subscriptions from " + url)
keep_fetching = True
payload = {
self.PLIMIT: per_page
}
while keep_fetching:
r = self.__fetch(url, payload)
response_raw = r.json()
subscriptions = response_raw['data']
yield subscriptions
total_subscriptions = response_raw['total_count']
logger.debug("Subscriptions: %i/%i" % (response_raw['end_item'], total_subscriptions))
payload[self.PPAGE_TOKEN] = response_raw['next_page_token']
keep_fetching = response_raw['has_more']
def _download_archive(self, url, payload, filepath):
r = self.session.get(url, params=payload, stream=True, verify=self.ssl_verify)
try:
r.raise_for_status()
self._write_archive(r, filepath)
except requests.exceptions.HTTPError as e:
logger.error("Impossible to download archives from %s. Error info: %s",
self.uri, str(e.response.text))
raise e
except OSError as e:
logger.warning("Ignoring %s archive due to: %s", self.uri, str(e))
return False
logger.debug("%s archive downloaded and stored in %s", self.uri, filepath)
return True
@staticmethod
def _write_archive(r, filepath):
with open(filepath, 'wb') as fd:
fd.write(r.raw.read())
def __find_group_info(self):
"""Find the id and download archive permission of a group given
its name by iterating on the list of subscriptions
"""
group_subscriptions = self.subscriptions()
for subscriptions in group_subscriptions:
for sub in subscriptions:
if sub['group_name'] == self.group_name:
return sub['group_id'], sub['perms']['download_archives']
msg = "Group id not found for group name %s" % self.group_name
raise BackendError(cause=msg)
def __fetch(self, url, payload):
"""Fetch requests from groupsio API"""
r = self.session.get(url, params=payload, verify=self.ssl_verify)
try:
r.raise_for_status()
except requests.exceptions.HTTPError as e:
raise e
return r
def __login(self, email, password):
"""Login a user to the server based on email and password.
:param email: Groupsio user email
:param password: Groupsio user password
"""
url = urijoin(GROUPSIO_API_URL, self.RLOGIN)
payload = {
self.PEMAIL: email,
self.PPASSWORD: password
}
self.session.post(url, params=payload)
logger.debug("Groupsio email %s authenticated in %s",
email, GROUPSIO_API_URL)
[docs]class GroupsioCommand(BackendCommand):
"""Class to run Groupsio backend from the command line."""
BACKEND = Groupsio
def _pre_init(self):
"""Initialize mailing lists directory path"""
if not self.parsed_args.mboxes_path:
base_path = os.path.expanduser('~/.perceval/mailinglists/')
dirpath = os.path.join(base_path, GROUPSIO_URL, 'g', self.parsed_args.group_name)
else:
dirpath = self.parsed_args.mboxes_path
setattr(self.parsed_args, 'dirpath', dirpath)
[docs] @classmethod
def setup_cmd_parser(cls):
"""Returns the Groupsio argument parser."""
parser = BackendCommandArgumentParser(cls.BACKEND,
from_date=True,
ssl_verify=True)
# Optional arguments
group = parser.parser.add_argument_group('Groupsio arguments')
group.add_argument('--mboxes-path', dest='mboxes_path',
help="Path where mbox files will be stored")
# Required arguments
parser.parser.add_argument('group_name', help="Name of the group on Groups.io")
parser.parser.add_argument('-e', '--email', dest='email', help="Groupsio user email")
parser.parser.add_argument('-p', '--password', dest='password', help="Groupsio user password")
return parser