Source code for perceval.backends.core.rss
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Alvaro del Castillo <acs@bitergia.com>
# Santiago DueƱas <sduenas@bitergia.com>
# Stephan Barth <stephan.barth@gmail.com>
# Valerio Cosentino <valcos@bitergia.com>
# Jesus M. Gonzalez-Barahona <jgb@gsyc.es>
# Harshal Mittal <harshalmittal4@gmail.com>
#
import logging
import feedparser
from grimoirelab_toolkit.datetime import str_to_datetime
from ...backend import (Backend,
BackendCommand,
BackendCommandArgumentParser)
from ...client import HttpClient
CATEGORY_ENTRY = "entry"
logger = logging.getLogger(__name__)
[docs]class RSS(Backend):
"""RSS backend for Perceval.
This class retrieves the entries from a RSS feed.
To initialize this class the URL must be provided.
The `url` will be set as the origin of the data.
:param url: RSS url
:param tag: label used to mark the data
:param archive: archive to store/retrieve items
:param ssl_verify: enable/disable SSL verification
"""
version = '0.7.0'
CATEGORIES = [CATEGORY_ENTRY]
def __init__(self, url, tag=None, archive=None, ssl_verify=True):
origin = url
super().__init__(origin, tag=tag, archive=archive, ssl_verify=ssl_verify)
self.url = url
self.client = None
[docs] def fetch(self, category=CATEGORY_ENTRY):
"""Fetch the entries from the url.
The method retrieves all entries from a RSS url
:param category: the category of items to fetch
:returns: a generator of entries
"""
kwargs = {}
items = super().fetch(category, **kwargs)
return items
[docs] def fetch_items(self, category, **kwargs):
"""Fetch the entries
:param category: the category of items to fetch
:param kwargs: backend arguments
:returns: a generator of items
"""
logger.info("Looking for rss entries at feed '%s'", self.url)
nentries = 0 # number of entries
raw_entries = self.client.get_entries()
entries = self.parse_feed(raw_entries)['entries']
for item in entries:
yield item
nentries += 1
logger.info("Total number of entries: %i", nentries)
[docs] @classmethod
def has_archiving(cls):
"""Returns whether it supports archiving entries on the fetch process.
:returns: this backend supports entries archive
"""
return True
[docs] @classmethod
def has_resuming(cls):
"""Returns whether it supports to resume the fetch process.
:returns: this backend does not supports entries resuming
"""
return False
[docs] @staticmethod
def metadata_id(item):
"""Extracts the identifier from an entry item."""
return str(item['link'])
[docs] @staticmethod
def metadata_updated_on(item):
"""Extracts the update time from a RSS item.
The timestamp is extracted from 'published' field.
This date is a datetime string that needs to be converted to
a UNIX timestamp float value.
:param item: item generated by the backend
:returns: a UNIX timestamp
"""
ts = str_to_datetime(item['published'])
return ts.timestamp()
[docs] @staticmethod
def metadata_category(item):
"""Extracts the category from a RSS item.
This backend only generates one type of item which is
'entry'.
"""
return CATEGORY_ENTRY
def _init_client(self, from_archive=False):
"""Init client"""
return RSSClient(self.url, self.archive, from_archive, self.ssl_verify)
[docs]class RSSClient(HttpClient):
"""RSS API client.
This class implements a simple client to retrieve entries from
projects in a RSS node.
:param url: URL of rss node: https://item.opnfv.org/ci
:param archive: an archive to store/read fetched data
:param from_archive: it tells whether to write/read the archive
:param ssl_verify: enable/disable SSL verification
:raises HTTPError: when an error occurs doing the request
"""
def __init__(self, url, archive=None, from_archive=False, ssl_verify=True):
super().__init__(url, archive=archive, from_archive=from_archive, ssl_verify=ssl_verify)
[docs] def get_entries(self):
""" Retrieve all entries from a RSS feed"""
req = self.fetch(self.base_url)
return req.text
[docs]class RSSCommand(BackendCommand):
"""Class to run RSS backend from the command line."""
BACKEND = RSS
[docs] @classmethod
def setup_cmd_parser(cls):
"""Returns the RSS argument parser."""
parser = BackendCommandArgumentParser(cls.BACKEND,
archive=True,
ssl_verify=True)
# Required arguments
parser.parser.add_argument('url',
help="URL of the RSS feed")
return parser