Source code for perceval.backends.core.confluence

# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Santiago Dueñas <sduenas@bitergia.com>
#     Stephan Barth <stephan.barth@gmail.com>
#     Valerio Cosentino <valcos@bitergia.com>
#     Jesus M. Gonzalez-Barahona <jgb@gsyc.es>
#     Maurizio Pillitu <maoo@apache.org>
#     Harshal Mittal <harshalmittal4@gmail.com>
#

import logging
import json

import requests

from grimoirelab_toolkit.datetime import datetime_to_utc, str_to_datetime
from grimoirelab_toolkit.uris import urijoin

from ...backend import (Backend,
                        BackendCommand,
                        BackendCommandArgumentParser,
                        DEFAULT_SEARCH_FIELD)
from ...client import HttpClient
from ...utils import DEFAULT_DATETIME

CATEGORY_HISTORICAL_CONTENT = "historical content"
MAX_CONTENTS = 200
SEARCH_ANCESTOR_IDS = 'ancestor_ids'
SEARCH_CONTENT_ID = 'content_id'
SEARCH_CONTENT_VERSION_NUMBER = 'version_number'


logger = logging.getLogger(__name__)


[docs]class Confluence(Backend):
    """Confluence backend.

    This class allows the fetch the historical contents (content
    versions) stored on a Confluence server. Initialize this class
    passing the URL os this server. The `url` will be set as the
    origin of the data.

    :param url: URL of the server
    :param tag: label used to mark the data
    :param archive: archive to store/retrieve items
    :param ssl_verify: enable/disable SSL verification
    """
    version = '0.12.0'

    CATEGORIES = [CATEGORY_HISTORICAL_CONTENT]

    def __init__(self, url, tag=None, archive=None, ssl_verify=True):
        origin = url

        super().__init__(origin, tag=tag, archive=archive, ssl_verify=ssl_verify)
        self.url = url
        self.client = None

[docs]    def search_fields(self, item):
        """Add search fields to an item.

        It adds the values of `metadata_id` plus the page ancestor IDs,
        the content ID and the content version number.

        :param item: the item to extract the search fields values

        :returns: a dict of search fields
        """
        search_fields = {
            DEFAULT_SEARCH_FIELD: self.metadata_id(item),
            SEARCH_ANCESTOR_IDS: None,
            SEARCH_CONTENT_ID: None,
            SEARCH_CONTENT_VERSION_NUMBER: None
        }

        ancestors_ids = []

        ancestors = item.get('ancestors', None)
        if ancestors:
            for ancestor in ancestors:
                if 'id' in ancestor:
                    ancestors_ids.append(ancestor['id'])

        search_fields[SEARCH_ANCESTOR_IDS] = ancestors_ids
        search_fields[SEARCH_CONTENT_ID] = item['id']
        search_fields[SEARCH_CONTENT_VERSION_NUMBER] = item['version']['number']

        return search_fields

[docs]    def fetch(self, category=CATEGORY_HISTORICAL_CONTENT, from_date=DEFAULT_DATETIME):
        """Fetch the contents by version from the server.

        This method fetches the different historical versions (or
        snapshots) of the contents stored in the server that were
        updated since the given date. Only those snapshots created
        or updated after `from_date` will be returned.

        Take into account that the seconds of `from_date` parameter will
        be ignored because the Confluence REST API only accepts the date
        and hours and minutes for timestamps values.

        :param category: the category of items to fetch
        :param from_date: obtain historical versions of contents updated
            since this date

        :returns: a generator of historical versions
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        from_date = datetime_to_utc(from_date)

        kwargs = {
            'from_date': from_date
        }

        items = super().fetch(category, **kwargs)

        return items

[docs]    def fetch_items(self, category, **kwargs):
        """Fetch the contents

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """

        from_date = kwargs['from_date']

        logger.info("Fetching historical contents of '%s' from %s",
                    self.url, str(from_date))

        nhcs = 0

        contents = self.__fetch_contents_summary(from_date)
        contents = [content for content in contents]

        for content in contents:
            cid = content['id']
            content_url = urijoin(self.origin, content['_links']['webui'])

            hcs = self.__fetch_historical_contents(cid, from_date)

            for hc in hcs:
                hc['content_url'] = content_url
                hc['ancestors'] = content.get('ancestors', [])

                yield hc
                nhcs += 1

        logger.info("Fetch process completed: %s historical contents fetched",
                    nhcs)

[docs]    @classmethod
    def has_archiving(cls):
        """Returns whether it supports archiving items on the fetch process.

        :returns: this backend supports items archive
        """
        return True

[docs]    @classmethod
    def has_resuming(cls):
        """Returns whether it supports to resume the fetch process.

        :returns: this backend supports items resuming
        """
        return True

[docs]    @staticmethod
    def metadata_id(item):
        """Extracts the identifier from a Confluence item.

        This identifier will be the mix of two fields because a
        historical content does not have any unique identifier.
        In this case, 'id' and 'version' values are combined because
        it should not be possible to have two equal version numbers
        for the same content. The value to return will follow the
        pattern: <content>#v<version> (i.e 28979#v10).
        """
        cid = item['id']
        cversion = item['version']['number']

        return str(cid) + '#v' + str(cversion)

[docs]    @staticmethod
    def metadata_updated_on(item):
        """Extracts and coverts the update time from a Confluence item.

        The timestamp is extracted from 'when' field on 'version' section.
        This date is converted to UNIX timestamp format.

        :param item: item generated by the backend

        :returns: a UNIX timestamp
        """
        ts = item['version']['when']
        ts = str_to_datetime(ts)

        return ts.timestamp()

[docs]    @staticmethod
    def metadata_category(item):
        """Extracts the category from a Confluence item.

        This backend only generates one type of item which is
        'historical content'.
        """
        return CATEGORY_HISTORICAL_CONTENT

[docs]    @staticmethod
    def parse_contents_summary(raw_json):
        """Parse a Confluence summary JSON list.

        The method parses a JSON stream and returns an iterator
        of diccionaries. Each dictionary is a content summary.

        :param raw_json: JSON string to parse

        :returns: a generator of parsed content summaries.
        """
        summary = json.loads(raw_json)

        contents = summary['results']
        for c in contents:
            yield c

[docs]    @staticmethod
    def parse_historical_content(raw_json):
        """Parse a Confluence historical content JSON stream.

        This method parses a JSON stream and returns a dictionary
        that contains the data of a historical content.

        :param raw_json: JSON string to parse

        :returns: a dict with historical content
        """
        hc = json.loads(raw_json)
        return hc

    def _init_client(self, from_archive=False):
        """Init client"""

        return ConfluenceClient(self.url, archive=self.archive, from_archive=from_archive, ssl_verify=self.ssl_verify)

    def __fetch_contents_summary(self, from_date):
        logger.debug("Fetching contents summary from %s", str(from_date))
        for page in self.client.contents(from_date=from_date):
            for cs in self.parse_contents_summary(page):
                yield cs

    def __fetch_historical_contents(self, cid, from_date):
        logger.debug("Fetching historical contents of %s content", cid)

        fetching = True
        version = 1

        while fetching:
            logger.debug("Fetching and parsing historical content #%s for %s ",
                         str(version), cid)

            try:
                raw_hc = self.client.historical_content(cid, version)
            except requests.exceptions.HTTPError as e:
                code = e.response.status_code

                # Common problems found: removed and private contents
                if code not in (404, 500):
                    raise e

                logger.warning("Error retrieving content %s v#%s; skipping",
                               cid, version)
                logger.warning("Exception: %s", str(e))
                break

            hc = self.parse_historical_content(raw_hc)

            # if 'when' attribute is not present, the historical content is skipped
            if 'when' not in hc['version']:
                logger.debug("Content %s v%s skipped due to missing 'when' attribute",
                             hc['id'], str(hc['version']['number']))

                fetching = not hc['history']['latest']
                version += 1
                continue

            # Return those versions that were created after 'from_date'
            when = str_to_datetime(hc['version']['when'])
            if when >= from_date:
                yield hc
            else:
                logger.debug("Content %s v%s updated before %s; skipped",
                             hc['id'], str(hc['version']['number']), str(from_date))

            # Check whether it retrieved the latest version
            fetching = not hc['history']['latest']
            version += 1


[docs]class ConfluenceCommand(BackendCommand):
    """Class to run Confluence backend from the command line."""

    BACKEND = Confluence

[docs]    @classmethod
    def setup_cmd_parser(cls):
        """Returns the Bugzilla argument parser."""

        parser = BackendCommandArgumentParser(cls.BACKEND,
                                              from_date=True,
                                              archive=True,
                                              ssl_verify=True)

        # Required arguments
        parser.parser.add_argument('url',
                                   help="URL of the Confluence server")

        return parser


[docs]class ConfluenceClient(HttpClient):
    """Confluence REST API client.

    This class implements a client to retrieve contents from a
    Confluence server using its REST API.

    :param base_url: URL of the Confluence server
    :param archive: an archive to store/read fetched data
    :param from_archive: it tells whether to write/read the archive
    :param ssl_verify: enable/disable SSL verification
    """
    URL = "%(base)s/rest/api/%(resource)s"

    # API resources
    RCONTENTS = 'content'
    RHISTORY = 'history'
    RSPACE = 'space'

    # API methods
    MSEARCH = 'search'

    # API parameters
    PCQL = 'cql'
    PEXPAND = 'expand'
    PLIMIT = 'limit'
    PSTART = 'start'
    PSTATUS = 'status'
    PVERSION = 'version'
    PANCESTORS = 'ancestors'

    # Common values
    VCQL = "lastModified>='%(date)s' order by lastModified"
    VEXPAND = ['body.storage', 'history', 'version']
    VHISTORICAL = 'historical'

    def __init__(self, base_url, archive=None, from_archive=False, ssl_verify=True):
        super().__init__(base_url.rstrip('/'), archive=archive, from_archive=from_archive, ssl_verify=ssl_verify)

[docs]    def contents(self, from_date=DEFAULT_DATETIME,
                 offset=None, max_contents=MAX_CONTENTS):
        """Get the contents of a repository.

        This method returns an iterator that manages the pagination
        over contents. Take into account that the seconds of `from_date`
        parameter will be ignored because the API only works with
        hours and minutes.

        :param from_date: fetch the contents updated since this date
        :param offset: fetch the contents starting from this offset
        :param limit: maximum number of contents to fetch per request
        """
        resource = self.RCONTENTS + '/' + self.MSEARCH

        # Set confluence query parameter (cql)
        date = from_date.strftime("%Y-%m-%d %H:%M")
        cql = self.VCQL % {'date': date}

        # Set parameters
        params = {
            self.PCQL: cql,
            self.PLIMIT: max_contents,
            self.PEXPAND: self.PANCESTORS
        }

        if offset:
            params[self.PSTART] = offset

        for response in self._call(resource, params):
            yield response

[docs]    def historical_content(self, content_id, version):
        """Get the snapshot of a content for the given version.

        :param content_id: fetch the snapshot of this content
        :param version: snapshot version of the content
        """
        resource = self.RCONTENTS + '/' + str(content_id)

        params = {
            self.PVERSION: version,
            self.PSTATUS: self.VHISTORICAL,
            self.PEXPAND: ','.join(self.VEXPAND)
        }

        # Only one item is returned
        response = [response for response in self._call(resource, params)]
        return response[0]

    def _call(self, resource, params):
        """Retrive the given resource.

        :param resource: resource to retrieve
        :param params: dict with the HTTP parameters needed to retrieve
            the given resource
        """
        url = self.URL % {'base': self.base_url, 'resource': resource}

        logger.debug("Confluence client requests: %s params: %s",
                     resource, str(params))

        while True:
            r = self.fetch(url, payload=params)
            yield r.text

            # Pagination is available when 'next' link exists
            j = r.json()
            if '_links' not in j:
                break
            if 'next' not in j['_links']:
                break

            url = urijoin(self.base_url, j['_links']['next'])
            params = {}