Source code for perceval.backends.core.mediawiki

# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Santiago DueƱas <sduenas@bitergia.com>
#     Stephan Barth <stephan.barth@gmail.com>
#     Alvaro del Castillo <acs@bitergia.com>
#     Valerio Cosentino <valcos@bitergia.com>
#     Jesus M. Gonzalez-Barahona <jgb@gsyc.es>
#     Harshal Mittal <harshalmittal4@gmail.com>
#

import json
import logging

import dateutil

from grimoirelab_toolkit.datetime import (datetime_to_utc,
                                          datetime_utcnow,
                                          str_to_datetime)
from grimoirelab_toolkit.uris import urijoin

from ...backend import (Backend,
                        BackendCommand,
                        BackendCommandArgumentParser)
from ...client import HttpClient
from ...errors import BackendError
from ...utils import DEFAULT_DATETIME

CATEGORY_PAGE = 'page'

logger = logging.getLogger(__name__)

MAX_RECENT_DAYS = 30  # max number of days included in MediaWiki recent changes


[docs]class MediaWiki(Backend): """MediaWiki backend for Perceval. This class retrieves the wiki pages and edits from a MediaWiki site. To initialize this class the URL must be provided. The origin of the data will be set to this URL. It uses different APIs to support pre and post 1.27 MediaWiki versions. The pre 1.27 approach performance is better but it needs different logic for full an incremental retrieval. In pre 1.27 the incremental approach uses the recent changes API which just covers MAX_RECENT_DAYS. If the from_date used is older, all the pages must be retrieved and the consumer of the items must filter itself. Both approach return a common format: a page with all its revisions. It is different how the pages list is generated. The page and revisions data downloaded are the standard. More data could be gathered using additional properties. Deleted pages are not analyzed. :param url: MediaWiki url :param tag: label used to mark the data :param archive: archive to store/retrieve items :param ssl_verify: enable/disable SSL verification """ version = '0.11.0' CATEGORIES = [CATEGORY_PAGE] def __init__(self, url, tag=None, archive=None, ssl_verify=True): origin = url super().__init__(origin, tag=tag, archive=archive, ssl_verify=ssl_verify) self.url = url self.client = None
[docs] def fetch(self, category=CATEGORY_PAGE, from_date=DEFAULT_DATETIME, reviews_api=False): """Fetch the pages from the backend url. The method retrieves, from a MediaWiki url, the wiki pages. :param category: the category of items to fetch :param from_date: obtain pages updated since this date :param reviews_api: use the reviews API available in MediaWiki >= 1.27 :returns: a generator of pages """ if from_date == DEFAULT_DATETIME: from_date = None else: from_date = datetime_to_utc(from_date) kwargs = {"from_date": from_date, "reviews_api": reviews_api} items = super().fetch(category, **kwargs) return items
[docs] def fetch_items(self, category, **kwargs): """Fetch the pages :param category: the category of items to fetch :param kwargs: backend arguments :returns: a generator of items """ from_date = kwargs['from_date'] reviews_api = kwargs['reviews_api'] mediawiki_version = self.client.get_version() logger.info("MediaWiki version: %s", mediawiki_version) if reviews_api: if (mediawiki_version[0] == 1 and mediawiki_version[1] >= 27) or mediawiki_version[0] > 1: fetcher = self.__fetch_1_27(from_date) else: logger.warning("Reviews API only available in MediaWiki >= 1.27") logger.warning("Using the Pages API instead") fetcher = self.__fetch_pre1_27(from_date) else: fetcher = self.__fetch_pre1_27(from_date) for page_reviews in fetcher: yield page_reviews
[docs] @classmethod def has_archiving(cls): """Returns whether it supports archiving items on the fetch process. :returns: this backend supports items archive """ return True
[docs] @classmethod def has_resuming(cls): """Returns whether it supports to resume the fetch process. :returns: this backend does not support items resuming """ return False
[docs] @staticmethod def metadata_id(item): """Extracts the identifier from a MediaWiki page.""" return str(item['pageid'])
[docs] @staticmethod def metadata_updated_on(item): """Extracts the update field from a MediaWiki item. The timestamp is extracted from 'update' field. This date is a UNIX timestamp but needs to be converted to a float value. :param item: item generated by the backend :returns: a UNIX timestamp """ return float(item['update'])
[docs] @staticmethod def metadata_category(item): """Extracts the category from a MediaWiki item. This backend only generates one type of item which is 'page'. """ return CATEGORY_PAGE
def _init_client(self, from_archive=False): """Init client""" return MediaWikiClient(self.url, self.archive, from_archive, self.ssl_verify) def __get_max_date(self, reviews): """"Get the max date in unixtime format from reviews.""" max_ts = 0 for review in reviews: ts = str_to_datetime(review['timestamp']) ts = datetime_to_utc(ts) if ts.timestamp() > max_ts: max_ts = ts.timestamp() return max_ts def __get_namespaces_contents(self): # The pages are organized in namespaces of different types # Only contents namespaces are analyzed in this backend raw_namespaces = self.client.get_namespaces() namespaces = json.loads(raw_namespaces)["query"]["namespaces"] namespaces_contents = [ns for ns in namespaces if 'content' in namespaces[ns].keys()] return namespaces_contents def __fetch_1_27(self, from_date=None): """Fetch the pages from the backend url for MediaWiki >=1.27 The method retrieves, from a MediaWiki url, the wiki pages. :returns: a generator of pages """ logger.info("Looking for pages at url '%s'", self.url) npages = 0 # number of pages processed tpages = 0 # number of total pages pages_done = [] # pages already retrieved in reviews API namespaces_contents = self.__get_namespaces_contents() arvcontinue = '' # pagination for getting revisions and their pages while arvcontinue is not None: raw_pages = self.client.get_pages_from_allrevisions(namespaces_contents, from_date, arvcontinue) data_json = json.loads(raw_pages) arvcontinue = data_json['continue']['arvcontinue'] if 'continue' in data_json else None pages_json = data_json['query']['allrevisions'] for page in pages_json: if page['pageid'] in pages_done: logger.debug("Page %s already processed; skipped", page['pageid']) continue tpages += 1 pages_done.append(page['pageid']) page_reviews = self.__get_page_reviews(page) if not page_reviews: logger.warning("Revisions not found in %s [page id: %s], page skipped", page['title'], page['pageid']) continue yield page_reviews npages += 1 logger.info("Total number of pages: %i, skipped %i", tpages, tpages - npages) def __get_page_reviews(self, page): revisions_raw = self.client.get_revisions(page['pageid']) page_reviews = self.__build_page_reviews(page, json.loads(revisions_raw)) return page_reviews def __fetch_pre1_27(self, from_date=None): """Fetch the pages from the backend url. The method retrieves, from a MediaWiki url, the wiki pages. :returns: a generator of pages """ def fetch_incremental_changes(namespaces_contents): # Use recent changes API to get the pages from date npages = 0 # number of pages processed tpages = 0 # number of total pages pages_done = [] # pages already retrieved in reviews API rccontinue = '' hole_created = True # To detect that incremental is not complete while rccontinue is not None: raw_pages = self.client.get_recent_pages(namespaces_contents, rccontinue) data_json = json.loads(raw_pages) if 'query-continue' in data_json: # < 1.27 rccontinue = data_json['query-continue']['recentchanges']['rccontinue'] elif 'continue' in data_json: # >= 1.27 rccontinue = data_json['continue']['rccontinue'] else: rccontinue = None pages_json = data_json['query']['recentchanges'] for page in pages_json: page_ts = dateutil.parser.parse(page['timestamp']) if from_date >= page_ts: # The rest of recent changes are older than from_date logger.debug("All recent changes newer than %s processed.", from_date) rccontinue = None hole_created = False break if 'pageid' not in page: logger.warning("Missing pageid in page %s; skipped", page) continue if page['pageid'] in pages_done: logger.debug("Page %s already processed; skipped", page['pageid']) continue tpages += 1 pages_done.append(page['pageid']) page_reviews = self.__get_page_reviews(page) if not page_reviews: logger.warning("Revisions not found in %s [page id: %s], page skipped", page['title'], page['pageid']) continue yield page_reviews npages += 1 if hole_created: logger.error("Incremental update NOT completed. Hole in history created.") logger.info("Total number of pages: %i, skipped %i", tpages, tpages - npages) def fetch_all_pages(namespaces_contents): # Use get all pages API to get pages npages = 0 # number of pages processed tpages = 0 # number of total pages pages_done = [] # pages already retrieved in reviews API for ns in namespaces_contents: apcontinue = '' # pagination for getting pages logger.debug("Getting pages for namespace: %s", ns) while apcontinue is not None: raw_pages = self.client.get_pages(ns, apcontinue) data_json = json.loads(raw_pages) if 'query-continue' in data_json: # < 1.27 apcontinue = data_json['query-continue']['allpages']['apcontinue'] elif 'continue' in data_json: # >= 1.27 apcontinue = data_json['continue']['apcontinue'] else: apcontinue = None pages_json = data_json['query']['allpages'] for page in pages_json: if page['pageid'] in pages_done: logger.debug("Page %s already processed; skipped", page['pageid']) continue tpages += 1 pages_done.append(page['pageid']) page_reviews = self.__get_page_reviews(page) if not page_reviews: logger.warning("Revisions not found in %s [page id: %s], page skipped", page['title'], page['pageid']) continue yield page_reviews npages += 1 logger.info("Total number of pages: %i, skipped %i", tpages, tpages - npages) logger.info("Looking for pages at url '%s'", self.url) # from_date can not be older than MAX_RECENT_DAYS days ago if from_date: if (datetime_utcnow() - from_date).days >= MAX_RECENT_DAYS: cause = "Can't get incremental pages older than %i days." % MAX_RECENT_DAYS cause += " Do a complete analysis without from_date for older changes." raise BackendError(cause=cause) namespaces_contents = self.__get_namespaces_contents() if not from_date: return fetch_all_pages(namespaces_contents) else: return fetch_incremental_changes(namespaces_contents) def __build_page_reviews(self, page, reviews): page['revisions'] = None page['update'] = None if str(page["pageid"]) in reviews["query"]["pages"]: reviews_json = reviews["query"]["pages"][str(page["pageid"])] if 'revisions' in reviews_json: page["revisions"] = reviews_json['revisions'] page['update'] = self.__get_max_date(page['revisions']) else: page = None else: logger.warning("Revisions not found in %s [page id: %s], page skipped", page['title'], page['pageid']) page = None return page
[docs]class MediaWikiClient(HttpClient): """MediaWiki API client. This class implements a simple client to retrieve pages from projects in a MediaWiki node. :param url: URL of mediawiki site: https://wiki.mozilla.org :param archive: an archive to store/retrieved the fetched data :param from_archive: define whether the archive is used to store/read data :param ssl_verify: enable/disable SSL verification :raises HTTPError: when an error occurs doing the request """ # Resource parameters PACTION = "action" PMETA = "meta" PSIPROP = "siprop" PFORMAT = "format" PLIST = "list" PAP_LIMIT = "aplimit" PAP_NAMESPACE = "apnamespace" PAP_CONTINUE = "apcontinue" PRC_LIMIT = "rclimit" PRC_NAMESPACE = "rcnamespace" PRC_PROP = "rcprop" PRC_CONTINUE = "rccontinue" PPROP = "prop" PPAGE_IDS = "pageids" PRV_DIR = "rvdir" PRV_LIMIT = "rvlimit" PRV_START = "rvstart" PARV_NAMESPACE = "arvnamespace" PARV_DIR = "arvdir" PARV_LIMIT = "arvlimit" PARV_PROP = "arvprop" PARV_CONTINUE = "arvcontinue" PARV_START = "arvstart" # Predefined values VQUERY = "query" VSITE_INFO = "siteinfo" VNAMESPACES = "namespaces" VJSON = "json" VALL_PAGES = "allpages" VRECENT_CHANGES = "recentchanges" VRC_PROP = "title|timestamp|ids" VREVISIONS = "revisions" VNEWER = "newer" VALL_REVISIONS = "allrevisions" VIDS = "ids" def __init__(self, url, archive=None, from_archive=False, ssl_verify=True): super().__init__(urijoin(url, "api.php"), archive=archive, from_archive=from_archive, ssl_verify=ssl_verify) self.limit = "max" # Always get the max number of items
[docs] def call(self, params): """Run an API command. :param cgi: cgi command to run on the server :param params: dict with the HTTP parameters needed to run the given command """ logger.debug("MediaWiki client calls API: %s params: %s", self.base_url, str(params)) req = self.fetch(self.base_url, payload=params) return req.text
[docs] def get_namespaces(self): """ Retrieve all contents namespaces.""" params = { self.PACTION: self.VQUERY, self.PMETA: self.VSITE_INFO, self.PSIPROP: self.VNAMESPACES, self.PFORMAT: self.VJSON } return self.call(params)
[docs] def get_version(self): params = { self.PACTION: self.VQUERY, self.PMETA: self.VSITE_INFO, self.PFORMAT: self.VJSON } try: res = self.call(params) siteinfo = json.loads(res) siteinfo = siteinfo["query"]["general"] except Exception as ex: logger.error(ex) cause = "Wrong MediaWiki API: " + self.base_url raise BackendError(cause=cause) version = siteinfo['generator'] # MediaWiki 1.28.0-wmf.7, MediaWiki 1.19alpha version = version.split(" ")[1] # Removes MediaWiki version_major = int(version.split(".")[0]) version_minor = int(version.split(".")[1][0:2]) return [version_major, version_minor]
[docs] def get_pages(self, namespace, apcontinue=''): """Retrieve all pages from a namespace starting from apcontinue.""" params = { self.PACTION: self.VQUERY, self.PLIST: self.VALL_PAGES, self.PAP_LIMIT: self.limit, self.PAP_NAMESPACE: namespace, self.PFORMAT: self.VJSON } if apcontinue: params[self.PAP_CONTINUE] = apcontinue return self.call(params)
[docs] def get_recent_pages(self, namespaces, rccontinue=''): """Retrieve recent pages from all namespaces starting from rccontinue.""" namespaces.sort() params = { self.PACTION: self.VQUERY, self.PLIST: self.VRECENT_CHANGES, self.PRC_LIMIT: self.limit, self.PRC_NAMESPACE: "|".join(namespaces), self.PRC_PROP: self.VRC_PROP, self.PFORMAT: self.VJSON } if rccontinue: params[self.PRC_CONTINUE] = rccontinue return self.call(params)
[docs] def get_revisions(self, pageid, last_date=None): # TODO: Iterate if more than self.max reviews (500) if last_date: last_date_str = last_date.isoformat() params = { self.PACTION: self.VQUERY, self.PPROP: self.VREVISIONS, self.PPAGE_IDS: pageid, self.PRV_DIR: self.VNEWER, self.PRV_LIMIT: self.limit, self.PFORMAT: self.VJSON } if last_date: params[self.PRV_START] = last_date_str return self.call(params)
[docs] def get_pages_from_allrevisions(self, namespaces, from_date=None, arvcontinue=None): if from_date: if from_date.tzinfo != dateutil.tz.tzutc(): raise ValueError("Datetime is not in UTC timezone") from_date_str = from_date.strftime("%Y-%m-%dT%H:%M:%SZ") namespaces.sort() params = { self.PACTION: self.VQUERY, self.PLIST: self.VALL_REVISIONS, self.PARV_NAMESPACE: "|".join(namespaces), self.PARV_DIR: self.VNEWER, self.PARV_LIMIT: self.limit, self.PARV_PROP: self.VIDS, self.PFORMAT: self.VJSON } if arvcontinue: params[self.PARV_CONTINUE] = arvcontinue else: if from_date: params[self.PARV_START] = from_date_str return self.call(params)
[docs]class MediaWikiCommand(BackendCommand): """Class to run MediaWiki backend from the command line.""" BACKEND = MediaWiki
[docs] @classmethod def setup_cmd_parser(cls): """Returns the MediaWiki argument parser.""" parser = BackendCommandArgumentParser(cls.BACKEND, from_date=True, archive=True, ssl_verify=True) # MediaWiki options group = parser.parser.add_argument_group('MediaWiki arguments') group.add_argument('--reviews-api', action='store_true', help="Use the experimental Reviews API in MediaWiki >= 1.27") # Required arguments parser.parser.add_argument('url', help="URL of the MediaWiki server") return parser