# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Santiago DueƱas <sduenas@bitergia.com>
# Valerio Cosentino <valcos@bitergia.com>
# Israel Herraiz <israel.herraiz@bbvadata.com>
# anveshc05 <anveshc10047@gmail.com>
# Jesus M. Gonzalez-Barahona <jgb@gsyc.es>
# Harshal Mittal <harshalmittal4@gmail.com>
# Victor Morales <victor.morales@intel.com>
# animesh <animuz111@gmail.com>
#
import collections
import io
import logging
import os
import re
import subprocess
import threading
import dulwich.client
import dulwich.repo
from grimoirelab_toolkit.datetime import datetime_to_utc, str_to_datetime
from ...backend import (Backend,
BackendCommand,
BackendCommandArgumentParser)
from ...errors import RepositoryError, ParseError
from ...utils import DEFAULT_DATETIME, DEFAULT_LAST_DATETIME
CATEGORY_COMMIT = 'commit'
logger = logging.getLogger(__name__)
[docs]class Git(Backend):
"""Git backend.
This class allows the fetch the commits from a Git repository
(local or remote) or from a log file. To initialize this class,
you have to provide the URI repository and a value for `gitpath`.
This `uri` will be set as the origin of the data.
When `gitpath` is a directory or does not exist, it will be
considered as the place where the repository is/will be cloned;
when `gitpath` is a file it will be considered as a Git log file.
:param uri: URI of the Git repository
:param gitpath: path to the repository or to the log file
:param tag: label used to mark the data
:param archive: archive to store/retrieve items
:raises RepositoryError: raised when there was an error cloning or
updating the repository.
"""
version = '0.12.1'
CATEGORIES = [CATEGORY_COMMIT]
def __init__(self, uri, gitpath, tag=None, archive=None):
origin = uri
super().__init__(origin, tag=tag, archive=archive)
self.uri = uri
self.gitpath = gitpath
[docs] def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DEFAULT_LAST_DATETIME,
branches=None, latest_items=False, no_update=False):
"""Fetch commits.
The method retrieves from a Git repository or a log file
a list of commits. Commits are returned in the same order
they were obtained.
When `from_date` parameter is given it returns items committed
since the given date.
The list of `branches` is a list of strings, with the names of
the branches to fetch. If the list of branches is empty, no
commit is fetched. If the list of branches is None, all commits
for all branches will be fetched.
The parameter `latest_items` returns only those commits which
are new since the last time this method was called.
The parameter `no_update` returns all commits without performing
an update of the repository before.
Take into account that `from_date` and `branches` are ignored
when the commits are fetched from a Git log file or when
`latest_items` flag is set.
The class raises a `RepositoryError` exception when an error
occurs accessing the repository.
:param category: the category of items to fetch
:param from_date: obtain commits newer than a specific date
(inclusive)
:param to_date: obtain commits older than a specific date
:param branches: names of branches to fetch from (default: None)
:param latest_items: sync with the repository to fetch only the
newest commits
:param no_update: if enabled, don't update the repo with the latest changes
:returns: a generator of commits
"""
if not from_date:
from_date = DEFAULT_DATETIME
if not to_date:
to_date = DEFAULT_LAST_DATETIME
kwargs = {
'from_date': from_date,
'to_date': to_date,
'branches': branches,
'latest_items': latest_items,
'no_update': no_update
}
items = super().fetch(category, **kwargs)
return items
[docs] def fetch_items(self, category, **kwargs):
"""Fetch the commits
:param category: the category of items to fetch
:param kwargs: backend arguments
:returns: a generator of items
"""
from_date = kwargs['from_date']
to_date = kwargs['to_date']
branches = kwargs['branches']
latest_items = kwargs['latest_items']
no_update = kwargs['no_update']
ncommits = 0
try:
if os.path.isfile(self.gitpath):
commits = self.__fetch_from_log()
else:
commits = self.__fetch_from_repo(from_date, to_date, branches,
latest_items, no_update)
for commit in commits:
yield commit
ncommits += 1
except EmptyRepositoryError:
pass
logger.info("Fetch process completed: %s commits fetched",
ncommits)
[docs] @classmethod
def has_archiving(cls):
"""Returns whether it supports archiving items on the fetch process.
:returns: this backend does not support items archive
"""
return False
[docs] @classmethod
def has_resuming(cls):
"""Returns whether it supports to resume the fetch process.
:returns: this backend supports items resuming
"""
return True
[docs] @staticmethod
def parse_git_log_from_file(filepath):
"""Parse a Git log file.
The method parses the Git log file and returns an iterator of
dictionaries. Each one of this, contains a commit.
:param filepath: path to the log file
:returns: a generator of parsed commits
:raises ParseError: raised when the format of the Git log file
is invalid
:raises OSError: raised when an error occurs reading the
given file
"""
with open(filepath, 'r', errors='surrogateescape',
newline=os.linesep) as f:
parser = GitParser(f)
for commit in parser.parse():
yield commit
[docs] @staticmethod
def parse_git_log_from_iter(iterator):
"""Parse a Git log obtained from an iterator.
The method parses the Git log fetched from an iterator, where
each item is a line of the log. It returns and iterator of
dictionaries. Each dictionary contains a commit.
:param iterator: iterator of Git log lines
:raises ParseError: raised when the format of the Git log
is invalid
"""
parser = GitParser(iterator)
for commit in parser.parse():
yield commit
def _init_client(self, from_archive=False):
pass
def __fetch_from_log(self):
logger.info("Fetching commits: '%s' git repository from log file %s",
self.uri, self.gitpath)
return self.parse_git_log_from_file(self.gitpath)
def __fetch_from_repo(self, from_date, to_date, branches, latest_items=False, no_update=False):
# When no latest items are set or the repository has not
# been cloned use the default mode
default_mode = not latest_items or not os.path.exists(self.gitpath)
repo = self.__create_git_repository()
if default_mode:
commits = self.__fetch_commits_from_repo(repo, from_date, to_date, branches, no_update)
else:
commits = self.__fetch_newest_commits_from_repo(repo)
return commits
def __fetch_commits_from_repo(self, repo, from_date, to_date, branches, no_update):
if branches is None:
branches_text = "all"
elif len(branches) == 0:
branches_text = "no"
else:
branches_text = ", ".join(branches)
logger.info("Fetching commits: '%s' git repository from %s to %s; %s branches",
self.uri, str(from_date), str(to_date), branches_text)
# Ignore default datetime to avoid problems with git
# or convert to UTC
if to_date == DEFAULT_LAST_DATETIME:
to_date = None
else:
to_date = datetime_to_utc(to_date)
if from_date == DEFAULT_DATETIME:
from_date = None
else:
from_date = datetime_to_utc(from_date)
if not no_update:
repo.update()
gitlog = repo.log(from_date, to_date, branches)
return self.parse_git_log_from_iter(gitlog)
def __fetch_newest_commits_from_repo(self, repo):
logger.info("Fetching latest commits: '%s' git repository",
self.uri)
hashes = repo.sync()
if not hashes:
return []
gitshow = repo.show(hashes)
return self.parse_git_log_from_iter(gitshow)
def __create_git_repository(self):
if not os.path.exists(self.gitpath):
repo = GitRepository.clone(self.uri, self.gitpath)
elif os.path.isdir(self.gitpath):
repo = GitRepository(self.uri, self.gitpath)
return repo
[docs]class GitCommand(BackendCommand):
"""Class to run Git backend from the command line."""
BACKEND = Git
def _pre_init(self):
"""Initialize repositories directory path"""
if self.parsed_args.git_log:
git_path = self.parsed_args.git_log
elif self.parsed_args.git_path:
git_path = self.parsed_args.git_path
else:
if self.parsed_args.base_path:
base_path = self.parsed_args.base_path
else:
base_path = os.path.expanduser('~/.perceval/repositories/')
processed_uri = self.parsed_args.uri.lstrip('/')
git_path = os.path.join(base_path, processed_uri) + '-git'
setattr(self.parsed_args, 'gitpath', git_path)
[docs] @classmethod
def setup_cmd_parser(cls):
"""Returns the Git argument parser."""
parser = BackendCommandArgumentParser(cls.BACKEND,
from_date=True,
to_date=True)
# Optional arguments
group = parser.parser.add_argument_group('Git arguments')
group.add_argument('--branches', dest='branches',
nargs='+', type=str, default=None,
help="Fetch commits only from these branches")
# Mutual exclusive parameters
exgroup = group.add_mutually_exclusive_group()
exgroup.add_argument('--base-path', dest='base_path',
help="Base path where the Git repositories will be cloned")
exgroup.add_argument('--git-path', dest='git_path',
help="Path where the Git repository will be cloned")
exgroup.add_argument('--git-log', dest='git_log',
help="Path to the Git log file")
exgroup_fetch = group.add_mutually_exclusive_group()
exgroup_fetch.add_argument('--latest-items', dest='latest_items',
action='store_true',
help="Fetch latest commits added to the repository")
exgroup_fetch.add_argument('--no-update', dest='no_update',
action='store_true',
help="Fetch all commits without updating the repository")
# Required arguments
parser.parser.add_argument('uri',
help="URI of the Git log repository")
return parser
[docs]class GitParser:
"""Git log parser.
This class parses a plain Git log stream, converting plain commits
into dict items.
Not every Git log output is valid to be parsed. The Git log stream
must have a specific structure. It must contain raw commits data and
stats about modified files. The next excerpt shows an example of a
valid log:
commit aaa7a9209f096aaaadccaaa7089aaaa3f758a703
Author: John Smith <jsmith@example.com>
AuthorDate: Tue Aug 14 14:30:13 2012 -0300
Commit: John Smith <jsmith@example.com>
CommitDate: Tue Aug 14 14:30:13 2012 -0300
Commit for testing
:000000 100644 0000000... aaaaaaa... A aaa/otherthing
:000000 100644 0000000... aaaaaaa... A aaa/something
:000000 100644 0000000... aaaaaaa... A bbb/bthing
0 0 aaa/otherthing
0 0 aaa/something
0 0 bbb/bthing
Each commit starts with the 'commit' tag that is followed by the
SHA-1 of the commit, its parents (two or more parents in the case
of a merge) and a list of refs, if any.
commit 456a68ee1407a77f3e804a30dff245bb6c6b872f
ce8e0b86a1e9877f42fe9453ede418519115f367
51a3b654f252210572297f47597b31527c475fb8
(HEAD -> refs/heads/master)
The commit line is followed by one or more headers. Each header
has a key and a value:
Author: John Smith <jsmith@example.com>
AuthorDate: Tue Aug 14 14:30:13 2012 -0300
Commit: John Smith <jsmith@example.com>
CommitDate: Tue Aug 14 14:30:13 2012 -0300
Then, an empty line divides the headers from the commit message.
First line of the commit
Commit message splitted into one or several lines.
Each line of the message stars with 4 spaces.
Commit messages can contain a list of 'trailers'. These trailers
have the same format of headers but their meaning is project
dependent. This is an example of a commit message with trailers:
Commit message with trailers
This is the body of the message where trailers are included.
Trailers are part of the body so each line of the message
stars with 4 spaces.
Signed-off-by: John Doe <jdoe@example.com>
Signed-off-by: Jane Rae <jrae@example.com>
After a new empty line, actions and stats over files can be found.
A action line starts with one or more ':' chars and contain data
about the old and new permissions of a file, its old and new indexes,
the action code and the filepath to the file. In the case of a copied,
renamed or moved file, the new filepath to that file is included.
:100644 100644 e69de29... e69de29... R100 aaa/otherthing aaa/otherthing.renamed
Stats lines include the number of lines added and removed, and the
name of the file. The new name is also included for moved or renamed
files.
10 0 aaa/{otherthing => otherthing.renamed}
The commit ends with an empty line.
Take into account that one empty line is valid at the beginning
of the log. This allows to parse empty logs without raising
exceptions.
This example was generated using the next command:
git log --raw --numstat --pretty=fuller --decorate=full \
--parents -M -C -c --remotes=origin --all
:param stream: a file object which stores the log
"""
COMMIT_PATTERN = r"""^commit[ \t](?P<commit>[a-f0-9]{40})
(?:[ \t](?P<parents>[a-f0-9][a-f0-9 \t]+))?
(?:[ \t]\((?P<refs>.+)\))?$
"""
HEADER_TRAILER_PATTERN = r"^(?P<name>[a-zA-z0-9\-]+)\:[ \t]+(?P<value>.+)$"
MESSAGE_LINE_PATTERN = r"^[\s]{4}(?P<msg>.*)$"
ACTION_PATTERN = r"""^(?P<sc>\:+)
(?P<modes>(?:\d{6}[ \t])+)
(?P<indexes>(?:[a-f0-9]+\.{,3}[ \t])+)
(?P<action>[^\t]+)\t+
(?P<file>[^\t]+)
(?:\t+(?P<newfile>.+))?$"""
STATS_PATTERN = r"^(?P<added>\d+|-)[ \t]+(?P<removed>\d+|-)[ \t]+(?P<file>.+)$"
EMPTY_LINE_PATTERN = r"^$"
# Compiled patterns
GIT_COMMIT_REGEXP = re.compile(COMMIT_PATTERN, re.VERBOSE)
GIT_HEADER_TRAILER_REGEXP = re.compile(HEADER_TRAILER_PATTERN, re.VERBOSE)
GIT_MESSAGE_REGEXP = re.compile(MESSAGE_LINE_PATTERN, re.VERBOSE)
GIT_ACTION_REGEXP = re.compile(ACTION_PATTERN, re.VERBOSE)
GIT_STATS_REGEXP = re.compile(STATS_PATTERN, re.VERBOSE)
GIT_NEXT_STATE_REGEXP = re.compile(EMPTY_LINE_PATTERN, re.VERBOSE)
# Git parser status
(INIT,
COMMIT,
HEADER,
MESSAGE,
FILE) = range(5)
# Git trailers
TRAILERS = ['Signed-off-by']
def __init__(self, stream):
self.stream = stream
self.nline = 0
self.state = self.INIT
# Aux vars to store the commit that is being parsed
self.commit = None
self.commit_files = {}
self.handlers = {
self.INIT: self._handle_init,
self.COMMIT: self._handle_commit,
self.HEADER: self._handle_header,
self.MESSAGE: self._handle_message,
self.FILE: self._handle_file
}
[docs] def parse(self):
"""Parse the Git log stream."""
for line in self.stream:
line = line.rstrip('\n')
parsed = False
self.nline += 1
while not parsed:
parsed = self.handlers[self.state](line)
if self.state == self.COMMIT and self.commit:
commit = self._build_commit()
logger.debug("Commit %s parsed", commit['commit'])
yield commit
# Return the last commit, if any
if self.commit:
commit = self._build_commit()
logger.debug("Commit %s parsed", commit['commit'])
yield commit
def _build_commit(self):
def remove_none_values(d):
return {k: v for k, v in d.items() if v is not None}
commit = self.commit
commit = remove_none_values(commit)
commit['files'] = [remove_none_values(item)
for _, item in sorted(self.commit_files.items())]
self.commit = None
self.commit_files = {}
return commit
def _handle_init(self, line):
m = self.GIT_NEXT_STATE_REGEXP.match(line)
# In both cases, the parser advances to the next state.
# It only has to check whether the line has to be parsed
# again or not
self.state = self.COMMIT
parsed = m is not None
return parsed
def _handle_commit(self, line):
m = self.GIT_COMMIT_REGEXP.match(line)
if not m:
msg = "commit expected on line %s" % (str(self.nline))
raise ParseError(cause=msg)
parents = self.__parse_data_list(m.group('parents'))
refs = self.__parse_data_list(m.group('refs'), sep=',')
# Initialize a new commit
self.commit = {}
self.commit['commit'] = m.group('commit')
self.commit['parents'] = parents
self.commit['refs'] = refs
self.state = self.HEADER
return True
def _handle_header(self, line):
m = self.GIT_NEXT_STATE_REGEXP.match(line)
if m:
self.state = self.MESSAGE
return True
m = self.GIT_HEADER_TRAILER_REGEXP.match(line)
if not m:
msg = "invalid header format on line %s" % (str(self.nline))
raise ParseError(cause=msg)
header = m.group('name')
value = m.group('value')
self.commit[header] = value
return True
def _handle_message(self, line):
m = self.GIT_NEXT_STATE_REGEXP.match(line)
if m:
self.state = self.FILE
return True
m = self.GIT_MESSAGE_REGEXP.match(line)
if not m:
logger.debug("Invalid message format on line %s. Skipping.",
str(self.nline))
self.state = self.FILE
return False
msg_line = m.group('msg')
# Concatenate message lines
if 'message' not in self.commit:
self.commit['message'] = ''
else:
self.commit['message'] += '\n'
self.commit['message'] += msg_line
# Check trailers
self._handle_trailer(msg_line)
return True
def _handle_file(self, line):
m = self.GIT_NEXT_STATE_REGEXP.match(line)
if m:
self.state = self.COMMIT
return True
m = self.GIT_ACTION_REGEXP.match(line)
if m:
data = m.groupdict()
self._handle_action_data(data)
return True
m = self.GIT_STATS_REGEXP.match(line)
if m:
data = m.groupdict()
self._handle_stats_data(data)
return True
# No match case
logger.debug("Invalid action format on line %s. Skipping.",
str(self.nline))
self.state = self.COMMIT
return False
def _handle_trailer(self, line):
m = self.GIT_HEADER_TRAILER_REGEXP.match(line)
if not m:
return
trailer = m.group('name')
value = m.group('value')
if trailer not in self.TRAILERS:
logger.debug("Trailer %s found on line %s but is not a core trailer. Skipping.",
trailer, str(self.nline))
return
self.commit.setdefault(trailer, []).append(value)
def _handle_action_data(self, data):
modes = self.__parse_data_list(data['modes'])
indexes = self.__parse_data_list(data['indexes'])
filename = data['file']
if filename not in self.commit_files:
self.commit_files[filename] = {}
self.commit_files[filename]['modes'] = modes
self.commit_files[filename]['indexes'] = indexes
self.commit_files[filename]['action'] = data['action']
self.commit_files[filename]['file'] = filename
self.commit_files[filename]['newfile'] = data['newfile']
def _handle_stats_data(self, data):
filename = self.__get_old_filepath(data['file'])
if filename not in self.commit_files:
self.commit_files[filename] = {'file': filename}
self.commit_files[filename]['added'] = data['added']
self.commit_files[filename]['removed'] = data['removed']
def __parse_data_list(self, data, sep=' '):
if data:
lst = data.strip().split(sep)
return [e.strip() for e in lst]
else:
return []
def __get_old_filepath(self, f):
"""Get the old filepath of a moved/renamed file.
Moved or renamed files can be found in the log with any of the
next patterns:
'old_name => new_name'
'{old_prefix => new_prefix}/name'
'name/{old_suffix => new_suffix}'
This method returns the filepath before the file was moved or
renamed.
"""
i = f.find('{')
j = f.find('}')
if i > -1 and j > -1:
prefix = f[0:i]
inner = f[i + 1:f.find(' => ', i)]
suffix = f[j + 1:]
return prefix + inner + suffix
elif ' => ' in f:
return f.split(' => ')[0]
else:
return f
[docs]class EmptyRepositoryError(RepositoryError):
"""Exception raised when a repository is empty"""
message = "%(repository)s is empty"
GitRef = collections.namedtuple('GitRef', ['hash', 'refname'])
class _GraphWalker:
"""Commit walker needed by fetch_pack"""
def __init__(self, local_refs):
self.heads = [
ref.hash.encode('utf-8') for ref in local_refs
if ref.refname.startswith('refs/heads/')
]
def ack(self, sha):
pass
def next(self):
if self.heads:
ret = self.heads.pop()
return ret
return None
__next__ = next
[docs]class GitRepository:
"""Manage a Git repository.
This class provides access to a Git repository running some
common commands such as `clone`, `pull` or `log`.
To create an instance from a remote repository, use `clone()`
class method.
:param uri: URI of the repository
:param dirpath: local directory where the repository is stored
"""
GIT_PRETTY_OUTPUT_OPTS = [
'--raw', # show data in raw format
'--numstat', # show added/deleted lines per file
'--pretty=fuller', # pretty output
'--decorate=full', # show full refs
'--parents', # show parents information
'-M', # detect and report renames
'-C', # detect and report copies
'-c', # show merge info
]
def __init__(self, uri, dirpath):
gitdir = os.path.join(dirpath, 'HEAD')
if not os.path.exists(dirpath):
cause = "directory '%s' for Git repository '%s' does not exist" % (dirpath, uri)
raise RepositoryError(cause=cause)
elif not os.path.exists(gitdir):
warning = "Working directories for Git repositories no longer supported." \
"Please remove it or clone it using --mirror option."
logger.warning(warning)
cause = "directory '%s' is not a Git mirror of repository '%s'" % (dirpath, uri)
raise RepositoryError(cause=cause)
self.uri = uri
self.dirpath = dirpath
self.gitenv = {
'LANG': 'C',
'PAGER': '',
'HTTP_PROXY': os.getenv('HTTP_PROXY', ''),
'HTTPS_PROXY': os.getenv('HTTPS_PROXY', ''),
'NO_PROXY': os.getenv('NO_PROXY', ''),
'HOME': os.getenv('HOME', '')
}
[docs] @classmethod
def clone(cls, uri, dirpath):
"""Clone a Git repository.
Make a bare copy of the repository stored in `uri` into `dirpath`.
The repository would be either local or remote.
:param uri: URI of the repository
:param dirtpath: directory where the repository will be cloned
:returns: a `GitRepository` class having cloned the repository
:raises RepositoryError: when an error occurs cloning the given
repository
"""
cmd = ['git', 'clone', '--bare', uri, dirpath]
env = {
'LANG': 'C',
'HOME': os.getenv('HOME', '')
}
cls._exec(cmd, env=env)
logger.debug("Git %s repository cloned into %s",
uri, dirpath)
return cls(uri, dirpath)
[docs] def count_objects(self):
"""Count the objects of a repository.
The method returns the total number of objects (packed and unpacked)
available on the repository.
:raises RepositoryError: when an error occurs counting the objects
of a repository
"""
cmd_count = ['git', 'count-objects', '-v']
outs = self._exec(cmd_count, cwd=self.dirpath, env=self.gitenv)
outs = outs.decode('utf-8', errors='surrogateescape').rstrip()
try:
cobjs = {k: v for k, v in (x.split(': ') for x in outs.split('\n'))}
nobjs = int(cobjs['count']) + int(cobjs['in-pack'])
except KeyError as e:
error = "unable to parse 'count-objects' output; reason: '%s' entry not found" \
% e.args[0]
raise RepositoryError(cause=error)
except ValueError as e:
error = "unable to parse 'count-objects' output; reason: %s" % str(e)
raise RepositoryError(cause=error)
logger.debug("Git %s repository has %s objects",
self.uri, str(nobjs))
return nobjs
[docs] def is_detached(self):
"""Check if the repo is in a detached state.
The repository is in a detached state when HEAD is not a symbolic
reference.
:returns: whether the repository is detached or not
:raises RepositoryError: when an error occurs checking the state
of the repository
"""
cmd_sym = ['git', 'symbolic-ref', 'HEAD']
try:
self._exec(cmd_sym, cwd=self.dirpath, env=self.gitenv)
except RepositoryError as e:
if e.msg.find("ref HEAD is not a symbolic ref") == -1:
raise e
return True
else:
return False
[docs] def is_empty(self):
"""Determines whether the repository is empty or not.
Returns `True` when the repository is empty. Under the hood,
it checks the number of objects on the repository. When
this number is 0, the repositoy is empty.
:raises RepositoryError: when an error occurs accessing the
repository
"""
return self.count_objects() == 0
[docs] def update(self):
"""Update repository from its remote.
Calling this method, the repository will be synchronized with
the remote repository using 'fetch' command for 'heads' refs.
Any commit stored in the local copy will be removed; refs
will be overwritten.
:raises RepositoryError: when an error occurs updating the
repository
"""
cmd_update = ['git', 'fetch', 'origin', '+refs/heads/*:refs/heads/*', '--prune']
self._exec(cmd_update, cwd=self.dirpath, env=self.gitenv)
logger.debug("Git %s repository updated into %s",
self.uri, self.dirpath)
[docs] def sync(self):
"""Keep the repository in sync.
This method will synchronize the repository with its 'origin',
fetching newest objects and updating references. It uses low
level commands which allow to keep track of which things
have changed in the repository.
The method also returns a list of hashes related to the new
commits fetched during the process.
:returns: list of new commits
:raises RepositoryError: when an error occurs synchronizing
the repository
"""
pack_name, refs = self._fetch_pack()
if pack_name:
commits = self._read_commits_from_pack(pack_name)
else:
commits = []
logger.debug("Git repository %s (%s) does not have any new object",
self.uri, self.dirpath)
self._update_references(refs)
logger.debug("Git repository %s (%s) is synced",
self.uri, self.dirpath)
return commits
[docs] def rev_list(self, branches=None):
"""Read the list commits from the repository
The list of branches is a list of strings, with the names of the
branches to fetch. If the list of branches is empty, no commit
is fetched. If the list of branches is None, all commits
for all branches will be fetched.
The method returns the Git rev-list of the repository using the
following options:
git rev-list --topo-order
:param branches: names of branches to fetch from (default: None)
:raises EmptyRepositoryError: when the repository is empty and
the action cannot be performed
:raises RepositoryError: when an error occurs executing the command
"""
if self.is_empty():
logger.warning("Git %s repository is empty; unable to get the rev-list",
self.uri)
raise EmptyRepositoryError(repository=self.uri)
cmd_rev_list = ['git', 'rev-list', '--topo-order']
if branches is None:
cmd_rev_list.extend(['--branches', '--tags', '--remotes=origin'])
elif len(branches) == 0:
cmd_rev_list.extend(['--branches', '--tags', '--max-count=0'])
else:
branches = ['refs/heads/' + branch for branch in branches]
cmd_rev_list.extend(branches)
for line in self._exec_nb(cmd_rev_list, cwd=self.dirpath, env=self.gitenv):
yield line.rstrip('\n')
logger.debug("Git rev-list fetched from %s repository (%s)",
self.uri, self.dirpath)
[docs] def log(self, from_date=None, to_date=None, branches=None, encoding='utf-8'):
"""Read the commit log from the repository.
The method returns the Git log of the repository using the
following options:
git log --raw --numstat --pretty=fuller --decorate=full
--all --reverse --topo-order --parents -M -C -c
--remotes=origin
When `from_date` is given, it gets the commits equal or older
than that date. This date is given in a datetime object.
The list of branches is a list of strings, with the names of the
branches to fetch. If the list of branches is empty, no commit
is fetched. If the list of branches is None, all commits
for all branches will be fetched.
:param from_date: fetch commits newer than a specific
date (inclusive)
:param branches: names of branches to fetch from (default: None)
:param encoding: encode the log using this format
:returns: a generator where each item is a line from the log
:raises EmptyRepositoryError: when the repository is empty and
the action cannot be performed
:raises RepositoryError: when an error occurs fetching the log
"""
if self.is_empty():
logger.warning("Git %s repository is empty; unable to get the log",
self.uri)
raise EmptyRepositoryError(repository=self.uri)
cmd_log = ['git', 'log', '--reverse', '--topo-order']
cmd_log.extend(self.GIT_PRETTY_OUTPUT_OPTS)
if from_date:
dt = from_date.strftime("%Y-%m-%d %H:%M:%S %z")
cmd_log.append('--since=' + dt)
if to_date:
dt = to_date.strftime("%Y-%m-%d %H:%M:%S %z")
cmd_log.append('--until=' + dt)
if branches is None:
cmd_log.extend(['--branches', '--tags', '--remotes=origin'])
elif len(branches) == 0:
cmd_log.append('--max-count=0')
else:
branches = ['refs/heads/' + branch for branch in branches]
cmd_log.extend(branches)
for line in self._exec_nb(cmd_log, cwd=self.dirpath, env=self.gitenv):
yield line
logger.debug("Git log fetched from %s repository (%s)",
self.uri, self.dirpath)
[docs] def show(self, commits=None, encoding='utf-8'):
"""Show the data of a set of commits.
The method returns the output of Git show command for a
set of commits using the following options:
git show --raw --numstat --pretty=fuller --decorate=full
--parents -M -C -c [<commit>...<commit>]
When the list of commits is empty, the command will return
data about the last commit, like the default behaviour of
`git show`.
:param commits: list of commits to show data
:param encoding: encode the output using this format
:returns: a generator where each item is a line from the show output
:raises EmptyRepositoryError: when the repository is empty and
the action cannot be performed
:raises RepositoryError: when an error occurs fetching the show output
"""
if self.is_empty():
logger.warning("Git %s repository is empty; unable to run show",
self.uri)
raise EmptyRepositoryError(repository=self.uri)
if commits is None:
commits = []
cmd_show = ['git', 'show']
cmd_show.extend(self.GIT_PRETTY_OUTPUT_OPTS)
cmd_show.extend(commits)
for line in self._exec_nb(cmd_show, cwd=self.dirpath, env=self.gitenv):
yield line
logger.debug("Git show fetched from %s repository (%s)",
self.uri, self.dirpath)
def _fetch_pack(self):
"""Fetch changes and store them in a pack."""
def prepare_refs(refs):
return [ref.hash.encode('utf-8') for ref in refs
if not ref.refname.endswith('^{}')]
def determine_wants(refs):
remote_refs = prepare_refs(self._discover_refs(remote=True))
local_refs = prepare_refs(self._discover_refs())
wants = [ref for ref in remote_refs if ref not in local_refs]
return wants
client, repo_path = dulwich.client.get_transport_and_path(self.uri)
repo = dulwich.repo.Repo(self.dirpath)
fd = io.BytesIO()
local_refs = self._discover_refs()
graph_walker = _GraphWalker(local_refs)
result = client.fetch_pack(repo_path,
determine_wants,
graph_walker,
fd.write)
refs = [GitRef(ref_hash.decode('utf-8'), ref_name.decode('utf-8'))
for ref_name, ref_hash in result.refs.items()]
if len(fd.getvalue()) > 0:
fd.seek(0)
pack = repo.object_store.add_thin_pack(fd.read, None)
pack_name = pack.name().decode('utf-8')
else:
pack_name = None
return (pack_name, refs)
def _read_commits_from_pack(self, packet_name):
"""Read the commits of a pack."""
filepath = 'objects/pack/pack-' + packet_name
cmd_verify_pack = ['git', 'verify-pack', '-v', filepath]
outs = self._exec(cmd_verify_pack, cwd=self.dirpath, env=self.gitenv)
outs = outs.decode('utf-8', errors='surrogateescape').rstrip()
lines = [line.split(' ') for line in outs.split('\n')]
# Commits usually come in the pack ordered from newest to oldest
commits = [parts[0] for parts in lines if parts[1] == 'commit']
commits.reverse()
return commits
def _update_references(self, refs):
"""Update references removing old ones."""
new_refs = [ref.refname for ref in refs]
# Delete old references
for old_ref in self._discover_refs():
if not old_ref.refname.startswith('refs/heads/'):
continue
if old_ref.refname in new_refs:
continue
self._update_ref(old_ref, delete=True)
# Update new references
for new_ref in refs:
refname = new_ref.refname
if refname.endswith('^{}'):
logger.debug("Annotated tag %s ignored for updating in sync process",
refname)
continue
elif not refname.startswith('refs/heads/') and not refname.startswith('refs/tags/'):
logger.debug("Reference %s not needed; ignored for updating in sync process",
refname)
continue
else:
self._update_ref(new_ref)
# Prune repository to remove old branches
cmd = ['git', 'remote', 'prune', 'origin']
self._exec(cmd, cwd=self.dirpath, env=self.gitenv)
def _discover_refs(self, remote=False):
"""Get the current list of local or remote refs."""
if remote:
cmd_refs = ['git', 'ls-remote', '-h', '-t', '--exit-code', 'origin']
sep = '\t'
ignored_error_codes = [2]
else:
# Check first whether the local repo is empty;
# Running 'show-ref' in empty repos gives an error
if self.is_empty():
raise EmptyRepositoryError(repository=self.uri)
cmd_refs = ['git', 'show-ref', '--heads', '--tags']
sep = ' '
ignored_error_codes = [1]
# Error codes returned when no matching refs (i.e, no heads
# or tags) are found in a repository will be ignored. Otherwise,
# the full process would fail for those situations.
outs = self._exec(cmd_refs, cwd=self.dirpath,
env=self.gitenv,
ignored_error_codes=ignored_error_codes)
outs = outs.decode('utf-8', errors='surrogateescape').rstrip()
outs = outs.split('\n') if outs else []
refs = []
for line in outs:
data = line.split(sep)
ref = GitRef(data[0], data[1])
refs.append(ref)
return refs
def _update_ref(self, ref, delete=False):
"""Update a reference."""
cmd = ['git', 'update-ref']
if delete:
cmd.extend(['-d', ref.refname])
action = 'deleted'
else:
cmd.extend([ref.refname, ref.hash])
action = 'updated to %s' % ref.hash
try:
self._exec(cmd, cwd=self.dirpath, env=self.gitenv)
except RepositoryError as e:
logger.warning("Git %s ref could not be %s during sync process in %s (%s); skipped",
ref.refname, action, self.uri, self.dirpath)
else:
logger.debug("Git %s ref %s in %s (%s)",
ref.refname, action, self.uri, self.dirpath)
def _exec_nb(self, cmd, cwd=None, env=None, encoding='utf-8'):
"""Run a command with a non blocking call.
Execute `cmd` command with a non blocking call. The command will
be run in the directory set by `cwd`. Enviroment variables can be
set using the `env` dictionary. The output data is returned
as encoded bytes in an iterator. Each item will be a line of the
output.
:returns: an iterator with the output of the command as encoded bytes
:raises RepositoryError: when an error occurs running the command
"""
self.failed_message = None
logger.debug("Running command %s (cwd: %s, env: %s)",
' '.join(cmd), cwd, str(env))
try:
self.proc = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=cwd,
env=env)
err_thread = threading.Thread(target=self._read_stderr,
kwargs={'encoding': encoding},
daemon=True)
err_thread.start()
for line in self.proc.stdout:
yield line.decode(encoding, errors='surrogateescape')
err_thread.join()
self.proc.communicate()
self.proc.stdout.close()
self.proc.stderr.close()
except OSError as e:
err_thread.join()
raise RepositoryError(cause=str(e))
if self.proc.returncode != 0:
cause = "git command - %s (return code: %d)" % \
(self.failed_message, self.proc.returncode)
raise RepositoryError(cause=cause)
def _read_stderr(self, encoding='utf-8'):
"""Reads self.proc.stderr.
Usually, this should be read in a thread, to prevent blocking
the read from stdout of the stderr buffer is filled, and this
function is not called becuase the program is busy in the
stderr reading loop.
Reads self.proc.stderr (self.proc is the subprocess running
the git command), and reads / writes self.failed_message
(the message sent to stderr when git fails, usually one line).
"""
for line in self.proc.stderr:
err_line = line.decode(encoding, errors='surrogateescape')
if self.proc.returncode != 0:
# If the subprocess didn't finish successfully, we expect
# the last line in stderr to provide the cause
if self.failed_message is not None:
# We had a message, there is a newer line, print it
logger.debug("Git log stderr: " + self.failed_message)
self.failed_message = err_line
else:
# The subprocess is successfully up to now, print the line
logger.debug("Git log stderr: " + err_line)
@staticmethod
def _exec(cmd, cwd=None, env=None, ignored_error_codes=None,
encoding='utf-8'):
"""Run a command.
Execute `cmd` command in the directory set by `cwd`. Environment
variables can be set using the `env` dictionary. The output
data is returned as encoded bytes.
Commands which their returning status codes are non-zero will
be treated as failed. Error codes considered as valid can be
ignored giving them in the `ignored_error_codes` list.
:returns: the output of the command as encoded bytes
:raises RepositoryError: when an error occurs running the command
"""
if ignored_error_codes is None:
ignored_error_codes = []
logger.debug("Running command %s (cwd: %s, env: %s)",
' '.join(cmd), cwd, str(env))
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=cwd, env=env)
(outs, errs) = proc.communicate()
except OSError as e:
raise RepositoryError(cause=str(e))
if proc.returncode != 0 and proc.returncode not in ignored_error_codes:
err = errs.decode(encoding, errors='surrogateescape')
cause = "git command - %s" % err
raise RepositoryError(cause=cause)
else:
logger.debug(errs.decode(encoding, errors='surrogateescape'))
return outs