#!/usr/bin/env python from __future__ import print_function import argparse import email.mime.multipart import email.mime.text import logging import os.path import pickle import re import smtplib import subprocess import sys from datetime import datetime, timedelta from phabricator import Phabricator # Setting up a virtualenv to run this script can be done by running the # following commands: # $ virtualenv venv # $ . ./venv/bin/activate # $ pip install Phabricator GIT_REPO_METADATA = (("llvm-monorepo", "https://github.com/llvm/llvm-project"),) # The below PhabXXX classes represent objects as modelled by Phabricator. # The classes can be serialized to disk, to try and make sure that we don't # needlessly have to re-fetch lots of data from Phabricator, as that would # make this script unusably slow. class PhabObject: OBJECT_KIND = None def __init__(self, id): self.id = id class PhabObjectCache: def __init__(self, PhabObjectClass): self.PhabObjectClass = PhabObjectClass self.most_recent_info = None self.oldest_info = None self.id2PhabObjects = {} def get_name(self): return self.PhabObjectClass.OBJECT_KIND + "sCache" def get(self, id): if id not in self.id2PhabObjects: self.id2PhabObjects[id] = self.PhabObjectClass(id) return self.id2PhabObjects[id] def get_ids_in_cache(self): return list(self.id2PhabObjects.keys()) def get_objects(self): return list(self.id2PhabObjects.values()) DEFAULT_DIRECTORY = "PhabObjectCache" def _get_pickle_name(self, directory): file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle" return os.path.join(directory, file_name) def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY): """ FIXME: consider if serializing to JSON would bring interoperability advantages over serializing to pickle. """ try: f = open(self._get_pickle_name(directory), "rb") except IOError as err: print("Could not find cache. Error message: {0}. Continuing...".format(err)) else: with f: try: d = pickle.load(f) self.__dict__.update(d) except EOFError as err: print( "Cache seems to be corrupt. " + "Not using cache. Error message: {0}".format(err) ) def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY): if not os.path.exists(directory): os.makedirs(directory) with open(self._get_pickle_name(directory), "wb") as f: pickle.dump(self.__dict__, f) print( "wrote cache to disk, most_recent_info= {0}".format( datetime.fromtimestamp(self.most_recent_info) if self.most_recent_info is not None else None ) ) class PhabReview(PhabObject): OBJECT_KIND = "Review" def __init__(self, id): PhabObject.__init__(self, id) def update(self, title, dateCreated, dateModified, author): self.title = title self.dateCreated = dateCreated self.dateModified = dateModified self.author = author def setPhabDiffs(self, phabDiffs): self.phabDiffs = phabDiffs class PhabUser(PhabObject): OBJECT_KIND = "User" def __init__(self, id): PhabObject.__init__(self, id) def update(self, phid, realName): self.phid = phid self.realName = realName class PhabHunk: def __init__(self, rest_api_hunk): self.oldOffset = int(rest_api_hunk["oldOffset"]) self.oldLength = int(rest_api_hunk["oldLength"]) # self.actual_lines_changed_offset will contain the offsets of the # lines that were changed in this hunk. self.actual_lines_changed_offset = [] offset = self.oldOffset inHunk = False hunkStart = -1 contextLines = 3 for line in rest_api_hunk["corpus"].split("\n"): if line.startswith("+"): # line is a new line that got introduced in this patch. # Do not record it as a changed line. if inHunk is False: inHunk = True hunkStart = max(self.oldOffset, offset - contextLines) continue if line.startswith("-"): # line was changed or removed from the older version of the # code. Record it as a changed line. if inHunk is False: inHunk = True hunkStart = max(self.oldOffset, offset - contextLines) offset += 1 continue # line is a context line. if inHunk is True: inHunk = False hunkEnd = offset + contextLines self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) offset += 1 if inHunk is True: hunkEnd = offset + contextLines self.actual_lines_changed_offset.append((hunkStart, hunkEnd)) # The above algorithm could result in adjacent or overlapping ranges # being recorded into self.actual_lines_changed_offset. # Merge the adjacent and overlapping ranges in there: t = [] lastRange = None for start, end in self.actual_lines_changed_offset + [ (sys.maxsize, sys.maxsize) ]: if lastRange is None: lastRange = (start, end) else: if lastRange[1] >= start: lastRange = (lastRange[0], end) else: t.append(lastRange) lastRange = (start, end) self.actual_lines_changed_offset = t class PhabChange: def __init__(self, rest_api_change): self.oldPath = rest_api_change["oldPath"] self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]] class PhabDiff(PhabObject): OBJECT_KIND = "Diff" def __init__(self, id): PhabObject.__init__(self, id) def update(self, rest_api_results): self.revisionID = rest_api_results["revisionID"] self.dateModified = int(rest_api_results["dateModified"]) self.dateCreated = int(rest_api_results["dateCreated"]) self.changes = [PhabChange(c) for c in rest_api_results["changes"]] class ReviewsCache(PhabObjectCache): def __init__(self): PhabObjectCache.__init__(self, PhabReview) class UsersCache(PhabObjectCache): def __init__(self): PhabObjectCache.__init__(self, PhabUser) reviews_cache = ReviewsCache() users_cache = UsersCache() def init_phab_connection(): phab = Phabricator() phab.update_interfaces() return phab def update_cached_info( phab, cache, phab_query, order, record_results, max_nr_entries_per_fetch, max_nr_days_to_cache, ): q = phab LIMIT = max_nr_entries_per_fetch for query_step in phab_query: q = getattr(q, query_step) results = q(order=order, limit=LIMIT) most_recent_info, oldest_info = record_results(cache, results, phab) oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - timedelta( days=max_nr_days_to_cache ) most_recent_info_overall = most_recent_info cache.write_cache_to_disk() after = results["cursor"]["after"] print("after: {0!r}".format(after)) print("most_recent_info: {0}".format(datetime.fromtimestamp(most_recent_info))) while ( after is not None and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch ): need_more_older_data = ( cache.oldest_info is None or datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch ) print( ( "need_more_older_data={0} cache.oldest_info={1} " + "oldest_info_to_fetch={2}" ).format( need_more_older_data, datetime.fromtimestamp(cache.oldest_info) if cache.oldest_info is not None else None, oldest_info_to_fetch, ) ) need_more_newer_data = ( cache.most_recent_info is None or cache.most_recent_info < most_recent_info ) print( ( "need_more_newer_data={0} cache.most_recent_info={1} " + "most_recent_info={2}" ).format(need_more_newer_data, cache.most_recent_info, most_recent_info) ) if not need_more_older_data and not need_more_newer_data: break results = q(order=order, after=after, limit=LIMIT) most_recent_info, oldest_info = record_results(cache, results, phab) after = results["cursor"]["after"] print("after: {0!r}".format(after)) print("most_recent_info: {0}".format(datetime.fromtimestamp(most_recent_info))) cache.write_cache_to_disk() cache.most_recent_info = most_recent_info_overall if after is None: # We did fetch all records. Mark the cache to contain all info since # the start of time. oldest_info = 0 cache.oldest_info = oldest_info cache.write_cache_to_disk() def record_reviews(cache, reviews, phab): most_recent_info = None oldest_info = None for reviewInfo in reviews["data"]: if reviewInfo["type"] != "DREV": continue id = reviewInfo["id"] # phid = reviewInfo["phid"] dateModified = int(reviewInfo["fields"]["dateModified"]) dateCreated = int(reviewInfo["fields"]["dateCreated"]) title = reviewInfo["fields"]["title"] author = reviewInfo["fields"]["authorPHID"] phabReview = cache.get(id) if ( "dateModified" not in phabReview.__dict__ or dateModified > phabReview.dateModified ): diff_results = phab.differential.querydiffs(revisionIDs=[id]) diff_ids = sorted(diff_results.keys()) phabDiffs = [] for diff_id in diff_ids: diffInfo = diff_results[diff_id] d = PhabDiff(diff_id) d.update(diffInfo) phabDiffs.append(d) phabReview.update(title, dateCreated, dateModified, author) phabReview.setPhabDiffs(phabDiffs) print( "Updated D{0} modified on {1} ({2} diffs)".format( id, datetime.fromtimestamp(dateModified), len(phabDiffs) ) ) if most_recent_info is None: most_recent_info = dateModified elif most_recent_info < dateModified: most_recent_info = dateModified if oldest_info is None: oldest_info = dateModified elif oldest_info > dateModified: oldest_info = dateModified return most_recent_info, oldest_info def record_users(cache, users, phab): most_recent_info = None oldest_info = None for info in users["data"]: if info["type"] != "USER": continue id = info["id"] phid = info["phid"] dateModified = int(info["fields"]["dateModified"]) # dateCreated = int(info["fields"]["dateCreated"]) realName = info["fields"]["realName"] phabUser = cache.get(id) phabUser.update(phid, realName) if most_recent_info is None: most_recent_info = dateModified elif most_recent_info < dateModified: most_recent_info = dateModified if oldest_info is None: oldest_info = dateModified elif oldest_info > dateModified: oldest_info = dateModified return most_recent_info, oldest_info PHABCACHESINFO = ( ( reviews_cache, ("differential", "revision", "search"), "updated", record_reviews, 5, 7, ), (users_cache, ("user", "search"), "newest", record_users, 100, 1000), ) def load_cache(): for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO: cache.populate_cache_from_disk() print( "Loaded {0} nr entries: {1}".format( cache.get_name(), len(cache.get_ids_in_cache()) ) ) print( "Loaded {0} has most recent info: {1}".format( cache.get_name(), datetime.fromtimestamp(cache.most_recent_info) if cache.most_recent_info is not None else None, ) ) def update_cache(phab): load_cache() for ( cache, phab_query, order, record_results, max_nr_entries_per_fetch, max_nr_days_to_cache, ) in PHABCACHESINFO: update_cached_info( phab, cache, phab_query, order, record_results, max_nr_entries_per_fetch, max_nr_days_to_cache, ) ids_in_cache = cache.get_ids_in_cache() print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name())) cache.write_cache_to_disk() def get_most_recent_reviews(days): newest_reviews = sorted(reviews_cache.get_objects(), key=lambda r: -r.dateModified) if len(newest_reviews) == 0: return newest_reviews most_recent_review_time = datetime.fromtimestamp(newest_reviews[0].dateModified) cut_off_date = most_recent_review_time - timedelta(days=days) result = [] for review in newest_reviews: if datetime.fromtimestamp(review.dateModified) < cut_off_date: return result result.append(review) return result # All of the above code is about fetching data from Phabricator and caching it # on local disk. The below code contains the actual "business logic" for this # script. _userphid2realname = None def get_real_name_from_author(user_phid): global _userphid2realname if _userphid2realname is None: _userphid2realname = {} for user in users_cache.get_objects(): _userphid2realname[user.phid] = user.realName return _userphid2realname.get(user_phid, "unknown") def print_most_recent_reviews(phab, days, filter_reviewers): msgs = [] def add_msg(msg): msgs.append(msg) print(msg.encode("utf-8")) newest_reviews = get_most_recent_reviews(days) add_msg( "These are the reviews that look interesting to be reviewed. " + "The report below has 2 sections. The first " + "section is organized per review; the second section is organized " + "per potential reviewer.\n" ) oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None oldest_datetime = ( datetime.fromtimestamp(oldest_review.dateModified) if oldest_review else None ) add_msg( ( "The report below is based on analyzing the reviews that got " + "touched in the past {0} days (since {1}). " + "The script found {2} such reviews.\n" ).format(days, oldest_datetime, len(newest_reviews)) ) reviewer2reviews_and_scores = {} for i, review in enumerate(newest_reviews): matched_reviewers = find_reviewers_for_review(review) matched_reviewers = filter_reviewers(matched_reviewers) if len(matched_reviewers) == 0: continue add_msg( ( "{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" + " Last updated on {4}" ).format( i, review.id, get_real_name_from_author(review.author), review.title, datetime.fromtimestamp(review.dateModified), ) ) for reviewer, scores in matched_reviewers: add_msg( " potential reviewer {0}, score {1}".format( reviewer, "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")", ) ) if reviewer not in reviewer2reviews_and_scores: reviewer2reviews_and_scores[reviewer] = [] reviewer2reviews_and_scores[reviewer].append((review, scores)) # Print out a summary per reviewer. for reviewer in sorted(reviewer2reviews_and_scores.keys()): reviews_and_scores = reviewer2reviews_and_scores[reviewer] reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True) add_msg( "\n\nSUMMARY FOR {0} (found {1} reviews):".format( reviewer, len(reviews_and_scores) ) ) for review, scores in reviews_and_scores: add_msg( "[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format( "/".join(["{0:.1f}%".format(s) for s in scores]), review.id, review.title, get_real_name_from_author(review.author), ) ) return "\n".join(msgs) def get_git_cmd_output(cmd): output = None try: logging.debug(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logging.debug(str(e)) if output is None: return None return output.decode("utf-8", errors="ignore") reAuthorMail = re.compile("^author-mail <([^>]*)>.*$") def parse_blame_output_line_porcelain(blame_output_lines): email2nr_occurences = {} if blame_output_lines is None: return email2nr_occurences for line in blame_output_lines: m = reAuthorMail.match(line) if m: author_email_address = m.group(1) if author_email_address not in email2nr_occurences: email2nr_occurences[author_email_address] = 1 else: email2nr_occurences[author_email_address] += 1 return email2nr_occurences class BlameOutputCache: def __init__(self): self.cache = {} def _populate_cache_for(self, cache_key): assert cache_key not in self.cache git_repo, base_revision, path = cache_key cmd = ( "git -C {0} blame --encoding=utf-8 --date iso -f -e -w " + "--line-porcelain {1} -- {2}" ).format(git_repo, base_revision, path) blame_output = get_git_cmd_output(cmd) self.cache[cache_key] = ( blame_output.split("\n") if blame_output is not None else None ) # FIXME: the blame cache could probably be made more effective still if # instead of storing the requested base_revision in the cache, the last # revision before the base revision this file/path got changed in gets # stored. That way multiple project revisions for which this specific # file/patch hasn't changed would get cache hits (instead of misses in # the current implementation). def get_blame_output_for( self, git_repo, base_revision, path, start_line=-1, end_line=-1 ): cache_key = (git_repo, base_revision, path) if cache_key not in self.cache: self._populate_cache_for(cache_key) assert cache_key in self.cache all_blame_lines = self.cache[cache_key] if all_blame_lines is None: return None if start_line == -1 and end_line == -1: return all_blame_lines assert start_line >= 0 assert end_line >= 0 assert end_line <= len(all_blame_lines) assert start_line <= len(all_blame_lines) assert start_line <= end_line return all_blame_lines[start_line:end_line] def get_parsed_git_blame_for( self, git_repo, base_revision, path, start_line=-1, end_line=-1 ): return parse_blame_output_line_porcelain( self.get_blame_output_for( git_repo, base_revision, path, start_line, end_line ) ) blameOutputCache = BlameOutputCache() def find_reviewers_for_diff_heuristic(diff): # Heuristic 1: assume good reviewers are the ones that touched the same # lines before as this patch is touching. # Heuristic 2: assume good reviewers are the ones that touched the same # files before as this patch is touching. reviewers2nr_lines_touched = {} reviewers2nr_files_touched = {} # Assume last revision before diff was modified is the revision the diff # applies to. assert len(GIT_REPO_METADATA) == 1 git_repo = os.path.join("git_repos", GIT_REPO_METADATA[0][0]) cmd = 'git -C {0} rev-list -n 1 --before="{1}" main'.format( git_repo, datetime.fromtimestamp(diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"), ) base_revision = get_git_cmd_output(cmd).strip() logging.debug("Base revision={0}".format(base_revision)) for change in diff.changes: path = change.oldPath # Compute heuristic 1: look at context of patch lines. for hunk in change.hunks: for start_line, end_line in hunk.actual_lines_changed_offset: # Collect git blame results for authors in those ranges. for ( reviewer, nr_occurences, ) in blameOutputCache.get_parsed_git_blame_for( git_repo, base_revision, path, start_line, end_line ).items(): if reviewer not in reviewers2nr_lines_touched: reviewers2nr_lines_touched[reviewer] = 0 reviewers2nr_lines_touched[reviewer] += nr_occurences # Compute heuristic 2: don't look at context, just at files touched. # Collect git blame results for authors in those ranges. for reviewer, nr_occurences in blameOutputCache.get_parsed_git_blame_for( git_repo, base_revision, path ).items(): if reviewer not in reviewers2nr_files_touched: reviewers2nr_files_touched[reviewer] = 0 reviewers2nr_files_touched[reviewer] += 1 # Compute "match scores" total_nr_lines = sum(reviewers2nr_lines_touched.values()) total_nr_files = len(diff.changes) reviewers_matchscores = [ ( reviewer, ( reviewers2nr_lines_touched.get(reviewer, 0) * 100.0 / total_nr_lines if total_nr_lines != 0 else 0, reviewers2nr_files_touched[reviewer] * 100.0 / total_nr_files if total_nr_files != 0 else 0, ), ) for reviewer, nr_lines in reviewers2nr_files_touched.items() ] reviewers_matchscores.sort(key=lambda i: i[1], reverse=True) return reviewers_matchscores def find_reviewers_for_review(review): # Process the newest diff first. diffs = sorted(review.phabDiffs, key=lambda d: d.dateModified, reverse=True) if len(diffs) == 0: return diff = diffs[0] matched_reviewers = find_reviewers_for_diff_heuristic(diff) # Show progress, as this is a slow operation: sys.stdout.write(".") sys.stdout.flush() logging.debug("matched_reviewers: {0}".format(matched_reviewers)) return matched_reviewers def update_git_repos(): git_repos_directory = "git_repos" for name, url in GIT_REPO_METADATA: dirname = os.path.join(git_repos_directory, name) if not os.path.exists(dirname): cmd = "git clone {0} {1}".format(url, dirname) output = get_git_cmd_output(cmd) cmd = "git -C {0} pull --rebase".format(dirname) output = get_git_cmd_output(cmd) def send_emails(email_addresses, sender, msg): s = smtplib.SMTP() s.connect() for email_address in email_addresses: email_msg = email.mime.multipart.MIMEMultipart() email_msg["From"] = sender email_msg["To"] = email_address email_msg["Subject"] = "LLVM patches you may be able to review." email_msg.attach(email.mime.text.MIMEText(msg.encode("utf-8"), "plain")) # python 3.x: s.send_message(email_msg) s.sendmail(email_msg["From"], email_msg["To"], email_msg.as_string()) s.quit() def filter_reviewers_to_report_for(people_to_look_for): # The below is just an example filter, to only report potential reviews # to do for the people that will receive the report email. return lambda potential_reviewers: [ r for r in potential_reviewers if r[0] in people_to_look_for ] def main(): parser = argparse.ArgumentParser( description="Match open reviews to potential reviewers." ) parser.add_argument( "--no-update-cache", dest="update_cache", action="store_false", default=True, help="Do not update cached Phabricator objects", ) parser.add_argument( "--email-report", dest="email_report", nargs="*", default="", help="A email addresses to send the report to.", ) parser.add_argument( "--sender", dest="sender", default="", help="The email address to use in 'From' on messages emailed out.", ) parser.add_argument( "--email-addresses", dest="email_addresses", nargs="*", help="The email addresses (as known by LLVM git) of " + "the people to look for reviews for.", ) parser.add_argument("--verbose", "-v", action="count") args = parser.parse_args() if args.verbose >= 1: logging.basicConfig(level=logging.DEBUG) people_to_look_for = [e.decode("utf-8") for e in args.email_addresses] logging.debug( "Will look for reviews that following contributors could " + "review: {}".format(people_to_look_for) ) logging.debug("Will email a report to: {}".format(args.email_report)) phab = init_phab_connection() if args.update_cache: update_cache(phab) load_cache() update_git_repos() msg = print_most_recent_reviews( phab, days=1, filter_reviewers=filter_reviewers_to_report_for(people_to_look_for), ) if args.email_report != []: send_emails(args.email_report, args.sender, msg) if __name__ == "__main__": main()