# Python interface to RCS.  Builds some data structures on top of
# rcsparse to simplify coding up interfaces to large bodies of RCS
# files.
# 
# Nick Barnes, Ravenbrook Limited, 2015-01-13.
#
# $Id: //info.ravenbrook.com/project/mlworks/import/2013-04-25/xanalys-tarball/rcs.py#3 $

import os
import bisect
import datetime
import functools
from operator import attrgetter

import rcsparse


class ID(object):
    """An RCS identifier, either a branch or a revision.

    We intern all identifiers, so that (say) RCS_ID('1.3.1') is a
    unique object.
    """

    @functools.total_ordering
    class _ID(object):
        """An interned RCS identifier, either a branch or a revision."""

        def __init__(self, id):

            if not isinstance(id,tuple) or any(not isinstance(m,int) for m in id):
                raise TypeError("ID must be a tuple of integers.")
            # A tuple of integers representing the RCS ID.
            self.id = id

            # The RCS ID as a string.
            self.name = '.'.join(str(i) for i in id)

            # The parent of a revision is the branch it's on.
            # The parent of a branch is the revision it's from.
            # The parent of a trunk branch is itself.
            if len(id) > 1:
                self.parent = ID(id[:-1])
            else:
                self.parent = self

            # A boolean attribute: is this ID on the trunk?
            # The GNU RCS 5.9.4 manual says
            # "A revision number with one dot ... is said to be on the trunk".
            self.trunk = len(id) < 3

        def __repr__(self):
            return "%s(%s)" % (type(self).__name__, self.name)

        # total_ordering uses this < operator, and identity.
        def __lt__(self, other):
            return self.id < other.id

    class BranchID(_ID):
        """An RCS Branch ID."""

        def __init__(self, id):
            super(BranchID, self).__init__(id)
            # The branch_point of branch A.B.C.D.E is revision A.B.C.D.
            self.branch_point = self.parent

        def first_rev(self,i):
            """Return the ID of the first revision on this branch."""
            return ID(self._id+(1,))

    class RevisionID(_ID):
        """An RCS Revision ID."""

        def __init__(self, id):
            super(RevisionID, self).__init__(id)
            # A boolean 
            self.is_branch_head = (self.id[-1] == 1)
            # The branch ID of the branch on which this revision exists.
            self.branch = self.parent

    # Dictionary of integer tuples to IDs, used for interning.
    _known={}

    def __new__(cls, id):
        """Create a new RCS ID, or return an identical existing one."""
        if isinstance(id, str):
            id = tuple(int(i) for i in id.split('.'))
        # Uniquify: we should have only one object for each ID.
        if id not in ID._known:
            C = ID.BranchID if len(id) % 2 == 1 else ID.RevisionID
            ID._known[id] = C(id)
        return ID._known[id]

# Promote these two classes to the top-level namespace.
    
RevisionID = ID.RevisionID
BranchID = ID.BranchID

class FileRevision(object):
    """A single revision of an RCS file."""
    
    def __init__(self, id, rcsfile, rcsparse, names):
        # rcsparse gives us tuples:
        #   (revision, UTC(?) timestamp, author, state, branches, next, commitID)
        _, timestamp, author, state, bs, n, c = rcsparse.revs[id.name]

        # A RevisionID
        self.id = id

        # a boolean: is this revision on the trunk?
        self.trunk = id.trunk

        # a boolean: is this revision the first on a branch?
        self.is_branch_head = id.is_branch_head

        # the File object
        self.rcsfile = rcsfile

        # the timestamp, a Unix timestamp, probably in UTC
        self.timestamp = timestamp

        # the author, a string identifying the user (originally a Unix user name)
        self.author = author

        # the RCS "state", a string.
        self.state = state

        # When we init, the File object doesn't yet have complete
        # rev[] and branch[] dicts, so we postpone lookups to
        # methods (self.branches() and self,next()).
        self._branches = [ID(b) for b in bs]
        self._next = ID(n) if n is not None else None

        # The commit ID.  No idea what this is; usually None in the files I see.
        self.commitID = c
        # The log string.
        self.log = rcsparse.getlog(id.name)

        # A list of RCS symbolic names for this revision.
        self.names = names

        # See self.contents().  Defer extracting the file contents until we are
        # asked, then cache them.
        self._contents = None

        # key for combining revisions of different files.
        # FileRevisions from different files, with matching keys and
        # similar timestamps, can be combined into a RevisionSet (see below).
        # 
        # We define this as a property so that we can usefully
        # sub-class for RCS-based SCM systems which version metadata,
        # for which we might get more information about file revisions
        # which we can use to group them.
        self.key = (self.author, self.log, self.is_branch_head)

    def contents(self):
        """Return the contents of the file revision."""
        if self._contents is None:
            self._contents = self.rcsfile._rcsparse().checkout(self.id.name)
        return self._contents

    def branches(self):
        """Return the set of branches of the file revision."""
        return (self.rcsfile.branch[b] for b in self._branches)

    def branch(self):
        """Return the FileBranch of which this is a revision."""
        return self.rcsfile.branch[self.id.branch]
    
    def next(self):
        """Return the revision appearing next in this RCS file."""
        return self.rcsfile.rev[self._next] if self._next is not None else None

    def __repr__(self):
        return "%s(%s %s)" % (type(self).__name__, self.rcsfile.filename, self.id.name)
        

class FileBranch(object):
    """A single branch of an RCS file."""

    def __init__(self, id, rcsfile, revs, names):

        # A BranchID.
        self.id = id

        # The File of which this is a branch.
        self.rcsfile = rcsfile

        # A list of revisions on this branch, sorted into order.
        self.revs = sorted(revs, key=attrgetter('id'))

        # The timestamps of the revisions.
        self.timestamps = [r.timestamp for r in self.revs]

        # The timestamp of the first revision on the branch.
        self.at = self.timestamps[0]

        # A boolean: is this branch part of the trunk of the file?
        self.trunk = id.trunk

        # For a non-trunk branch: the FileRevision from which we branched.
        self.branch_point = self.rcsfile.rev[self.id.branch_point] if not self.trunk else None

        # check that RCS timestamps are in order.
        # (possible for this to fail if RCS files created on a network
        # filesystem; it has bad consequences for our 'slide' algorithm).
        assert(self.timestamps == sorted(self.timestamps))
        assert(self.trunk or self.at >= self.branch_point.timestamp)

        # A list of RCS symbolic names for this branch.
        self.names = names

    def __repr__(self):
        return "%s(%s %s)" % (type(self).__name__, self.rcsfile.filename, self.id.name)
        
class File(object):
    """An RCS file."""

    def __init__(self, filename, revisionClass=None, branchClass=None):

        """Create an RCS File object.

        `revisionClass` and `branchClass` are optional keyword
        arguments which allow you to subclass FileRevision and
        FileBranch.
        """
        assert(filename.endswith(',v'))

        # revision class and branch class to use for branch
        self._revisionClass = revisionClass if revisionClass else FileRevision
        self._branchClass = branchClass if branchClass else FileBranch

        # The name of the RCS file (for example, "some/repo/path/RCS/foo.v").
        self.rcs_filename = filename

        # The working filename of the RCS file (for example, "foo")
        self.filename = os.path.basename(filename)[:-2]

        # The rcsparse.rcsfile object.  The File object doesn't hang
        # onto this because there's no way to close it except for the
        # automatic destructor, and if you are processing a large repo
        # you will run out of inodes.
        f = self._rcsparse()

        # map of symbols to branch/revision ID.
        self.symbols = {s:ID(r) for s,r in f.symbols.items()}

        # Invert the symbol table so we can find the symbols for a given
        # branch or revision.
        _invert_symbols = {}
        for s,r in self.symbols.items():
            _invert_symbols.setdefault(r,[]).append(s)
        
        # set of all revision IDs.
        self.rev_ids = set(ID(r) for r in f.revs.keys())

        # We don't like empty RCS files.
        assert(self.rev_ids)
        
        # map from revision ID to FileRevision.
        self.rev = {r: self._revisionClass(r, self, f,
                                           _invert_symbols.get(r, set()))
                    for r in self.rev_ids}

        # head FileRevision.
        self.head = self.rev[ID(f.head)]

        # set of all branch IDs.
        self.branch_ids = set(r.parent for r in self.rev_ids)

        # We don't like empty RCS files.
        assert(self.branch_ids)

        # map from branch ID to FileBranch.
        self.branch = {b: self._branchClass(b, self,
                                            (self.rev[r] for r in self.rev_ids if r.parent == b),
                                            _invert_symbols.get(b,[]))
                       for b in self.branch_ids}

        # branch of most recent check-in.
        self.current_branch = self.branch[ID(f.branch)] if f.branch is not None else None

        # sorted list of FileBranch objects on the trunk (usually
        # these will have IDs 1, 2, ...)
        self.trunk = sorted((self.branch[b] for b in self.branch_ids if b.trunk),
                            key = attrgetter('id'))

        # We don't like empty RCS files.
        assert(self.trunk)

        # map from symbol to revision
        self.rev_named = {s:self.rev[r]
                          for s,r in self.symbols.items()
                          if isinstance(r, RevisionID)}

        # map from symbol to branch. We have seen branch symbols which
        # are not in the branch dictionary (don't know why): we drop these.
        self.branch_named = {s:self.branch[b]
                             for s,b in self.symbols.items()
                             if isinstance(b, BranchID) and b in self.branch}

        # For completeness, a few other things that rcsparse gives us.
        # (although these may not be useful).

        # Dictionary from symbol to either revision or branch tip revision
        self.symrevs = {s:self.rev[ID(f.sym2rev(s))] for s in self.symbols}

        self.locks = dict(f.locks)
        self.strict = f.strict # strict locking
        self.expand = f.expand # expand keywords?
        self.comment = f.comment # string preceding keyword expansions.
        self.access = f.access # access control policy
        self.desc = f.desc # Description

    def _rcsparse(self):
        """Return the rcsparse.rcsfile object used for this File."""
        return rcsparse.rcsfile(self.rcs_filename)

    def revision(self, rev=None):
        """Return the FileRevision with the given name or ID, or the trunk tip.

        `rev` can be a symbolic name, or a revision ID (either as a
        string or a RevisionID), or None or not given (meaning the
        trunk tip revision).
        """

        if rev is None:
            return self.trunk[-1].revs[-1]
        elif isinstance(rev, str):
            if rev in self.rev_named:
                return self.rev_named[rev]
            else:
                return self.rev[ID(rev)]
        else:
            assert(isinstance(rev, RevisionID))
            return self.rev.get(rev, None)

    def revision_at(self, branch=None, at=None, or_earliest=False):
        """Return the FileRevision on `branch` at time `at`.

        `branch` can be a symbolic name, or a branch ID (either as a
        string or a BranchID), or a FileBranch, or None or not
        given (meaning the trunk).

        `at` can be None, meaning the latest revision.  If `at` is too
        early, return None, unless `or_earliest` is true in which case
        return the first revision on that branch.
        """

        if branch is None:
            # find branch on the trunk
            if (at is None or
                at >= self.trunk[-1].timestamps[-1]):
                # last trunk revision
                return self.trunk[-1].revs[-1]
            else:
                branch = (b for b in self.trunk if b.timestamps[-1] > at).next()
                # this must find b because of the previous condition on 'at'.
        if isinstance(branch, BranchID):
            branch = self.branch[branch]
        elif isinstance(branch, str):
            if branch in self.branch_named:
                branch = self.branch_named[branch]
            else:
                # RCS Branch ID such as "1.2.3"
                branch = self.branch[ID(branch)]
        assert(isinstance(branch, FileBranch))
        assert(branch.rcsfile == self)
        if at is None: # latest revision
            return branch.revs[-1]
        if branch.timestamps[0] > at:
            return branch.revs[0] if or_earliest else None
        # use bisect to find the index of the revision with
        # greatest timestamp less than at.
        i = bisect.bisect_right(branch.timestamps, at)
        return branch.revs[i-1]

    def __repr__(self):
        return "%s(%s)" % (type(self).__name__, self.filename)
        
def show_timestamp(t):
    """Convert a Unix timestamp to human-readable form."""
    return datetime.datetime.utcfromtimestamp(t).isoformat()

class RevisionSet(object):
    """A set of FileRevisions which share an author and a log, and have
    similar timestamps, and which are all either branch head revisions
    or not branch head revisions.
    """

    def __init__(self, rev):
        self.key = rev.key
        self.author = self.key[0]
        self.log = self.key[1]
        self.is_branch_head = self.key[2]
        self.revs = set((rev,))
        self.first = rev.timestamp
        self.last = self.first

    def add(self, rev):
        assert(self.key == rev.key)
        self.revs.add(rev)
        self.last=max(self.last, rev.timestamp)
        self.first=min(self.first, rev.timestamp)

    def describe(self, noisy, last_commit=None, indent=2, reason=None):
        lines = ["%*sset of %4d: +%8d: %s+%04d %sby %s: %s" %
                 (indent+11, '', len(self.revs),
                  self.first - last_commit if last_commit is not None else 0,
                  show_timestamp(self.first),
                  self.last - self.first,
                  '(%s) ' % reason if reason is not None else '',
                  self.author,
                  repr(self.log)[:20])]
        if noisy:
            for rev in sorted(self.revs, key=attrgetter('timestamp')):
                lines.append("%*s%s %s %s" %
                             (indent+2, '', show_timestamp(rev.timestamp),
                              rev.rcsfile, rev.id.name))
        return lines

class Namespace(object):
    """A work-around for Python 2's problems with outer-variable scope."""

    pass
                    
def slide(files, limit=None, slack=600, noisy=0):
    """Generate RevisionSets from a set of Files.

    `files` is an iterable of File objects.

    If set, `limit` is the maximum number of RevisionSets to generate.
    
    `slack` is the amount of time, in seconds, to allow between
    consecutive FileRevisions of a single RevisionSet.  The
    default is 600 (10 minutes), allowing for slow machines and large
    clock skew on a networked file system.

    `noisy` is a verbosity level.  0 produces no output, 1 reports progress
    and the produced revision sets, 2 lists all the FileRevisions in
    every revision set.
    """

    revisions={} # map timestamp to list of FileRevisions
    named_revs={} # map symbol to list of FileRevisions

    if noisy:
        print("Collecting all revisions.")
    for f in files:
        for rev in f.rev.values():
            revisions.setdefault(rev.timestamp,[]).append(rev)
        for sym,r in f.rev_named.items():
            named_revs.setdefault(sym, []).append(r)
    if noisy:
        print("%d timestamps, %d revisions" %
              (len(revisions), sum(len(rl) for rl in revisions.values())))
        print("%d named revisions" % (len(named_revs),))

    pending={} # the pending RevisionSets, indexed by rev.key
    pending_rcs={} # File -> RevisionSet
    forced_commits = 0
    ns = Namespace() # work-around for Python 2 outer variable scoping
    ns.commits = 0
    ns.last_commit = None
    if limit is None:
        limit=sum(len(l) for l in revisions.values())

    def commit(revset, reason=None):
        """Commit a revision set.

        Update our general data structures to reflect the fact that we
        are now done with this revision set.  Returns True if we have
        produced enough revision sets and should now stop.
        """

        if noisy:
            for line in revset.describe(noisy>1, last_commit=ns.last_commit, reason=reason):
                print(line)
        ns.last_commit = revset.first
        del pending[revset.key]
        for k in set(k for k,rs in pending_rcs.items() if rs == revset):
            del pending_rcs[k]
        ns.commits += 1
        return (ns.commits >= limit)
        
    if noisy:
        print("Starting from the beginning of time (%s)."
              % show_timestamp(min(sorted(revisions))))

    for t in sorted(revisions):
        # Commit any revsets which are old enough
        to_commit = sorted((rs for rs in pending.values()
                            if rs.last < t - slack),
                           key=attrgetter('first'))
        if to_commit and noisy>1:
            print("%s: Committing %d aged revision sets." %
                  (show_timestamp(t), len(to_commit)))
        for rs in to_commit:
            yield(rs)
            if commit(rs, "age"): # returns true if we're done
                return

        # Now iterate through all revs with this timestamp.
        # We sometimes see the base rev of a branch and the initial rev of that
        # branch with the same timestamp; sort by ID so the base rev comes first.
        for rev in sorted(revisions[t], key=attrgetter('id')):
            if noisy > 1:
                print("%s: %s %s" %
                      (show_timestamp(rev.timestamp),
                       rev.rcsfile, rev.id.name))

            # If there is a pending revset of the same file, we
            # should commit it.
            oldrevset = pending_rcs.get(rev.rcsfile, None)
            if oldrevset is not None:
                if noisy>1:
                    print("  forcing commit of revset %s" %
                          show_timestamp(oldrevset.first))
                forced_commits += 1
                yield(oldrevset)
                if commit(oldrevset, 'forced'):
                    return

            # Find or make a RevisionSet for this revision
            k = rev.key
            if k in pending:
                rs = pending[k]
                pending[k].add(rev)
            else:
                pending[k] = RevisionSet(rev)
            pending_rcs[rev.rcsfile] = pending[k]
    if noisy:
        print("Reached the end of time.")
    for rs in sorted(pending.values(),
                     key=attrgetter('first')):
        yield(rs)
        if commit(rs):
            return
    assert(len(pending) == 0)
    assert(len(pending_rcs) == 0)
    if noisy:
        print("%d Commits" % ns.commits)
        print("%d Forced commits" % forced_commits)