# Copyright (C) 2006 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""The functionality for converting a CVS source into bzr."""

from cStringIO import StringIO
import errno
import os
import re
import stat
import subprocess
import sys
import time

from bzrlib import (
    branch as _mod_branch,
    bzrdir,
    errors,
    generate_ids,
    inventory,
    lazy_regex,
    memorytree,
    osutils,
    revision as _mod_revision,
    trace,
    transport,
    ui,
    workingtree,
    )

import cvsps_errors
import lru_cache
from parser import Parser
# from cvsps.parser import Parser


class MapFile(object):
    """This maintains the mapping from CVS revisions to bzr revisions."""

    HEADER = '# cvsps patchset => bzr revision id map file v1'

    def __init__(self, path, cvs_module):
        """Associate the map file with the given location.

        This will also read in the mapping if the file already exists.
        """
        self._path = path
        self._cvs_module = cvs_module
        self._map = {}
        self._dirty = False
        self.read()

    def _read_header(self, fp):
        """Read the header information from the file.

        This will raise exceptions if any of the header info is not correct.
        """
        first = fp.readline()
        if not first:
            return # Just an empty file

        first = first.rstrip()
        if first != self.HEADER:
            raise cvsps_errors.InvalidMapFile(self._path,
                'Invalid header. Expected %r, got %r'
                % (self.HEADER, first))

        second = fp.readline().rstrip()
        if not second.startswith('cvs module: '):
            raise cvsps_errors.InvalidMapFile(self._path,
                'The second line of the map file should be'
                ' the cvs module, not: %r'
                % (second,))

        module = second[len('cvs module: '):]
        if module != self._cvs_module:
            raise cvsps_errors.InvalidMapModule(self._path,
                    exp_module=self._cvs_module,
                    act_module=module)

    def _read_mapping(self, fp):
        """Read in the rest of the data mapping.

        This will fill self._map with the mapping.
        """
        for line in fp:
            line = line.rstrip()
            cvs_patchset, bzr_revision_id = line.split(' ')
            cvs_patchset = int(cvs_patchset)
            bzr_revision_id = bzr_revision_id.decode('utf-8')
            self._map[cvs_patchset] = bzr_revision_id

    def read(self):
        """Read in the data if the file exists."""
        try:
            fp = open(self._path, 'rb')
        except (IOError, OSError), e:
            if e.errno in (errno.ENOENT,):
                return # No data to read
            raise
        try:
            self._read_header(fp)
            self._read_mapping(fp)
        finally:
            fp.close()

    def _write_mapping(self, fp):
        """Write out the actual mapping"""
        for patch_num in sorted(self._map.keys()):
            utf_8_id = self._map[patch_num].encode('utf-8')
            fp.write('%d %s\n' % (patch_num, utf_8_id))

    def write(self):
        """Write out the mapping to the associated map file."""
        fp = open(self._path, 'wb')
        try:
            fp.write(self.HEADER + '\n')
            fp.write('cvs module: %s\n' % self._cvs_module)
            self._write_mapping(fp)
        finally:
            fp.close()
        self._dirty = False

    def write_if_dirty(self):
        """Write out if the data has changed."""
        if not self._dirty:
            return
        self.write()

    def add(self, patch_num, bzr_revision_id):
        """Add a new mapping."""
        self._map[patch_num] = bzr_revision_id
        self._dirty = True

    def get(self, patch_num):
        """Get the mapping from patch_num => revision_id.

        If the Patchset is not found, return None.
        """
        return self._map.get(patch_num, None)


class FileIdMapFile(MapFile):
    """Keeps track of cvs path => file_id mappings.

    This makes sure that separate branches get the same file id for the same
    path. Since that is how cvs works (it doesn't track renames).

    The structure is almost identical to the Patchset map file.
    """

    HEADER = '# cvs path => bzr file id map file v1'

    def _read_mapping(self, fp):
        """Read in the rest of the data mapping.

        This will fill self._map with the mapping.
        """
        for line in fp:
            line = line.rstrip()
            cvs_filename, bzr_file_id = line.split('\0')
            self._map[cvs_filename.decode('utf-8')] = bzr_file_id

    def _write_mapping(self, fp):
        for path in sorted(self._map.keys()):
            fp.write('%s\0%s\n' % (path.encode('utf-8'), self._map[path]))


class MinimalTree(object):
    """This class tracks an inventory, and keeps the content of modified files.

    Basically, it is designed to only save the minimal amount of tree
    information to be able to commit data to a repository.

    This interface is very minimal, only supporting things that CVS can do.
    """

    _message_escape_re = lazy_regex.lazy_compile(
        u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+'
        )

    def __init__(self, base_inventory, file_id_generator=None,
                 revision_id=None):
        # Grab a copy because we will be modifying it
        self._base_inventory = base_inventory
        self._file_id_generator = file_id_generator
        if self._file_id_generator is None:
            self._file_id_generator = self._fallback_generate_id
        self._reset(revision_id)

    @staticmethod
    def _fallback_generate_id(path, basename):
        return generate_ids.gen_file_id(basename)

    def _copy_base_inventory(self):
        """Create a copy of an inventory.

        It turns that bzr's Inventory.copy() is broken, so do it differently.
        """
        entries = self._base_inventory.iter_entries_by_dir()
        try:
            root_path, root = entries.next()
        except StopIteration:
            # Our basis tree is the EmptyTree (null:), which means this is
            # actually an 'init' of a new tree.
            new_inv = inventory.Inventory()
        else:
            new_inv = inventory.Inventory(root.file_id)
            for path, entry in entries:
                new_inv.add(entry.copy())
        self._inventory = new_inv

    def _reset(self, revision_id):
        """Reset self to be ready for new work."""
        self._copy_base_inventory()
        self._revision_id = revision_id

        self._modified_paths = {}
        self._removed_paths = set()
        self._texts = {}
        self._executable = {}

    @staticmethod
    def create_from_branch(branch):
        branch_rev = branch.last_revision()
        rev_tree = branch.repository.revision_tree(branch_rev)
        return MinimalTree(rev_tree.inventory,
                           revision_id=branch_rev)

    def set_text(self, path, txt, executable=None):
        """Set the text for a given path to the supplied value.

        If the path doesn't exist yet, this will show up as an add. Otherwise
        it will show up as a modified path.
        """
        file_id = self._inventory.path2id(path)
        if file_id is None:
            parent = self._create_leading_dirs(path)
            basename = osutils.basename(path)
            file_id = self._file_id_generator(path, basename)
            entry = inventory.make_entry('file', basename,
                                         parent_id=parent.file_id,
                                         file_id=file_id)
            self._inventory.add(entry)
            self._modified_paths[path] = ('add', 'file')
        else:
            # This is just a modify step
            self._modified_paths[path] = ('update', 'file')
            entry = self._inventory[file_id]
        if executable is not None:
            entry.executable = executable
        self._texts[path] = txt
        self._executable[path] = executable

    def get_text(self, path):
        """Get the text for a given path.

        The text must have been set by set_txt first.
        """
        return self._texts[path]

    # This is required by builder.record_entry_contents, because it uses
    # get_file(file_id).readlines() to determine the text. Honestly this should
    # be passing the path if it has it.
    def get_file(self, file_id, path=None):
        if path is None:
            path = self._inventory.id2path(file_id)
        return StringIO(self._texts[path])
    
    # This is required by builder.record_entry_cotents, because it now seems
    # to use get_file_with_stat instead of get_file as above, however, I've 
    # left get_file intact for backwards compatibility and wrapped it below.
    def get_file_with_stat(self, file_id, path=None):
        return (self.get_file(file_id, path), None)

    # InventoryEntry.snapshot() calls _read_tree_state, which requires these
    # functions to exist on the "working tree". Though honestly, we really
    # shouldn't need them to be implemented, because it generally doesn't need
    # the values given here.
    def get_file_sha1(self, file_id, path=None):
        if path is None:
            path = self._inventory.id2path(file_id)
        return osutils.sha_string(self._texts[path])

    def is_executable(self, file_id, path=None):
        if path is None:
            path = self._inventory.id2path(file_id)
        return self._executable[path]

    def last_revision(self):
        """Get the revision that this tree is based on."""
        return self._revision_id

    def _create_leading_dirs(self, path):
        """Create any directories we need to get to path."""
        # This is an add, look for a parent that does exist
        cur = self._inventory.root
        sections = path.split('/')
        basename = sections.pop() # Grab the last section as the basename

        count = 0
        for section in sections:
            if section in cur.children:
                cur = cur.children[section]
                count += 1
            else:
                break

        # We need to add directory entries for everything leading up to
        # the current entry
        base = '/'.join(sections[:count])
        for section in sections[count:]:
            base = osutils.pathjoin(base, section)
            self._modified_paths[base] = ('add', 'directory')
            # TODO: Use a custom file id generator to handle the current
            # timestamp
            file_id = self._file_id_generator(base, section)
            entry = inventory.make_entry('directory', section,
                                         parent_id=cur.file_id,
                                         file_id=file_id)
            self._inventory.add(entry)
            cur = entry
        return cur

    def path_content_summary(self, path, file_id=None):
        if path not in self._modified_paths:
            # Return the content summary from the basis inventory
            # kind, size, executable, sha
            if file_id is None:
                file_id = self._inventory.path2id(path)
            if file_id not in self._base_inventory:
                # Should only happen for the root
                ie = self._inventory[file_id]
            else:
                ie = self._base_inventory[file_id]
            if ie.kind == 'file':
                return (ie.kind, ie.text_size, ie.executable, ie.text_sha1)
            else:
                return (ie.kind, None, None, None)
        else:
            action, kind = self._modified_paths[path]
            assert action != 'remove'
            if kind == 'file':
                txt = self._texts[path]
                executable = self._executable[path]
                return (kind, len(txt), executable or False, None)
            else:
                assert kind == 'directory'
                return (kind, None, None, None)

    def remove_file(self, path):
        """Remove a given file from the inventory."""
        file_id = self._inventory.path2id(path)
        if file_id is None:
            raise errors.NoSuchFile(path)
        entry = self._inventory[file_id]
        assert entry.kind == 'file'
        self._inventory.remove_recursive_id(file_id)
        self._modified_paths[path] = ('remove', 'file')
        self._removed_paths.add(path)

    def resolve_removed(self):
        """After processing, remove any empty directories."""
        # We want to make sure to process from longest to shortest.
        # We pop off the left, so just a plain sorted list is enough
        to_check = sorted(self._removed_paths)

        while to_check:
            removed_path = to_check.pop()
            parent_path = osutils.dirname(removed_path)
            if not parent_path or parent_path in self._removed_paths:
                continue
            parent_id = self._inventory.path2id(parent_path)
            if parent_id is None:
                continue
            parent_ie = self._inventory[parent_id]
            # Are there any children left?
            if not parent_ie.children:
                self._inventory.remove_recursive_id(parent_id)
                self._modified_paths[parent_path] = ('remove', 'directory')
                self._removed_paths.add(parent_path)
                to_check.append(parent_path)

    def commit(self, branch, branch_name, message, timestamp, timezone,
               committer, revision_id):
        """Commit the current tree to the supplied branch.

        This commits the current inventory and file texts, so that they will be
        properly processed by the commit. This goes around bzrlib/commit.py
        because we don't need to do all of the integrity checking, serializing,
        etc that it does. Everything is all ready in memory, and just needs to
        be saved.
        """
        self.resolve_removed()
        branch_revno, branch_rev = branch.last_revision_info()
        assert self._revision_id == branch_rev

        # CVS doesn't support recording merges, so the parent list is always
        # only 1 entry long
        if _mod_revision.is_null(self._revision_id):
            parents = []
            parent_invs = []
        else:
            parents = [self._revision_id]
            parent_invs = [self._base_inventory]

        # We explicitly use a config of None, because we want to make sure to
        # supply everything manually, and not have it guessed by the commit
        # builder.
        config = None

        revprops = {}
        revprops['branch-nick'] = branch_name

        # We go directly to the branch.repository.get_commit_builder, because
        # Branch.get_commit_builder() always overrides the fact that 'config'
        # is None, by building its own Config object. Which turns out to take
        # quite a bit of time.
        builder = branch.repository.get_commit_builder(branch,
                    parents=parents,
                    config=config,
                    timestamp=timestamp, timezone=timezone,
                    committer=committer, revprops=revprops,
                    revision_id=revision_id)
        # XXX: Bazaar 0.92 broke api compatibility, the only way to detect it
        #       (other than inspecting version_info) is to count the number of
        #       arguments
        num_args = builder.record_entry_contents.func_code.co_argcount
        if num_args == 6:
            use_path_content_summary = True
        else:
            use_path_content_summary = False
        try:
            for path, ie in self._inventory.iter_entries_by_dir():
                new_ie = ie.copy()

                if path in self._modified_paths:
                    # This entry needs to be marked as modified, so that the commit
                    # will record it.
                    new_ie.revision = None
                else:
                    # If this one isn't in _modified_paths it should have the
                    # original last-modified revision
                    # The only exception to this is the root entry
                    if ie.parent_id is not None and ie.revision is None:
                        import pdb; pdb.set_trace()

                if use_path_content_summary:
                    builder.record_entry_contents(new_ie, parent_invs, path,
                        self, self.path_content_summary(path, new_ie.file_id))
                else:
                    builder.record_entry_contents(new_ie, parent_invs, path,
                                                  self)

            builder.finish_inventory()
            message = self._escape_commit_message(message)
            try:
                new_revision_id = builder.commit(message)
            except ValueError:
                lost_message = 'Lost CVS commit message during bzr conversion'
                new_revision_id = builder.commit(lost_message)
                trace.warning('WARNING The following cvs commit message was lost during the bzr import:')
                trace.warning('==============================')
                trace.warning(message)
                trace.warning('==============================')
                trace.warning('Using "{0}" instead and continuing'.format(lost_message))
        except:
            builder.abort()
            raise

        branch.set_last_revision_info(branch_revno+1, new_revision_id)

        self._base_inventory = builder.revision_tree().inventory
        self._reset(new_revision_id)

        return new_revision_id

    def _escape_commit_message(self, message):
        """see bzrlib/commit.py Commit._escape_commit_message"""
        message, escape_count = self._message_escape_re.subn(
                lambda match: match.group(0).encode('unicode_escape'),
                message)
        message=message.replace('\r\n','\n').replace('\r','\n')
        return message


class CVSUpdater(object):
    """Update the CVS tree as patchsets come in."""

    def __init__(self, cvs_root, cvs_module, use_cvs_for_text=True):
        self._cvs_root = cvs_root
        if cvs_module == '.':
            cvs_module = ''
        self._cvs_module = cvs_module
        self._cvs_modpath = osutils.pathjoin(cvs_root, cvs_module) \
                            .encode(sys.getfilesystemencoding() or 'utf-8')
        self._use_cvs_for_text = use_cvs_for_text # Do we use 'cvs' or 'co'?

    def update_file(self, target_base, filename, revision):
        """Update the specific file to the given revision.

        If 'DEAD' is in the cvs revision id, then the file will be removed.

        By using 'co', and directly writing to the file, we are able to avoid
        creating CVS directories which would need to be ignored.  Also 'co'
        doesn't pause for 1s after updating a file, so it should be a lot
        faster. Also, when deleting files/dirs, we don't have to worry about
        leftover CVS dirs.

        :param target_base: The root directory we are working in.
        :param filename: The path to the file we are updating
        :param revision: The CVS revision id (like 1.1.1.1)
        :return: (created, removed) This lists the directories that were
            created and removed in order to add the file or cleanup after it
            was deleted.

            The last entry in created or removed should be the filename itself.
            This file may have only been updated rather than actually added.
        """
        if 'DEAD' in revision:
            return [], self._remove_file(target_base, filename)

        target_path, created = self._prepare_target(target_base, filename)
        rcs_file = self._get_rcs_filename(filename)

        executable = self._is_executable(rcs_file)
        txt = self._run_co(rcs_file, revision)
        out_f = open(target_path, 'wb')
        try:
            out_f.write(txt)
        finally:
            out_f.close()

        if executable:
            # We might need to chmod non executable files but since cvs can't
            # change the executable bit and the default should be
            # non-executable, it shouldn't be a problem. And it means fewer
            # chmods.
            os.chmod(target_path, 0770)

        created.append(filename)

        return created, []

    def _prepare_target(self, target_base, filename):
        """Prepare directories for handling the target file.

        :return: (target_path, created)
            target_path: The final path to the target filename
            created: Intermediate dirs that needed to be created
        """
        target_path = osutils.pathjoin(target_base, filename)
        target_dir_path = osutils.dirname(target_path)
        target_dir = osutils.dirname(filename)

        created = []
        create_path = []
        while target_dir and not os.path.isdir(target_dir_path):
            created.append(target_dir)
            create_path.append(target_dir_path)
            target_dir = osutils.dirname(target_dir)
            target_dir_path = osutils.dirname(target_dir_path)

        created.reverse()
        create_path.reverse()
        for path in create_path:
            os.mkdir(path)

        return target_path, created

    def _remove_file(self, target_base, filename):
        """Remove the target file, and any directories which are now empty.

        :return: The list of files and directories that were removed.
                 This should return their relative paths, not absolute paths.
        """
        removed = []
        target_path = osutils.pathjoin(target_base, filename)
        try:
            os.unlink(target_path)
        except (OSError,), e:
            if e.errno not in (errno.ENOENT,):
                raise
        removed.append(filename)

        # XXX: Should we be catching an exception here
        target_dir = osutils.dirname(filename)
        target_dir_path = osutils.dirname(target_path)
        while target_dir:
            if (not os.path.isdir(target_dir_path)
                or os.listdir(target_dir_path) != []):
                break
            os.rmdir(target_dir_path)
            removed.append(target_dir)
            target_dir = osutils.dirname(target_dir)
            target_dir_path = osutils.dirname(target_dir_path)
        return removed

    def _get_rcs_filename(self, filename):
        """Get the ',v' file associated with this filename.

        This just looks in the matching path, or in Attic if it isn't found.
        :return: cvs_path for cvs control file.
        """
        rcs_file = osutils.pathjoin(self._cvs_modpath, filename + ',v')
        if not os.path.isfile(rcs_file):
            # Look in the Attic
            cvs_dir, cvs_basename = os.path.split(rcs_file)
            orig_cvs_file = rcs_file
            rcs_file = osutils.pathjoin(cvs_dir, 'Attic', cvs_basename)
            if not os.path.isfile(rcs_file):
                raise cvsps_errors.NoRCSFile(filename, orig_cvs_file, rcs_file)

        return rcs_file

    def _run_co(self, rcs_file, revision):
        """Run 'co' the RCS checkout command."""
        cmd = ['co', '-M', '-p', '-r'+revision, rcs_file]
        trace.mutter('running %s', cmd)

        try:
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 stdin=subprocess.PIPE,
                                )
        except ValueError:
          # bad subprocess parameters, should never happen
          raise
        except OSError, e:
            if e.errno in (errno.ENOENT,):
                raise cvsps_errors.MissingProgram(cmd, e, program='rcs')
            raise
        out, err = p.communicate('')
        if p.returncode != 0:
            raise cvsps_errors.COError(p.returncode, cmd, err)
        return out

    def _run_cvs(self, filename, cvs_revision):
        """Checkout the text for a given filename."""
        module_path = os.path.join(self._cvs_module, filename)
        cmd = ['cvs', '-d', self._cvs_root,
               'co', '-p', '-r'+cvs_revision, module_path,
              ]
        trace.mutter('running %s', cmd)

        try:
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 stdin=subprocess.PIPE,
                                )
        except ValueError:
          # bad subprocess parameters, should never happen
          raise
        except OSError, e:
            if e.errno in (errno.ENOENT,):
                raise cvsps_errors.MissingProgram(cmd, e,)
            raise
        out, err = p.communicate('')
        if p.returncode != 0:
            raise cvsps_errors.CVSError(p.returncode, cmd, err)
        return out

    def get_text(self, filename, cvs_revision):
        """Get the text of the file at a given revision."""
        if 'DEAD' in cvs_revision:
            return None, None

        rcs_file = self._get_rcs_filename(filename)
        executable = self._is_executable(rcs_file)
        if self._use_cvs_for_text:
            txt = self._run_cvs(filename, cvs_revision)
        else:
            txt = self._run_co(rcs_file, cvs_revision)
        return txt, executable

    @staticmethod
    def _is_executable(rcs_file):
        """Check if the cvs control file is executable."""
        st = os.stat(rcs_file)
        return bool(st.st_mode & (stat.S_IXGRP | stat.S_IXOTH | stat.S_IXUSR))


class CVSToBzr(object):
    """Maintain the connection between cvs output and bzr.

    This class maintains the state between the CVS tree and the bzr tree.
    """

    def __init__(self, bzr_repo, cvs_root, cvs_module, map_file,
                 verify=True, use_cvs_for_text=True,
                 file_id_map_file=None, only_branches=None,
                 cvs_fs_encoding=None, tag_style='tag'):
        self._bzr_repo = bzr_repo
        self._map_file = map_file
        self._cvs_root = cvs_root
        self._cvs_module = cvs_module
        self._tag_style = tag_style
        self._file_id_map_file = file_id_map_file
        self._only_branches = only_branches
        if self._only_branches is not None:
            self._only_branches_regex = re.compile(only_branches)
        else:
            self._only_branches_regex = None
        self._cvs_fs_encoding = cvs_fs_encoding

        self._working_path = osutils.pathjoin(
            self._bzr_repo.bzrdir.root_transport.local_abspath('.'),
            'working')

        self._cvs_updater = CVSUpdater(cvs_root, cvs_module,
                                       use_cvs_for_text=use_cvs_for_text)

        self._verify = verify

        self._open_branches = lru_cache.LRUCache(max_cache=100)

        self._cur_branch_name = None
        self._cur_history = None
        self._cur_tree = None
        self._cur_bzr_branch = None

        self._n_patches = 0
        self._n_existing_patches = 0
        self._n_tags = 0

    def handle_patchset(self, patchset, pb):
        """Handle one of the patchsets from cvs to bzr"""

        revision_id = self._map_file.get(patchset.num)
        if not self._verify:
            if revision_id is not None:
                return revision_id, 'skipped'

        if (self._only_branches_regex is not None
            and patchset.branch != 'HEAD'
            and not self._only_branches_regex.match(patchset.branch)):
            return 'ignored', 'unmatched'

        if self._cur_branch_name != patchset.branch:
            # We need to open a different branch
            self._cur_bzr_branch = self._open_or_create_branch(patchset)
            self._cur_history = set(self._cur_bzr_branch.revision_history())
            self._cur_branch_name = patchset.branch

        if revision_id is not None:
            # We have processed this patch in the past, it should exist on this
            # branch's mainline
            assert revision_id in self._cur_history
            action = 'verified'
            # print '=> existing %s %s' % (self._cur_branch_name, revision_id,)
            self._n_existing_patches += 1
        else:
            # We want to process a new patchset
            revision_id = self._extract_changes(patchset)
            self._map_file.add(patchset.num, revision_id)
            # print '=> created %s %s' % (self._cur_branch_name, revision_id,)
            action = 'created'
            self._n_patches += 1

        if patchset.tag is not None:
            if self._tag_style == 'tag':
                self._handle_tag_tag(patchset, revision_id, pb)
            else:
                self._handle_tag_branch(patchset, revision_id)

            action += '+tag'

        return revision_id, action

    def cleanup(self):
        """This should be called after all processing.

        It unlocks locked branches, etc.
        """
        # TODO: Now that we use an LRU cache, n_branches is no longer accurate
        n_branches = len(self._open_branches)
        self._open_branches.clear()

        return self._n_patches, self._n_existing_patches, self._n_tags, n_branches

    def _generate_file_id(self, path, basename):
        file_id = self._file_id_map_file.get(path)
        if file_id is None:
            file_id = generate_ids.gen_file_id(basename)
            self._file_id_map_file.add(path, file_id)
        return file_id

    def _update_file_id_generator(self, patchset):
        """For each new patchset, we update the fileid generator.

        This way file ids have reasonable date portions, as well as giving a
        little bit more variety rather than just having a huge number of serial
        numbers.
        """
        # We have also customized the file-ids a little bit reducing the number
        # of random characters so that we can save a little bit of space in
        # inventories, etc. It is only 4 bytes, so it isn't a huge difference,
        # but it should help. (4bytes per id * 2ids per line * 55k lines ~ .5MB
        # per fulltext inventory)
        generate_ids._gen_file_id_serial = 0
        generate_ids._gen_file_id_suffix = '-%s-%s-' % (
                    osutils.compact_date(patchset.timestamp),
                    osutils.rand_chars(12)
                    )

    def _ensure_tree(self):
        """Ensure that the working tree is pointed at the right revision."""
        branch_rev = self._cur_bzr_branch.last_revision()

        if (self._cur_tree is None
            or self._cur_tree.last_revision() != branch_rev):
            rev_tree = self._bzr_repo.revision_tree(branch_rev)
            self._cur_tree = MinimalTree(rev_tree.inventory,
                                    revision_id=branch_rev,
                                    file_id_generator=self._generate_file_id)

    def _get_branch_path(self, branch_name):
        """Get the path on disk where we would like to put the branch."""
        root = self._bzr_repo.bzrdir.root_transport.local_abspath('.')
        full_branch_name = 'branches/' + branch_name
        return osutils.pathjoin(root, full_branch_name)

    def _get_tag_branch_path(self, tag_name):
        """Get the path on disk for a given tag name."""
        root = self._bzr_repo.bzrdir.root_transport.local_abspath('.')
        full_tag_branch_name = 'tags/' + tag_name
        return osutils.pathjoin(root, full_tag_branch_name)

    def _assert_same_repo(self, repo_a, repo_b):
        """Assert that the repositories point to the same place"""
        assert repo_a.bzrdir.transport.base == repo_b.bzrdir.transport.base

    def _set_repo(self, a_branch):
        """Make sure the tree is sharing the repository."""
        # XXX: Ugly hack to make sure there is only one repository instance
        self._assert_same_repo(self._bzr_repo, a_branch.repository)
        a_branch.repository = self._bzr_repo

    def _open_branch(self, branch_path):
        """Open a specific branch, it must already exist.

        This is unfortunately a little invasive. But it has to be, because we
        want to make sure that we share the repository instance, rather than
        opening it for every branch that we process.
        """
        a_branch = _mod_branch.Branch.open(branch_path)
        self._set_repo(a_branch)
        try:
            a_branch.lock_write()
        except KeyboardInterrupt:
            import pdb; pdb.set_trace()
            raise
        return a_branch

    def _open_existing_branch(self, branch_name):
        """Return an open branch, or open a new one.

        This expects the branch to already exist, and it may or may not already
        be open.
        """
        if branch_name in self._open_branches:
            return self._open_branches[branch_name]
        else:
            branch_path = self._get_branch_path(branch_name)
            a_branch = self._open_branch(branch_path)
            self._cache_branch(branch_name, a_branch)
            return a_branch

    def _cleanup_a_branch(self, branch_name, a_branch):
        """Callback when branches are cleaned up."""
        trace.mutter('unlocking branch: %s', branch_name)
        a_branch.unlock()

    def _cache_branch(self, branch_name, a_branch):
        """Add this branch to the cache.

        Written as a helper because we want to include a callback for cleanup.
        """
        # if we want to allow this, we could check if a_branch is the same
        # object as the cached one, and if so, turn this into a no-op.
        assert branch_name not in self._open_branches, \
            'Branch %s already cached' % (branch_name,)
        self._open_branches.add(branch_name, a_branch,
                                cleanup=self._cleanup_a_branch)
        # This should occur after the caching, because if we push something out
        # of the cache, it might unlock the branch.
        assert a_branch.is_locked(), 'Branch %s was unlocked' % (branch_name,)

    def _sprout_new_branch(self, patchset):
        """We have a new branch being created.

        This is generally indicated by patchset having an ancestor_branch,
        which is the branch that we should branch from.
        """
        assert patchset.ancestor_branch is not None

        if patchset.ancestor_branch == self._cur_branch_name:
            source_branch = self._cur_bzr_branch
        else:
            try:
                source_branch = self._open_existing_branch(patchset.ancestor_branch)
            except errors.NotBranchError:
                trace.warning('%s claims an ancestor branch %s which does'
                              ' not exist yet. Falling back to HEAD',
                              patchset, patchset.ancestor_branch)
                try:
                    source_branch = self._open_existing_branch('HEAD')
                except errors.NotBranchError:
                    if self._cur_bzr_branch is None:
                        trace.warning('%s claims an ancestor branch but not even'
                                      ' HEAD exists. Creating a new branch.',
                                      patchset)
                        return self._create_branch(patchset)
                    else:
                        trace.warning('%s claims an ancestor branch but not even'
                                      ' HEAD exists. Sprouting from current',
                                      patchset)
                        source_branch = self._cur_bzr_branch

        target_path = self._get_branch_path(patchset.branch)

        # TODO: jam 20061121 We should catch an error if the target already
        #       exists. That would also indicate something weird going on.
        os.makedirs(target_path)

        # XXX: Cannot use source_branch.bzrdir.sprout() because it creates a new
        # Repository object (self.open_repository()), and then uses .fetch(),
        # which requires a write lock. Which conflicts with the fact that we
        # already have a write-lock open.
        # target_bzrdir = source_branch.bzrdir.sprout(target_path)
        # target_tree = target_bzrdir.open_workingtree()
        target_bzrdir_format = source_branch.bzrdir.cloning_metadir()
        target_bzrdir = target_bzrdir_format.initialize(target_path)

        # Make sure that a new branch will get the same repository, so we don't
        # have to worry about doing a fetch.
        result_repo = target_bzrdir.find_repository()
        self._assert_same_repo(self._bzr_repo, result_repo)

        # XXX: We can't use source_branch.branch.sprout(target_bzrdir)
        #   Because it also ends up creating a Repository object of its own,
        #   and then it tries to lock it during 'set_revision_history()'
        # source_branch.branch.sprout(target_bzrdir)
        target_branch = source_branch._format.initialize(target_bzrdir)
        # XXX: Ugly hack to make sure there is only one repository instance
        self._set_repo(target_branch)
        target_branch.lock_write()

        source_branch.copy_content_into(target_branch)
        target_branch.set_parent(source_branch.bzrdir.root_transport.base)

        self._cache_branch(patchset.branch, target_branch)
        return target_branch

    def _open_or_create_branch(self, patchset):
        """Open or create a branch referenced by the patchset."""
        try:
            return self._open_existing_branch(patchset.branch)
        except errors.NotBranchError:
            pass

        # No branch exists yet, create one
        if patchset.ancestor_branch is not None:
            a_branch = self._sprout_new_branch(patchset)
            return a_branch

        return self._create_branch(patchset)

    def _create_branch(self, patchset):
        branch_path = self._get_branch_path(patchset.branch)
        # TODO: We need this in case a module is deeper than the top level, it
        #       seems like there should be a better way, though.
        os.makedirs(branch_path)

        # Create a new one
        format = bzrdir.BzrDirFormat.get_default_format()
        target_branch = bzrdir.BzrDir.create_branch_convenience(branch_path,
                                                         force_new_tree=False,
                                                         format=format)
        self._set_repo(target_branch)
        target_branch.lock_write()
        self._cache_branch(patchset.branch, target_branch)
        return target_branch

    def _extract_changes(self, patchset):
        """Extract the changes for a patchset and apply them to working dir."""
        # TODO: jam 20061121 Handle patchset.tag. We have the tags, we just
        #       need to figure out what we want to do with them.
        #       We could create a versioned file, or we could create an
        #       external file that just tracks them, since they shouldn't be
        #       changing.
        self._ensure_tree()

        self._update_file_id_generator(patchset)

        for member, revision in patchset.members:
            txt, executable = self._cvs_updater.get_text(member, revision)
            member = member.decode(self._cvs_fs_encoding)
            if txt is None:
                try:
                    self._cur_tree.remove_file(member)
                except errors.NoSuchFile:
                    trace.warning("%s requested to delete an unversioned file:"
                                  " %s. Ignoring.",
                                  patchset, member)
            else:
                self._cur_tree.set_text(member, txt, executable=executable)

        cvs_revision_id = 'cvs-1:' + generate_ids.gen_revision_id(
                            username=patchset.author,
                            timestamp=patchset.timestamp,
                            )
        revision_id = self._cur_tree.commit(
            branch=self._cur_bzr_branch,
            branch_name=patchset.branch,
            message=u''.join(patchset.log),
            timestamp=patchset.timestamp,
            timezone=patchset.time_offset,
            committer=patchset.author,
            revision_id=cvs_revision_id,
            )
        self._cur_history.add(revision_id)

        return revision_id

    def _handle_tag_tag(self, patchset, revision_id, pb):
        """Create a tag with the given revision id."""
        self._cur_bzr_branch.tags.set_tag(patchset.tag, revision_id)
        self._n_tags += 1

    def _handle_tag_branch(self, patchset, revision_id):
        """Create a tag with the given revision id."""
        tag_branch_path = self._get_tag_branch_path(patchset.tag)
        try:
            tag_branch = self._open_branch(tag_branch_path)
        except errors.NotBranchError:
            if not os.path.isdir(tag_branch_path):
                os.makedirs(tag_branch_path)
            tag_bzrdir_format = self._cur_bzr_branch.bzrdir.cloning_metadir()
            tag_bzrdir = tag_bzrdir_format.initialize(tag_branch_path)

            # Make sure that a new branch will get the same repository, so we
            # don't have to worry about doing a fetch.
            result_repo = tag_bzrdir.find_repository()
            self._assert_same_repo(self._bzr_repo, result_repo)

            # XXX: We can't use source_branch.branch.sprout(tag_bzrdir)
            #   Because it also ends up creating a Repository object of its own,
            #   and then it tries to lock it during 'set_revision_history()'
            # source_branch.branch.sprout(tag_bzrdir)
            tag_branch = self._cur_bzr_branch._format.initialize(tag_bzrdir)
            # XXX: Ugly hack to make sure there is only one repository instance
            self._set_repo(tag_branch)
            tag_branch.lock_write()
            self._cur_bzr_branch.copy_content_into(tag_branch,
                                                   revision_id=revision_id)
        try:
            if tag_branch.last_revision() != revision_id:
                tag_branch.generate_revision_history(revision_id)
                if tag_branch.get_parent():
                    tag_branch.set_parent(None)
        finally:
            tag_branch.unlock()
        self._n_tags += 1


class CVSPSController(object):
    """This class handles controlling cvsps.

    That includes parsing the output.
    """

    def __init__(self, cvs_root, cvs_module):
        """Create a new controller to handle the given module."""
        # cvs_root has to be well defined, or cvsps gets confused
        self.cvs_root = osutils.normpath(osutils.abspath(cvs_root).rstrip('/'))
        self.cvs_module = cvs_module

    def create_cvsps_dump(self, dump_filename):
        """Ask cvsps to generate the patchset information."""
        module = self.cvs_module

        # either cvs or cvsps has a bug if you try to supply '.' as the module.
        # So instead, we use the repository root in that situation.
        if module == '.':
            module = self.cvs_root

        # by supplying --root and module cvsps shouldn't care what directory it
        # is being run in.
        cmd = ['cvsps', '--cvs-direct', '-A', '-u', '-q',
               '--root', self.cvs_root,
               module,
              ]

        cvsps_dump_file = open(dump_filename, 'wb')
        try:
            self._run_cvsps(cmd, out_file=cvsps_dump_file)
        finally:
            cvsps_dump_file.close()

    def _run_cvsps(self, cmd, out_file):
        """Run the cvsps command, and put the result in the specified file.

        :raise errors.CVSPSError: if the return code is not 0
        """
        trace.mutter('Running command: %s', cmd)
        env = os.environ.copy()
        env['TZ'] = 'UTC'
        try:
            p = subprocess.Popen(cmd, stdout=out_file, env=env)
        except OSError, e:
            if e.errno in (errno.ENOENT,):
                raise cvsps_errors.MissingProgram(cmd, e)
            raise

        returncode = p.wait()
        if returncode != 0:
            raise cvsps_errors.CVSPSError(returncode, cmd)


class Importer(object):
    """Import a CVS project into bzr."""

    def __init__(self, cvsroot, cvs_module, output_base, cvsps_dump=None,
                 encoding=None, verify=True, use_cvs_for_text=True,
                 only_branches=None, tag_style='tag'):
        self._cvs_root = osutils.abspath(cvsroot)
        self._cvs_module = cvs_module
        self._use_cvs_for_text = use_cvs_for_text
        self._tag_style = tag_style
        self._verify = verify
        self._only_branches = only_branches

        self.output_base = output_base

        self._encoding = encoding
        if not self._encoding:
            self._encoding = 'iso-8859-1'

        self._paths_created = False

        self._repo_path = osutils.pathjoin(output_base, 'bzr',
                                           self._cvs_module)
        self._branches_path = osutils.pathjoin(self._repo_path, 'branches')
        self._tags_path = osutils.pathjoin(self._repo_path, 'tags')
        self._staging_path = osutils.pathjoin(output_base, 'staging')

        # This is a filename matching the module name, but safe for a specific
        # filename
        self._module_fname = self.sanitize_module(cvs_module)

        self._map_filename = osutils.pathjoin(self._staging_path,
                                              self._module_fname + '.map')
        self._fmap_filename = osutils.pathjoin(self._staging_path,
                                               self._module_fname + '.fmap')
        # An existing dump file
        self._cvsps_dump_filename = cvsps_dump

    @staticmethod
    def sanitize_module(module):
        """Change a module name to something safe to use as a filename."""
        if module == '.':
            return 'ROOT'
        return module.replace('/', '_').lstrip('.')

    def setup_directories(self):
        """Make sure that the output directories are created."""
        if self._paths_created:
            return
        for path in (self.output_base, self._branches_path,
                     self._tags_path,
                     self._staging_path):
            if not os.path.isdir(path):
                os.makedirs(path)
        self._paths_created = True

    def open_or_create_bzr_repo(self):
        """Open the bzr repository, creating it if needed."""
        self.setup_directories()
        bzr_repo_transport = transport.get_transport(self._repo_path)
        is_branch = True
        try:
            a_bzrdir = bzrdir.BzrDir.open_from_transport(bzr_repo_transport)
        except errors.NotBranchError:
            is_branch = False
            return self._create_bzr_repo(bzr_repo_transport)
        if self._tag_style == 'tag' and not is_branch and (
                not a_bzrdir.find_branch_format().supports_tags()):
            newFormat = bzrdir.format_registry.get_default()
            converter = a_bzrdir._format.get_converter(newFormat)
            pb = ui.ui_factory.nested_progress_bar()
            try:
                a_bzrdir = converter.convert(a_bzrdir)
            finally:
                pb.finished()
        return a_bzrdir.open_repository()

    def _create_bzr_repo(self, a_transport):
        """Create the target bzr repository. It is assumed it doesn't exist."""
        try:
            a_transport.mkdir('.')
        except errors.FileExists:
            pass
        fmt = bzrdir.BzrDirFormat.get_default_format()
        control = fmt.initialize_on_transport(a_transport)
        repo = control.create_repository(shared=True)
        repo.set_make_working_trees(False)
        return repo

    def _parse_cvsps_dump(self, pb=None):
        # The path to use if we are creating one
        if self._cvsps_dump_filename:
            cvsps_dump_file = open(self._cvsps_dump_filename, 'rb')
        else:
            cvsps_dump_path = osutils.pathjoin(self._staging_path,
                                               self._module_fname + '.dump')
            trace.note('Creating cvsps dump file: %s', cvsps_dump_path)
            # Create a dump file
            cvsps_controller = CVSPSController(cvs_root=self._cvs_root,
                                               cvs_module=self._cvs_module)
            cvsps_controller.create_cvsps_dump(cvsps_dump_path)
            cvsps_dump_file = open(cvsps_dump_path, 'rb')
        if pb is not None:
            pb.update('Parsing cvsps dump file.')
        try:
            parser = Parser(cvsps_dump_file, encoding=self._encoding)
            patchsets = parser.parse()
        finally:
            cvsps_dump_file.close()
        return patchsets

    def _process_patchsets(self, cvs_to_bzr, patchsets, pb=None):
        n_patchsets = len(patchsets)
        for i, patchset in enumerate(patchsets):
            try:
                rev_id, action = cvs_to_bzr.handle_patchset(patchset, pb)
            except KeyboardInterrupt:
                if pb is not None:
                    pb.clear()
                trace.warning('Stopped while processing: %s', patchset)
                raise
            except:
                if pb is not None:
                    pb.clear()
                trace.warning('Failed while processing: %s', patchset)
                raise
            if pb is not None:
                pb.update('%-15.15s %s %-35.35s'
                          % (cvs_to_bzr._cur_branch_name,
                             action, rev_id),
                          i, n_patchsets)

    def _cleanup_and_report(self, cvs_to_bzr, start_time):
        """We are stopping, so cleanup and report results."""
        (n_patches, n_existing_patches,
         n_branches, n_tags) = cvs_to_bzr.cleanup()

        total_time = time.time() - start_time
        if total_time < 0.1:
            total_time = 0.1

        patches_per_second = n_patches / total_time
        trace.note('Processed %s patches (%s new, %s existing)'
                   ' on %s branches'
                   ' (%s tags) in %.1fs (%.2f patch/s)',
                    n_patches + n_existing_patches,
                    n_patches, n_existing_patches, n_branches,
                    n_tags, total_time, patches_per_second)

    def process(self):
        """Start converting the repository."""
        pb = ui.ui_factory.nested_progress_bar()
        try:
            repo = self.open_or_create_bzr_repo()
        finally:
            pb.finished()
        # Maintain a repository wide lock for the whole transaction
        # that should help cache stuff.
        # TODO: jam 20061121 This may actually cache *too* much. Consider
        #       unlocking from time to time. Like perhaps every 100 revisions
        #       or maybe after N file changes.
        repo.lock_write()
        try:
            # TODO: jam 20061121 We should write new entries to the file as they
            #       are created. Rather than waiting till everything is done
            map_file = MapFile(self._map_filename, self._cvs_module)
            file_id_map_file = FileIdMapFile(self._fmap_filename,
                                            self._cvs_module)
            try:
                cvs_to_bzr = CVSToBzr(repo, self._cvs_root, self._cvs_module,
                                      map_file, verify=self._verify,
                                      use_cvs_for_text=self._use_cvs_for_text,
                                      file_id_map_file=file_id_map_file,
                                      only_branches=self._only_branches,
                                      cvs_fs_encoding=self._encoding)
                start_time = time.time()
                pb = ui.ui_factory.nested_progress_bar()
                try:
                    patchsets = self._parse_cvsps_dump(pb=pb)

                    start_time = time.time()
                    self._process_patchsets(cvs_to_bzr, patchsets, pb=pb)
                finally:
                    pb.finished()
                    self._cleanup_and_report(cvs_to_bzr, start_time)
            finally:
                # TODO: Install a ^C handler here...
                map_file.write_if_dirty()
                file_id_map_file.write_if_dirty()
        finally:
            repo.unlock()
