From 89912789aac6d42dd9b716c7ffb7d961a41bd28f Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Fri, 7 Mar 2025 09:04:27 +0000
Subject: [PATCH] (feature) Added forking existing remote and publishing to a
 new repo. Needed for teh SAIL to CPRD scenario. Closes #47

---
 acmc/main.py       | 58 ++++++++++++++++++++++++----
 acmc/phen.py       | 96 ++++++++++++++++++++++++++++++++++++++++++----
 docs/usage.md      | 33 +++++++++++-----
 tests/test_acmc.py | 15 ++++++++
 4 files changed, 177 insertions(+), 25 deletions(-)

diff --git a/acmc/main.py b/acmc/main.py
index 64d8ed5..72ce394 100644
--- a/acmc/main.py
+++ b/acmc/main.py
@@ -36,6 +36,16 @@ def phen_init(args):
     phen.init(args.phen_dir, args.remote_url)
 
 
+def phen_fork(args):
+    """Handle the `phen fork` command."""
+    phen.fork(
+        args.phen_dir,
+        args.upstream_url,
+        args.upstream_version,
+        new_origin_url=args.remote_url,
+    )
+
+
 def phen_validate(args):
     """Handle the `phen validate` command."""
     phen.validate(args.phen_dir)
@@ -135,13 +145,45 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Phenotype workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_init_parser.add_argument(
-        "-r", "--remote_url", help="URL to remote git repository"
+        "-r",
+        "--remote_url",
+        help="(Optional) URL to repository where the phenotype will be published.",
     )
     phen_init_parser.set_defaults(func=phen_init)
 
+    # phen fork
+    phen_fork_parser = phen_subparsers.add_parser(
+        "fork", help="Fork an existing phenotype"
+    )
+    phen_fork_parser.add_argument(
+        "-d",
+        "--phen-dir",
+        type=str,
+        default=str(phen.DEFAULT_PHEN_PATH.resolve()),
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
+    )
+    phen_fork_parser.add_argument(
+        "-r",
+        "--remote_url",
+        help="(Optional) URL to repository where the forked phenotype will be published.",
+    )
+    phen_fork_parser.add_argument(
+        "-u",
+        "--upstream-url",
+        required=True,
+        help="(Required) URL to the phenotype repository to fork.",
+    )
+    phen_fork_parser.add_argument(
+        "-v",
+        "--upstream-version",
+        required=True,
+        help="(Required) Phenotype version to fork.",
+    )
+    phen_fork_parser.set_defaults(func=phen_fork)
+
     # phen validate
     phen_validate_parser = phen_subparsers.add_parser(
         "validate", help="Validate phenotype configuration"
@@ -151,7 +193,7 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Phenotype workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_validate_parser.set_defaults(func=phen_validate)
 
@@ -162,7 +204,7 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Phenotype workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_map_parser.add_argument(
         "-t",
@@ -181,7 +223,7 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Phenotype workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_export_parser.add_argument(
         "-v",
@@ -201,7 +243,7 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Phenotype workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_publish_parser.add_argument(
         "-i",
@@ -228,7 +270,7 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Phenotype workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_copy_parser.add_argument(
         "-td",
@@ -255,7 +297,7 @@ def main():
         "--phen-dir",
         type=str,
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
-        help="Directory for the changed phenotype version, defaults to workspace directory",
+        help="(Optional) Local phenotype workspace directory (default is ./workspace/phen).",
     )
     phen_diff_parser.add_argument(
         "-v",
diff --git a/acmc/phen.py b/acmc/phen.py
index 03df2d4..39568d5 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -182,6 +182,84 @@ def check_delete_dir(path, msg):
     return deleted
 
 
+def fork(phen_dir, upstream_url, upstream_version, new_origin_url=None):
+    logger.info(
+        f"Forking upstream repo {upstream_url} {upstream_version} into directory: {phen_dir}"
+    )
+
+    phen_path = Path(phen_dir)
+    # check if directory already exists and ask user if they want to recreate it
+    if (
+        phen_path.exists() and phen_path.is_dir()
+    ):  # Check if it exists and is a directory
+        configure = check_delete_dir(
+            phen_path,
+            f"The phen directory already exists. Do you want to reinitialise? (yes/no): ",
+        )
+    else:
+        configure = True
+
+    if not configure:
+        logger.info(f"Exiting, phenotype not initiatised")
+        return
+
+    try:
+        # Clone repo
+        git_url = construct_git_url(upstream_url)
+        repo = git.Repo.clone_from(git_url, phen_path)
+
+        # Fetch all branches and tags
+        repo.remotes.origin.fetch()
+
+        # Check if the version exists
+        available_refs = [ref.name.split("/")[-1] for ref in repo.references]
+        if upstream_version not in available_refs:
+            raise ValueError(
+                f"Version '{upstream_version}' not found in the repository: {upstream_url}."
+            )
+
+        # Checkout the specified version
+        repo.git.checkout(upstream_version)
+        main_branch = repo.heads[DEFAULT_GIT_BRANCH]
+        main_branch.checkout()
+
+        # Check if 'config.yaml' exists in the root directory
+        config_path = phen_path / "config.yaml"
+        if not os.path.isfile(config_path):
+            raise ValueError(
+                f"The forked repository is not a valid ACMC repo because 'config.yaml' is missing in the root directory."
+            )
+
+        # Validate the phenotype is compatible with the acmc tool
+        validate(phen_path)
+
+        # Delete each tag locally
+        tags = repo.tags
+        for tag in tags:
+            repo.delete_tag(tag)
+            logger.debug(f"Deleted tags from forked repo: {tag}")
+
+        # Add upstream remote
+        repo.create_remote("upstream", upstream_url)
+        repo.delete_remote("origin")  # Remove existing origin
+
+        # Optionally set a new origin remote
+        if new_origin_url:
+            git_url = construct_git_url(new_origin_url)
+            repo.create_remote("origin", git_url)
+            repo.git.push("--set-upstream", "origin", "main")
+
+        logger.info(f"Repository forked successfully at {phen_path}")
+        logger.info(f"Upstream set to {upstream_url}")
+        if new_origin_url:
+            logger.info(f"Origin set to {new_origin_url}")
+
+    except Exception as e:
+        if phen_path.exists():
+            shutil.rmtree(phen_path)
+        raise ValueError(f"Error occurred during repository fork: {str(e)}")
+
+
 def init(phen_dir, remote_url):
     """Initial phenotype directory as git repo with standard structure"""
     logger.info(f"Initialising Phenotype in directory: {phen_dir}")
@@ -886,8 +964,11 @@ def publish(phen_dir, msg, remote_url, increment=DEFAULT_VERSION_INC):
 
     # check if any changes to publish
     if not repo.is_dirty() and not repo.untracked_files:
-        logger.info("Nothing to publish, no changes to the repo")
-        return
+        if remote_url is not None and "origin" not in repo.remotes:
+            logger.info(f"First publish to remote url {remote_url}")
+        else:
+            logger.info("Nothing to publish, no changes to the repo")
+            return
 
     # get next version
     new_version_str = generate_version_tag(repo, increment)
@@ -919,15 +1000,16 @@ def publish(phen_dir, msg, remote_url, increment=DEFAULT_VERSION_INC):
 
     # push to origin if a remote repo
     if remote_url is not None and "origin" not in repo.remotes:
-        repo.create_remote("origin", remote_url)
+        git_url = construct_git_url(remote_url)
+        repo.create_remote("origin", git_url)
 
     try:
         if "origin" in repo.remotes:
-            logger.debug(f"Remote 'origin' is already set {repo.remotes.origin.url}")
+            logger.debug(f"Remote 'origin' is set {repo.remotes.origin.url}")
             origin = repo.remotes.origin
-            logger.info(f"Pushing main branch to {repo.remotes.origin.url}")
-            origin.push("main")
-            logger.info(f"Pushing tags to {repo.remotes.origin.url}")
+            logger.info(f"Pushing main branch to remote repo")
+            repo.git.push("--set-upstream", "origin", "main")
+            logger.info(f"Pushing version tags to remote git repo")
             origin.push(tags=True)
             logger.debug("Changes pushed to 'origin'")
         else:
diff --git a/docs/usage.md b/docs/usage.md
index bbc7ab3..168ab9c 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -67,16 +67,29 @@ The `omop` command is used for installing OMOP vocabularies.
 
 The `phen` command is used phenotype-related operations.
 
-- **Initialize Phenotype**
+- **Initialise Phenotype**
 
-  Initialize a phenotype directory locally or from a remote git repository:
+  Initialise a phenotype directory locally or with remote git repository:
 
   ```bash
   acmc phen init -d <PHENOTYPE_DIRECTORY> -r <REMOTE_URL>
   ```
 
-  - `-d`, `--phen-dir`: (Optional) Directory to write phenotype configuration (the default is ./build/phen).
-  - `-r`, `--remote_url`: (Optional) URL to a remote git repository.
+  - `-d`, `--phen-dir`: (Optional) Directory to write phenotype configuration (the default is ./workspace/phen).
+  - `-r`, `--remote-url`: (Optional) URL to a remote git repository where the phenotype will be published, only supports an empty repo without existing commits.
+ 
+- **Fork Existing Phenotype**
+
+  Initialise a phenotype an existing phenotype publish in a git repository:
+
+  ```bash
+  acmc phen fork -d <PHENOTYPE_DIRECTORY> -r <REMOTE_URL>
+  ```
+
+  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
+  - `-r`, `--remote-url`: (Optional) URL to a remote git repository where the phenotype will be published, only supports an empty repo without existing commits.
+  - `-u`, `--upstream-url`: (Required) URL to the phenotype repository to fork.
+  - `-v`, `--upstream-version`: (Required) Phenotype version to fork. 
 
 - **Validate Phenotype**
 
@@ -86,7 +99,7 @@ The `phen` command is used phenotype-related operations.
   acmc phen validate -d <PHENOTYPE_DIRECTORY>
   ```
 
-  - `-d`, `--phen-dir`: (Optional) Directory of phenotype configuration (the default is ./build/phen).
+  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
 
 - **Map Phenotype**
 
@@ -97,7 +110,7 @@ The `phen` command is used phenotype-related operations.
   ```
 
   - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`).
-  - `-d`, `--phen-dir`: (Optional) Directory of phenotype configuration (the default is ./build/phen).
+  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
 
 - **Publish Phenotype Configuration**
 
@@ -107,10 +120,10 @@ The `phen` command is used phenotype-related operations.
   acmc phen publish -d <PHENOTYPE_DIRECTORY>
   ```
 
-  - `-d`, `--phen-dir`: (Optional) Directory of phenotype configuration (the default is ./build/phen).
+  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
   - `-i`, `--increment`: (Optional) Version increment: `major`, `minor`, or `patch`, default is `patch` increment 
   - `-m`, `--msg`: (Optional) Message to include with the published version
-  - `-r`, `--remote_url`: (Optional) URL to a remote git repository, only supports an empty repo without existing commits.
+  - `-r`, `--remote_url`: (Optional) URL to a remote git repository where the phenotype will be published, only supports an empty repo without existing commits.
 
 - **Copy Phenotype Configuration**
 
@@ -120,7 +133,7 @@ The `phen` command is used phenotype-related operations.
   acmc phen copy -d <PHENOTYPE_DIRECTORY> -td <TARGET_DIRECTORY> -v <PHENOTYPE_VERSION>
   ```
 
-  - `-d`, `--phen-dir`: (Optional) Directory of phenotype configuration (the default is ./build/phen).
+  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
   - `-td`, `--target-dir`: (Optional) Directory to copy the phenotype configuration to, (the default is ./build).
   - `-v`, `--version`: The phenotype version to copy.
 
@@ -132,7 +145,7 @@ The `phen` command is used phenotype-related operations.
   acmc phen diff -d <NEW_PHENOTYPE_DIRECTORY> -old <OLD_PHENOTYPE_DIRECTORY>
   ```
 
-  - `-d`, `--phen-dir`: (Optional) Directory of changed phenotype, default is `./workspace/phen`.
+  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
   - `-v`, `--version`: (Optional) Directory of changed phenotype version, default is `latest` which is the current files in the changed phen directory. 
   - `-od`, `--old-phen-dir`: (Optional) Directory of old phenotype, default is `./workspace/phen`.
   - `-ov`, `--old-version`: (Required) Old phenotype version to compare with the chnaged version, default is `latest` which are the current files in the old phen directory.  
diff --git a/tests/test_acmc.py b/tests/test_acmc.py
index c8ece57..7ebdaa4 100644
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -330,3 +330,18 @@ def test_diff(tmp_dir, monkeypatch, caplog):
     assert "Removed concepts ['ABDO_PAIN']" in content
     assert "Added concepts ['DEPRESSION', 'DID_NOT_ATTEND', 'HYPERTENSION']" in content
     assert "Added outputs: ['read3.csv', 'snomed.csv']" in content
+
+
+# TEST REPO NEEDS TO BE AUTOMATED
+
+# Create remote repo acmc-test1 (https://git.soton.ac.uk/mjbonifa/acmc-test1.git) and acmc-test2 (https://git.soton.ac.uk/mjbonifa/acmc-test2.git)
+
+# Init repo from the remote acmc-test1
+
+# Copy example and run map
+
+# Publish creating a version on the remote repo
+
+# Fork repo from acmc-test1 with remote acmc-test2
+
+# Publish repo creating a version on the new repo
-- 
GitLab