From 9df8d5d286de18026eafdd4ea9520b83d3cd74cd Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Thu, 6 Mar 2025 09:43:09 +0000
Subject: [PATCH] (fix) Changed the versioning of phenotypes to semantic
 versioning using the last tag in the repo rather than the commit count. The
 user can specify if they want to increment major, minor or patch when they
 publish which keeps the versioning simple. Previosuly we used the commit
 count but that was problematic due to always incrementing the patch versions
 and if using an existing the commit history could be large. This would
 especiually be the case when forking a repo where the commit history is
 retained. Closes #51

---
 acmc/main.py          | 10 +++++-
 acmc/phen.py          | 74 ++++++++++++++++++++++++++-----------------
 docs/usage.md         |  3 +-
 examples/config1.yaml |  2 +-
 examples/config2.yaml |  2 +-
 examples/config3.yaml |  2 +-
 pyproject.toml        |  3 +-
 tests/test_acmc.py    | 18 +++++------
 8 files changed, 70 insertions(+), 44 deletions(-)

diff --git a/acmc/main.py b/acmc/main.py
index ec29b5d..5c0b30a 100644
--- a/acmc/main.py
+++ b/acmc/main.py
@@ -53,7 +53,7 @@ def phen_export(args):
 
 def phen_publish(args):
     """Handle the `phen publish` command."""
-    phen.publish(args.phen_dir, args.msg, args.remote_url)
+    phen.publish(args.phen_dir, args.msg, args.remote_url, args.increment)
 
 
 def phen_copy(args):
@@ -203,6 +203,14 @@ def main():
         default=str(phen.DEFAULT_PHEN_PATH.resolve()),
         help="Phenotype workspace directory",
     )
+    phen_publish_parser.add_argument(
+        "-i",
+        "--increment",
+        type=str,
+        default=phen.DEFAULT_VERSION_INC,
+        choices=phen.SEMANTIC_VERSION_TYPES,
+        help=f"Version increment: {phen.SEMANTIC_VERSION_TYPES}, default is {phen.DEFAULT_VERSION_INC} increment",
+    )
     phen_publish_parser.add_argument(
         "-m", "--msg", help="Message to include with the published version"
     )
diff --git a/acmc/phen.py b/acmc/phen.py
index 86675bf..d4a95d0 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -11,6 +11,7 @@ import re
 import logging
 import requests
 import yaml
+import semver
 from cerberus import Validator
 from deepdiff import DeepDiff
 from pathlib import Path
@@ -37,6 +38,8 @@ OMOP_PATH = Path(CONCEPT_SET_DIR) / "omop"
 DEFAULT_PHEN_DIR_LIST = [CONCEPTS_DIR, MAP_DIR, CONCEPT_SET_DIR]
 CONFIG_FILE = "config.yaml"
 VOCAB_VERSION_FILE = "vocab_version.yaml"
+SEMANTIC_VERSION_TYPES = ["major", "minor", "patch"]
+DEFAULT_VERSION_INC = "patch"
 
 DEFAULT_GIT_BRANCH = "main"
 
@@ -58,7 +61,7 @@ CONFIG_SCHEMA = {
             "version": {
                 "type": "string",
                 "required": True,
-                "regex": r"^v\d+\.\d+\.\d+$",  # Enforces 'vN.N.N' format
+                "regex": r"^\d+\.\d+\.\d+$",  # Enforces 'vN.N.N' format
             },
             "omop": {
                 "type": "dict",
@@ -258,15 +261,10 @@ def init(phen_dir, remote_url):
     for d in DEFAULT_PHEN_DIR_LIST:
         create_empty_git_dir(phen_path / d)
 
-    # set initial version based on the number of commits in the repo, depending on how the repo was created
-    # e.g., with a README.md, then there will be some initial commits before the phen config is added
-    next_commit_count = commit_count + 1
-    initial_version = f"v1.0.{next_commit_count}"
-
     # create empty phen config file
     config = {
         "phenotype": {
-            "version": initial_version,
+            "version": "0.0.0",
             "omop": {
                 "vocabulary_id": "",
                 "vocabulary_name": "",
@@ -365,7 +363,7 @@ def validate(phen_dir):
     code_types = parse.CodeTypeParser().code_types
 
     # check the version number is of the format vn.n.n
-    match = re.match(r"v(\d+\.\d+\.\d+)", phenotype["version"])
+    match = re.match(r"(\d+\.\d+\.\d+)", phenotype["version"])
     if not match:
         validation_errors.append(
             f"Invalid version format in configuration file: {phenotype['version']}"
@@ -840,7 +838,35 @@ def map_target_code_type(phen_path, phenotype, target_code_type):
     logger.info(f"Phenotype processed target code type {target_code_type}")
 
 
-def publish(phen_dir, msg, remote_url):
+def generate_version_tag(repo, increment=DEFAULT_VERSION_INC, use_v_prefix=False):
+    # Get all valid semantic version tags
+    versions = []
+    for tag in repo.tags:
+        tag_name = (
+            tag.name.lstrip("v") if use_v_prefix else tag.name
+        )  # Remove 'v' if needed
+        if semver.Version.is_valid(tag_name):
+            versions.append(semver.Version.parse(tag_name))
+
+    # Determine the next version
+    if not versions:
+        new_version = semver.Version(0, 0, 1)
+    else:
+        latest_version = max(versions)
+        if increment == "major":
+            new_version = latest_version.bump_major()
+        elif increment == "minor":
+            new_version = latest_version.bump_minor()
+        else:
+            new_version = latest_version.bump_patch()
+
+    # Create the new tag
+    new_version_str = f"v{new_version}" if use_v_prefix else str(new_version)
+
+    return new_version_str
+
+
+def publish(phen_dir, msg, remote_url, increment=DEFAULT_VERSION_INC):
     """Publishes updates to the phenotype by commiting all changes to the repo directory"""
 
     # Validate config
@@ -862,21 +888,16 @@ def publish(phen_dir, msg, remote_url):
         logger.info("Nothing to publish, no changes to the repo")
         return
 
-    # get major version from configuration file
+    # get next version
+    new_version_str = generate_version_tag(repo, increment)
+    logger.info(f"New version: {new_version_str}")
+
+    # Write version in configuration file
     config_path = phen_path / CONFIG_FILE
     with config_path.open("r") as file:
         config = yaml.safe_load(file)
-    match = re.match(r"v(\d+\.\d+)", config["phenotype"]["version"])
-    major_version = match.group(1)
-
-    # get latest minor version from git commit count
-    commit_count = len(list(repo.iter_commits("HEAD")))
 
-    # set version and write to config file so consistent with repo version
-    next_minor_version = commit_count + 1
-    version = f"v{major_version}.{next_minor_version}"
-    logger.debug(f"New version: {version}")
-    config["phenotype"]["version"] = version
+    config["phenotype"]["version"] = new_version_str
     with open(config_path, "w") as file:
         yaml.dump(
             config,
@@ -887,18 +908,13 @@ def publish(phen_dir, msg, remote_url):
             default_style='"',
         )
 
-    # Add and commit changes to repo
+    # Add and commit changes to repo including version updates
     commit_message = f"Committing updates to phenotype {phen_path}"
     repo.git.add("--all")
     repo.index.commit(commit_message)
 
-    # Create and push the tag
-    if version in repo.tags:
-        raise Exception(f"Tag {version} already exists in repo {phen_path}")
-    if msg is None:
-        msg = f"Release {version}"
-    repo.create_tag(version, message=msg)
-    logger.info(f"New version: {version}")
+    # Add tag to the repo
+    repo.create_tag(new_version_str)
 
     # push to origin if a remote repo
     if remote_url is not None and "origin" not in repo.remotes:
@@ -916,7 +932,7 @@ def publish(phen_dir, msg, remote_url):
         else:
             logger.debug("Remote 'origin' is not set")
     except Exception as e:
-        repo.delete_tag(version)
+        repo.delete_tag(new_version_str)
         repo.git.reset("--soft", "HEAD~1")
         raise e
 
diff --git a/docs/usage.md b/docs/usage.md
index b364849..e042826 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -108,7 +108,8 @@ The `phen` command is used phenotype-related operations.
   ```
 
   - `-d`, `--phen-dir`: (Optional) Directory of phenotype configuration (the default is ./build/phen).
-  - `-m`, `--msg`: (Optional) Message to include with the published version- 
+  - `-i`, `--increment`: (Optional) Version increment: `major`, `minor`, or `patch`, default is `patch` increment 
+  - `-m`, `--msg`: (Optional) Message to include with the published version
   - `-r`, `--remote_url`: (Optional) URL to a remote git repository, only supports an empty repo without existing commits.
 
 - **Copy Phenotype Configuration**
diff --git a/examples/config1.yaml b/examples/config1.yaml
index 09d0e80..2709d3f 100644
--- a/examples/config1.yaml
+++ b/examples/config1.yaml
@@ -1,5 +1,5 @@
 phenotype:
-  version: "v1.0.1"
+  version: "0.0.0"
   omop:
     vocabulary_id: "ACMC_Example_1"
     vocabulary_name: "ACMC example 1 phenotype"
diff --git a/examples/config2.yaml b/examples/config2.yaml
index 4c6252e..4a9ad79 100644
--- a/examples/config2.yaml
+++ b/examples/config2.yaml
@@ -1,5 +1,5 @@
 phenotype:
-  version: "v1.0.1"
+  version: "0.0.0"
   omop:
     vocabulary_id: "ACMC_Example_2"
     vocabulary_name: "ACMC example 2 phenotype"
diff --git a/examples/config3.yaml b/examples/config3.yaml
index 764d7d8..2e07427 100644
--- a/examples/config3.yaml
+++ b/examples/config3.yaml
@@ -1,5 +1,5 @@
 phenotype:
-  version: "v1.0.1"
+  version: "0.0.0"
   omop:
     vocabulary_id: "ACMC_Example_3"
     vocabulary_name: "ACMC example 3 phenotype"
diff --git a/pyproject.toml b/pyproject.toml
index c340e83..39f046a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,8 @@ dependencies = [
     "tables", 	
     "pytest",
 	"pyyaml",	
-    "requests",	
+    "requests",
+    "semver", 
     "simpledbf",
     "smmap",
     "sqlalchemy",
diff --git a/tests/test_acmc.py b/tests/test_acmc.py
index a9bda40..ee5dcf1 100644
--- a/tests/test_acmc.py
+++ b/tests/test_acmc.py
@@ -141,7 +141,7 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
                 "-td",
                 str(tmp_dir.resolve()),
                 "-v",
-                "v1.0.3",
+                "0.0.1",
             ],
         )
         main.main()
@@ -149,7 +149,7 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
 
     # diff phenotype
     with caplog.at_level(logging.DEBUG):
-        old_path = tmp_dir / "v1.0.3"
+        old_path = tmp_dir / "0.0.1"
         monkeypatch.setattr(
             sys,
             "argv",
@@ -234,7 +234,7 @@ def test_diff(tmp_dir, monkeypatch, caplog):
                 "-td",
                 str(tmp_dir.resolve()),
                 "-v",
-                "v1.0.3",
+                "0.0.1",
             ],
         )
         main.main()
@@ -260,9 +260,9 @@ def test_diff(tmp_dir, monkeypatch, caplog):
             main.main()
     assert "Phenotype processed successfully" in caplog.text
 
-    # diff phenotype with v1.0.3
+    # diff phenotype with 0.0.1
     with caplog.at_level(logging.DEBUG):
-        old_path = tmp_dir / "v1.0.3"
+        old_path = tmp_dir / "0.0.1"
         monkeypatch.setattr(
             sys,
             "argv",
@@ -280,7 +280,7 @@ def test_diff(tmp_dir, monkeypatch, caplog):
     assert "Phenotypes diff'd successfully" in caplog.text
 
     # check changes
-    with open(phen_path / "v1.0.3_diff.md", "r") as file:
+    with open(phen_path / "0.0.1_diff.md", "r") as file:
         content = file.read()
     assert "Removed concepts ['ABDO_PAIN']" in content
     assert "Added concepts ['DID_NOT_ATTEND']" in content
@@ -306,9 +306,9 @@ def test_diff(tmp_dir, monkeypatch, caplog):
             main.main()
     assert "Phenotype processed successfully" in caplog.text
 
-    # diff phenotype with v1.0.3
+    # diff phenotype with 0.0.1
     with caplog.at_level(logging.DEBUG):
-        old_path = tmp_dir / "v1.0.3"
+        old_path = tmp_dir / "0.0.1"
         monkeypatch.setattr(
             sys,
             "argv",
@@ -325,7 +325,7 @@ def test_diff(tmp_dir, monkeypatch, caplog):
         main.main()
     assert "Phenotypes diff'd successfully" in caplog.text
 
-    with open(phen_path / "v1.0.3_diff.md", "r") as file:
+    with open(phen_path / "0.0.1_diff.md", "r") as file:
         content = file.read()
     assert "Removed concepts ['ABDO_PAIN']" in content
     assert "Added concepts ['DEPRESSION', 'DID_NOT_ATTEND', 'HYPERTENSION']" in content
-- 
GitLab