From c5bbb8ac8f8271b631331f2546111f273ed2e57c Mon Sep 17 00:00:00 2001
From: Jakub Dylag <jjd1c23@soton.ac.uk>
Date: Wed, 23 Apr 2025 09:37:22 +0100
Subject: [PATCH] (feat) flag do not translate in phen map

---
 acmc/main.py |  8 +++++++-
 acmc/phen.py | 14 ++++++++------
 docs/cli.md  |  2 ++
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/acmc/main.py b/acmc/main.py
index 5f796b0..91a40bc 100644
--- a/acmc/main.py
+++ b/acmc/main.py
@@ -58,7 +58,7 @@ def _phen_validate(args: argparse.Namespace):
 
 def _phen_map(args: argparse.Namespace):
     """Handle the `phen map` command."""
-    phen.map(args.phen_dir, args.target_coding)
+    phen.map(args.phen_dir, args.target_coding, args.not_translate)
 
 
 def _phen_export(args: argparse.Namespace):
@@ -217,6 +217,12 @@ def main():
         choices=parse.SUPPORTED_CODE_TYPES,
         help=f"Specify the target coding {parse.SUPPORTED_CODE_TYPES}",
     )
+    phen_map_parser.add_argument(
+        "--not-translate",
+        action='store_true',
+        default=False,
+        help="(Optional) Prevent any phenotype translation using NHS TRUD vocabularies.",
+    ) 
     phen_map_parser.set_defaults(func=_phen_map)
 
     # phen export
diff --git a/acmc/phen.py b/acmc/phen.py
index 052e1a0..8d6cb82 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -663,7 +663,7 @@ def _preprocess_source_concepts(
 
 # Translate Df with multiple codes into single code type Series
 def translate_codes(
-    source_df: pd.DataFrame, target_code_type: str, concept_name: str
+    source_df: pd.DataFrame, target_code_type: str, concept_name: str, not_translate:bool
 ) -> pd.DataFrame:
     """Translates each source code type the source coding list into a target type and returns all conversions as a concept set"""
 
@@ -688,7 +688,7 @@ def translate_codes(
             _logger.debug(
                 f"Target code type {target_code_type} is the same as source code type {len(source_df)}, copying codes rather than translating"
             )
-        else:
+        elif not not_translate:
             # get the translation filename using source to target code types
             filename = f"{source_code_type}_to_{target_code_type}.parquet"
             map_path = trud.PROCESSED_PATH / filename
@@ -783,7 +783,7 @@ def write_vocab_version(phen_path: Path):
         )
 
 
-def map(phen_dir: str, target_code_type: str):
+def map(phen_dir: str, target_code_type: str, not_translate:bool):
     _logger.info(f"Processing phenotype: {phen_dir}")
 
     # Validate configuration
@@ -807,15 +807,15 @@ def map(phen_dir: str, target_code_type: str):
         )
 
     if target_code_type is not None:
-        _map_target_code_type(phen_path, phenotype, target_code_type)
+        _map_target_code_type(phen_path, phenotype, target_code_type, not_translate)
     else:
         for t in phenotype["map"]:
-            _map_target_code_type(phen_path, phenotype, t)
+            _map_target_code_type(phen_path, phenotype, t, not_translate)
 
     _logger.info(f"Phenotype processed successfully")
 
 
-def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: str):
+def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: str, not_translate:bool):
     _logger.debug(f"Target coding format: {target_code_type}")
     concepts_path = phen_path / CONCEPTS_DIR
     # Create output dataframe
@@ -882,6 +882,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
                             source_df,
                             target_code_type=target_code_type,
                             concept_name=concept_set_name,
+                            not_translate=not_translate,
                         )
                         trans_out = add_metadata(
                             codes=trans_out,
@@ -894,6 +895,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
                     source_df,
                     target_code_type=target_code_type,
                     concept_name=concept_set_name,
+                    not_translate=not_translate,
                 )
                 trans_out = add_metadata(
                     codes=trans_out,
diff --git a/docs/cli.md b/docs/cli.md
index 4b756dc..0a1cf39 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -111,6 +111,8 @@ The `phen` command is used phenotype-related operations.
 
   - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`).
   - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
+  - `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped.
+
 
 - **Publish Phenotype Configuration**
 
-- 
GitLab