From c5bbb8ac8f8271b631331f2546111f273ed2e57c Mon Sep 17 00:00:00 2001 From: Jakub Dylag <jjd1c23@soton.ac.uk> Date: Wed, 23 Apr 2025 09:37:22 +0100 Subject: [PATCH] (feat) flag do not translate in phen map --- acmc/main.py | 8 +++++++- acmc/phen.py | 14 ++++++++------ docs/cli.md | 2 ++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/acmc/main.py b/acmc/main.py index 5f796b0..91a40bc 100644 --- a/acmc/main.py +++ b/acmc/main.py @@ -58,7 +58,7 @@ def _phen_validate(args: argparse.Namespace): def _phen_map(args: argparse.Namespace): """Handle the `phen map` command.""" - phen.map(args.phen_dir, args.target_coding) + phen.map(args.phen_dir, args.target_coding, args.not_translate) def _phen_export(args: argparse.Namespace): @@ -217,6 +217,12 @@ def main(): choices=parse.SUPPORTED_CODE_TYPES, help=f"Specify the target coding {parse.SUPPORTED_CODE_TYPES}", ) + phen_map_parser.add_argument( + "--not-translate", + action='store_true', + default=False, + help="(Optional) Prevent any phenotype translation using NHS TRUD vocabularies.", + ) phen_map_parser.set_defaults(func=_phen_map) # phen export diff --git a/acmc/phen.py b/acmc/phen.py index 052e1a0..8d6cb82 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -663,7 +663,7 @@ def _preprocess_source_concepts( # Translate Df with multiple codes into single code type Series def translate_codes( - source_df: pd.DataFrame, target_code_type: str, concept_name: str + source_df: pd.DataFrame, target_code_type: str, concept_name: str, not_translate:bool ) -> pd.DataFrame: """Translates each source code type the source coding list into a target type and returns all conversions as a concept set""" @@ -688,7 +688,7 @@ def translate_codes( _logger.debug( f"Target code type {target_code_type} is the same as source code type {len(source_df)}, copying codes rather than translating" ) - else: + elif not not_translate: # get the translation filename using source to target code types filename = f"{source_code_type}_to_{target_code_type}.parquet" map_path = trud.PROCESSED_PATH / filename @@ -783,7 +783,7 @@ def write_vocab_version(phen_path: Path): ) -def map(phen_dir: str, target_code_type: str): +def map(phen_dir: str, target_code_type: str, not_translate:bool): _logger.info(f"Processing phenotype: {phen_dir}") # Validate configuration @@ -807,15 +807,15 @@ def map(phen_dir: str, target_code_type: str): ) if target_code_type is not None: - _map_target_code_type(phen_path, phenotype, target_code_type) + _map_target_code_type(phen_path, phenotype, target_code_type, not_translate) else: for t in phenotype["map"]: - _map_target_code_type(phen_path, phenotype, t) + _map_target_code_type(phen_path, phenotype, t, not_translate) _logger.info(f"Phenotype processed successfully") -def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: str): +def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: str, not_translate:bool): _logger.debug(f"Target coding format: {target_code_type}") concepts_path = phen_path / CONCEPTS_DIR # Create output dataframe @@ -882,6 +882,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st source_df, target_code_type=target_code_type, concept_name=concept_set_name, + not_translate=not_translate, ) trans_out = add_metadata( codes=trans_out, @@ -894,6 +895,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st source_df, target_code_type=target_code_type, concept_name=concept_set_name, + not_translate=not_translate, ) trans_out = add_metadata( codes=trans_out, diff --git a/docs/cli.md b/docs/cli.md index 4b756dc..0a1cf39 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -111,6 +111,8 @@ The `phen` command is used phenotype-related operations. - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`). - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen). + - `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped. + - **Publish Phenotype Configuration** -- GitLab