diff --git a/acmc/main.py b/acmc/main.py index 29f5aea00f8c1c5384ed48970cce9204cdda45b1..02ad902c9a1de0dcaa50797b710b3b7828109bb5 100644 --- a/acmc/main.py +++ b/acmc/main.py @@ -58,7 +58,13 @@ def _phen_validate(args: argparse.Namespace): def _phen_map(args: argparse.Namespace): """Handle the `phen map` command.""" - phen.map(args.phen_dir, args.target_coding, args.not_translate, args.no_metadata) + phen.map( + args.phen_dir, + args.target_coding, + args.not_translate, + args.no_metadata, + args.do_reverse_translate, + ) def _phen_export(args: argparse.Namespace): @@ -235,6 +241,12 @@ def main(): default=False, help="(Optional) Prevent copying of metadata columns to output.", ) + phen_map_parser.add_argument( + "--do-reverse-translate", + action="store_true", + default=False, + help="(Optional) Enable reversing one directional mappings. WARNING goes against NHS TRUD guidelines.", + ) phen_map_parser.set_defaults(func=_phen_map) # phen export diff --git a/acmc/phen.py b/acmc/phen.py index 01f417faa03504b523423bad0982faa14a264973..64f6b4983db7a362a9e0f225686877fb4f9b41a8 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -666,6 +666,7 @@ def translate_codes( target_code_type: str, concept_name: str, not_translate: bool, + do_reverse_translate: bool, ) -> pd.DataFrame: """Translates each source code type the source coding list into a target type and returns all conversions as a concept set""" @@ -695,23 +696,15 @@ def translate_codes( filename = f"{source_code_type}_to_{target_code_type}.parquet" map_path = trud.PROCESSED_PATH / filename + filename_reversed = f"{target_code_type}_to_{source_code_type}.parquet" + map_path_reversed = trud.PROCESSED_PATH / filename_reversed + # do the mapping if it exists if map_path.exists(): - # get mapping - df_map = pd.read_parquet(map_path) - - # do mapping - translated_df = pd.merge( - source_df[source_code_type], df_map, how="left" - ) - - # normalise the output - translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"]) - translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type - - # add to list of codes - codes = pd.concat([codes, translated_df]) - + codes = _translate_codes(map_path, source_df, source_code_type, codes) + # otherwise do reverse mapping if enabled and it exists + elif do_reverse_translate and map_path_reversed.exists(): + codes = _translate_codes(map_path_reversed, source_df, source_code_type, codes) else: _logger.warning( f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist" @@ -728,6 +721,23 @@ def translate_codes( return codes +def _translate_codes(map_path, source_df, source_code_type, codes) -> pd.DataFrame: + # get mapping + df_map = pd.read_parquet(map_path) + + # do mapping + translated_df = pd.merge(source_df[source_code_type], df_map, how="left") + + # normalise the output + translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"]) + translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type + + # add to list of codes + codes = pd.concat([codes, translated_df]) + + return codes + + def _write_code_errors(code_errors: list, code_errors_path: Path): err_df = pd.DataFrame( [ @@ -786,7 +796,13 @@ def write_vocab_version(phen_path: Path): ) -def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata: bool): +def map( + phen_dir: str, + target_code_type: str, + not_translate: bool, + no_metadata: bool, + do_reverse_translate: bool, +): _logger.info(f"Processing phenotype: {phen_dir}") # Validate configuration @@ -811,11 +827,23 @@ def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata: if target_code_type is not None: _map_target_code_type( - phen_path, phenotype, target_code_type, not_translate, no_metadata + phen_path, + phenotype, + target_code_type, + not_translate, + no_metadata, + do_reverse_translate, ) else: for t in phenotype["map"]: - _map_target_code_type(phen_path, phenotype, t, not_translate, no_metadata) + _map_target_code_type( + phen_path, + phenotype, + t, + not_translate, + no_metadata, + do_reverse_translate, + ) _logger.info(f"Phenotype processed successfully") @@ -826,6 +854,7 @@ def _map_target_code_type( target_code_type: str, not_translate: bool, no_metadata: bool, + do_reverse_translate: bool, ): _logger.debug(f"Target coding format: {target_code_type}") concepts_path = phen_path / CONCEPTS_DIR @@ -894,6 +923,7 @@ def _map_target_code_type( target_code_type=target_code_type, concept_name=concept_set_name, not_translate=not_translate, + do_reverse_translate=do_reverse_translate, ) trans_out = add_metadata( codes=trans_out, @@ -908,6 +938,7 @@ def _map_target_code_type( target_code_type=target_code_type, concept_name=concept_set_name, not_translate=not_translate, + do_reverse_translate=do_reverse_translate, ) trans_out = add_metadata( codes=trans_out, diff --git a/docs/cli.md b/docs/cli.md index 396f5291183990337ed7d5ce98192673da93ca4c..6001ec7dc2a805c170f070c951c9583bf4cb4184 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -112,7 +112,8 @@ The `phen` command is used phenotype-related operations. - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`). - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen). - `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped. - - `--no-metadata`: (Optional) Prevent copying of metadata columns to output. + - `--no-metadata`: (Optional) Prevent copying of metadata columns to output. + - `--do-reverse-translate`: (Optional) Enable reversing one directional mappings. **WARNING** goes against NHS TRUD guidelines. - **Publish Phenotype Configuration**