Compare revisions

Jakub Dylag · Jakub Dylag · Jakub Dylag · Jakub Dylag · Jakub Dylag · Jakub Dylag
--- a/acmc/main.py
+++ b/acmc/main.py
@@ -58,7 +58,13 @@ def _phen_validate(args: argparse.Namespace):
 def _phen_map(args: argparse.Namespace):
    """Handle the `phen map` command."""
-    phen.map(args.phen_dir, args.target_coding, args.not_translate, args.no_metadata)
+    phen.map(
+        args.phen_dir,
+        args.target_coding,
+        args.not_translate,
+        args.no_metadata,
+        args.do_reverse_translate,
+    )
 def _phen_export(args: argparse.Namespace):
@@ -235,6 +241,12 @@ def main():
        default=False,
        help="(Optional) Prevent copying of metadata columns to output.",
    )
+    phen_map_parser.add_argument(
+        "--do-reverse-translate",
+        action="store_true",
+        default=False,
+        help="(Optional) Enable reversing one directional mappings. WARNING goes against NHS TRUD guidelines.",
+    )
    phen_map_parser.set_defaults(func=_phen_map)
    # phen export

--- a/acmc/parse.py
+++ b/acmc/parse.py
@@ -99,6 +99,8 @@ class Proto:
                    _logger.debug(f"Check: Fixed")
                except InvalidCodesException as ex:
                    errors.append(ex.error)
+                    codes = codes[cond(codes)]  # remove codes that cannot be fixed
+                    _logger.debug(f"Check: Invalid Codes Removed, no fix available")
            else:
                _logger.debug(f"Check: passed")
@@ -231,7 +233,7 @@ class Read3(Proto):
                            f"QA Alphanumeric Dot",
                            codes=codes,
                            codes_file=codes_file,
-                            mask=None,
+                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
                            code_type=self.name,
                        )
                    )
@@ -246,7 +248,7 @@ class Read3(Proto):
                            f"QA In Database",
                            codes=codes,
                            codes_file=codes_file,
-                            mask=None,
+                            mask=self.in_database(codes, self.db, self.name),
                            code_type=self.name,
                        )
                    )
@@ -385,6 +387,13 @@ class Snomed(Proto):
                    )
                ),
            ),
+            (
+                "Is Integer",
+                lambda codes: ~codes.str.contains("."),
+                lambda codes, codes_file: codes.str.split(".")
+                .str[0]
+                .astype(str),  # Convert from float to integer and back to string
+            ),
            (
                "Numeric",
                lambda codes: codes.str.match(r"[0-9]+$"),
@@ -400,11 +409,6 @@ class Snomed(Proto):
                    )
                ),
            ),
-            # (
-            # 	"Is Integer",
-            # 	lambda codes : codes.dtype == int,
-            # 	lambda codes : codes.astype(int) #Convert to integer
-            # ),
            (
                "In Database",
                lambda codes: self.in_database(codes, self.db, self.name),

--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -666,6 +666,7 @@ def translate_codes(
    target_code_type: str,
    concept_name: str,
    not_translate: bool,
+    do_reverse_translate: bool,
 ) -> pd.DataFrame:
    """Translates each source code type the source coding list into a target type and returns all conversions as a concept set"""
@@ -695,23 +696,17 @@ def translate_codes(
            filename = f"{source_code_type}_to_{target_code_type}.parquet"
            map_path = trud.PROCESSED_PATH / filename
+            filename_reversed = f"{target_code_type}_to_{source_code_type}.parquet"
+            map_path_reversed = trud.PROCESSED_PATH / filename_reversed
            # do the mapping if it exists
            if map_path.exists():
-                # get mapping
+                codes = _translate_codes(map_path, source_df, source_code_type, codes)
-                df_map = pd.read_parquet(map_path)
+            # otherwise do reverse mapping if enabled and it exists
+            elif do_reverse_translate and map_path_reversed.exists():
-                # do mapping
+                codes = _translate_codes(
-                translated_df = pd.merge(
+                    map_path_reversed, source_df, source_code_type, codes, reverse=True
-                    source_df[source_code_type], df_map, how="left"
                )
-                # normalise the output
-                translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"])
-                translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
-                # add to list of codes
-                codes = pd.concat([codes, translated_df])
            else:
                _logger.warning(
                    f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist"
@@ -728,6 +723,30 @@ def translate_codes(
    return codes
+def _translate_codes(
+    map_path, source_df, source_code_type, codes, reverse=False
+) -> pd.DataFrame:
+    # get mapping
+    df_map = pd.read_parquet(map_path)
+    # do mapping
+    if not (reverse):
+        translated_df = pd.merge(source_df[source_code_type], df_map, how="left")
+    else:
+        translated_df = pd.merge(
+            source_df[source_code_type], df_map, how="left"
+        )  # output codes from target as reversed
+    # normalise the output
+    translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"])
+    translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
+    # add to list of codes
+    codes = pd.concat([codes, translated_df])
+    return codes
 def _write_code_errors(code_errors: list, code_errors_path: Path):
    err_df = pd.DataFrame(
        [
@@ -786,7 +805,13 @@ def write_vocab_version(phen_path: Path):
        )
-def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata: bool):
+def map(
+    phen_dir: str,
+    target_code_type: str,
+    not_translate: bool,
+    no_metadata: bool,
+    do_reverse_translate: bool,
+):
    _logger.info(f"Processing phenotype: {phen_dir}")
    # Validate configuration
@@ -811,11 +836,23 @@ def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata:
    if target_code_type is not None:
        _map_target_code_type(
-            phen_path, phenotype, target_code_type, not_translate, no_metadata
+            phen_path,
+            phenotype,
+            target_code_type,
+            not_translate,
+            no_metadata,
+            do_reverse_translate,
        )
    else:
        for t in phenotype["map"]:
-            _map_target_code_type(phen_path, phenotype, t, not_translate, no_metadata)
+            _map_target_code_type(
+                phen_path,
+                phenotype,
+                t,
+                not_translate,
+                no_metadata,
+                do_reverse_translate,
+            )
    _logger.info(f"Phenotype processed successfully")
@@ -826,6 +863,7 @@ def _map_target_code_type(
    target_code_type: str,
    not_translate: bool,
    no_metadata: bool,
+    do_reverse_translate: bool,
 ):
    _logger.debug(f"Target coding format: {target_code_type}")
    concepts_path = phen_path / CONCEPTS_DIR
@@ -894,6 +932,7 @@ def _map_target_code_type(
                            target_code_type=target_code_type,
                            concept_name=concept_set_name,
                            not_translate=not_translate,
+                            do_reverse_translate=do_reverse_translate,
                        )
                        trans_out = add_metadata(
                            codes=trans_out,
@@ -908,6 +947,7 @@ def _map_target_code_type(
                    target_code_type=target_code_type,
                    concept_name=concept_set_name,
                    not_translate=not_translate,
+                    do_reverse_translate=do_reverse_translate,
                )
                trans_out = add_metadata(
                    codes=trans_out,

--- a/docs/api/acmc/main.html
+++ b/docs/api/acmc/main.html
--- a/docs/api/acmc/parse.html
+++ b/docs/api/acmc/parse.html
@@ -760,7 +760,7 @@
                <section id="SUPPORTED_CODE_TYPES">
                    <div class="attr variable">
            <span class="name">SUPPORTED_CODE_TYPES</span>        =
-<span class="default_value">{&#39;read3&#39;, &#39;read2&#39;, &#39;snomed&#39;, &#39;opcs4&#39;, &#39;icd10&#39;, &#39;atc&#39;}</span>
+<span class="default_value">{&#39;opcs4&#39;, &#39;atc&#39;, &#39;snomed&#39;, &#39;icd10&#39;, &#39;read2&#39;, &#39;read3&#39;}</span>
    </div>

--- a/docs/api/acmc/phen.html
+++ b/docs/api/acmc/phen.html
--- a/docs/api/search.js
+++ b/docs/api/search.js
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -112,7 +112,8 @@ The `phen` command is used phenotype-related operations.
  - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`).
  - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
  - `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped.
-    - `--no-metadata`: (Optional) Prevent copying of metadata columns to output.
+  - `--no-metadata`: (Optional) Prevent copying of metadata columns to output.
+  - `--do-reverse-translate`: (Optional) Enable reversing one directional mappings. **WARNING** goes against NHS TRUD guidelines.
 - **Publish Phenotype Configuration**
No results found