Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • meldb/concepts-processing
1 result
Select Git revision
Show changes
Commits on Source (7)
......@@ -58,7 +58,13 @@ def _phen_validate(args: argparse.Namespace):
def _phen_map(args: argparse.Namespace):
"""Handle the `phen map` command."""
phen.map(args.phen_dir, args.target_coding, args.not_translate, args.no_metadata)
phen.map(
args.phen_dir,
args.target_coding,
args.not_translate,
args.no_metadata,
args.do_reverse_translate,
)
def _phen_export(args: argparse.Namespace):
......@@ -235,6 +241,12 @@ def main():
default=False,
help="(Optional) Prevent copying of metadata columns to output.",
)
phen_map_parser.add_argument(
"--do-reverse-translate",
action="store_true",
default=False,
help="(Optional) Enable reversing one directional mappings. WARNING goes against NHS TRUD guidelines.",
)
phen_map_parser.set_defaults(func=_phen_map)
# phen export
......
......@@ -99,6 +99,8 @@ class Proto:
_logger.debug(f"Check: Fixed")
except InvalidCodesException as ex:
errors.append(ex.error)
codes = codes[cond(codes)] # remove codes that cannot be fixed
_logger.debug(f"Check: Invalid Codes Removed, no fix available")
else:
_logger.debug(f"Check: passed")
......@@ -231,7 +233,7 @@ class Read3(Proto):
f"QA Alphanumeric Dot",
codes=codes,
codes_file=codes_file,
mask=None,
mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
code_type=self.name,
)
)
......@@ -246,7 +248,7 @@ class Read3(Proto):
f"QA In Database",
codes=codes,
codes_file=codes_file,
mask=None,
mask=self.in_database(codes, self.db, self.name),
code_type=self.name,
)
)
......@@ -385,6 +387,13 @@ class Snomed(Proto):
)
),
),
(
"Is Integer",
lambda codes: ~codes.str.contains("."),
lambda codes, codes_file: codes.str.split(".")
.str[0]
.astype(str), # Convert from float to integer and back to string
),
(
"Numeric",
lambda codes: codes.str.match(r"[0-9]+$"),
......@@ -400,11 +409,6 @@ class Snomed(Proto):
)
),
),
# (
# "Is Integer",
# lambda codes : codes.dtype == int,
# lambda codes : codes.astype(int) #Convert to integer
# ),
(
"In Database",
lambda codes: self.in_database(codes, self.db, self.name),
......
......@@ -666,6 +666,7 @@ def translate_codes(
target_code_type: str,
concept_name: str,
not_translate: bool,
do_reverse_translate: bool,
) -> pd.DataFrame:
"""Translates each source code type the source coding list into a target type and returns all conversions as a concept set"""
......@@ -695,23 +696,17 @@ def translate_codes(
filename = f"{source_code_type}_to_{target_code_type}.parquet"
map_path = trud.PROCESSED_PATH / filename
filename_reversed = f"{target_code_type}_to_{source_code_type}.parquet"
map_path_reversed = trud.PROCESSED_PATH / filename_reversed
# do the mapping if it exists
if map_path.exists():
# get mapping
df_map = pd.read_parquet(map_path)
# do mapping
translated_df = pd.merge(
source_df[source_code_type], df_map, how="left"
codes = _translate_codes(map_path, source_df, source_code_type, codes)
# otherwise do reverse mapping if enabled and it exists
elif do_reverse_translate and map_path_reversed.exists():
codes = _translate_codes(
map_path_reversed, source_df, source_code_type, codes, reverse=True
)
# normalise the output
translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"])
translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
# add to list of codes
codes = pd.concat([codes, translated_df])
else:
_logger.warning(
f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist"
......@@ -728,6 +723,30 @@ def translate_codes(
return codes
def _translate_codes(
map_path, source_df, source_code_type, codes, reverse=False
) -> pd.DataFrame:
# get mapping
df_map = pd.read_parquet(map_path)
# do mapping
if not (reverse):
translated_df = pd.merge(source_df[source_code_type], df_map, how="left")
else:
translated_df = pd.merge(
source_df[source_code_type], df_map, how="left"
) # output codes from target as reversed
# normalise the output
translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"])
translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
# add to list of codes
codes = pd.concat([codes, translated_df])
return codes
def _write_code_errors(code_errors: list, code_errors_path: Path):
err_df = pd.DataFrame(
[
......@@ -786,7 +805,13 @@ def write_vocab_version(phen_path: Path):
)
def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata: bool):
def map(
phen_dir: str,
target_code_type: str,
not_translate: bool,
no_metadata: bool,
do_reverse_translate: bool,
):
_logger.info(f"Processing phenotype: {phen_dir}")
# Validate configuration
......@@ -811,11 +836,23 @@ def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata:
if target_code_type is not None:
_map_target_code_type(
phen_path, phenotype, target_code_type, not_translate, no_metadata
phen_path,
phenotype,
target_code_type,
not_translate,
no_metadata,
do_reverse_translate,
)
else:
for t in phenotype["map"]:
_map_target_code_type(phen_path, phenotype, t, not_translate, no_metadata)
_map_target_code_type(
phen_path,
phenotype,
t,
not_translate,
no_metadata,
do_reverse_translate,
)
_logger.info(f"Phenotype processed successfully")
......@@ -826,6 +863,7 @@ def _map_target_code_type(
target_code_type: str,
not_translate: bool,
no_metadata: bool,
do_reverse_translate: bool,
):
_logger.debug(f"Target coding format: {target_code_type}")
concepts_path = phen_path / CONCEPTS_DIR
......@@ -894,6 +932,7 @@ def _map_target_code_type(
target_code_type=target_code_type,
concept_name=concept_set_name,
not_translate=not_translate,
do_reverse_translate=do_reverse_translate,
)
trans_out = add_metadata(
codes=trans_out,
......@@ -908,6 +947,7 @@ def _map_target_code_type(
target_code_type=target_code_type,
concept_name=concept_set_name,
not_translate=not_translate,
do_reverse_translate=do_reverse_translate,
)
trans_out = add_metadata(
codes=trans_out,
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -760,7 +760,7 @@
<section id="SUPPORTED_CODE_TYPES">
<div class="attr variable">
<span class="name">SUPPORTED_CODE_TYPES</span> =
<span class="default_value">{&#39;read3&#39;, &#39;read2&#39;, &#39;snomed&#39;, &#39;opcs4&#39;, &#39;icd10&#39;, &#39;atc&#39;}</span>
<span class="default_value">{&#39;opcs4&#39;, &#39;atc&#39;, &#39;snomed&#39;, &#39;icd10&#39;, &#39;read2&#39;, &#39;read3&#39;}</span>
</div>
......
This diff is collapsed.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -112,7 +112,8 @@ The `phen` command is used phenotype-related operations.
- `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`).
- `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
- `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped.
- `--no-metadata`: (Optional) Prevent copying of metadata columns to output.
- `--no-metadata`: (Optional) Prevent copying of metadata columns to output.
- `--do-reverse-translate`: (Optional) Enable reversing one directional mappings. **WARNING** goes against NHS TRUD guidelines.
- **Publish Phenotype Configuration**
......