Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • meldb/concepts-processing
1 result
Select Git revision
Show changes
Commits on Source (7)
...@@ -58,7 +58,13 @@ def _phen_validate(args: argparse.Namespace): ...@@ -58,7 +58,13 @@ def _phen_validate(args: argparse.Namespace):
def _phen_map(args: argparse.Namespace): def _phen_map(args: argparse.Namespace):
"""Handle the `phen map` command.""" """Handle the `phen map` command."""
phen.map(args.phen_dir, args.target_coding, args.not_translate, args.no_metadata) phen.map(
args.phen_dir,
args.target_coding,
args.not_translate,
args.no_metadata,
args.do_reverse_translate,
)
def _phen_export(args: argparse.Namespace): def _phen_export(args: argparse.Namespace):
...@@ -235,6 +241,12 @@ def main(): ...@@ -235,6 +241,12 @@ def main():
default=False, default=False,
help="(Optional) Prevent copying of metadata columns to output.", help="(Optional) Prevent copying of metadata columns to output.",
) )
phen_map_parser.add_argument(
"--do-reverse-translate",
action="store_true",
default=False,
help="(Optional) Enable reversing one directional mappings. WARNING goes against NHS TRUD guidelines.",
)
phen_map_parser.set_defaults(func=_phen_map) phen_map_parser.set_defaults(func=_phen_map)
# phen export # phen export
......
...@@ -99,6 +99,8 @@ class Proto: ...@@ -99,6 +99,8 @@ class Proto:
_logger.debug(f"Check: Fixed") _logger.debug(f"Check: Fixed")
except InvalidCodesException as ex: except InvalidCodesException as ex:
errors.append(ex.error) errors.append(ex.error)
codes = codes[cond(codes)] # remove codes that cannot be fixed
_logger.debug(f"Check: Invalid Codes Removed, no fix available")
else: else:
_logger.debug(f"Check: passed") _logger.debug(f"Check: passed")
...@@ -231,7 +233,7 @@ class Read3(Proto): ...@@ -231,7 +233,7 @@ class Read3(Proto):
f"QA Alphanumeric Dot", f"QA Alphanumeric Dot",
codes=codes, codes=codes,
codes_file=codes_file, codes_file=codes_file,
mask=None, mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
code_type=self.name, code_type=self.name,
) )
) )
...@@ -246,7 +248,7 @@ class Read3(Proto): ...@@ -246,7 +248,7 @@ class Read3(Proto):
f"QA In Database", f"QA In Database",
codes=codes, codes=codes,
codes_file=codes_file, codes_file=codes_file,
mask=None, mask=self.in_database(codes, self.db, self.name),
code_type=self.name, code_type=self.name,
) )
) )
...@@ -385,6 +387,13 @@ class Snomed(Proto): ...@@ -385,6 +387,13 @@ class Snomed(Proto):
) )
), ),
), ),
(
"Is Integer",
lambda codes: ~codes.str.contains("."),
lambda codes, codes_file: codes.str.split(".")
.str[0]
.astype(str), # Convert from float to integer and back to string
),
( (
"Numeric", "Numeric",
lambda codes: codes.str.match(r"[0-9]+$"), lambda codes: codes.str.match(r"[0-9]+$"),
...@@ -400,11 +409,6 @@ class Snomed(Proto): ...@@ -400,11 +409,6 @@ class Snomed(Proto):
) )
), ),
), ),
# (
# "Is Integer",
# lambda codes : codes.dtype == int,
# lambda codes : codes.astype(int) #Convert to integer
# ),
( (
"In Database", "In Database",
lambda codes: self.in_database(codes, self.db, self.name), lambda codes: self.in_database(codes, self.db, self.name),
......
...@@ -666,6 +666,7 @@ def translate_codes( ...@@ -666,6 +666,7 @@ def translate_codes(
target_code_type: str, target_code_type: str,
concept_name: str, concept_name: str,
not_translate: bool, not_translate: bool,
do_reverse_translate: bool,
) -> pd.DataFrame: ) -> pd.DataFrame:
"""Translates each source code type the source coding list into a target type and returns all conversions as a concept set""" """Translates each source code type the source coding list into a target type and returns all conversions as a concept set"""
...@@ -695,23 +696,17 @@ def translate_codes( ...@@ -695,23 +696,17 @@ def translate_codes(
filename = f"{source_code_type}_to_{target_code_type}.parquet" filename = f"{source_code_type}_to_{target_code_type}.parquet"
map_path = trud.PROCESSED_PATH / filename map_path = trud.PROCESSED_PATH / filename
filename_reversed = f"{target_code_type}_to_{source_code_type}.parquet"
map_path_reversed = trud.PROCESSED_PATH / filename_reversed
# do the mapping if it exists # do the mapping if it exists
if map_path.exists(): if map_path.exists():
# get mapping codes = _translate_codes(map_path, source_df, source_code_type, codes)
df_map = pd.read_parquet(map_path) # otherwise do reverse mapping if enabled and it exists
elif do_reverse_translate and map_path_reversed.exists():
# do mapping codes = _translate_codes(
translated_df = pd.merge( map_path_reversed, source_df, source_code_type, codes, reverse=True
source_df[source_code_type], df_map, how="left"
) )
# normalise the output
translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"])
translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
# add to list of codes
codes = pd.concat([codes, translated_df])
else: else:
_logger.warning( _logger.warning(
f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist" f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist"
...@@ -728,6 +723,30 @@ def translate_codes( ...@@ -728,6 +723,30 @@ def translate_codes(
return codes return codes
def _translate_codes(
map_path, source_df, source_code_type, codes, reverse=False
) -> pd.DataFrame:
# get mapping
df_map = pd.read_parquet(map_path)
# do mapping
if not (reverse):
translated_df = pd.merge(source_df[source_code_type], df_map, how="left")
else:
translated_df = pd.merge(
source_df[source_code_type], df_map, how="left"
) # output codes from target as reversed
# normalise the output
translated_df.columns = pd.Index(["SOURCE_CONCEPT", "CONCEPT"])
translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
# add to list of codes
codes = pd.concat([codes, translated_df])
return codes
def _write_code_errors(code_errors: list, code_errors_path: Path): def _write_code_errors(code_errors: list, code_errors_path: Path):
err_df = pd.DataFrame( err_df = pd.DataFrame(
[ [
...@@ -786,7 +805,13 @@ def write_vocab_version(phen_path: Path): ...@@ -786,7 +805,13 @@ def write_vocab_version(phen_path: Path):
) )
def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata: bool): def map(
phen_dir: str,
target_code_type: str,
not_translate: bool,
no_metadata: bool,
do_reverse_translate: bool,
):
_logger.info(f"Processing phenotype: {phen_dir}") _logger.info(f"Processing phenotype: {phen_dir}")
# Validate configuration # Validate configuration
...@@ -811,11 +836,23 @@ def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata: ...@@ -811,11 +836,23 @@ def map(phen_dir: str, target_code_type: str, not_translate: bool, no_metadata:
if target_code_type is not None: if target_code_type is not None:
_map_target_code_type( _map_target_code_type(
phen_path, phenotype, target_code_type, not_translate, no_metadata phen_path,
phenotype,
target_code_type,
not_translate,
no_metadata,
do_reverse_translate,
) )
else: else:
for t in phenotype["map"]: for t in phenotype["map"]:
_map_target_code_type(phen_path, phenotype, t, not_translate, no_metadata) _map_target_code_type(
phen_path,
phenotype,
t,
not_translate,
no_metadata,
do_reverse_translate,
)
_logger.info(f"Phenotype processed successfully") _logger.info(f"Phenotype processed successfully")
...@@ -826,6 +863,7 @@ def _map_target_code_type( ...@@ -826,6 +863,7 @@ def _map_target_code_type(
target_code_type: str, target_code_type: str,
not_translate: bool, not_translate: bool,
no_metadata: bool, no_metadata: bool,
do_reverse_translate: bool,
): ):
_logger.debug(f"Target coding format: {target_code_type}") _logger.debug(f"Target coding format: {target_code_type}")
concepts_path = phen_path / CONCEPTS_DIR concepts_path = phen_path / CONCEPTS_DIR
...@@ -894,6 +932,7 @@ def _map_target_code_type( ...@@ -894,6 +932,7 @@ def _map_target_code_type(
target_code_type=target_code_type, target_code_type=target_code_type,
concept_name=concept_set_name, concept_name=concept_set_name,
not_translate=not_translate, not_translate=not_translate,
do_reverse_translate=do_reverse_translate,
) )
trans_out = add_metadata( trans_out = add_metadata(
codes=trans_out, codes=trans_out,
...@@ -908,6 +947,7 @@ def _map_target_code_type( ...@@ -908,6 +947,7 @@ def _map_target_code_type(
target_code_type=target_code_type, target_code_type=target_code_type,
concept_name=concept_set_name, concept_name=concept_set_name,
not_translate=not_translate, not_translate=not_translate,
do_reverse_translate=do_reverse_translate,
) )
trans_out = add_metadata( trans_out = add_metadata(
codes=trans_out, codes=trans_out,
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -760,7 +760,7 @@ ...@@ -760,7 +760,7 @@
<section id="SUPPORTED_CODE_TYPES"> <section id="SUPPORTED_CODE_TYPES">
<div class="attr variable"> <div class="attr variable">
<span class="name">SUPPORTED_CODE_TYPES</span> = <span class="name">SUPPORTED_CODE_TYPES</span> =
<span class="default_value">{&#39;read3&#39;, &#39;read2&#39;, &#39;snomed&#39;, &#39;opcs4&#39;, &#39;icd10&#39;, &#39;atc&#39;}</span> <span class="default_value">{&#39;opcs4&#39;, &#39;atc&#39;, &#39;snomed&#39;, &#39;icd10&#39;, &#39;read2&#39;, &#39;read3&#39;}</span>
</div> </div>
......
This diff is collapsed.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -112,7 +112,8 @@ The `phen` command is used phenotype-related operations. ...@@ -112,7 +112,8 @@ The `phen` command is used phenotype-related operations.
- `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`). - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`).
- `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen). - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
- `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped. - `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped.
- `--no-metadata`: (Optional) Prevent copying of metadata columns to output. - `--no-metadata`: (Optional) Prevent copying of metadata columns to output.
- `--do-reverse-translate`: (Optional) Enable reversing one directional mappings. **WARNING** goes against NHS TRUD guidelines.
- **Publish Phenotype Configuration** - **Publish Phenotype Configuration**
......