Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 11-test-fix-tests-to-handle-licensed-data-resources-from-trud-snd-omop
  • 61-feature-add-optional-backwards-mapping-for-consistency-with-older-version
  • 61-feature-add-optional-backwards-mapping-for-consistency-with-older-version-2
  • dev
  • general
  • main
  • old-main
  • pypi
  • v0.0.3
9 results

Target

Select target project
  • meldb/concepts-processing
1 result
Select Git revision
  • 11-test-fix-tests-to-handle-licensed-data-resources-from-trud-snd-omop
  • 61-feature-add-optional-backwards-mapping-for-consistency-with-older-version
  • 61-feature-add-optional-backwards-mapping-for-consistency-with-older-version-2
  • dev
  • general
  • main
  • old-main
  • pypi
  • v0.0.3
9 results
Show changes
Commits on Source (18)
...@@ -58,7 +58,7 @@ def _phen_validate(args: argparse.Namespace): ...@@ -58,7 +58,7 @@ def _phen_validate(args: argparse.Namespace):
def _phen_map(args: argparse.Namespace): def _phen_map(args: argparse.Namespace):
"""Handle the `phen map` command.""" """Handle the `phen map` command."""
phen.map(args.phen_dir, args.target_coding) phen.map(args.phen_dir, args.target_coding, args.not_translate, args.no_metadata)
def _phen_export(args: argparse.Namespace): def _phen_export(args: argparse.Namespace):
...@@ -78,7 +78,7 @@ def _phen_copy(args: argparse.Namespace): ...@@ -78,7 +78,7 @@ def _phen_copy(args: argparse.Namespace):
def _phen_diff(args: argparse.Namespace): def _phen_diff(args: argparse.Namespace):
"""Handle the `phen diff` command.""" """Handle the `phen diff` command."""
phen.diff(args.phen_dir, args.version, args.old_phen_dir, args.old_version) phen.diff(args.phen_dir, args.version, args.old_phen_dir, args.old_version, args.not_check_config)
def main(): def main():
...@@ -217,6 +217,18 @@ def main(): ...@@ -217,6 +217,18 @@ def main():
choices=parse.SUPPORTED_CODE_TYPES, choices=parse.SUPPORTED_CODE_TYPES,
help=f"Specify the target coding {parse.SUPPORTED_CODE_TYPES}", help=f"Specify the target coding {parse.SUPPORTED_CODE_TYPES}",
) )
phen_map_parser.add_argument(
"--not-translate",
action='store_true',
default=False,
help="(Optional) Prevent any phenotype translation using NHS TRUD vocabularies.",
)
phen_map_parser.add_argument(
"--no-metadata",
action='store_true',
default=False,
help="(Optional) Prevent copying of metadata columns to output.",
)
phen_map_parser.set_defaults(func=_phen_map) phen_map_parser.set_defaults(func=_phen_map)
# phen export # phen export
...@@ -323,6 +335,12 @@ def main(): ...@@ -323,6 +335,12 @@ def main():
required=True, required=True,
help="Old phenotype version to compare with the changed version", help="Old phenotype version to compare with the changed version",
) )
phen_diff_parser.add_argument(
"--not-check-config",
action='store_true',
default=False,
help="(Optional) Prevent loading and comparing config file, in the case where one does not exist",
)
phen_diff_parser.set_defaults(func=_phen_diff) phen_diff_parser.set_defaults(func=_phen_diff)
# Parse arguments # Parse arguments
......
...@@ -192,7 +192,7 @@ class Read2(Proto): ...@@ -192,7 +192,7 @@ class Read2(Proto):
class Read3(Proto): class Read3(Proto):
def __init__(self): def __init__(self):
super().__init__("Read3", trud.PROCESSED_PATH / "read3.parquet") super().__init__("read3", trud.PROCESSED_PATH / "read3.parquet")
self.checks = [ self.checks = [
( (
...@@ -231,7 +231,7 @@ class Read3(Proto): ...@@ -231,7 +231,7 @@ class Read3(Proto):
f"QA Alphanumeric Dot", f"QA Alphanumeric Dot",
codes=codes, codes=codes,
codes_file=codes_file, codes_file=codes_file,
check_regex=codes.str.match(r"^[a-zA-Z0-9.]+$"), mask=None,
code_type=self.name, code_type=self.name,
) )
) )
...@@ -246,7 +246,7 @@ class Read3(Proto): ...@@ -246,7 +246,7 @@ class Read3(Proto):
f"QA In Database", f"QA In Database",
codes=codes, codes=codes,
codes_file=codes_file, codes_file=codes_file,
check_regex=self.in_database(codes, self.db, self.name), mask=None,
code_type=self.name, code_type=self.name,
) )
) )
......
...@@ -127,9 +127,11 @@ CONFIG_SCHEMA = { ...@@ -127,9 +127,11 @@ CONFIG_SCHEMA = {
"type": "dict", "type": "dict",
"schema": { "schema": {
"name": {"type": "string", "required": True}, "name": {"type": "string", "required": True},
"file": { "files": {
"type": "list",
"required": True,
"schema": {
"type": "dict", "type": "dict",
"required": False,
"schema": { "schema": {
"path": {"type": "string", "required": True}, "path": {"type": "string", "required": True},
"columns": {"type": "dict", "required": True}, "columns": {"type": "dict", "required": True},
...@@ -138,12 +140,19 @@ CONFIG_SCHEMA = { ...@@ -138,12 +140,19 @@ CONFIG_SCHEMA = {
}, # Optional but must be string if present }, # Optional but must be string if present
"actions": { "actions": {
"type": "dict", "type": "dict",
"schema": {"divide_col": {"type": "string"}}, "schema": {
"divide_col": {"type": "string"},
"split_col": {"type": "string"},
"codes_col": {"type": "string"}
}, },
}, },
}, },
}, },
}, },
"metadata": {"type": "dict", "required": False},
},
},
}, },
}, },
} }
...@@ -518,9 +527,10 @@ def validate(phen_dir: str): ...@@ -518,9 +527,10 @@ def validate(phen_dir: str):
concept_set_names.append(item["name"]) concept_set_names.append(item["name"])
# check codes definition # check codes definition
for item in phenotype["concept_sets"]: for files in phenotype["concept_sets"]:
for item in files["files"]:
# check concepte code file exists # check concepte code file exists
concept_code_file_path = concepts_path / item["file"]["path"] concept_code_file_path = concepts_path / item["path"]
if not concept_code_file_path.exists(): if not concept_code_file_path.exists():
validation_errors.append( validation_errors.append(
f"Coding file {str(concept_code_file_path.resolve())} does not exist" f"Coding file {str(concept_code_file_path.resolve())} does not exist"
...@@ -539,15 +549,15 @@ def validate(phen_dir: str): ...@@ -539,15 +549,15 @@ def validate(phen_dir: str):
) )
# check columns specified are a supported medical coding type # check columns specified are a supported medical coding type
for column in item["file"]["columns"]: for column in item["columns"]:
if column not in code_types: if column not in code_types:
validation_errors.append( validation_errors.append(
f"Column type {column} for file {concept_code_file_path} is not supported" f"Column type {column} for file {concept_code_file_path} is not supported"
) )
# check the actions are supported # check the actions are supported
if "actions" in item["file"]: if "actions" in item:
for action in item["file"]["actions"]: for action in item["actions"]:
if action not in COL_ACTIONS: if action not in COL_ACTIONS:
validation_errors.append(f"Action {action} is not supported") validation_errors.append(f"Action {action} is not supported")
...@@ -588,12 +598,12 @@ def _process_actions(df: pd.DataFrame, concept_set: dict) -> pd.DataFrame: ...@@ -588,12 +598,12 @@ def _process_actions(df: pd.DataFrame, concept_set: dict) -> pd.DataFrame:
# Perform Structural Changes to file before preprocessing # Perform Structural Changes to file before preprocessing
_logger.debug("Processing file structural actions") _logger.debug("Processing file structural actions")
if ( if (
"actions" in concept_set["file"] "actions" in concept_set
and "split_col" in concept_set["file"]["actions"] and "split_col" in concept_set["actions"]
and "codes_col" in concept_set["file"]["actions"] and "codes_col" in concept_set["actions"]
): ):
split_col = concept_set["file"]["actions"]["split_col"] split_col = concept_set["actions"]["split_col"]
codes_col = concept_set["file"]["actions"]["codes_col"] codes_col = concept_set["actions"]["codes_col"]
_logger.debug( _logger.debug(
"Action: Splitting", "Action: Splitting",
split_col, split_col,
...@@ -621,12 +631,12 @@ def _preprocess_source_concepts( ...@@ -621,12 +631,12 @@ def _preprocess_source_concepts(
# Preprocess codes # Preprocess codes
code_types = parse.CodeTypeParser().code_types code_types = parse.CodeTypeParser().code_types
for code_type in concept_set["file"]["columns"]: for code_type in concept_set["columns"]:
parser = code_types[code_type] parser = code_types[code_type]
_logger.info(f"Processing {code_type} codes for {code_file_path}") _logger.info(f"Processing {code_type} codes for {code_file_path}")
# get codes by column name # get codes by column name
source_col_name = concept_set["file"]["columns"][code_type] source_col_name = concept_set["columns"][code_type]
codes = df[source_col_name].dropna() codes = df[source_col_name].dropna()
codes = codes.astype(str) # convert to string codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces codes = codes.str.strip() # remove excess spaces
...@@ -653,7 +663,7 @@ def _preprocess_source_concepts( ...@@ -653,7 +663,7 @@ def _preprocess_source_concepts(
# Translate Df with multiple codes into single code type Series # Translate Df with multiple codes into single code type Series
def translate_codes( def translate_codes(
source_df: pd.DataFrame, target_code_type: str, concept_name: str source_df: pd.DataFrame, target_code_type: str, concept_name: str, not_translate:bool
) -> pd.DataFrame: ) -> pd.DataFrame:
"""Translates each source code type the source coding list into a target type and returns all conversions as a concept set""" """Translates each source code type the source coding list into a target type and returns all conversions as a concept set"""
...@@ -678,7 +688,7 @@ def translate_codes( ...@@ -678,7 +688,7 @@ def translate_codes(
_logger.debug( _logger.debug(
f"Target code type {target_code_type} is the same as source code type {len(source_df)}, copying codes rather than translating" f"Target code type {target_code_type} is the same as source code type {len(source_df)}, copying codes rather than translating"
) )
else: elif not not_translate:
# get the translation filename using source to target code types # get the translation filename using source to target code types
filename = f"{source_code_type}_to_{target_code_type}.parquet" filename = f"{source_code_type}_to_{target_code_type}.parquet"
map_path = trud.PROCESSED_PATH / filename map_path = trud.PROCESSED_PATH / filename
...@@ -725,7 +735,7 @@ def _write_code_errors(code_errors: list, code_errors_path: Path): ...@@ -725,7 +735,7 @@ def _write_code_errors(code_errors: list, code_errors_path: Path):
"SOURCE": err.codes_file, "SOURCE": err.codes_file,
"CAUSE": err.message, "CAUSE": err.message,
} }
for err in code_errors for err in code_errors if err.mask is not None
] ]
) )
...@@ -773,7 +783,7 @@ def write_vocab_version(phen_path: Path): ...@@ -773,7 +783,7 @@ def write_vocab_version(phen_path: Path):
) )
def map(phen_dir: str, target_code_type: str): def map(phen_dir: str, target_code_type: str, not_translate:bool, no_metadata:bool):
_logger.info(f"Processing phenotype: {phen_dir}") _logger.info(f"Processing phenotype: {phen_dir}")
# Validate configuration # Validate configuration
...@@ -797,15 +807,15 @@ def map(phen_dir: str, target_code_type: str): ...@@ -797,15 +807,15 @@ def map(phen_dir: str, target_code_type: str):
) )
if target_code_type is not None: if target_code_type is not None:
_map_target_code_type(phen_path, phenotype, target_code_type) _map_target_code_type(phen_path, phenotype, target_code_type, not_translate, no_metadata)
else: else:
for t in phenotype["map"]: for t in phenotype["map"]:
_map_target_code_type(phen_path, phenotype, t) _map_target_code_type(phen_path, phenotype, t, not_translate, no_metadata)
_logger.info(f"Phenotype processed successfully") _logger.info(f"Phenotype processed successfully")
def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: str): def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: str, not_translate:bool, no_metadata:bool):
_logger.debug(f"Target coding format: {target_code_type}") _logger.debug(f"Target coding format: {target_code_type}")
concepts_path = phen_path / CONCEPTS_DIR concepts_path = phen_path / CONCEPTS_DIR
# Create output dataframe # Create output dataframe
...@@ -813,11 +823,17 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -813,11 +823,17 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
code_errors = [] code_errors = []
# Process each folder in codes section # Process each folder in codes section
for concept_set in phenotype["concept_sets"]: for files in phenotype["concept_sets"]:
_logger.debug(f"--- {concept_set['file']} ---") concept_set_name = files["name"]
if "metadata" in files:
concept_set_metadata = files["metadata"]
else:
concept_set_metadata = {}
for concept_set in files["files"]:
_logger.debug(f"--- {concept_set} ---")
# Load code file # Load code file
codes_file_path = Path(concepts_path / concept_set["file"]["path"]) codes_file_path = Path(concepts_path / concept_set["path"])
df = _read_table_file(codes_file_path) df = _read_table_file(codes_file_path)
# process structural actions # process structural actions
...@@ -832,7 +848,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -832,7 +848,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
) )
# create df with just the source code columns # create df with just the source code columns
source_column_names = list(concept_set["file"]["columns"].keys()) source_column_names = list(concept_set["columns"].keys())
source_df = df[source_column_names] source_df = df[source_column_names]
_logger.debug(source_df.columns) _logger.debug(source_df.columns)
...@@ -848,22 +864,30 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -848,22 +864,30 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
# Map source concepts codes to target codes # Map source concepts codes to target codes
# if processing a source coding list with categorical data # if processing a source coding list with categorical data
if ( if (
"actions" in concept_set["file"] "actions" in concept_set
and "divide_col" in concept_set["file"]["actions"] and "divide_col" in concept_set["actions"]
and len(df) > 0 and len(df) > 0
): ):
divide_col = concept_set["file"]["actions"]["divide_col"] divide_col = concept_set["actions"]["divide_col"]
_logger.debug(f"Action: Dividing Table by {divide_col}") _logger.debug(f"Action: Dividing Table by {divide_col}")
_logger.debug(f"column into: {df[divide_col].unique()}") _logger.debug(f"column into: {df[divide_col].unique()}")
df_grp = df.groupby(divide_col) df_grp = df.groupby(divide_col)
for cat, grp in df_grp: for cat, grp in df_grp:
if cat == concept_set["file"]["category"]: if cat == concept_set["category"]:
grp = grp.drop(columns=[divide_col]) # delete categorical column grp = grp.drop(
columns=[divide_col]
) # delete categorical column
source_df = grp[source_column_names] source_df = grp[source_column_names]
trans_out = translate_codes( trans_out = translate_codes(
source_df, source_df,
target_code_type=target_code_type, target_code_type=target_code_type,
concept_name=concept_set["name"], concept_name=concept_set_name,
not_translate=not_translate,
)
trans_out = add_metadata(
codes=trans_out,
metadata=concept_set_metadata,
no_metadata=no_metadata,
) )
out = pd.concat([out, trans_out]) out = pd.concat([out, trans_out])
else: else:
...@@ -871,7 +895,13 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -871,7 +895,13 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
trans_out = translate_codes( trans_out = translate_codes(
source_df, source_df,
target_code_type=target_code_type, target_code_type=target_code_type,
concept_name=concept_set["name"], concept_name=concept_set_name,
not_translate=not_translate,
)
trans_out = add_metadata(
codes=trans_out,
metadata=concept_set_metadata,
no_metadata=no_metadata,
) )
out = pd.concat([out, trans_out]) out = pd.concat([out, trans_out])
...@@ -894,48 +924,51 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -894,48 +924,51 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
out_count = len(out.index) # out_count = len(out.index)
# added metadata # added metadata
# Loop over each source_concept_type and perform the left join on all columns apart from source code columns # Loop over each source_concept_type and perform the left join on all columns apart from source code columns
result_list = [] # result_list = []
source_column_names = list(concept_set["file"]["columns"].keys()) # for files in phenotype["concept_sets"]:
for source_concept_type in source_column_names: # concept_set_name = files["name"]
# Filter output based on the current source_concept_type # for concept_set in files["files"]:
out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type] # source_column_names = list(concept_set["columns"].keys())
filtered_count = len(out_filtered_df.index) # for source_concept_type in source_column_names:
# # Filter output based on the current source_concept_type
# Remove the source type columns except the current type will leave the metadata and the join # out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
remove_types = [ # filtered_count = len(out_filtered_df.index)
type for type in source_column_names if type != source_concept_type
] # # Remove the source type columns except the current type will leave the metadata and the join
metadata_df = df.drop(columns=remove_types) # remove_types = [
metadata_df = metadata_df.rename( # type for type in source_column_names if type != source_concept_type
columns={source_concept_type: "SOURCE_CONCEPT"} # ]
) # metadata_df = df.drop(columns=remove_types)
metadata_df_count = len(metadata_df.index) # metadata_df = metadata_df.rename(
# columns={source_concept_type: "SOURCE_CONCEPT"}
# )
# metadata_df_count = len(metadata_df.index)
# Perform the left join with df2 on SOURCE_CONCEPT to add the metadata # Perform the left join with df2 on SOURCE_CONCEPT to add the metadata
result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT") # result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
result_count = len(result.index) # result_count = len(result.index)
_logger.debug( # _logger.debug(
f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}" # f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
) # )
# Append the result to the result_list # # Append the result to the result_list
result_list.append(result) # result_list.append(result)
# Concatenate all the results into a single DataFrame # Concatenate all the results into a single DataFrame
final_out = pd.concat(result_list, ignore_index=True) # final_out = pd.concat(result_list, ignore_index=True)
final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) # final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
_logger.debug( # _logger.debug(
f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}" # f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
) # )
# Save output to map directory # Save output to map directory
output_filename = target_code_type + ".csv" output_filename = target_code_type + ".csv"
map_path = phen_path / MAP_DIR / output_filename map_path = phen_path / MAP_DIR / output_filename
final_out.to_csv(map_path, index=False) out.to_csv(map_path, index=False)
_logger.info(f"Saved mapped concepts to {str(map_path.resolve())}") _logger.info(f"Saved mapped concepts to {str(map_path.resolve())}")
# save concept sets as separate files # save concept sets as separate files
...@@ -950,7 +983,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -950,7 +983,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
concept_set_path.mkdir(parents=True, exist_ok=True) concept_set_path.mkdir(parents=True, exist_ok=True)
# write each concept as a separate file # write each concept as a separate file
for name, concept in final_out.groupby("CONCEPT_SET"): for name, concept in out.groupby("CONCEPT_SET"):
concept = concept.sort_values(by="CONCEPT") # sort rows concept = concept.sort_values(by="CONCEPT") # sort rows
concept = concept.dropna(how="all", axis=1) # remove empty cols concept = concept.dropna(how="all", axis=1) # remove empty cols
concept = concept.reindex( concept = concept.reindex(
...@@ -965,6 +998,22 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st ...@@ -965,6 +998,22 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
_logger.info(f"Phenotype processed target code type {target_code_type}") _logger.info(f"Phenotype processed target code type {target_code_type}")
# Add metadata dict to each row of Df codes
def add_metadata(
codes: pd.DataFrame, metadata: dict, no_metadata:bool,
) -> pd.DataFrame:
"""Add concept set metadata, stored as a dictionary, to each concept row"""
if not no_metadata:
for meta_name, meta_value in metadata.items():
codes[meta_name] = meta_value
_logger.debug(
f"Adding metadata for concept set: metadata name {meta_name}, metadata value {meta_value}"
)
return codes
def _generate_version_tag( def _generate_version_tag(
repo: git.Repo, increment: str = DEFAULT_VERSION_INC, use_v_prefix: bool = False repo: git.Repo, increment: str = DEFAULT_VERSION_INC, use_v_prefix: bool = False
) -> str: ) -> str:
...@@ -1165,7 +1214,7 @@ def copy(phen_dir: str, target_dir: str, version: str): ...@@ -1165,7 +1214,7 @@ def copy(phen_dir: str, target_dir: str, version: str):
def extract_concepts(config_data: dict) -> Tuple[dict, Set[str]]: def extract_concepts(config_data: dict) -> Tuple[dict, Set[str]]:
"""Extracts concepts as {name: file_path} dictionary and a name set.""" """Extracts concepts as {name: file_path} dictionary and a name set."""
concepts_dict = { concepts_dict = {
item["name"]: item["file"]["path"] item["name"]: [file["path"] for file in item["files"]]
for item in config_data["phenotype"]["concept_sets"] for item in config_data["phenotype"]["concept_sets"]
} }
name_set = set(concepts_dict.keys()) name_set = set(concepts_dict.keys())
...@@ -1190,7 +1239,7 @@ def diff_config(old_config: dict, new_config: dict) -> str: ...@@ -1190,7 +1239,7 @@ def diff_config(old_config: dict, new_config: dict) -> str:
old_concepts, old_names = extract_concepts(old_config) old_concepts, old_names = extract_concepts(old_config)
new_concepts, new_names = extract_concepts(new_config) new_concepts, new_names = extract_concepts(new_config)
# Check added and removed names # Check added and removed concept set names
added_names = new_names - old_names # Names that appear in new but not in old added_names = new_names - old_names # Names that appear in new but not in old
removed_names = old_names - new_names # Names that were in old but not in new removed_names = old_names - new_names # Names that were in old but not in new
...@@ -1331,9 +1380,15 @@ def diff_phen( ...@@ -1331,9 +1380,15 @@ def diff_phen(
old_phen_path: Path, old_phen_path: Path,
old_version: str, old_version: str,
report_path: Path, report_path: Path,
not_check_config:bool,
): ):
"""Compare the differences between two versions of a phenotype""" """Compare the differences between two versions of a phenotype"""
# write report heading
report = f"# Phenotype Comparison Report\n"
# Step 1: check differences configuration files
if not not_check_config:
# validate phenotypes # validate phenotypes
_logger.debug(f"Validating for diff old path: {str(old_phen_path.resolve())}") _logger.debug(f"Validating for diff old path: {str(old_phen_path.resolve())}")
validate(str(old_phen_path.resolve())) validate(str(old_phen_path.resolve()))
...@@ -1348,8 +1403,7 @@ def diff_phen( ...@@ -1348,8 +1403,7 @@ def diff_phen(
with new_config_path.open("r") as file: with new_config_path.open("r") as file:
new_config = yaml.safe_load(file) new_config = yaml.safe_load(file)
# write report heading # write report
report = f"# Phenotype Comparison Report\n"
report += f"## Original phenotype\n" report += f"## Original phenotype\n"
report += f" - {old_config['phenotype']['omop']['vocabulary_id']}\n" report += f" - {old_config['phenotype']['omop']['vocabulary_id']}\n"
report += f" - {old_version}\n" report += f" - {old_version}\n"
...@@ -1359,7 +1413,7 @@ def diff_phen( ...@@ -1359,7 +1413,7 @@ def diff_phen(
report += f" - {new_version}\n" report += f" - {new_version}\n"
report += f" - {str(new_phen_path.resolve())}\n" report += f" - {str(new_phen_path.resolve())}\n"
# Step 1: check differences configuration files
# Convert list of dicts into a dict: {name: file} # Convert list of dicts into a dict: {name: file}
report += diff_config(old_config, new_config) report += diff_config(old_config, new_config)
...@@ -1378,7 +1432,7 @@ def diff_phen( ...@@ -1378,7 +1432,7 @@ def diff_phen(
_logger.info(f"Phenotypes diff'd successfully") _logger.info(f"Phenotypes diff'd successfully")
def diff(phen_dir: str, version: str, old_phen_dir: str, old_version: str): def diff(phen_dir: str, version: str, old_phen_dir: str, old_version: str, not_check_config:bool):
# make tmp directory .acmc # make tmp directory .acmc
timestamp = time.strftime("%Y%m%d_%H%M%S") timestamp = time.strftime("%Y%m%d_%H%M%S")
temp_dir = Path(f".acmc/diff_{timestamp}") temp_dir = Path(f".acmc/diff_{timestamp}")
...@@ -1439,7 +1493,7 @@ def diff(phen_dir: str, version: str, old_phen_dir: str, old_version: str): ...@@ -1439,7 +1493,7 @@ def diff(phen_dir: str, version: str, old_phen_dir: str, old_version: str):
report_filename = f"{version}_{old_version}_diff.md" report_filename = f"{version}_{old_version}_diff.md"
report_path = changed_phen_path / report_filename report_path = changed_phen_path / report_filename
# diff old with new # diff old with new
diff_phen(changed_path, version, old_path, old_version, report_path) diff_phen(changed_path, version, old_path, old_version, report_path, not_check_config)
finally: finally:
# clean up tmp directory # clean up tmp directory
......
...@@ -760,7 +760,7 @@ ...@@ -760,7 +760,7 @@
<section id="SUPPORTED_CODE_TYPES"> <section id="SUPPORTED_CODE_TYPES">
<div class="attr variable"> <div class="attr variable">
<span class="name">SUPPORTED_CODE_TYPES</span> = <span class="name">SUPPORTED_CODE_TYPES</span> =
<span class="default_value">{&#39;opcs4&#39;, &#39;icd10&#39;, &#39;atc&#39;, &#39;snomed&#39;, &#39;read2&#39;, &#39;read3&#39;}</span> <span class="default_value">{&#39;atc&#39;, &#39;read2&#39;, &#39;read3&#39;, &#39;opcs4&#39;, &#39;snomed&#39;, &#39;icd10&#39;}</span>
</div> </div>
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
...@@ -111,6 +111,9 @@ The `phen` command is used phenotype-related operations. ...@@ -111,6 +111,9 @@ The `phen` command is used phenotype-related operations.
- `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`). - `-t`, `--target-coding`: (Optional) Specify the target coding (e.g., `read2`, `read3`, `icd10`, `snomed`, `opcs4`).
- `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen). - `-d`, `--phen-dir`: (Optional) Local phenotype workspace directory (default is ./workspace/phen).
- `--not-translate`: (Optional) Prevent any phenotype translation using NHS TRUD vocabularies. Therefore only concepts in already in the traget coding will be mapped.
- `--no-metadata`: (Optional) Prevent copying of metadata columns to output.
- **Publish Phenotype Configuration** - **Publish Phenotype Configuration**
......
...@@ -9,8 +9,7 @@ phenotype: ...@@ -9,8 +9,7 @@ phenotype:
- "read3" - "read3"
concept_sets: concept_sets:
- name: "ABDO_PAIN" - name: "ABDO_PAIN"
file: files:
path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv" - path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
columns: columns:
read2: "code" read2: "code"
...@@ -9,12 +9,12 @@ phenotype: ...@@ -9,12 +9,12 @@ phenotype:
- "read3" - "read3"
concept_sets: concept_sets:
- name: "CVD_EVENTS" - name: "CVD_EVENTS"
file: files:
path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" - path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns: columns:
icd10: "code" icd10: "code"
- name: "DID_NOT_ATTEND" - name: "DID_NOT_ATTEND"
file: files:
path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv" - path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
columns: columns:
read2: "code" read2: "code"
\ No newline at end of file
...@@ -10,26 +10,26 @@ phenotype: ...@@ -10,26 +10,26 @@ phenotype:
- "snomed" - "snomed"
concept_sets: concept_sets:
- name: "CVD_EVENTS" - name: "CVD_EVENTS"
file: files:
path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv" - path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns: columns:
icd10: "code" icd10: "code"
- name: "DID_NOT_ATTEND" - name: "DID_NOT_ATTEND"
file: files:
path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv" - path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
columns: columns:
read2: "code" read2: "code"
- name: "HYPERTENSION" - name: "HYPERTENSION"
file: files:
path: "hanlon/Read_codes_for_diagnoses.csv" - path: "hanlon/Read_codes_for_diagnoses.csv"
columns: columns:
read2: "Read Code" read2: "Read Code"
category: "2" category: "2"
actions: actions:
divide_col: "MMCode" divide_col: "MMCode"
- name: "DEPRESSION" - name: "DEPRESSION"
file: files:
path: "hanlon/Read_codes_for_diagnoses.csv" - path: "hanlon/Read_codes_for_diagnoses.csv"
columns: columns:
read2: "Read Code" read2: "Read Code"
category: "3" category: "3"
......
...@@ -128,26 +128,6 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file): ...@@ -128,26 +128,6 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog, config_file):
main.main() main.main()
assert "Phenotype published successfully" in caplog.text assert "Phenotype published successfully" in caplog.text
# copy phenotype'
with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(
sys,
"argv",
[
"main.py",
"phen",
"copy",
"-d",
str(phen_path.resolve()),
"-td",
str(tmp_dir.resolve()),
"-v",
"0.0.1",
],
)
main.main()
assert "Phenotype copied successfully" in caplog.text
# diff phenotype # diff phenotype
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
old_path = tmp_dir / "0.0.1" old_path = tmp_dir / "0.0.1"
......