Skip to content
Snippets Groups Projects
Commit ca41792b authored by mjbonifa's avatar mjbonifa
Browse files

refactor: completed refactoring of config to combine codes within concept...

refactor: completed refactoring of config to combine codes within concept sets. This optimisation will shorten to file overall, even though for things like hanlon it will be longer. Makes understanding of the file easier as no separate links bewteen concept sets and the code definition. Closes #3.
parent 32cf82d6
No related branches found
No related tags found
No related merge requests found
......@@ -42,6 +42,7 @@ CODES_COL_ACTION = "codes_col"
DIVIDE_COL_ACTION = "divide_col"
COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
class PhenValidationException(Exception):
"""Custom exception class raised when validation errors in phenotype configuration file"""
......@@ -190,16 +191,15 @@ def init(phen_dir, remote_url):
# create empty phen config file
config = {
"concept_sets": {
"phenotype": {
"version": initial_version,
"omop": {
"vocabulary_id": "",
"vocabulary_name": "",
"vocabulary_reference": "",
},
"concept_set": [],
},
"codes": [],
"concept_sets": [],
}
}
with open(phen_path / CONFIG_FILE, "w") as file:
......@@ -257,7 +257,7 @@ def validate(phen_dir):
# Load configuration File
if config_path.suffix == ".yaml":
with config_path.open("r") as file:
mapping = yaml.safe_load(file)
phenotype = yaml.safe_load(file)
else:
raise Exception(
f"Unsupported configuration filetype: {str(config_path.resolve())}"
......@@ -265,103 +265,69 @@ def validate(phen_dir):
# initiatise
validation_errors = []
concept_sets = mapping["concept_sets"]
concept_codes = mapping["codes"]
phenotype = phenotype["phenotype"]
code_types = parse.CodeTypeParser().code_types
# check the version number is of the format vn.n.n
match = re.match(r"v(\d+\.\d+\.\d+)", concept_sets["version"])
match = re.match(r"v(\d+\.\d+\.\d+)", phenotype["version"])
if not match:
validation_errors.append(
f"Invalid version format in configuration file: {concept_sets['version']}"
f"Invalid version format in configuration file: {phenotype['version']}"
)
# create a list of all the concept set names defined in the concept set configuration
concept_set_names = []
for item in concept_sets["concept_set"]:
if item["concept_set_name"] in concept_set_names:
for item in phenotype["concept_sets"]:
if item["name"] in concept_set_names:
validation_errors.append(
f"Duplicate concept set defined in concept sets {item['concept_set_name'] }"
f"Duplicate concept set defined in concept sets {item['name'] }"
)
else:
concept_set_names.append(item["concept_set_name"])
concept_set_names.append(item["name"])
# TODO: change this to some sort of yaml schema validation
required_keys = {"name", "file", "metadata"}
# check codes definition
concept_set_mapping_names = []
for item in concept_codes:
for item in phenotype["concept_sets"]:
required_keys = {"folder", "files"}
if required_keys.issubset(item.keys()):
# check concept codes path is a directory
concept_code_dir_path = codes_path / item["folder"]
if not concept_code_dir_path.is_dir():
validation_errors.append(
f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory"
)
for file in item["files"]:
# check concepte code file exists
concept_code_file_path = concept_code_dir_path / file["file"]
concept_code_file_path = codes_path / item["file"]["path"]
if not concept_code_file_path.exists():
validation_errors.append(
f"Coding file {str(concept_code_file_path.resolve())} does not exist"
)
# check concepte code file is not empty
concept_code_file_path = concept_code_dir_path / file["file"]
if concept_code_file_path.stat().st_size == 0:
validation_errors.append(
f"Coding file {str(concept_code_file_path.resolve())} is an empty file"
)
# check columns section exists
if "columns" not in file:
validation_errors.append(
f"Columns not defined for {concept_code_file_path}"
)
# check code file type is supported
if concept_code_file_path.suffix not in CODE_FILE_TYPES:
raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types")
# check columns specified are a supported medical coding type
for column in file["columns"]:
if column not in code_types and column != "metadata":
for column in item["file"]["columns"]:
if column not in code_types:
validation_errors.append(
f"Column type {column} for file {concept_code_file_path} is not supported"
)
# check the actions are supported
if "actions" in file:
for action in file["actions"]:
if "actions" in item["file"]:
for action in item["file"]["actions"]:
if action not in COL_ACTIONS:
validation_errors.append(
f"Action {action} is not supported"
)
# check concept_set defined for the mapping
logger.debug(f"file {file}")
for concept_set_mapping in file["concept_set"]:
# store the concept set names found for later set operations
logger.debug(f"mapping {concept_set_mapping}")
if concept_set_mapping['name'] not in concept_set_mapping_names:
concept_set_mapping_names.append(concept_set_mapping['name'])
else:
validation_errors.append(
f"Missing required elements {required_keys} in codes {item}"
)
# create sets to perform set operations on the lists of concept set names
concept_set_names_set = set(concept_set_names)
concept_set_mapping_names_set = set(concept_set_mapping_names)
# check all concept sets in the summary section have at least one code mapping
concept_set_no_codes = list(concept_set_names_set - concept_set_mapping_names_set)
if len(concept_set_no_codes) > 0:
validation_errors.append(
f"Concept sets do not exist in codes {concept_set_no_codes}"
)
# check all concept sets included in the code mapping are defined in the summary concept_set section
codes_no_concept_set = list(concept_set_mapping_names_set - concept_set_names_set)
if len(codes_no_concept_set) > 0:
validation_errors.append(
f"Concept sets mapped in codes do not exist in the concept sets: {codes_no_concept_set}"
f"Missing required elements {required_keys} in concept set {item}"
)
if len(validation_errors) > 0:
......@@ -378,9 +344,11 @@ def read_table_file(path, excel_sheet=None):
"""
Load Code List File
"""
path = path.resolve()
if path.suffix == ".csv":
df = pd.read_csv(path, dtype=str)
elif path.suffix == ".xlsx":
elif path.suffix == ".xlsx" or path.suffix == ".xls":
if excel_sheet:
df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str)
else:
......@@ -388,21 +356,21 @@ def read_table_file(path, excel_sheet=None):
elif path.suffix == ".dta":
df = pd.read_stata(path, dtype=str)
else:
raise Exception(f"Unsupported filetype provided for source file {path.suffix}")
raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types")
return df
def process_actions(df, file):
def process_actions(df, concept_set):
# Perform Structural Changes to file before preprocessing
logger.debug("Processing file structural actions")
if (
"actions" in file
and "split_col" in file["actions"]
and "codes_col" in file["actions"]
"actions" in concept_set["file"]
and "split_col" in concept_set["file"]["actions"]
and "codes_col" in concept_set["file"]["actions"]
):
split_col = file["actions"]["split_col"]
codes_col = file["actions"]["codes_col"]
split_col = concept_set["file"]["actions"]["split_col"]
codes_col = concept_set["file"]["actions"]["codes_col"]
logger.debug(
"Action: Splitting",
split_col,
......@@ -419,33 +387,33 @@ def process_actions(df, file):
# Perform QA Checks on columns individually and append to df
def preprocess_codes(df, file, target_code_type=None, codes_file=None):
def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
"""Parses each column individually - Order and length will not be preserved!"""
out = pd.DataFrame([]) # create output df to append to
code_errors = [] # list of errors from processing
metadata_df = pd.DataFrame()
meta_columns = [] # meta columns to keep with codes
if "actions" in file and "divide_col" in file["actions"]:
meta_columns += [file["actions"]["divide_col"]]
# TODO: enable metacolumns to be outputted - problem with map_file appending
if "metadata" in file["columns"]:
meta_columns += file["columns"]["metadata"]
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
meta_columns += [concept_set["file"]["actions"]["divide_col"]]
metadata_df = df[meta_columns]
# TODO: enable metacolumns to be outputted - problem with map_file appending
# if "metadata" in file["columns"]:
# meta_columns += file["columns"]["metadata"]
# Preprocess codes
code_types = parse.CodeTypeParser().code_types
for code_type_name, code_type_parser in code_types.items():
if code_type_name in file["columns"]:
if code_type_name in concept_set["file"]["columns"]:
logger.info(f"Processing {code_type_name} codes...")
# get code types
codes = df[file["columns"][code_type_name]].dropna()
codes = df[concept_set["file"]["columns"][code_type_name]].dropna()
codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces
# process codes, validating them using parser and returning the errors
codes, errors = code_type_parser.process(codes, codes_file)
codes, errors = code_type_parser.process(codes, code_file_path)
if len(errors) > 0:
code_errors.extend(errors)
logger.warning(f"Codes validation failed with {len(errors)} errors")
......@@ -491,10 +459,7 @@ def translate_codes(df, target_code_type):
# Append file's codes to output Df with concept
def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
# seperate out meta_columns
metadata_df = df[meta_columns]
df = df.drop(columns=meta_columns)
def map_file(df, target_code_type, out, concept_name):
# translate codes
codes = translate_codes(df, target_code_type)
......@@ -503,9 +468,7 @@ def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
# Append to output if translated
if len(codes) > 0:
codes = pd.DataFrame({"CONCEPT": codes})
codes = codes.join(metadata_df)
for concept in concept_names:
codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes))
out = pd.concat([out, codes])
else:
logger.debug(f"No codes converted with target code type {target_code_type}")
......@@ -588,37 +551,29 @@ def map(phen_dir, target_code_type):
# load configuration
with config_path.open("r") as file:
config = yaml.safe_load(file)
concept_sets = config["concept_sets"]
codes = config["codes"]
phenotype = config["phenotype"]
# Create output dataframe
out = pd.DataFrame([])
code_errors = []
# Process each folder in codes section
for folder in codes:
for file in folder["files"]:
logger.debug(f"--- {file['file']} ---")
codes_file_path = codes_path / folder["folder"] / file["file"]
for concept_set in phenotype["concept_sets"]:
logger.debug(f"--- {concept_set['file']} ---")
# Load Code File
if "excel_sheet" in file:
df = read_table_file(
path=codes_file_path, excel_sheet=file["excel_sheet"]
)
else:
df = read_table_file(path=codes_file_path)
codes_file_path = Path(codes_path / concept_set["file"]["path"])
df = read_table_file(codes_file_path)
# process structural actions
df = process_actions(df, file)
df = process_actions(df, concept_set)
# Preprocessing & Validation Checks
logger.debug("Processing and validating code formats")
df, meta_columns, errors = preprocess_codes(
df,
file,
codes_file=str(codes_file_path.resolve()),
concept_set,
codes_file_path,
target_code_type=target_code_type,
)
......@@ -628,40 +583,35 @@ def map(phen_dir, target_code_type):
logger.debug(f" Length of code_errors {len(code_errors)}")
# partition table by categorical column
if "actions" in file and "divide_col" in file["actions"] and len(df) > 0:
divide_col = file["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col} column into: {df[divide_col].unique()}")
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
divide_col = concept_set["file"]["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col}")
logger.debug(df.head())
logger.debug(f"column into: {df[divide_col].unique()}")
df = df.groupby(divide_col)
# Map to Concept/Phenotype
# TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
logger.debug(f"instance of df before if: {type(df)}")
if isinstance(df, pd.core.frame.DataFrame):
concept_names = [concept['name'] for concept in file["concept_set"]]
out = map_file(
df,
target_code_type,
out,
concept_names=concept_names,
meta_columns=meta_columns,
concept_name=concept_set['name']
)
elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
meta_columns.remove(divide_col) # delete categorical column
for cat, grp in df:
for concept_set in file['concept_set']:
# what if there's no category, there's going to be an error
if cat == concept_set["category"]:
if cat == concept_set["file"]["category"]:
grp = grp.drop(
columns=[divide_col]
) # delete categorical column
logger.debug(f"Mapping category: {cat}")
concept_names = [concept_set["name"]]
out = map_file(
grp,
target_code_type,
out,
concept_names=concept_names,
meta_columns=meta_columns,
concept_name=concept_set['name']
)
else:
logger.debug(f"instance of df: {type(df)}")
......@@ -682,26 +632,12 @@ def map(phen_dir, target_code_type):
raise Exception(
f"No output after map processing, check config {str(config_path.resolve())}"
)
# Final processing
out = out.reset_index(drop=True)
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
# Add concept set definition metadata
concept_sets_df = pd.DataFrame(
concept_sets["concept_set"]
) # transform to dataframe
if "metadata" in concept_sets_df.columns:
concept_sets_df = concept_sets_df.join(
pd.json_normalize(concept_sets_df["metadata"])
) # metadata to columns
concept_sets_df = concept_sets_df.drop(columns=["metadata"])
concept_sets_df = concept_sets_df.rename(
columns={"concept_set_name": "CONCEPT_SET"}
)
concept_sets_df = concept_sets_df.drop_duplicates() # remove duplicates
out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output
# Save output to map directory
output_filename = target_code_type + ".csv"
map_path = phen_path / MAP_DIR / output_filename
......@@ -762,7 +698,7 @@ def publish(phen_dir):
config_path = phen_path / CONFIG_FILE
with config_path.open("r") as file:
config = yaml.safe_load(file)
match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
match = re.match(r"v(\d+\.\d+)", config["phenotype"]["version"])
major_version = match.group(1)
# get latest minor version from git commit count
......@@ -772,7 +708,7 @@ def publish(phen_dir):
next_minor_version = commit_count + 1
version = f"v{major_version}.{next_minor_version}"
logger.debug(f"New version: {version}")
config["concept_sets"]["version"] = version
config["phenotype"]["version"] = version
with open(config_path, "w") as file:
yaml.dump(config, file, default_flow_style=False, sort_keys=False)
......@@ -826,8 +762,8 @@ def export(phen_dir, version):
export_db_path = omop.export(
map_path,
export_path,
config["concept_sets"]["version"],
config["concept_sets"]["omop"],
config["phenotype"]["version"],
config["phenotype"]["omop"],
)
# write to tables
......@@ -925,7 +861,7 @@ def diff(phen_dir, phen_old_dir):
with new_config.open("r") as file:
new_config = yaml.safe_load(file)
report.write(
f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
)
report.write(f"- Removed outputs: {list(removed_outputs)}\n")
report.write(f"- Added outputs: {list(added_outputs)}\n")
......
concept_sets:
phenotype:
version: "v1.0.1"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
concept_set:
- concept_set_name: "ABDO_PAIN"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
concept_sets:
- name: "ABDO_PAIN"
file:
path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
columns:
read2: "code"
metadata:
- "description"
concept_set:
- name: "ABDO_PAIN"
metadata: {}
concept_sets:
phenotype:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
concept_sets:
- name: "CVD_EVENTS"
file:
path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- name: "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
metadata: {}
- name: "DID_NOT_ATTEND"
file:
path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- name: "DID_NOT_ATTEND"
metadata: {}
\ No newline at end of file
concept_sets:
phenotype:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
metadata: {}
- concept_set_name: "HYPERTENSION"
metadata: {}
- concept_set_name: "DEPRESSION"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
concept_sets:
- name: "CVD_EVENTS"
file:
path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- name: "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
metadata: {}
- name: "DID_NOT_ATTEND"
file:
path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- name: "DID_NOT_ATTEND"
- folder: hanlon
description: Hanlon Paper Code Lists
files:
- file: Read_codes_for_diagnoses.csv
metadata: {}
- name: "HYPERTENSION"
file:
path: "hanlon/Read_codes_for_diagnoses.csv"
columns:
read2: Read Code
actions:
divide_col: MMCode
concept_set:
- name: HYPERTENSION
read2: "Read Code"
category: "2"
- name: DEPRESSION
actions:
divide_col: "MMCode"
metadata: {}
- name: "DEPRESSION"
file:
path: "hanlon/Read_codes_for_diagnoses.csv"
columns:
read2: "Read Code"
category: "3"
actions:
divide_col: "MMCode"
metadata: {}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment