Skip to content
Snippets Groups Projects
Commit f1f18dcd authored by mjbonifa's avatar mjbonifa
Browse files

Merge branch '3-review-and-refactor-configuration-file-2' into 'dev'

refactor: completed refactoring of config to combine codes within concept...

Closes #3

See merge request meldb/concepts-processing!9
parents 32cf82d6 ca41792b
No related branches found
No related tags found
No related merge requests found
......@@ -42,6 +42,7 @@ CODES_COL_ACTION = "codes_col"
DIVIDE_COL_ACTION = "divide_col"
COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
class PhenValidationException(Exception):
"""Custom exception class raised when validation errors in phenotype configuration file"""
......@@ -190,16 +191,15 @@ def init(phen_dir, remote_url):
# create empty phen config file
config = {
"concept_sets": {
"phenotype": {
"version": initial_version,
"omop": {
"vocabulary_id": "",
"vocabulary_name": "",
"vocabulary_reference": "",
},
"concept_set": [],
},
"codes": [],
"concept_sets": [],
}
}
with open(phen_path / CONFIG_FILE, "w") as file:
......@@ -257,7 +257,7 @@ def validate(phen_dir):
# Load configuration File
if config_path.suffix == ".yaml":
with config_path.open("r") as file:
mapping = yaml.safe_load(file)
phenotype = yaml.safe_load(file)
else:
raise Exception(
f"Unsupported configuration filetype: {str(config_path.resolve())}"
......@@ -265,103 +265,69 @@ def validate(phen_dir):
# initiatise
validation_errors = []
concept_sets = mapping["concept_sets"]
concept_codes = mapping["codes"]
phenotype = phenotype["phenotype"]
code_types = parse.CodeTypeParser().code_types
# check the version number is of the format vn.n.n
match = re.match(r"v(\d+\.\d+\.\d+)", concept_sets["version"])
match = re.match(r"v(\d+\.\d+\.\d+)", phenotype["version"])
if not match:
validation_errors.append(
f"Invalid version format in configuration file: {concept_sets['version']}"
f"Invalid version format in configuration file: {phenotype['version']}"
)
# create a list of all the concept set names defined in the concept set configuration
concept_set_names = []
for item in concept_sets["concept_set"]:
if item["concept_set_name"] in concept_set_names:
for item in phenotype["concept_sets"]:
if item["name"] in concept_set_names:
validation_errors.append(
f"Duplicate concept set defined in concept sets {item['concept_set_name'] }"
f"Duplicate concept set defined in concept sets {item['name'] }"
)
else:
concept_set_names.append(item["concept_set_name"])
concept_set_names.append(item["name"])
# TODO: change this to some sort of yaml schema validation
required_keys = {"name", "file", "metadata"}
# check codes definition
concept_set_mapping_names = []
for item in concept_codes:
for item in phenotype["concept_sets"]:
required_keys = {"folder", "files"}
if required_keys.issubset(item.keys()):
# check concept codes path is a directory
concept_code_dir_path = codes_path / item["folder"]
if not concept_code_dir_path.is_dir():
validation_errors.append(
f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory"
)
for file in item["files"]:
# check concepte code file exists
concept_code_file_path = concept_code_dir_path / file["file"]
concept_code_file_path = codes_path / item["file"]["path"]
if not concept_code_file_path.exists():
validation_errors.append(
f"Coding file {str(concept_code_file_path.resolve())} does not exist"
)
# check concepte code file is not empty
concept_code_file_path = concept_code_dir_path / file["file"]
if concept_code_file_path.stat().st_size == 0:
validation_errors.append(
f"Coding file {str(concept_code_file_path.resolve())} is an empty file"
)
# check columns section exists
if "columns" not in file:
validation_errors.append(
f"Columns not defined for {concept_code_file_path}"
)
# check code file type is supported
if concept_code_file_path.suffix not in CODE_FILE_TYPES:
raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types")
# check columns specified are a supported medical coding type
for column in file["columns"]:
if column not in code_types and column != "metadata":
for column in item["file"]["columns"]:
if column not in code_types:
validation_errors.append(
f"Column type {column} for file {concept_code_file_path} is not supported"
)
# check the actions are supported
if "actions" in file:
for action in file["actions"]:
if "actions" in item["file"]:
for action in item["file"]["actions"]:
if action not in COL_ACTIONS:
validation_errors.append(
f"Action {action} is not supported"
)
# check concept_set defined for the mapping
logger.debug(f"file {file}")
for concept_set_mapping in file["concept_set"]:
# store the concept set names found for later set operations
logger.debug(f"mapping {concept_set_mapping}")
if concept_set_mapping['name'] not in concept_set_mapping_names:
concept_set_mapping_names.append(concept_set_mapping['name'])
else:
validation_errors.append(
f"Missing required elements {required_keys} in codes {item}"
)
# create sets to perform set operations on the lists of concept set names
concept_set_names_set = set(concept_set_names)
concept_set_mapping_names_set = set(concept_set_mapping_names)
# check all concept sets in the summary section have at least one code mapping
concept_set_no_codes = list(concept_set_names_set - concept_set_mapping_names_set)
if len(concept_set_no_codes) > 0:
validation_errors.append(
f"Concept sets do not exist in codes {concept_set_no_codes}"
)
# check all concept sets included in the code mapping are defined in the summary concept_set section
codes_no_concept_set = list(concept_set_mapping_names_set - concept_set_names_set)
if len(codes_no_concept_set) > 0:
validation_errors.append(
f"Concept sets mapped in codes do not exist in the concept sets: {codes_no_concept_set}"
f"Missing required elements {required_keys} in concept set {item}"
)
if len(validation_errors) > 0:
......@@ -378,9 +344,11 @@ def read_table_file(path, excel_sheet=None):
"""
Load Code List File
"""
path = path.resolve()
if path.suffix == ".csv":
df = pd.read_csv(path, dtype=str)
elif path.suffix == ".xlsx":
elif path.suffix == ".xlsx" or path.suffix == ".xls":
if excel_sheet:
df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str)
else:
......@@ -388,21 +356,21 @@ def read_table_file(path, excel_sheet=None):
elif path.suffix == ".dta":
df = pd.read_stata(path, dtype=str)
else:
raise Exception(f"Unsupported filetype provided for source file {path.suffix}")
raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types")
return df
def process_actions(df, file):
def process_actions(df, concept_set):
# Perform Structural Changes to file before preprocessing
logger.debug("Processing file structural actions")
if (
"actions" in file
and "split_col" in file["actions"]
and "codes_col" in file["actions"]
"actions" in concept_set["file"]
and "split_col" in concept_set["file"]["actions"]
and "codes_col" in concept_set["file"]["actions"]
):
split_col = file["actions"]["split_col"]
codes_col = file["actions"]["codes_col"]
split_col = concept_set["file"]["actions"]["split_col"]
codes_col = concept_set["file"]["actions"]["codes_col"]
logger.debug(
"Action: Splitting",
split_col,
......@@ -419,33 +387,33 @@ def process_actions(df, file):
# Perform QA Checks on columns individually and append to df
def preprocess_codes(df, file, target_code_type=None, codes_file=None):
def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
"""Parses each column individually - Order and length will not be preserved!"""
out = pd.DataFrame([]) # create output df to append to
code_errors = [] # list of errors from processing
metadata_df = pd.DataFrame()
meta_columns = [] # meta columns to keep with codes
if "actions" in file and "divide_col" in file["actions"]:
meta_columns += [file["actions"]["divide_col"]]
# TODO: enable metacolumns to be outputted - problem with map_file appending
if "metadata" in file["columns"]:
meta_columns += file["columns"]["metadata"]
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
meta_columns += [concept_set["file"]["actions"]["divide_col"]]
metadata_df = df[meta_columns]
# TODO: enable metacolumns to be outputted - problem with map_file appending
# if "metadata" in file["columns"]:
# meta_columns += file["columns"]["metadata"]
# Preprocess codes
code_types = parse.CodeTypeParser().code_types
for code_type_name, code_type_parser in code_types.items():
if code_type_name in file["columns"]:
if code_type_name in concept_set["file"]["columns"]:
logger.info(f"Processing {code_type_name} codes...")
# get code types
codes = df[file["columns"][code_type_name]].dropna()
codes = df[concept_set["file"]["columns"][code_type_name]].dropna()
codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces
# process codes, validating them using parser and returning the errors
codes, errors = code_type_parser.process(codes, codes_file)
codes, errors = code_type_parser.process(codes, code_file_path)
if len(errors) > 0:
code_errors.extend(errors)
logger.warning(f"Codes validation failed with {len(errors)} errors")
......@@ -491,10 +459,7 @@ def translate_codes(df, target_code_type):
# Append file's codes to output Df with concept
def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
# seperate out meta_columns
metadata_df = df[meta_columns]
df = df.drop(columns=meta_columns)
def map_file(df, target_code_type, out, concept_name):
# translate codes
codes = translate_codes(df, target_code_type)
......@@ -503,9 +468,7 @@ def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
# Append to output if translated
if len(codes) > 0:
codes = pd.DataFrame({"CONCEPT": codes})
codes = codes.join(metadata_df)
for concept in concept_names:
codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes))
out = pd.concat([out, codes])
else:
logger.debug(f"No codes converted with target code type {target_code_type}")
......@@ -588,37 +551,29 @@ def map(phen_dir, target_code_type):
# load configuration
with config_path.open("r") as file:
config = yaml.safe_load(file)
concept_sets = config["concept_sets"]
codes = config["codes"]
phenotype = config["phenotype"]
# Create output dataframe
out = pd.DataFrame([])
code_errors = []
# Process each folder in codes section
for folder in codes:
for file in folder["files"]:
logger.debug(f"--- {file['file']} ---")
codes_file_path = codes_path / folder["folder"] / file["file"]
for concept_set in phenotype["concept_sets"]:
logger.debug(f"--- {concept_set['file']} ---")
# Load Code File
if "excel_sheet" in file:
df = read_table_file(
path=codes_file_path, excel_sheet=file["excel_sheet"]
)
else:
df = read_table_file(path=codes_file_path)
codes_file_path = Path(codes_path / concept_set["file"]["path"])
df = read_table_file(codes_file_path)
# process structural actions
df = process_actions(df, file)
df = process_actions(df, concept_set)
# Preprocessing & Validation Checks
logger.debug("Processing and validating code formats")
df, meta_columns, errors = preprocess_codes(
df,
file,
codes_file=str(codes_file_path.resolve()),
concept_set,
codes_file_path,
target_code_type=target_code_type,
)
......@@ -628,40 +583,35 @@ def map(phen_dir, target_code_type):
logger.debug(f" Length of code_errors {len(code_errors)}")
# partition table by categorical column
if "actions" in file and "divide_col" in file["actions"] and len(df) > 0:
divide_col = file["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col} column into: {df[divide_col].unique()}")
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
divide_col = concept_set["file"]["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col}")
logger.debug(df.head())
logger.debug(f"column into: {df[divide_col].unique()}")
df = df.groupby(divide_col)
# Map to Concept/Phenotype
# TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
logger.debug(f"instance of df before if: {type(df)}")
if isinstance(df, pd.core.frame.DataFrame):
concept_names = [concept['name'] for concept in file["concept_set"]]
out = map_file(
df,
target_code_type,
out,
concept_names=concept_names,
meta_columns=meta_columns,
concept_name=concept_set['name']
)
elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
meta_columns.remove(divide_col) # delete categorical column
for cat, grp in df:
for concept_set in file['concept_set']:
# what if there's no category, there's going to be an error
if cat == concept_set["category"]:
if cat == concept_set["file"]["category"]:
grp = grp.drop(
columns=[divide_col]
) # delete categorical column
logger.debug(f"Mapping category: {cat}")
concept_names = [concept_set["name"]]
out = map_file(
grp,
target_code_type,
out,
concept_names=concept_names,
meta_columns=meta_columns,
concept_name=concept_set['name']
)
else:
logger.debug(f"instance of df: {type(df)}")
......@@ -682,26 +632,12 @@ def map(phen_dir, target_code_type):
raise Exception(
f"No output after map processing, check config {str(config_path.resolve())}"
)
# Final processing
out = out.reset_index(drop=True)
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
# Add concept set definition metadata
concept_sets_df = pd.DataFrame(
concept_sets["concept_set"]
) # transform to dataframe
if "metadata" in concept_sets_df.columns:
concept_sets_df = concept_sets_df.join(
pd.json_normalize(concept_sets_df["metadata"])
) # metadata to columns
concept_sets_df = concept_sets_df.drop(columns=["metadata"])
concept_sets_df = concept_sets_df.rename(
columns={"concept_set_name": "CONCEPT_SET"}
)
concept_sets_df = concept_sets_df.drop_duplicates() # remove duplicates
out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output
# Save output to map directory
output_filename = target_code_type + ".csv"
map_path = phen_path / MAP_DIR / output_filename
......@@ -762,7 +698,7 @@ def publish(phen_dir):
config_path = phen_path / CONFIG_FILE
with config_path.open("r") as file:
config = yaml.safe_load(file)
match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
match = re.match(r"v(\d+\.\d+)", config["phenotype"]["version"])
major_version = match.group(1)
# get latest minor version from git commit count
......@@ -772,7 +708,7 @@ def publish(phen_dir):
next_minor_version = commit_count + 1
version = f"v{major_version}.{next_minor_version}"
logger.debug(f"New version: {version}")
config["concept_sets"]["version"] = version
config["phenotype"]["version"] = version
with open(config_path, "w") as file:
yaml.dump(config, file, default_flow_style=False, sort_keys=False)
......@@ -826,8 +762,8 @@ def export(phen_dir, version):
export_db_path = omop.export(
map_path,
export_path,
config["concept_sets"]["version"],
config["concept_sets"]["omop"],
config["phenotype"]["version"],
config["phenotype"]["omop"],
)
# write to tables
......@@ -925,7 +861,7 @@ def diff(phen_dir, phen_old_dir):
with new_config.open("r") as file:
new_config = yaml.safe_load(file)
report.write(
f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
)
report.write(f"- Removed outputs: {list(removed_outputs)}\n")
report.write(f"- Added outputs: {list(added_outputs)}\n")
......
concept_sets:
phenotype:
version: "v1.0.1"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
concept_set:
- concept_set_name: "ABDO_PAIN"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
concept_sets:
- name: "ABDO_PAIN"
file:
path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
columns:
read2: "code"
metadata:
- "description"
concept_set:
- name: "ABDO_PAIN"
metadata: {}
concept_sets:
phenotype:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
concept_sets:
- name: "CVD_EVENTS"
file:
path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- name: "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
metadata: {}
- name: "DID_NOT_ATTEND"
file:
path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- name: "DID_NOT_ATTEND"
metadata: {}
\ No newline at end of file
concept_sets:
phenotype:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
metadata: {}
- concept_set_name: "HYPERTENSION"
metadata: {}
- concept_set_name: "DEPRESSION"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
concept_sets:
- name: "CVD_EVENTS"
file:
path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- name: "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
metadata: {}
- name: "DID_NOT_ATTEND"
file:
path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- name: "DID_NOT_ATTEND"
- folder: hanlon
description: Hanlon Paper Code Lists
files:
- file: Read_codes_for_diagnoses.csv
metadata: {}
- name: "HYPERTENSION"
file:
path: "hanlon/Read_codes_for_diagnoses.csv"
columns:
read2: Read Code
actions:
divide_col: MMCode
concept_set:
- name: HYPERTENSION
read2: "Read Code"
category: "2"
- name: DEPRESSION
actions:
divide_col: "MMCode"
metadata: {}
- name: "DEPRESSION"
file:
path: "hanlon/Read_codes_for_diagnoses.csv"
columns:
read2: "Read Code"
category: "3"
actions:
divide_col: "MMCode"
metadata: {}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment