Skip to content
Snippets Groups Projects
Commit 6ecf468f authored by mjbonifa's avatar mjbonifa
Browse files

test: added new config3.yaml that includes hanlon. had to refactor phen map to...

test: added new config3.yaml that includes hanlon. had to refactor phen map to handle new format and there's some weird code in the processing of the concept categories that needs further attention. The hanlon also includes an action to divide columns. #22
parent 55bf62f0
No related branches found
No related tags found
No related merge requests found
......@@ -336,10 +336,12 @@ def validate(phen_dir):
)
# check concept_set defined for the mapping
logger.debug(f"file {file}")
for concept_set_mapping in file["concept_set"]:
# store the concept set names found for later set operations
if concept_set_mapping not in concept_set_mapping_names:
concept_set_mapping_names.append(concept_set_mapping)
logger.debug(f"mapping {concept_set_mapping}")
if concept_set_mapping['name'] not in concept_set_mapping_names:
concept_set_mapping_names.append(concept_set_mapping['name'])
else:
validation_errors.append(
f"Missing required elements {required_keys} in codes {item}"
......@@ -489,7 +491,7 @@ def translate_codes(df, target_code_type):
# Append file's codes to output Df with concept
def map_file(df, target_code_type, out, concepts, meta_columns=[]):
def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
# seperate out meta_columns
metadata_df = df[meta_columns]
df = df.drop(columns=meta_columns)
......@@ -502,7 +504,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[]):
if len(codes) > 0:
codes = pd.DataFrame({"CONCEPT": codes})
codes = codes.join(metadata_df)
for concept in concepts:
for concept in concept_names:
codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
out = pd.concat([out, codes])
else:
......@@ -628,52 +630,44 @@ def map(phen_dir, target_code_type):
# partition table by categorical column
if "actions" in file and "divide_col" in file["actions"] and len(df) > 0:
divide_col = file["actions"]["divide_col"]
logger.debug(
"Action: Dividing Table by",
divide_col,
"column into: ",
df[divide_col].unique(),
)
logger.debug(f"Action: Dividing Table by {divide_col} column into: {df[divide_col].unique()}")
df = df.groupby(divide_col)
# Map to Concept/Phenotype
# TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
if len(df.index) != 0:
if ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
logger.debug(f"instance of df before if: {type(df)}")
if isinstance(df, pd.core.frame.DataFrame):
concept_names = [concept['name'] for concept in file["concept_set"]]
out = map_file(
df,
target_code_type,
out,
concepts=file["concept_set"],
concept_names=concept_names,
meta_columns=meta_columns,
)
elif ("concept_set_categories" in file) and isinstance(
df, pd.core.groupby.generic.DataFrameGroupBy
):
elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
meta_columns.remove(divide_col) # delete categorical column
for cat, grp in df:
if (
cat in file["concept_set_categories"].keys()
): # check if category is mapped
for concept_set in file['concept_set']:
# what if there's no category, there's going to be an error
if cat == concept_set["category"]:
grp = grp.drop(
columns=[divide_col]
) # delete categorical column
logger.debug("Category:", cat)
logger.debug(f"Mapping category: {cat}")
concept_names = [concept_set["name"]]
out = map_file(
grp,
target_code_type,
out,
concepts=file["concept_set_categories"][cat],
concept_names=concept_names,
meta_columns=meta_columns,
)
else:
raise AttributeError(
f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
)
else:
logger.warning(
f"File {file} has no output after preprocessing in config {str(config_path.resolve())}"
)
logger.debug(f"instance of df: {type(df)}")
# raise AttributeError(
# f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
# )
if len(code_errors) > 0:
logger.error(f"The map processing has {len(code_errors)} errors")
......
This diff is collapsed.
MMCode,Condition,
1,,
2,Hypertension,
3,Depression,
4,Painful_condition,
5,Asthma,
6,Coronary Heart disease,
7,Treated_dyspepsia,
8,Diabetes,
9,Thyroid_disease,
10,Rheumatoid_arthritis_Inflammatory_arthropathies_and_connective_tissue_disorders,
11,Deafness,
12,COPD,
13,Anxiety,
14,Irritable_bowel_syndrome,
15,Cancer,
16,Alcohol_problem,
17,Other_psychoactive_substance_misuse,
18,Treated_constipation,
19,Stroke_or_TIA,
20,Chronic_kidney_disease,
21,Diverticular_disease,Need to add
22,Atrial_fibrillation,
23,Peripheral_vascular_disease,
24,Heart_failure,
25,Prostate_disorders,
26,Glaucoma,
27,Epilepsy_(Currently_treated),
28,Dementia,
29,Schizophrenia_or_bipolar_disorder,
30,Psoriasis_or_eczema,
31,Inflammatory_bowel_disease,
32,Migraine,
33,Blindness_and_low_vision,
34,Chronic_sinusitis,
35,Learning_disability,
36,Anorexia_or_bulimia,
37,Bronchiectasis,
38,Parkinson's_disease,
39,Multiple_sclerosis,
40,Viral_hepatitis,
41,Chronic_liver_disease,
This diff is collapsed.
This diff is collapsed.
concept_sets:
version: "v1.0.4"
omop:
vocabulary_id: "ACMC_Example"
vocabulary_name: "ACMC example phenotype"
vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
concept_set:
- concept_set_name: "CVD_EVENTS"
metadata: {}
- concept_set_name: "DID_NOT_ATTEND"
metadata: {}
- concept_set_name: "HYPERTENSION"
metadata: {}
- concept_set_name: "DEPRESSION"
metadata: {}
codes:
- folder: "clinical-codes-org"
description: "Downloaded 16/11/23"
files:
- file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
columns:
icd10: "code"
metadata: []
concept_set:
- name: "CVD_EVENTS"
- file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
columns:
read2: "code"
metadata: []
concept_set:
- name: "DID_NOT_ATTEND"
- folder: hanlon
description: Hanlon Paper Code Lists
files:
- file: Read_codes_for_diagnoses.csv
columns:
read2: Read Code
actions:
divide_col: MMCode
concept_set:
- name: HYPERTENSION
category: "2"
- name: DEPRESSION
category: "3"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment