refactor: completed refactoring of config to combine codes within concept...

refactor: completed refactoring of config to combine codes within concept sets. This optimisation will shorten to file overall, even though for things like hanlon it will be longer. Makes understanding of the file easier as no separate links bewteen concept sets and the code definition. Closes #3.

refactor: completed refactoring of config to combine codes within concept...
ca41792b · mjbonifa · 32cf82d6 · ca41792b · ca41792b · ca41792b
Commit ca41792b authored 5 months ago by mjbonifa
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -42,6 +42,7 @@ CODES_COL_ACTION = "codes_col"
 DIVIDE_COL_ACTION = "divide_col"
 COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]

+CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]

 class PhenValidationException(Exception):
    """Custom exception class raised when validation errors in phenotype configuration file"""
@@ -190,16 +191,15 @@ def init(phen_dir, remote_url):

    # create empty phen config file
    config = {
-        "concept_sets": {
+        "phenotype": {
            "version": initial_version,
            "omop": {
                "vocabulary_id": "",
                "vocabulary_name": "",
                "vocabulary_reference": "",
            },
-            "concept_set": [],
-        },
-        "codes": [],
+            "concept_sets": [],
+        }
    }

    with open(phen_path / CONFIG_FILE, "w") as file:
@@ -257,7 +257,7 @@ def validate(phen_dir):
    # Load configuration File
    if config_path.suffix == ".yaml":
        with config_path.open("r") as file:
-            mapping = yaml.safe_load(file)
+            phenotype = yaml.safe_load(file)
    else:
        raise Exception(
            f"Unsupported configuration filetype: {str(config_path.resolve())}"
@@ -265,103 +265,69 @@ def validate(phen_dir):

    # initiatise
    validation_errors = []
-    concept_sets = mapping["concept_sets"]
-    concept_codes = mapping["codes"]
+    phenotype = phenotype["phenotype"]
    code_types = parse.CodeTypeParser().code_types

    # check the version number is of the format vn.n.n
-    match = re.match(r"v(\d+\.\d+\.\d+)", concept_sets["version"])
+    match = re.match(r"v(\d+\.\d+\.\d+)", phenotype["version"])
    if not match:
        validation_errors.append(
-            f"Invalid version format in configuration file: {concept_sets['version']}"
+            f"Invalid version format in configuration file: {phenotype['version']}"
        )

    # create a list of all the concept set names defined in the concept set configuration
    concept_set_names = []
-    for item in concept_sets["concept_set"]:
-        if item["concept_set_name"] in concept_set_names:
+    for item in phenotype["concept_sets"]:
+        if item["name"] in concept_set_names:
            validation_errors.append(
-                f"Duplicate concept set defined in concept sets {item['concept_set_name'] }"
+                f"Duplicate concept set defined in concept sets {item['name'] }"
            )
        else:
-            concept_set_names.append(item["concept_set_name"])
+            concept_set_names.append(item["name"])
+
+    # TODO: change this to some sort of yaml schema validation
+    required_keys = {"name", "file", "metadata"}    
    
    # check codes definition
-    concept_set_mapping_names = []
-    for item in concept_codes:
+    for item in phenotype["concept_sets"]:

-        required_keys = {"folder", "files"}
        if required_keys.issubset(item.keys()):
-            # check concept codes path is a directory
-            concept_code_dir_path = codes_path / item["folder"]
-            if not concept_code_dir_path.is_dir():
-                validation_errors.append(
-                    f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory"
-                )

-            for file in item["files"]:
            # check concepte code file exists
-                concept_code_file_path = concept_code_dir_path / file["file"]
+            concept_code_file_path = codes_path / item["file"]["path"]
            if not concept_code_file_path.exists():
                validation_errors.append(
                    f"Coding file {str(concept_code_file_path.resolve())} does not exist"
                )

            # check concepte code file is not empty
-                concept_code_file_path = concept_code_dir_path / file["file"]
            if concept_code_file_path.stat().st_size == 0:
                validation_errors.append(
                    f"Coding file {str(concept_code_file_path.resolve())} is an empty file"
                )

-                # check columns section exists
-                if "columns" not in file:
-                    validation_errors.append(
-                        f"Columns not defined for {concept_code_file_path}"
-                    )
+            # check code file type is supported
+            if concept_code_file_path.suffix not in CODE_FILE_TYPES:
+                raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types")
            
            # check columns specified are a supported medical coding type        
-                for column in file["columns"]:
-                    if column not in code_types and column != "metadata":
+            for column in item["file"]["columns"]:                   
+                if column not in code_types:
                    validation_errors.append(
                        f"Column type {column} for file {concept_code_file_path} is not supported"
                    )

            # check the actions are supported
-                if "actions" in file:
-                    for action in file["actions"]:
+            if "actions" in item["file"]:
+                for action in item["file"]["actions"]:
                    if action not in COL_ACTIONS:
                        validation_errors.append(
                            f"Action {action} is not supported"
                        )

-                # check concept_set defined for the mapping
-                logger.debug(f"file {file}")
-                for concept_set_mapping in file["concept_set"]:
-                    # store the concept set names found for later set operations
-                    logger.debug(f"mapping {concept_set_mapping}")
-                    if concept_set_mapping['name'] not in concept_set_mapping_names:
-                        concept_set_mapping_names.append(concept_set_mapping['name'])
        else:
            validation_errors.append(
-                f"Missing required elements {required_keys} in codes {item}"
-            )
-    # create sets to perform set operations on the lists of concept set names
-    concept_set_names_set = set(concept_set_names)
-    concept_set_mapping_names_set = set(concept_set_mapping_names)
-
-    # check all concept sets in the summary section have at least one code mapping
-    concept_set_no_codes = list(concept_set_names_set - concept_set_mapping_names_set)
-    if len(concept_set_no_codes) > 0:
-        validation_errors.append(
-            f"Concept sets do not exist in codes {concept_set_no_codes}"
-        )
-
-    # check all concept sets included in the code mapping are defined in the summary concept_set section
-    codes_no_concept_set = list(concept_set_mapping_names_set - concept_set_names_set)
-    if len(codes_no_concept_set) > 0:
-        validation_errors.append(
-            f"Concept sets mapped in codes do not exist in the concept sets: {codes_no_concept_set}"
+                f"Missing required elements {required_keys} in concept set {item}"
            )

    if len(validation_errors) > 0:
@@ -378,9 +344,11 @@ def read_table_file(path, excel_sheet=None):
    """
    Load Code List File
    """
+
+    path = path.resolve()
    if path.suffix == ".csv":
        df = pd.read_csv(path, dtype=str)
-    elif path.suffix == ".xlsx":
+    elif path.suffix == ".xlsx" or path.suffix == ".xls":
        if excel_sheet:
            df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str)
        else:
@@ -388,21 +356,21 @@ def read_table_file(path, excel_sheet=None):
    elif path.suffix == ".dta":
        df = pd.read_stata(path, dtype=str)
    else:
-        raise Exception(f"Unsupported filetype provided for source file {path.suffix}")
+        raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types")   

    return df


-def process_actions(df, file):
+def process_actions(df, concept_set):
    # Perform Structural Changes to file before preprocessing
    logger.debug("Processing file structural actions")
    if (
-        "actions" in file
-        and "split_col" in file["actions"]
-        and "codes_col" in file["actions"]
+        "actions" in concept_set["file"]
+        and "split_col" in concept_set["file"]["actions"]
+        and "codes_col" in concept_set["file"]["actions"]
    ):
-        split_col = file["actions"]["split_col"]
-        codes_col = file["actions"]["codes_col"]
+        split_col = concept_set["file"]["actions"]["split_col"]
+        codes_col = concept_set["file"]["actions"]["codes_col"]
        logger.debug(
            "Action: Splitting",
            split_col,
@@ -419,33 +387,33 @@ def process_actions(df, file):


 # Perform QA Checks on columns individually and append to df
-def preprocess_codes(df, file, target_code_type=None, codes_file=None):
+def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
    """Parses each column individually - Order and length will not be preserved!"""
    out = pd.DataFrame([])  # create output df to append to
    code_errors = []  # list of errors from processing

+    metadata_df = pd.DataFrame()  
    meta_columns = []  # meta columns to keep with codes
-    if "actions" in file and "divide_col" in file["actions"]:
-        meta_columns += [file["actions"]["divide_col"]]
-    # TODO: enable metacolumns to be outputted - problem with map_file appending
-    if "metadata" in file["columns"]:
-        meta_columns += file["columns"]["metadata"]
-
+    if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
+        meta_columns += [concept_set["file"]["actions"]["divide_col"]]
        metadata_df = df[meta_columns]        
+    # TODO: enable metacolumns to be outputted - problem with map_file appending
+   # if "metadata" in file["columns"]:
+   #     meta_columns += file["columns"]["metadata"]

    # Preprocess codes
    code_types = parse.CodeTypeParser().code_types
    for code_type_name, code_type_parser in code_types.items():
-        if code_type_name in file["columns"]:
+        if code_type_name in concept_set["file"]["columns"]:
            logger.info(f"Processing {code_type_name} codes...")

            # get code types
-            codes = df[file["columns"][code_type_name]].dropna()
+            codes = df[concept_set["file"]["columns"][code_type_name]].dropna()
            codes = codes.astype(str)  # convert to string
            codes = codes.str.strip()  # remove excess spaces

            # process codes, validating them using parser and returning the errors
-            codes, errors = code_type_parser.process(codes, codes_file)
+            codes, errors = code_type_parser.process(codes, code_file_path)
            if len(errors) > 0:
                code_errors.extend(errors)
                logger.warning(f"Codes validation failed with {len(errors)} errors")
@@ -491,10 +459,7 @@ def translate_codes(df, target_code_type):


 # Append file's codes to output Df with concept
-def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
-    # seperate out meta_columns
-    metadata_df = df[meta_columns]
-    df = df.drop(columns=meta_columns)
+def map_file(df, target_code_type, out, concept_name):

    # translate codes
    codes = translate_codes(df, target_code_type)
@@ -503,9 +468,7 @@ def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
    # Append to output if translated
    if len(codes) > 0:
        codes = pd.DataFrame({"CONCEPT": codes})
-        codes = codes.join(metadata_df)
-        for concept in concept_names:
-            codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
+        codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes))
        out = pd.concat([out, codes])
    else:
        logger.debug(f"No codes converted with target code type {target_code_type}")
@@ -588,37 +551,29 @@ def map(phen_dir, target_code_type):
    # load configuration
    with config_path.open("r") as file:
        config = yaml.safe_load(file)
-
-    concept_sets = config["concept_sets"]
-    codes = config["codes"]
+    phenotype = config["phenotype"]

    # Create output dataframe
    out = pd.DataFrame([])
    code_errors = []

    # Process each folder in codes section
-    for folder in codes:
-        for file in folder["files"]:
-            logger.debug(f"--- {file['file']} ---")
-            codes_file_path = codes_path / folder["folder"] / file["file"]
+    for concept_set in phenotype["concept_sets"]:
+        logger.debug(f"--- {concept_set['file']} ---")

        # Load Code File        
-            if "excel_sheet" in file:
-                df = read_table_file(
-                    path=codes_file_path, excel_sheet=file["excel_sheet"]
-                )
-            else:
-                df = read_table_file(path=codes_file_path)
+        codes_file_path = Path(codes_path / concept_set["file"]["path"])
+        df = read_table_file(codes_file_path)

        # process structural actions
-            df = process_actions(df, file)
+        df = process_actions(df, concept_set)

        # Preprocessing & Validation Checks
        logger.debug("Processing and validating code formats")
        df, meta_columns, errors = preprocess_codes(
            df,
-                file,
-                codes_file=str(codes_file_path.resolve()),
+            concept_set,
+            codes_file_path,
            target_code_type=target_code_type,
        )

@@ -628,40 +583,35 @@ def map(phen_dir, target_code_type):
        logger.debug(f" Length of code_errors {len(code_errors)}")

        # partition table by categorical column
-            if "actions" in file and "divide_col" in file["actions"] and len(df) > 0:
-                divide_col = file["actions"]["divide_col"]
-                logger.debug(f"Action: Dividing Table by {divide_col} column into: {df[divide_col].unique()}")
+        if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
+            divide_col = concept_set["file"]["actions"]["divide_col"]
+            logger.debug(f"Action: Dividing Table by {divide_col}")
+            logger.debug(df.head())
+            logger.debug(f"column into: {df[divide_col].unique()}")
            df = df.groupby(divide_col)

        # Map to Concept/Phenotype
        # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
        logger.debug(f"instance of df before if: {type(df)}")            
        if isinstance(df, pd.core.frame.DataFrame):
-                concept_names = [concept['name'] for concept in file["concept_set"]]
            out = map_file(
                df,
                target_code_type,
                out,
-                    concept_names=concept_names,
-                    meta_columns=meta_columns,
+                concept_name=concept_set['name']
            )
        elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):           
-                meta_columns.remove(divide_col)  # delete categorical column             
            for cat, grp in df:
-                    for concept_set in file['concept_set']:
                # what if there's no category, there's going to be an error                     
-                        if cat == concept_set["category"]:                       
+                if cat == concept_set["file"]["category"]:
                    grp = grp.drop(
                        columns=[divide_col]
                    )  # delete categorical column
-                            logger.debug(f"Mapping category: {cat}")
-                            concept_names = [concept_set["name"]]
                    out = map_file(
                        grp,
                        target_code_type,
                        out,
-                                concept_names=concept_names,
-                                meta_columns=meta_columns,
+                        concept_name=concept_set['name']
                    )                     
            else:
                logger.debug(f"instance of df: {type(df)}")
@@ -682,26 +632,12 @@ def map(phen_dir, target_code_type):
        raise Exception(
            f"No output after map processing, check config {str(config_path.resolve())}"
        )
+        
    # Final processing
    out = out.reset_index(drop=True)
    out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
    out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])

-    # Add concept set definition metadata
-    concept_sets_df = pd.DataFrame(
-        concept_sets["concept_set"]
-    )  # transform to dataframe
-    if "metadata" in concept_sets_df.columns:
-        concept_sets_df = concept_sets_df.join(
-            pd.json_normalize(concept_sets_df["metadata"])
-        )  # metadata to columns
-        concept_sets_df = concept_sets_df.drop(columns=["metadata"])
-    concept_sets_df = concept_sets_df.rename(
-        columns={"concept_set_name": "CONCEPT_SET"}
-    )
-    concept_sets_df = concept_sets_df.drop_duplicates()  # remove duplicates
-    out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET")  # merge with output
-
    # Save output to map directory
    output_filename = target_code_type + ".csv"
    map_path = phen_path / MAP_DIR / output_filename
@@ -762,7 +698,7 @@ def publish(phen_dir):
    config_path = phen_path / CONFIG_FILE
    with config_path.open("r") as file:
        config = yaml.safe_load(file)
-    match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
+    match = re.match(r"v(\d+\.\d+)", config["phenotype"]["version"])
    major_version = match.group(1)

    # get latest minor version from git commit count
@@ -772,7 +708,7 @@ def publish(phen_dir):
    next_minor_version = commit_count + 1
    version = f"v{major_version}.{next_minor_version}"
    logger.debug(f"New version: {version}")
-    config["concept_sets"]["version"] = version
+    config["phenotype"]["version"] = version
    with open(config_path, "w") as file:
        yaml.dump(config, file, default_flow_style=False, sort_keys=False)

@@ -826,8 +762,8 @@ def export(phen_dir, version):
    export_db_path = omop.export(
        map_path,
        export_path,
-        config["concept_sets"]["version"],
-        config["concept_sets"]["omop"],
+        config["phenotype"]["version"],
+        config["phenotype"]["omop"],
    )

    # write to tables
@@ -925,7 +861,7 @@ def diff(phen_dir, phen_old_dir):
    with new_config.open("r") as file:
        new_config = yaml.safe_load(file)
    report.write(
-        f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
+        f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
    )
    report.write(f"- Removed outputs: {list(removed_outputs)}\n")
    report.write(f"- Added outputs: {list(added_outputs)}\n")

--- a/examples/config1.yaml
+++ b/examples/config1.yaml
-concept_sets:
+phenotype:
  version: "v1.0.1"
  omop:
    vocabulary_id: "ACMC_Example"
    vocabulary_name: "ACMC example phenotype"
    vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
-  concept_set:
-    - concept_set_name: "ABDO_PAIN"
-      metadata: {}
-
-codes:
-  - folder: "clinical-codes-org"
-    description: "Downloaded 16/11/23"
-    files:
-      - file: "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
+  concept_sets:
+    - name: "ABDO_PAIN"
+      file:
+        path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
        columns:
          read2: "code"
-          metadata:
-            - "description"
-        concept_set:
-          - name: "ABDO_PAIN"
-
+      metadata: {}

--- a/examples/config2.yaml
+++ b/examples/config2.yaml
-concept_sets:
+phenotype:
  version: "v1.0.4"
  omop:
    vocabulary_id: "ACMC_Example"
    vocabulary_name: "ACMC example phenotype"
    vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
-  concept_set:
-    - concept_set_name: "CVD_EVENTS"
-      metadata: {}
-    - concept_set_name: "DID_NOT_ATTEND"
-      metadata: {}
-
-codes:
-  - folder: "clinical-codes-org"
-    description: "Downloaded 16/11/23"
-    files:
-      - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
+  concept_sets:
+    - name: "CVD_EVENTS"
+      file: 
+        path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
        columns:
          icd10: "code"
-          metadata: []
-        concept_set:
-          - name: "CVD_EVENTS"
-      - file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
+      metadata: {}
+    - name: "DID_NOT_ATTEND"
+      file: 
+        path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
        columns:
          read2: "code"
-          metadata: []
-        concept_set:
-          - name: "DID_NOT_ATTEND"
-
+      metadata: {}
\ No newline at end of file
--- a/examples/config3.yaml
+++ b/examples/config3.yaml
-concept_sets:
+phenotype:
  version: "v1.0.4"
  omop:
    vocabulary_id: "ACMC_Example"
    vocabulary_name: "ACMC example phenotype"
    vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
-  concept_set:
-    - concept_set_name: "CVD_EVENTS"
-      metadata: {}
-    - concept_set_name: "DID_NOT_ATTEND"
-      metadata: {}
-    - concept_set_name: "HYPERTENSION"
-      metadata: {}     
-    - concept_set_name: "DEPRESSION"
-      metadata: {}        
-
-codes:
-  - folder: "clinical-codes-org"
-    description: "Downloaded 16/11/23"
-    files:
-      - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
+  concept_sets:
+    - name: "CVD_EVENTS"
+      file: 
+        path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
        columns:
          icd10: "code"
-          metadata: []
-        concept_set:
-          - name: "CVD_EVENTS"
-      - file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
+      metadata: {}
+    - name: "DID_NOT_ATTEND"
+      file: 
+        path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
        columns:
          read2: "code"
-          metadata: []
-        concept_set:
-          - name: "DID_NOT_ATTEND"
-  - folder: hanlon
-    description: Hanlon Paper Code Lists
-    files:
-      - file: Read_codes_for_diagnoses.csv
+      metadata: {}
+    - name: "HYPERTENSION"
+      file: 
+        path: "hanlon/Read_codes_for_diagnoses.csv"
        columns:
-          read2: Read Code      
-        actions:
-          divide_col: MMCode
-        concept_set:
-          - name: HYPERTENSION
+          read2: "Read Code"
        category: "2"
-          - name: DEPRESSION
+        actions:
+          divide_col: "MMCode"         
+      metadata: {}
+    - name: "DEPRESSION"
+      file: 
+        path: "hanlon/Read_codes_for_diagnoses.csv"
+        columns:
+          read2: "Read Code"
        category: "3"
+        actions:
+          divide_col: "MMCode"         
+      metadata: {}