From ca41792b0c49bafa83a1cc0b7f791770fafc3531 Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Mon, 24 Feb 2025 18:05:32 +0000
Subject: [PATCH] refactor: completed refactoring of config to combine codes
 within concept sets. This optimisation will shorten to file overall, even
 though for things like hanlon it will be longer. Makes understanding of the
 file easier as no separate links bewteen concept sets and the code
 definition. Closes #3.

---
 acmc/phen.py          | 318 +++++++++++++++++-------------------------
 examples/config1.yaml |  21 +--
 examples/config2.yaml |  30 ++--
 examples/config3.yaml |  60 ++++----
 4 files changed, 169 insertions(+), 260 deletions(-)

diff --git a/acmc/phen.py b/acmc/phen.py
index 4b22e84..43ad09e 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -42,6 +42,7 @@ CODES_COL_ACTION = "codes_col"
 DIVIDE_COL_ACTION = "divide_col"
 COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
 
+CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
 
 class PhenValidationException(Exception):
     """Custom exception class raised when validation errors in phenotype configuration file"""
@@ -190,16 +191,15 @@ def init(phen_dir, remote_url):
 
     # create empty phen config file
     config = {
-        "concept_sets": {
+        "phenotype": {
             "version": initial_version,
             "omop": {
                 "vocabulary_id": "",
                 "vocabulary_name": "",
                 "vocabulary_reference": "",
             },
-            "concept_set": [],
-        },
-        "codes": [],
+            "concept_sets": [],
+        }
     }
 
     with open(phen_path / CONFIG_FILE, "w") as file:
@@ -257,7 +257,7 @@ def validate(phen_dir):
     # Load configuration File
     if config_path.suffix == ".yaml":
         with config_path.open("r") as file:
-            mapping = yaml.safe_load(file)
+            phenotype = yaml.safe_load(file)
     else:
         raise Exception(
             f"Unsupported configuration filetype: {str(config_path.resolve())}"
@@ -265,104 +265,70 @@ def validate(phen_dir):
 
     # initiatise
     validation_errors = []
-    concept_sets = mapping["concept_sets"]
-    concept_codes = mapping["codes"]
+    phenotype = phenotype["phenotype"]
     code_types = parse.CodeTypeParser().code_types
 
     # check the version number is of the format vn.n.n
-    match = re.match(r"v(\d+\.\d+\.\d+)", concept_sets["version"])
+    match = re.match(r"v(\d+\.\d+\.\d+)", phenotype["version"])
     if not match:
         validation_errors.append(
-            f"Invalid version format in configuration file: {concept_sets['version']}"
+            f"Invalid version format in configuration file: {phenotype['version']}"
         )
 
     # create a list of all the concept set names defined in the concept set configuration
     concept_set_names = []
-    for item in concept_sets["concept_set"]:
-        if item["concept_set_name"] in concept_set_names:
+    for item in phenotype["concept_sets"]:
+        if item["name"] in concept_set_names:
             validation_errors.append(
-                f"Duplicate concept set defined in concept sets {item['concept_set_name'] }"
+                f"Duplicate concept set defined in concept sets {item['name'] }"
             )
         else:
-            concept_set_names.append(item["concept_set_name"])
+            concept_set_names.append(item["name"])
 
+    # TODO: change this to some sort of yaml schema validation
+    required_keys = {"name", "file", "metadata"}    
+    
     # check codes definition
-    concept_set_mapping_names = []
-    for item in concept_codes:
+    for item in phenotype["concept_sets"]:
 
-        required_keys = {"folder", "files"}
         if required_keys.issubset(item.keys()):
-            # check concept codes path is a directory
-            concept_code_dir_path = codes_path / item["folder"]
-            if not concept_code_dir_path.is_dir():
+
+            # check concepte code file exists
+            concept_code_file_path = codes_path / item["file"]["path"]
+            if not concept_code_file_path.exists():
                 validation_errors.append(
-                    f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory"
+                    f"Coding file {str(concept_code_file_path.resolve())} does not exist"
                 )
 
-            for file in item["files"]:
-                # check concepte code file exists
-                concept_code_file_path = concept_code_dir_path / file["file"]
-                if not concept_code_file_path.exists():
-                    validation_errors.append(
-                        f"Coding file {str(concept_code_file_path.resolve())} does not exist"
-                    )
-
-                # check concepte code file is not empty
-                concept_code_file_path = concept_code_dir_path / file["file"]
-                if concept_code_file_path.stat().st_size == 0:
-                    validation_errors.append(
-                        f"Coding file {str(concept_code_file_path.resolve())} is an empty file"
-                    )
+            # check concepte code file is not empty
+            if concept_code_file_path.stat().st_size == 0:
+                validation_errors.append(
+                    f"Coding file {str(concept_code_file_path.resolve())} is an empty file"
+                )
 
-                # check columns section exists
-                if "columns" not in file:
+            # check code file type is supported
+            if concept_code_file_path.suffix not in CODE_FILE_TYPES:
+                raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types")
+            
+            # check columns specified are a supported medical coding type        
+            for column in item["file"]["columns"]:                   
+                if column not in code_types:
                     validation_errors.append(
-                        f"Columns not defined for {concept_code_file_path}"
+                        f"Column type {column} for file {concept_code_file_path} is not supported"
                     )
 
-                # check columns specified are a supported medical coding type
-                for column in file["columns"]:
-                    if column not in code_types and column != "metadata":
+            # check the actions are supported
+            if "actions" in item["file"]:
+                for action in item["file"]["actions"]:
+                    if action not in COL_ACTIONS:
                         validation_errors.append(
-                            f"Column type {column} for file {concept_code_file_path} is not supported"
+                            f"Action {action} is not supported"
                         )
 
-                # check the actions are supported
-                if "actions" in file:
-                    for action in file["actions"]:
-                        if action not in COL_ACTIONS:
-                            validation_errors.append(
-                                f"Action {action} is not supported"
-                            )
-
-                # check concept_set defined for the mapping
-                logger.debug(f"file {file}")
-                for concept_set_mapping in file["concept_set"]:
-                    # store the concept set names found for later set operations
-                    logger.debug(f"mapping {concept_set_mapping}")
-                    if concept_set_mapping['name'] not in concept_set_mapping_names:
-                        concept_set_mapping_names.append(concept_set_mapping['name'])
         else:
             validation_errors.append(
-                f"Missing required elements {required_keys} in codes {item}"
+                f"Missing required elements {required_keys} in concept set {item}"
             )
-    # create sets to perform set operations on the lists of concept set names
-    concept_set_names_set = set(concept_set_names)
-    concept_set_mapping_names_set = set(concept_set_mapping_names)
-
-    # check all concept sets in the summary section have at least one code mapping
-    concept_set_no_codes = list(concept_set_names_set - concept_set_mapping_names_set)
-    if len(concept_set_no_codes) > 0:
-        validation_errors.append(
-            f"Concept sets do not exist in codes {concept_set_no_codes}"
-        )
-
-    # check all concept sets included in the code mapping are defined in the summary concept_set section
-    codes_no_concept_set = list(concept_set_mapping_names_set - concept_set_names_set)
-    if len(codes_no_concept_set) > 0:
-        validation_errors.append(
-            f"Concept sets mapped in codes do not exist in the concept sets: {codes_no_concept_set}"
-        )
 
     if len(validation_errors) > 0:
         logger.error(validation_errors)
@@ -378,9 +344,11 @@ def read_table_file(path, excel_sheet=None):
     """
     Load Code List File
     """
+
+    path = path.resolve()
     if path.suffix == ".csv":
         df = pd.read_csv(path, dtype=str)
-    elif path.suffix == ".xlsx":
+    elif path.suffix == ".xlsx" or path.suffix == ".xls":
         if excel_sheet:
             df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str)
         else:
@@ -388,21 +356,21 @@ def read_table_file(path, excel_sheet=None):
     elif path.suffix == ".dta":
         df = pd.read_stata(path, dtype=str)
     else:
-        raise Exception(f"Unsupported filetype provided for source file {path.suffix}")
+        raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types")   
 
     return df
 
 
-def process_actions(df, file):
+def process_actions(df, concept_set):
     # Perform Structural Changes to file before preprocessing
     logger.debug("Processing file structural actions")
     if (
-        "actions" in file
-        and "split_col" in file["actions"]
-        and "codes_col" in file["actions"]
+        "actions" in concept_set["file"]
+        and "split_col" in concept_set["file"]["actions"]
+        and "codes_col" in concept_set["file"]["actions"]
     ):
-        split_col = file["actions"]["split_col"]
-        codes_col = file["actions"]["codes_col"]
+        split_col = concept_set["file"]["actions"]["split_col"]
+        codes_col = concept_set["file"]["actions"]["codes_col"]
         logger.debug(
             "Action: Splitting",
             split_col,
@@ -419,33 +387,33 @@ def process_actions(df, file):
 
 
 # Perform QA Checks on columns individually and append to df
-def preprocess_codes(df, file, target_code_type=None, codes_file=None):
+def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
     """Parses each column individually - Order and length will not be preserved!"""
     out = pd.DataFrame([])  # create output df to append to
     code_errors = []  # list of errors from processing
 
+    metadata_df = pd.DataFrame()  
     meta_columns = []  # meta columns to keep with codes
-    if "actions" in file and "divide_col" in file["actions"]:
-        meta_columns += [file["actions"]["divide_col"]]
+    if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
+        meta_columns += [concept_set["file"]["actions"]["divide_col"]]
+        metadata_df = df[meta_columns]        
     # TODO: enable metacolumns to be outputted - problem with map_file appending
-    if "metadata" in file["columns"]:
-        meta_columns += file["columns"]["metadata"]
-
-    metadata_df = df[meta_columns]
+   # if "metadata" in file["columns"]:
+   #     meta_columns += file["columns"]["metadata"]
 
     # Preprocess codes
     code_types = parse.CodeTypeParser().code_types
     for code_type_name, code_type_parser in code_types.items():
-        if code_type_name in file["columns"]:
+        if code_type_name in concept_set["file"]["columns"]:
             logger.info(f"Processing {code_type_name} codes...")
 
             # get code types
-            codes = df[file["columns"][code_type_name]].dropna()
+            codes = df[concept_set["file"]["columns"][code_type_name]].dropna()
             codes = codes.astype(str)  # convert to string
             codes = codes.str.strip()  # remove excess spaces
 
             # process codes, validating them using parser and returning the errors
-            codes, errors = code_type_parser.process(codes, codes_file)
+            codes, errors = code_type_parser.process(codes, code_file_path)
             if len(errors) > 0:
                 code_errors.extend(errors)
                 logger.warning(f"Codes validation failed with {len(errors)} errors")
@@ -491,10 +459,7 @@ def translate_codes(df, target_code_type):
 
 
 # Append file's codes to output Df with concept
-def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
-    # seperate out meta_columns
-    metadata_df = df[meta_columns]
-    df = df.drop(columns=meta_columns)
+def map_file(df, target_code_type, out, concept_name):
 
     # translate codes
     codes = translate_codes(df, target_code_type)
@@ -503,10 +468,8 @@ def map_file(df, target_code_type, out, concept_names, meta_columns=[]):
     # Append to output if translated
     if len(codes) > 0:
         codes = pd.DataFrame({"CONCEPT": codes})
-        codes = codes.join(metadata_df)
-        for concept in concept_names:
-            codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
-            out = pd.concat([out, codes])
+        codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes))
+        out = pd.concat([out, codes])
     else:
         logger.debug(f"No codes converted with target code type {target_code_type}")
 
@@ -588,86 +551,73 @@ def map(phen_dir, target_code_type):
     # load configuration
     with config_path.open("r") as file:
         config = yaml.safe_load(file)
-
-    concept_sets = config["concept_sets"]
-    codes = config["codes"]
+    phenotype = config["phenotype"]
 
     # Create output dataframe
     out = pd.DataFrame([])
     code_errors = []
 
     # Process each folder in codes section
-    for folder in codes:
-        for file in folder["files"]:
-            logger.debug(f"--- {file['file']} ---")
-            codes_file_path = codes_path / folder["folder"] / file["file"]
-
-            # Load Code File
-            if "excel_sheet" in file:
-                df = read_table_file(
-                    path=codes_file_path, excel_sheet=file["excel_sheet"]
-                )
-            else:
-                df = read_table_file(path=codes_file_path)
-
-            # process structural actions
-            df = process_actions(df, file)
+    for concept_set in phenotype["concept_sets"]:
+        logger.debug(f"--- {concept_set['file']} ---")
+
+        # Load Code File        
+        codes_file_path = Path(codes_path / concept_set["file"]["path"])
+        df = read_table_file(codes_file_path)
+
+        # process structural actions
+        df = process_actions(df, concept_set)
+
+        # Preprocessing & Validation Checks
+        logger.debug("Processing and validating code formats")
+        df, meta_columns, errors = preprocess_codes(
+            df,
+            concept_set,
+            codes_file_path,
+            target_code_type=target_code_type,
+        )
 
-            # Preprocessing & Validation Checks
-            logger.debug("Processing and validating code formats")
-            df, meta_columns, errors = preprocess_codes(
+        logger.debug(f"Length of errors from preprocess {len(errors)}")
+        if len(errors) > 0:
+            code_errors.extend(errors)
+        logger.debug(f" Length of code_errors {len(code_errors)}")
+
+        # partition table by categorical column
+        if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
+            divide_col = concept_set["file"]["actions"]["divide_col"]
+            logger.debug(f"Action: Dividing Table by {divide_col}")
+            logger.debug(df.head())
+            logger.debug(f"column into: {df[divide_col].unique()}")
+            df = df.groupby(divide_col)
+
+        # Map to Concept/Phenotype
+        # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
+        logger.debug(f"instance of df before if: {type(df)}")            
+        if isinstance(df, pd.core.frame.DataFrame):
+            out = map_file(
                 df,
-                file,
-                codes_file=str(codes_file_path.resolve()),
-                target_code_type=target_code_type,
+                target_code_type,
+                out,
+                concept_name=concept_set['name']
             )
-
-            logger.debug(f" Length of errors from preprocess {len(errors)}")
-            if len(errors) > 0:
-                code_errors.extend(errors)
-            logger.debug(f" Length of code_errors {len(code_errors)}")
-
-            # partition table by categorical column
-            if "actions" in file and "divide_col" in file["actions"] and len(df) > 0:
-                divide_col = file["actions"]["divide_col"]
-                logger.debug(f"Action: Dividing Table by {divide_col} column into: {df[divide_col].unique()}")
-                df = df.groupby(divide_col)
-
-            # Map to Concept/Phenotype
-            # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
-            logger.debug(f"instance of df before if: {type(df)}")            
-            if isinstance(df, pd.core.frame.DataFrame):
-                concept_names = [concept['name'] for concept in file["concept_set"]]
-                out = map_file(
-                    df,
-                    target_code_type,
-                    out,
-                    concept_names=concept_names,
-                    meta_columns=meta_columns,
-                )
-            elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
-                meta_columns.remove(divide_col)  # delete categorical column             
-                for cat, grp in df:
-                    for concept_set in file['concept_set']:
-                        # what if there's no category, there's going to be an error                     
-                        if cat == concept_set["category"]:                       
-                            grp = grp.drop(
-                                columns=[divide_col]
-                            )  # delete categorical column
-                            logger.debug(f"Mapping category: {cat}")
-                            concept_names = [concept_set["name"]]
-                            out = map_file(
-                                grp,
-                                target_code_type,
-                                out,
-                                concept_names=concept_names,
-                                meta_columns=meta_columns,
-                            )                     
-                else:
-                    logger.debug(f"instance of df: {type(df)}")
-                   # raise AttributeError(
-                   #     f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
-                   # )
+        elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):           
+            for cat, grp in df:
+                # what if there's no category, there's going to be an error                     
+                if cat == concept_set["file"]["category"]:
+                    grp = grp.drop(
+                        columns=[divide_col]
+                    )  # delete categorical column
+                    out = map_file(
+                        grp,
+                        target_code_type,
+                        out,
+                        concept_name=concept_set['name']
+                    )                     
+            else:
+                logger.debug(f"instance of df: {type(df)}")
+               # raise AttributeError(
+               #     f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
+               # )
 
     if len(code_errors) > 0:
         logger.error(f"The map processing has {len(code_errors)} errors")
@@ -682,26 +632,12 @@ def map(phen_dir, target_code_type):
         raise Exception(
             f"No output after map processing, check config {str(config_path.resolve())}"
         )
+        
     # Final processing
     out = out.reset_index(drop=True)
     out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
     out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
 
-    # Add concept set definition metadata
-    concept_sets_df = pd.DataFrame(
-        concept_sets["concept_set"]
-    )  # transform to dataframe
-    if "metadata" in concept_sets_df.columns:
-        concept_sets_df = concept_sets_df.join(
-            pd.json_normalize(concept_sets_df["metadata"])
-        )  # metadata to columns
-        concept_sets_df = concept_sets_df.drop(columns=["metadata"])
-    concept_sets_df = concept_sets_df.rename(
-        columns={"concept_set_name": "CONCEPT_SET"}
-    )
-    concept_sets_df = concept_sets_df.drop_duplicates()  # remove duplicates
-    out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET")  # merge with output
-
     # Save output to map directory
     output_filename = target_code_type + ".csv"
     map_path = phen_path / MAP_DIR / output_filename
@@ -762,7 +698,7 @@ def publish(phen_dir):
     config_path = phen_path / CONFIG_FILE
     with config_path.open("r") as file:
         config = yaml.safe_load(file)
-    match = re.match(r"v(\d+\.\d+)", config["concept_sets"]["version"])
+    match = re.match(r"v(\d+\.\d+)", config["phenotype"]["version"])
     major_version = match.group(1)
 
     # get latest minor version from git commit count
@@ -772,7 +708,7 @@ def publish(phen_dir):
     next_minor_version = commit_count + 1
     version = f"v{major_version}.{next_minor_version}"
     logger.debug(f"New version: {version}")
-    config["concept_sets"]["version"] = version
+    config["phenotype"]["version"] = version
     with open(config_path, "w") as file:
         yaml.dump(config, file, default_flow_style=False, sort_keys=False)
 
@@ -826,8 +762,8 @@ def export(phen_dir, version):
     export_db_path = omop.export(
         map_path,
         export_path,
-        config["concept_sets"]["version"],
-        config["concept_sets"]["omop"],
+        config["phenotype"]["version"],
+        config["phenotype"]["omop"],
     )
 
     # write to tables
@@ -925,7 +861,7 @@ def diff(phen_dir, phen_old_dir):
     with new_config.open("r") as file:
         new_config = yaml.safe_load(file)
     report.write(
-        f"\n\n# Report for version {new_config['concept_sets']['version']}\n\n"
+        f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
     )
     report.write(f"- Removed outputs: {list(removed_outputs)}\n")
     report.write(f"- Added outputs: {list(added_outputs)}\n")
diff --git a/examples/config1.yaml b/examples/config1.yaml
index f8c21bc..2f446aa 100644
--- a/examples/config1.yaml
+++ b/examples/config1.yaml
@@ -1,23 +1,14 @@
-concept_sets:
+phenotype:
   version: "v1.0.1"
   omop:
     vocabulary_id: "ACMC_Example"
     vocabulary_name: "ACMC example phenotype"
     vocabulary_reference: "https://git.soton.ac.uk/meldb/concepts-processing/-/tree/main/examples"
-  concept_set:
-    - concept_set_name: "ABDO_PAIN"
-      metadata: {}
-
-codes:
-  - folder: "clinical-codes-org"
-    description: "Downloaded 16/11/23"
-    files:
-      - file: "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
+  concept_sets:
+    - name: "ABDO_PAIN"
+      file:
+        path: "clinical-codes-org/Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
         columns:
           read2: "code"
-          metadata:
-            - "description"
-        concept_set:
-          - name: "ABDO_PAIN"
-
+      metadata: {}
 
diff --git a/examples/config2.yaml b/examples/config2.yaml
index ccf2839..24acf96 100644
--- a/examples/config2.yaml
+++ b/examples/config2.yaml
@@ -1,29 +1,19 @@
-concept_sets:
+phenotype:
   version: "v1.0.4"
   omop:
     vocabulary_id: "ACMC_Example"
     vocabulary_name: "ACMC example phenotype"
     vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
-  concept_set:
-    - concept_set_name: "CVD_EVENTS"
-      metadata: {}
-    - concept_set_name: "DID_NOT_ATTEND"
-      metadata: {}
-
-codes:
-  - folder: "clinical-codes-org"
-    description: "Downloaded 16/11/23"
-    files:
-      - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
+  concept_sets:
+    - name: "CVD_EVENTS"
+      file: 
+        path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
         columns:
           icd10: "code"
-          metadata: []
-        concept_set:
-          - name: "CVD_EVENTS"
-      - file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
+      metadata: {}
+    - name: "DID_NOT_ATTEND"
+      file: 
+        path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
         columns:
           read2: "code"
-          metadata: []
-        concept_set:
-          - name: "DID_NOT_ATTEND"
-
+      metadata: {}
\ No newline at end of file
diff --git a/examples/config3.yaml b/examples/config3.yaml
index 5ff5179..411606a 100644
--- a/examples/config3.yaml
+++ b/examples/config3.yaml
@@ -1,46 +1,38 @@
-concept_sets:
+phenotype:
   version: "v1.0.4"
   omop:
     vocabulary_id: "ACMC_Example"
     vocabulary_name: "ACMC example phenotype"
     vocabulary_reference: "https://www.it-innovation.soton.ac.uk/projects/meldb/concept-processing/example"
-  concept_set:
-    - concept_set_name: "CVD_EVENTS"
-      metadata: {}
-    - concept_set_name: "DID_NOT_ATTEND"
-      metadata: {}
-    - concept_set_name: "HYPERTENSION"
-      metadata: {}     
-    - concept_set_name: "DEPRESSION"
-      metadata: {}        
-
-codes:
-  - folder: "clinical-codes-org"
-    description: "Downloaded 16/11/23"
-    files:
-      - file: "Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
+  concept_sets:
+    - name: "CVD_EVENTS"
+      file: 
+        path: "clinical-codes-org/Cardiovascular events (ICD10)/res52-cardiovascular-events-icd10.csv"
         columns:
           icd10: "code"
-          metadata: []
-        concept_set:
-          - name: "CVD_EVENTS"
-      - file: "Non-attendance codes/res201-did-not-attend-appointment.csv"
+      metadata: {}
+    - name: "DID_NOT_ATTEND"
+      file: 
+        path: "clinical-codes-org/Non-attendance codes/res201-did-not-attend-appointment.csv"
         columns:
           read2: "code"
-          metadata: []
-        concept_set:
-          - name: "DID_NOT_ATTEND"
-  - folder: hanlon
-    description: Hanlon Paper Code Lists
-    files:
-      - file: Read_codes_for_diagnoses.csv
+      metadata: {}
+    - name: "HYPERTENSION"
+      file: 
+        path: "hanlon/Read_codes_for_diagnoses.csv"
         columns:
-          read2: Read Code      
+          read2: "Read Code"
+        category: "2"
         actions:
-          divide_col: MMCode
-        concept_set:
-          - name: HYPERTENSION
-            category: "2"
-          - name: DEPRESSION
-            category: "3"
+          divide_col: "MMCode"         
+      metadata: {}
+    - name: "DEPRESSION"
+      file: 
+        path: "hanlon/Read_codes_for_diagnoses.csv"
+        columns:
+          read2: "Read Code"
+        category: "3"
+        actions:
+          divide_col: "MMCode"         
+      metadata: {}        
 
-- 
GitLab