Merge branch '27-fix-divide-action-using-metadata-columns-to-control-program-logic-2' into 'dev'

refactor: tidied up the action code for categories but still not opimimal as... Closes #27 See merge request meldb/concepts-processing!10

Merge branch '27-fix-divide-action-using-metadata-columns-to-control-program-logic-2' into 'dev'
cae3acc7 · mjbonifa · f1f18dcd · 49d78a3f · cae3acc7
Commit cae3acc7 authored 3 months ago by mjbonifa
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -392,41 +392,39 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
    out = pd.DataFrame([])  # create output df to append to
    code_errors = []  # list of errors from processing
-    metadata_df = pd.DataFrame()  
+    # TODO: Is there a better way of processing this action as it's distributed across 
-    meta_columns = []  # meta columns to keep with codes
+    # different parts of the programme.
    if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]:
-        meta_columns += [concept_set["file"]["actions"]["divide_col"]]
+        divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]  
-        metadata_df = df[meta_columns]        
+    else:
-    # TODO: enable metacolumns to be outputted - problem with map_file appending
+        divide_col_df = pd.DataFrame()
-   # if "metadata" in file["columns"]:
-   #     meta_columns += file["columns"]["metadata"]
    # Preprocess codes
    code_types = parse.CodeTypeParser().code_types
-    for code_type_name, code_type_parser in code_types.items():
+    for code_type in concept_set["file"]["columns"]:
-        if code_type_name in concept_set["file"]["columns"]:
+        parser = code_types[code_type]
-            logger.info(f"Processing {code_type_name} codes...")
+        logger.info(f"Processing {code_type} codes...")
-            # get code types
-            codes = df[concept_set["file"]["columns"][code_type_name]].dropna()
-            codes = codes.astype(str)  # convert to string
-            codes = codes.str.strip()  # remove excess spaces
-            # process codes, validating them using parser and returning the errors
-            codes, errors = code_type_parser.process(codes, code_file_path)
-            if len(errors) > 0:
-                code_errors.extend(errors)
-                logger.warning(f"Codes validation failed with {len(errors)} errors")
-            # add metadata columns
-            out = pd.concat(
-                [out, pd.DataFrame({code_type_name: codes}).join(metadata_df)],
-                ignore_index=True,
-            )
-    return out, meta_columns, code_errors
+        # get code types
+        codes = df[concept_set["file"]["columns"][code_type]].dropna()
+        codes = codes.astype(str)  # convert to string
+        codes = codes.str.strip()  # remove excess spaces
+        # process codes, validating them using parser and returning the errors
+        codes, errors = parser.process(codes, code_file_path)
+        if len(errors) > 0:
+            code_errors.extend(errors)
+            logger.warning(f"Codes validation failed with {len(errors)} errors")
+        # append to output dataframe
+        out = pd.concat(
+            [out, pd.DataFrame({code_type: codes}).join(divide_col_df)],
+            ignore_index=True,
+        )
+    return out, code_errors
 # Translate Df with multiple codes into single code type Series
 def translate_codes(df, target_code_type):
    codes = pd.Series([], dtype=str)
@@ -561,7 +559,7 @@ def map(phen_dir, target_code_type):
    for concept_set in phenotype["concept_sets"]:
        logger.debug(f"--- {concept_set['file']} ---")
-        # Load Code File        
+        # Load code file
        codes_file_path = Path(codes_path / concept_set["file"]["path"])
        df = read_table_file(codes_file_path)
@@ -570,7 +568,7 @@ def map(phen_dir, target_code_type):
        # Preprocessing & Validation Checks
        logger.debug("Processing and validating code formats")
-        df, meta_columns, errors = preprocess_codes(
+        df, errors = preprocess_codes(
            df,
            concept_set,
            codes_file_path,
@@ -580,29 +578,16 @@ def map(phen_dir, target_code_type):
        logger.debug(f"Length of errors from preprocess {len(errors)}")
        if len(errors) > 0:
            code_errors.extend(errors)
-        logger.debug(f" Length of code_errors {len(code_errors)}")
+        logger.debug(f" Length of code_errors {len(code_errors)}")      
-        # partition table by categorical column
+        # Map
+        # if processing a source coding list with categorical data
        if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0:
            divide_col = concept_set["file"]["actions"]["divide_col"]
            logger.debug(f"Action: Dividing Table by {divide_col}")
-            logger.debug(df.head())
            logger.debug(f"column into: {df[divide_col].unique()}")
-            df = df.groupby(divide_col)
+            df_grp = df.groupby(divide_col)
+            for cat, grp in df_grp:                 
-        # Map to Concept/Phenotype
-        # TODO: This code needs refactory as it seems handling of the concept_set_categories should happen at another place
-        logger.debug(f"instance of df before if: {type(df)}")            
-        if isinstance(df, pd.core.frame.DataFrame):
-            out = map_file(
-                df,
-                target_code_type,
-                out,
-                concept_name=concept_set['name']
-            )
-        elif isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):           
-            for cat, grp in df:
-                # what if there's no category, there's going to be an error                     
                if cat == concept_set["file"]["category"]:
                    grp = grp.drop(
                        columns=[divide_col]
@@ -612,12 +597,14 @@ def map(phen_dir, target_code_type):
                        target_code_type,
                        out,
                        concept_name=concept_set['name']
-                    )                     
+                    )
-            else:
+        else:
-                logger.debug(f"instance of df: {type(df)}")
+            out = map_file(
-               # raise AttributeError(
+                df,
-               #     f"File {file} has either no concept_set or conceot_set_categories or the instance of dataframe objectives associated is incorrect, concept_set must be a DataFrame, conceot_set_categories must be pd.core.groupby.generic.DataFrameGroupBy"
+                target_code_type,
-               # )
+                out,
+                concept_name=concept_set['name']
+            )                             
    if len(code_errors) > 0:
        logger.error(f"The map processing has {len(code_errors)} errors")