fix: readded metadata to phen output. This is now done after the translation...

fix: readded metadata to phen output. This is now done after the translation based on any column that is not defined as a code column if the file configuraiton. The previous implementation required the user to include which metadata columns to KEEP in the configuration file which resulted in a long config. Of course we may want to exlcude metadata columns we don't want in the final phenotype but if we need that we'll have to implement exclusion. In additon we now retain the source code before it's preprocessed so that we know the starting point whichw as previously lost. Closes 28

fix: readded metadata to phen output. This is now done after the translation...
7ccf8657 · mjbonifa · 5b3b1d29 · 7ccf8657
Commit 7ccf8657 authored 5 months ago by mjbonifa
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -472,6 +472,7 @@ def process_actions(df, concept_set):

    return df

+
 # Perform QA Checks on columns individually and append to df
 def preprocess_source_concepts(df, concept_set, code_file_path):
    """Parses each column individually - Order and length will not be preserved!"""
@@ -508,29 +509,40 @@ def preprocess_source_concepts(df, concept_set, code_file_path):
            ignore_index=True,
        )

-
    logger.debug(out.head())

    return out, code_errors

+
 def get_code_type_from_col_name(col_name):
    return col_name.split("_")[0]


 # Translate Df with multiple codes into single code type Series
-def translate_codes(df, source_code_types, target_code_type, concept_name):
-    codes = pd.Series([], dtype=str)
+def translate_codes(source_df, target_code_type, concept_name):
+    """Translates each source code type the source coding list into a target type and returns all conversions as a concept set"""

+    # codes = pd.DataFrame([], dtype=str)
+    codes = pd.DataFrame(
+        columns=["SOURCE_CONCEPT", "SOURCE_CONCEPT_TYPE", "CONCEPT"], dtype="string"
+    )
    # Convert codes to target type
    logger.info(f"Converting to target code type {target_code_type}")

-    for source_code_type, source_code_column in source_code_types.items():
+    for source_code_type in source_df.columns:

        # if target code type is the same as thet source code type, no translation, just appending source as target
        if source_code_type == target_code_type:
-            codes = pd.concat([codes, df[source_code_type]])
+            copy_df = pd.DataFrame(
+                {
+                    "SOURCE_CONCEPT": source_df[source_code_type],
+                    "SOURCE_CONCEPT_TYPE": source_code_type,
+                    "CONCEPT": source_df[source_code_type],
+                }
+            )
+            codes = pd.concat([codes, copy_df])
            logger.debug(
-                f"Target code type {target_code_type} is the same as source code type {len(df)}, copying codes rather than translating"
+                f"Target code type {target_code_type} is the same as source code type {len(source_df)}, copying codes rather than translating"
            )
        else:
            # get the translation filename using source to target code types
@@ -539,35 +551,31 @@ def translate_codes(df, source_code_types, target_code_type, concept_name):

            # do the mapping if it exists
            if map_path.exists():
-                # get column from processed df and rename the series to what's needed for parquet
-                
-                col = df[source_code_type]
+                # get mapping
                df_map = pd.read_parquet(map_path)
-                # merge on corresponding codes and take target column
-                translated_df = pd.merge(col, df_map, how="left")[target_code_type]
-                logger.debug("TRANSLATE")
-                logger.debug(translated_df.head())

-                # TODO: BUG mask does not match column
+                # do mapping
+                translated_df = pd.merge(
+                    source_df[source_code_type], df_map, how="left"
+                )
+
+                # normalise the output
+                translated_df.columns = ["SOURCE_CONCEPT", "CONCEPT"]
+                translated_df["SOURCE_CONCEPT_TYPE"] = source_code_type
+
+                # add to list of codes
                codes = pd.concat([codes, translated_df])
-                logger.debug("CODES")
-                logger.debug(codes.head())                

            else:
                logger.warning(
                    f"No mapping from {source_code_type} to {target_code_type}, file {str(map_path.resolve())} does not exist"
                )
-    logger.debug("FULL CONCATED")
-    logger.debug(codes.head())                  

    codes = codes.dropna()  # delete NaNs

-    logger.debug(f"FULL CONCATED {len(codes.index)}")
-
-    # Append to output if translated
+    # added concept set type to output if any translations
    if len(codes.index) > 0:
-        codes = pd.DataFrame({"CONCEPT": codes})
-        codes["CONCEPT_SET"] = np.repeat(concept_name.strip(), len(codes))     
+        codes["CONCEPT_SET"] = concept_name
    else:
        logger.debug(f"No codes converted with target code type {target_code_type}")

@@ -699,6 +707,13 @@ def map_target_code_type(phen_path, phenotype, target_code_type):
            codes_file_path,
        )

+        # create df with just the source code columns
+        source_column_names = list(concept_set["file"]["columns"].keys())
+        source_df = df[source_column_names]
+
+        logger.debug(source_df.columns)
+        logger.debug(source_df.head())
+
        logger.debug(f"Length of errors from preprocess_source_concepts {len(errors)}")
        if len(errors) > 0:
            code_errors.extend(errors)
@@ -718,26 +733,21 @@ def map_target_code_type(phen_path, phenotype, target_code_type):
            for cat, grp in df_grp:
                if cat == concept_set["file"]["category"]:
                    grp = grp.drop(columns=[divide_col])  # delete categorical column
+                    source_df = grp[source_column_names]
                    trans_out = translate_codes(
-                        grp,    
-                        source_code_types=concept_set["file"]["columns"],
+                        source_df,
                        target_code_type=target_code_type,
-                        concept_name=concept_set["name"]
+                        concept_name=concept_set["name"],
                    )
                    out = pd.concat([out, trans_out])
        else:
+            source_df = df[source_column_names]
            trans_out = translate_codes(
-                        df,
-                        source_code_types=concept_set["file"]["columns"],
+                source_df,
                target_code_type=target_code_type,
-                        concept_name=concept_set["name"])
+                concept_name=concept_set["name"],
+            )
            out = pd.concat([out, trans_out])
-            logger.debug("TEST")
-            logger.debug(df.columns)
-            logger.debug(df.head)
-            
-            logger.debug(out.columns)
-            logger.debug(out.head)            

    if len(code_errors) > 0:
        logger.error(f"The map processing has {len(code_errors)} errors")
@@ -753,15 +763,54 @@ def map_target_code_type(phen_path, phenotype, target_code_type):
            f"No output after map processing, check config {str(config_path.resolve())}"
        )

-    # Final processing
+    # final processing
    out = out.reset_index(drop=True)
    out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
    out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])

+    out_count = len(out.index)
+    # added metadata
+    # Loop over each source_concept_type and perform the left join on all columns apart from source code columns
+    result_list = []
+    source_column_names = list(concept_set["file"]["columns"].keys())
+    for source_concept_type in source_column_names:
+
+        # Filter output based on the current source_concept_type
+        out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
+        filtered_count = len(out_filtered_df.index)
+
+        # Remove the source type columns except the current type will leave the metadata and the join
+        remove_types = [
+            type for type in source_column_names if type != source_concept_type
+        ]
+        metadata_df = df.drop(columns=remove_types)
+        metadata_df = metadata_df.rename(
+            columns={source_concept_type: "SOURCE_CONCEPT"}
+        )
+        metadata_df_count = len(metadata_df.index)
+
+        # Perform the left join with df2 on SOURCE_CONCEPT to add the metadata
+        result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
+        result_count = len(result.index)
+
+        logger.debug(
+            f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
+        )
+
+        # Append the result to the result_list
+        result_list.append(result)
+
+    # Concatenate all the results into a single DataFrame
+    final_out = pd.concat(result_list, ignore_index=True)
+    final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
+    logger.debug(
+        f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
+    )
+
    # Save output to map directory
    output_filename = target_code_type + ".csv"
    map_path = phen_path / MAP_DIR / output_filename
-    out.to_csv(map_path, index=False)
+    final_out.to_csv(map_path, index=False)
    logger.info(f"Saved mapped concepts to {str(map_path.resolve())}")

    # save concept sets as separate files
@@ -776,7 +825,7 @@ def map_target_code_type(phen_path, phenotype, target_code_type):
        concept_set_path.mkdir(parents=True, exist_ok=True)

    # write each concept as a separate file
-    for name, concept in out.groupby("CONCEPT_SET"):
+    for name, concept in final_out.groupby("CONCEPT_SET"):
        concept = concept.sort_values(by="CONCEPT")  # sort rows
        concept = concept.dropna(how="all", axis=1)  # remove empty cols
        concept = concept.reindex(