From a6636f578ac49ea6a685f51db93c2749a6bab48d Mon Sep 17 00:00:00 2001
From: Jakub Dylag <jjd1c23@soton.ac.uk>
Date: Fri, 28 Mar 2025 17:15:48 +0000
Subject: [PATCH] Remove metadata processing from map function

---
 acmc/phen.py | 65 +++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/acmc/phen.py b/acmc/phen.py
index 119a4c9..479605a 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -902,48 +902,51 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
     out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
     out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
 
-    out_count = len(out.index)
+    # out_count = len(out.index)
     # added metadata
     # Loop over each source_concept_type and perform the left join on all columns apart from source code columns
-    result_list = []
-    source_column_names = list(concept_set["file"]["columns"].keys())
-    for source_concept_type in source_column_names:
-        # Filter output based on the current source_concept_type
-        out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
-        filtered_count = len(out_filtered_df.index)
-
-        # Remove the source type columns except the current type will leave the metadata and the join
-        remove_types = [
-            type for type in source_column_names if type != source_concept_type
-        ]
-        metadata_df = df.drop(columns=remove_types)
-        metadata_df = metadata_df.rename(
-            columns={source_concept_type: "SOURCE_CONCEPT"}
-        )
-        metadata_df_count = len(metadata_df.index)
+    # result_list = []
+    # for files in phenotype["concept_sets"]:
+    #     concept_set_name = files["name"]
+    #     for concept_set in files["files"]:
+    #         source_column_names = list(concept_set["columns"].keys())
+    #         for source_concept_type in source_column_names:
+    #             # Filter output based on the current source_concept_type
+    #             out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
+    #             filtered_count = len(out_filtered_df.index)
+
+    #             # Remove the source type columns except the current type will leave the metadata and the join
+    #             remove_types = [
+    #                 type for type in source_column_names if type != source_concept_type
+    #             ]
+    #             metadata_df = df.drop(columns=remove_types)
+    #             metadata_df = metadata_df.rename(
+    #                 columns={source_concept_type: "SOURCE_CONCEPT"}
+    #             )
+    #             metadata_df_count = len(metadata_df.index)
 
         # Perform the left join with df2 on SOURCE_CONCEPT to add the metadata
-        result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
-        result_count = len(result.index)
+        # result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
+        # result_count = len(result.index)
 
-        _logger.debug(
-            f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
-        )
+    #             _logger.debug(
+    #                 f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
+    #             )
 
-        # Append the result to the result_list
-        result_list.append(result)
+    #             # Append the result to the result_list
+    #             result_list.append(result)
 
     # Concatenate all the results into a single DataFrame
-    final_out = pd.concat(result_list, ignore_index=True)
-    final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
-    _logger.debug(
-        f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
-    )
+    # final_out = pd.concat(result_list, ignore_index=True)
+    # final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
+    # _logger.debug(
+    #     f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
+    # )
 
     # Save output to map directory
     output_filename = target_code_type + ".csv"
     map_path = phen_path / MAP_DIR / output_filename
-    final_out.to_csv(map_path, index=False)
+    out.to_csv(map_path, index=False)
     _logger.info(f"Saved mapped concepts to {str(map_path.resolve())}")
 
     # save concept sets as separate files
@@ -958,7 +961,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
         concept_set_path.mkdir(parents=True, exist_ok=True)
 
     # write each concept as a separate file
-    for name, concept in final_out.groupby("CONCEPT_SET"):
+    for name, concept in out.groupby("CONCEPT_SET"):
         concept = concept.sort_values(by="CONCEPT")  # sort rows
         concept = concept.dropna(how="all", axis=1)  # remove empty cols
         concept = concept.reindex(
-- 
GitLab