Remove metadata processing from map function

a6636f57 · Jakub Dylag · 46d4752f · a6636f57
Commit a6636f57 authored 2 months ago by Jakub Dylag
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -902,48 +902,51 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
    out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
    out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])

-    out_count = len(out.index)
+    # out_count = len(out.index)
    # added metadata
    # Loop over each source_concept_type and perform the left join on all columns apart from source code columns
-    result_list = []
-    source_column_names = list(concept_set["file"]["columns"].keys())
-    for source_concept_type in source_column_names:
-        # Filter output based on the current source_concept_type
-        out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
-        filtered_count = len(out_filtered_df.index)
-
-        # Remove the source type columns except the current type will leave the metadata and the join
-        remove_types = [
-            type for type in source_column_names if type != source_concept_type
-        ]
-        metadata_df = df.drop(columns=remove_types)
-        metadata_df = metadata_df.rename(
-            columns={source_concept_type: "SOURCE_CONCEPT"}
-        )
-        metadata_df_count = len(metadata_df.index)
+    # result_list = []
+    # for files in phenotype["concept_sets"]:
+    #     concept_set_name = files["name"]
+    #     for concept_set in files["files"]:
+    #         source_column_names = list(concept_set["columns"].keys())
+    #         for source_concept_type in source_column_names:
+    #             # Filter output based on the current source_concept_type
+    #             out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
+    #             filtered_count = len(out_filtered_df.index)
+
+    #             # Remove the source type columns except the current type will leave the metadata and the join
+    #             remove_types = [
+    #                 type for type in source_column_names if type != source_concept_type
+    #             ]
+    #             metadata_df = df.drop(columns=remove_types)
+    #             metadata_df = metadata_df.rename(
+    #                 columns={source_concept_type: "SOURCE_CONCEPT"}
+    #             )
+    #             metadata_df_count = len(metadata_df.index)

        # Perform the left join with df2 on SOURCE_CONCEPT to add the metadata
-        result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
-        result_count = len(result.index)
+        # result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
+        # result_count = len(result.index)

-        _logger.debug(
-            f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
-        )
+    #             _logger.debug(
+    #                 f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
+    #             )

-        # Append the result to the result_list
-        result_list.append(result)
+    #             # Append the result to the result_list
+    #             result_list.append(result)

    # Concatenate all the results into a single DataFrame
-    final_out = pd.concat(result_list, ignore_index=True)
-    final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
-    _logger.debug(
-        f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
-    )
+    # final_out = pd.concat(result_list, ignore_index=True)
+    # final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
+    # _logger.debug(
+    #     f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
+    # )

    # Save output to map directory
    output_filename = target_code_type + ".csv"
    map_path = phen_path / MAP_DIR / output_filename
-    final_out.to_csv(map_path, index=False)
+    out.to_csv(map_path, index=False)
    _logger.info(f"Saved mapped concepts to {str(map_path.resolve())}")

    # save concept sets as separate files
@@ -958,7 +961,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
        concept_set_path.mkdir(parents=True, exist_ok=True)

    # write each concept as a separate file
-    for name, concept in final_out.groupby("CONCEPT_SET"):
+    for name, concept in out.groupby("CONCEPT_SET"):
        concept = concept.sort_values(by="CONCEPT")  # sort rows
        concept = concept.dropna(how="all", axis=1)  # remove empty cols
        concept = concept.reindex(