diff --git a/acmc/phen.py b/acmc/phen.py index 119a4c998086792f2728a3d0e97d6eb228df35ac..479605a34603201b72aa8afba50445f8cead70ca 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -902,48 +902,51 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) - out_count = len(out.index) + # out_count = len(out.index) # added metadata # Loop over each source_concept_type and perform the left join on all columns apart from source code columns - result_list = [] - source_column_names = list(concept_set["file"]["columns"].keys()) - for source_concept_type in source_column_names: - # Filter output based on the current source_concept_type - out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type] - filtered_count = len(out_filtered_df.index) - - # Remove the source type columns except the current type will leave the metadata and the join - remove_types = [ - type for type in source_column_names if type != source_concept_type - ] - metadata_df = df.drop(columns=remove_types) - metadata_df = metadata_df.rename( - columns={source_concept_type: "SOURCE_CONCEPT"} - ) - metadata_df_count = len(metadata_df.index) + # result_list = [] + # for files in phenotype["concept_sets"]: + # concept_set_name = files["name"] + # for concept_set in files["files"]: + # source_column_names = list(concept_set["columns"].keys()) + # for source_concept_type in source_column_names: + # # Filter output based on the current source_concept_type + # out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type] + # filtered_count = len(out_filtered_df.index) + + # # Remove the source type columns except the current type will leave the metadata and the join + # remove_types = [ + # type for type in source_column_names if type != source_concept_type + # ] + # metadata_df = df.drop(columns=remove_types) + # metadata_df = metadata_df.rename( + # columns={source_concept_type: "SOURCE_CONCEPT"} + # ) + # metadata_df_count = len(metadata_df.index) # Perform the left join with df2 on SOURCE_CONCEPT to add the metadata - result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT") - result_count = len(result.index) + # result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT") + # result_count = len(result.index) - _logger.debug( - f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}" - ) + # _logger.debug( + # f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}" + # ) - # Append the result to the result_list - result_list.append(result) + # # Append the result to the result_list + # result_list.append(result) # Concatenate all the results into a single DataFrame - final_out = pd.concat(result_list, ignore_index=True) - final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) - _logger.debug( - f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}" - ) + # final_out = pd.concat(result_list, ignore_index=True) + # final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) + # _logger.debug( + # f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}" + # ) # Save output to map directory output_filename = target_code_type + ".csv" map_path = phen_path / MAP_DIR / output_filename - final_out.to_csv(map_path, index=False) + out.to_csv(map_path, index=False) _logger.info(f"Saved mapped concepts to {str(map_path.resolve())}") # save concept sets as separate files @@ -958,7 +961,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st concept_set_path.mkdir(parents=True, exist_ok=True) # write each concept as a separate file - for name, concept in final_out.groupby("CONCEPT_SET"): + for name, concept in out.groupby("CONCEPT_SET"): concept = concept.sort_values(by="CONCEPT") # sort rows concept = concept.dropna(how="all", axis=1) # remove empty cols concept = concept.reindex(