Skip to content
Snippets Groups Projects
Commit a6636f57 authored by Jakub Dylag's avatar Jakub Dylag
Browse files

Remove metadata processing from map function

parent 46d4752f
Branches
No related tags found
No related merge requests found
......@@ -902,48 +902,51 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
out_count = len(out.index)
# out_count = len(out.index)
# added metadata
# Loop over each source_concept_type and perform the left join on all columns apart from source code columns
result_list = []
source_column_names = list(concept_set["file"]["columns"].keys())
for source_concept_type in source_column_names:
# Filter output based on the current source_concept_type
out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
filtered_count = len(out_filtered_df.index)
# Remove the source type columns except the current type will leave the metadata and the join
remove_types = [
type for type in source_column_names if type != source_concept_type
]
metadata_df = df.drop(columns=remove_types)
metadata_df = metadata_df.rename(
columns={source_concept_type: "SOURCE_CONCEPT"}
)
metadata_df_count = len(metadata_df.index)
# result_list = []
# for files in phenotype["concept_sets"]:
# concept_set_name = files["name"]
# for concept_set in files["files"]:
# source_column_names = list(concept_set["columns"].keys())
# for source_concept_type in source_column_names:
# # Filter output based on the current source_concept_type
# out_filtered_df = out[out["SOURCE_CONCEPT_TYPE"] == source_concept_type]
# filtered_count = len(out_filtered_df.index)
# # Remove the source type columns except the current type will leave the metadata and the join
# remove_types = [
# type for type in source_column_names if type != source_concept_type
# ]
# metadata_df = df.drop(columns=remove_types)
# metadata_df = metadata_df.rename(
# columns={source_concept_type: "SOURCE_CONCEPT"}
# )
# metadata_df_count = len(metadata_df.index)
# Perform the left join with df2 on SOURCE_CONCEPT to add the metadata
result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
result_count = len(result.index)
# result = pd.merge(out_filtered_df, metadata_df, how="left", on="SOURCE_CONCEPT")
# result_count = len(result.index)
_logger.debug(
f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
)
# _logger.debug(
# f"Adding metadata for {source_concept_type}: out_count {out_count}, filtered_count {filtered_count}, metadata_df_count {metadata_df_count}, result_count {result_count}"
# )
# Append the result to the result_list
result_list.append(result)
# # Append the result to the result_list
# result_list.append(result)
# Concatenate all the results into a single DataFrame
final_out = pd.concat(result_list, ignore_index=True)
final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
_logger.debug(
f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
)
# final_out = pd.concat(result_list, ignore_index=True)
# final_out = final_out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
# _logger.debug(
# f"Check metadata processing counts: before {len(out.index)} : after {len(final_out.index)}"
# )
# Save output to map directory
output_filename = target_code_type + ".csv"
map_path = phen_path / MAP_DIR / output_filename
final_out.to_csv(map_path, index=False)
out.to_csv(map_path, index=False)
_logger.info(f"Saved mapped concepts to {str(map_path.resolve())}")
# save concept sets as separate files
......@@ -958,7 +961,7 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
concept_set_path.mkdir(parents=True, exist_ok=True)
# write each concept as a separate file
for name, concept in final_out.groupby("CONCEPT_SET"):
for name, concept in out.groupby("CONCEPT_SET"):
concept = concept.sort_values(by="CONCEPT") # sort rows
concept = concept.dropna(how="all", axis=1) # remove empty cols
concept = concept.reindex(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment