diff --git a/main.py b/main.py index e5439de10676ba11089a3a711f8f5d7c4e75e9d6..a6ad4ccbea1fd22aaf152854506ca6486397d32c 100644 --- a/main.py +++ b/main.py @@ -134,11 +134,11 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], no_translate= #Append to out df if len(codes) > 0: codes = pd.DataFrame({ - "code":codes + "CONCEPT":codes }) codes = codes.join(df_meta) for concept in concepts: - codes["MELDB_concept"] = np.repeat(concept.strip(), len(codes)) + codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes)) out = pd.concat([out, codes]) return out @@ -200,7 +200,7 @@ def omop_publish_concept_sets(out, db_path, vocab_output, vocab_type): conn = sqlite3.connect(db_path) cur = conn.cursor() - for concept_set_name, grp in out.groupby("MELDB_concept"): + for concept_set_name, grp in out.groupby("CONCEPT_SET"): #Create Concept_Set if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name): cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');") @@ -214,7 +214,7 @@ def omop_publish_concept_sets(out, db_path, vocab_output, vocab_type): concept_set_id = cur.fetchone()[0] #Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED) - concept_codes = "'"+"', '".join(list(grp["code"].astype(str)))+"'" + concept_codes = "'"+"', '".join(list(grp["CONCEPT"].astype(str)))+"'" query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});" cur.execute(query, (vocab_type, )) df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"]) @@ -329,8 +329,8 @@ def run_all(mapping_file, target_code_type, #Final Processing out = out.reset_index(drop=True) - out = out.drop_duplicates(subset=["MELDB_concept", "code"]) - out = out.sort_values(by=["MELDB_concept", "code"]) + out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"]) + out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"]) #Merge with Concept Types in Summary Excel File summary_config = mapping["concepts"] @@ -346,9 +346,9 @@ def run_all(mapping_file, target_code_type, summary_cols_all += v summary_df = summary_df[summary_cols_all] #select all relevant columns - summary_df = summary_df.rename(columns={summary_config["columns"]["concept_name"]: "MELDB_concept"}) + summary_df = summary_df.rename(columns={summary_config["columns"]["concept_name"]: "CONCEPT_SET"}) summary_df = summary_df.drop_duplicates() #remove duplicates - out = out.merge(summary_df, how="left", on='MELDB_concept') + out = out.merge(summary_df, how="left", on='CONCEPT_SET') # Save Output File print(bcolors.HEADER, "---"*5, "OUTPUT", "---"*5, bcolors.ENDC) @@ -379,7 +379,7 @@ def run_all(mapping_file, target_code_type, if os.path.exists(log_errors_path): error_df = pd.read_csv(log_errors_path) error_df = error_df.drop_duplicates() #Remove Duplicates from Error file - error_df = error_df.sort_values(by=["SOURCE", "CODE_TYPE", "CODE"]) + error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CODE"]) error_df.to_csv(log_errors_path, index=False) diff --git a/publish.py b/publish.py index bfd7301394b0e357cbee27e2d0c5886574ac1c80..f39a2316fa845dd0f112abdee16a2fa6fbf22da9 100644 --- a/publish.py +++ b/publish.py @@ -9,8 +9,8 @@ def main(config): else: raise Exception("Concepts file must be '.csv' filetype") - for name, concept in df.groupby("MELDB_concept"): - concept = concept.sort_values(by="code") #sort rows + for name, concept in df.groupby("CONCEPT_SET"): + concept = concept.sort_values(by="CONCEPT") #sort rows concept = concept.dropna(how='all', axis=1) #remove empty cols concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically diff --git a/report.py b/report.py index d43e263ede3977c191fd226e5635afb6852cd3d9..c36a298f2cf7727f26d76f5fec1dfd6ca1c8898d 100644 --- a/report.py +++ b/report.py @@ -111,9 +111,9 @@ def test_concept_changes(config, report): report.write(f"`{out1}` to `{out2}`\n") df1 = pd.read_csv(out1) - df1 = df1[["code","MELDB_concept"]].groupby("MELDB_concept").count() + df1 = df1[["CONCEPT","CONCEPT_SET"]].groupby("CONCEPT_SET").count() df2 = pd.read_csv(out2) - df2 = df2[["code","MELDB_concept"]].groupby("MELDB_concept").count() + df2 = df2[["CONCEPT","CONCEPT_SET"]].groupby("CONCEPT_SET").count() #Added/Removed Concepts report.write("- Removed Concepts {}\n".format(list(set(df1.index) - set(df2.index)))) @@ -121,10 +121,10 @@ def test_concept_changes(config, report): #Changed Concepts diff = df2 - df1 #diff in counts - diff = diff[(~(diff["code"] == 0.0)) & diff["code"].notna()] #get non-zero counts + diff = diff[(~(diff["CONCEPT"] == 0.0)) & diff["CONCEPT"].notna()] #get non-zero counts s = "\n" for concept, row in diff.iterrows(): - s += "\t - {} {}\n".format(concept, row["code"]) + s += "\t - {} {}\n".format(concept, row["CONCEPT"]) report.write("- Changed Concepts {}\n\n".format(s)) # ✅ ❌