Skip to content
Snippets Groups Projects
Commit 09103276 authored by Jakub Dylag's avatar Jakub Dylag
Browse files

Remove Old Files

parent 3b7281fa
No related branches found
No related tags found
No related merge requests found
......@@ -13,12 +13,8 @@ __pycache__
~$*
# Build
output/
concepts-output/
archive/
maps/*
concepts-new/
codes/
medcoder/resources/maps/*
medcoder/resources/codes/*
# temporary
script
......
img/nihr-logo-1200-375.jpg

16.6 KiB

import os
import pandas as pd
import argparse
def main(config):
#Load Output Concepts CSV File
if config["concepts"].endswith(".csv"):
df = pd.read_csv(config["concepts"], dtype=str)
else:
raise Exception("Concepts file must be '.csv' filetype")
for name, concept in df.groupby("CONCEPT_SET"):
concept = concept.sort_values(by="CONCEPT") #sort rows
concept = concept.dropna(how='all', axis=1) #remove empty cols
concept = concept.reindex(sorted(concept.columns), axis=1) #sort cols alphabetically
concept.to_csv(os.path.join(config["output"], str(name)+".csv"), #save to csv
index=False )
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Script divides single CSV file into one CSV per concept",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("concepts", help="Output Concepts CSV file")
parser.add_argument("output", help="Output Folder")
# parser.add_argument("version", help="Version of output")
args = parser.parse_args()
config = vars(args)
main(config)
import json
import os
import pandas as pd
import numpy as np
import argparse
#Get all Files in JSON
def get_json_files(folders):
out = []
for folder in folders:
if "files" in folder:
for file in folder["files"]:
file_path = folder["folder"]+"/"+file["file"]
if "concept_set" in file:
for concept in file["concept_set"]:
out.append({"json_concept":concept, "filepath":file_path, "json_code_types":list(file["columns"].keys())})
elif "concept_set_categories" in file:
for code, concept in file["concept_set_categories"].items():
out.append({"json_concept":concept[0], "filepath":file_path, "json_code_types":list(file["columns"].keys())})
else:
out.append({"json_concept":None, "filepath":file_path})
out = pd.DataFrame(out)
out["filepath"] = out["filepath"].astype(str)
return out
#Get all Files Excel Summary
def get_excel_files(out2):
out2 = out2[["CONCEPT NAME ", "CODING LIST", "AGREED", "FUNCTION"]].loc[1:] #select relevant columns
#Filter Concepts in use
out2 = out2[out2["AGREED"] == "USE"] #remove deprecated concepts
out2 = out2[out2["FUNCTION"] == "QUERY BY CODING LIST"] #remove deprecated concepts
out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)
#Get filepaths
out2["CODING LIST"] = out2["CODING LIST"].str.split(",") #split by ,
out2 = out2.explode("CODING LIST") #one row per file
out2["CODING LIST"] = out2["CODING LIST"].str.strip()
out2["CODING LIST"] = out2["CODING LIST"].str.replace("https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/", "")
out2["CODING LIST"] = out2["CODING LIST"].str.replace("%20", " ")
out2 = out2.rename(columns={"CONCEPT NAME ":"excel_concept", "CODING LIST":"filepath"})
return out2
#Get all Files in /codes
def get_code_files(path_codes):
all_files = []
for root, dirs, files in os.walk(path_codes, topdown=False):
for name in files:
if ".ipynb_checkpoint" not in root: #exclude notebook checkpoints
if name.endswith(".csv") or name.endswith(".xlsx") or name.endswith(".dta"): #exclude non-data files
all_files.append(os.path.join(root, name))
all_files = pd.DataFrame(all_files)
all_files = all_files.rename(columns={0:"filepath"})
all_files["filepath"] = all_files["filepath"].astype(str)
return all_files
def test_concept_def(config, report, folders, summary):
report.write("## Check Concept Definitions")
out = get_json_files(folders)
out2 = get_excel_files(summary)
all_files = get_code_files(config["codes"])
#Merge all into single table
outs = pd.merge(all_files, out, how="outer", on="filepath")
outs = pd.merge(outs, out2, how="outer", on="filepath")
report.write("\n\nCode source files:\n")
report.write("- {} total files\n\n".format(len(all_files)))
report.write("\n\nJSON concepts:\n")
report.write("- {} unique concepts\n".format(len(out["filepath"].unique())))
missing = outs[outs["json_concept"].isna() & outs["excel_concept"].notna()]
if len(missing) > 0:
report.write("- Missing from JSON\n")
for id, row in missing.iterrows():
report.write("\t - ❌ {} {}\n\n".format(row["filepath"], row["excel_concept"]))
report.write("\n\nEXCEL concepts:\n")
report.write("- {} unique concepts\n".format(len(out2["filepath"].unique())))
missing = outs[outs["json_concept"].notna() & outs["excel_concept"].isna()]
if len(missing) > 0:
report.write("- Missing from EXCEL\n")
for id, row in missing.iterrows():
report.write("\t - ❌ {} {} {}\n\n".format(row["filepath"], row["json_concept"], row["json_code_types"]))
def get_output_files(version):
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
f"output/{version}_MELD_atc_no_translate.csv"
]
error_file = f"output/{version}_MELD_errors.csv"
return output_files, error_file
def test_concept_changes(config, report):
version_1 = config["previous"]
version_2 = config["version"]
output1, err1 = get_output_files(version_1)
output2, err2 = get_output_files(version_2)
report.write(f"\n\n## Compare Concepts {version_1} to {version_2}\n\n")
for out1, out2 in zip(output1, output2):
report.write(f"`{out1}` to `{out2}`\n")
df1 = pd.read_csv(out1)
df1 = df1[["CONCEPT","CONCEPT_SET"]].groupby("CONCEPT_SET").count()
df2 = pd.read_csv(out2)
df2 = df2[["CONCEPT","CONCEPT_SET"]].groupby("CONCEPT_SET").count()
#Added/Removed Concepts
report.write("- Removed Concepts {}\n".format(list(set(df1.index) - set(df2.index))))
report.write("- Added Concepts {}\n".format(list(set(df2.index) - set(df1.index))))
#Changed Concepts
diff = df2 - df1 #diff in counts
diff = diff[(~(diff["CONCEPT"] == 0.0)) & diff["CONCEPT"].notna()] #get non-zero counts
s = "\n"
for concept, row in diff.iterrows():
s += "\t - {} {}\n".format(concept, row["CONCEPT"])
report.write("- Changed Concepts {}\n\n".format(s))
# ✅ ❌
def main(config):
#Load Report
if config["report"].endswith(".md"):
report = open(config["report"], 'a')
else:
raise Exception("Unsupported filetype provided for source file")
#Load Mapping File
if config["map"].endswith(".json"):
folders = json.load(open(config["map"],'rb'))
else:
raise Exception("Unsupported filetype provided for source file")
#Load Excel Summary File
if config["summary"].endswith(".xlsx"):
summary = pd.read_excel(config["summary"], sheet_name="CONCEPT_TRACKING", dtype=str)
else:
raise Exception("Unsupported filetype provided for summary file")
#Title with version
report.write("\n\n# Report {} \n\n".format(config["version"]))
#Compare JSON Mapping with Excel Summary
test_concept_def(config, report, folders, summary)
#Changes in Concept Codes between versions
test_concept_changes(config, report)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Script performs testing and generates report for output files",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("map", help="Concept/Phenotype Assignment File (json)")
parser.add_argument("summary", help="Summary working excel document")
parser.add_argument("codes", help="Folder containing all code source files")
parser.add_argument("report", help="Output Markdown file containing report")
parser.add_argument("version", help="Version of output")
parser.add_argument("previous", help="Previous version of output")
args = parser.parse_args()
config = vars(args)
main(config)
\ No newline at end of file
This diff is collapsed.
#! /usr/bin/bash
version="V3_2_10"
previous="V3_2_9"
python main.py -r2 PHEN_assign_v3.json CONC_summary_working.xlsx
mv output/MELD_concepts_read.csv output/${version}_MELD_concepts_readv2.csv
python main.py -i PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
mv output/MELD_concepts_read.csv output/${version}_MELD_icd10_no_translate.csv
python main.py -s PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
mv output/MELD_concepts_read.csv output/${version}_MELD_snomed_no_translate.csv
# python main.py -o PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
# mv output/MELD_concepts_read.csv output/${version}_MELD_opcs4_no_translate.csv
python main.py -a PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
mv output/MELD_concepts_read.csv output/${version}_MELD_atc_no_translate.csv
# python main.py -m PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
# mv output/MELD_concepts_read.csv output/${version}_MELD_med_no_translate.csv
mv output/MELD_errors.csv output/${version}_MELD_errors.csv
#Generate Report
rm concepts-output/MELD-report.md
python report.py PHEN_assign_v3.json CONC_summary_working.xlsx codes/ concepts-output/MELD-report.md ${version} ${previous}
#Divide Concepts to Output Repo
rm -rf concepts-output/readv2/*
rm -rf concepts-output/icd10/*
rm -rf concepts-output/snomed/*
rm -rf concepts-output/atc/*
python publish.py output/${version}_MELD_concepts_readv2.csv concepts-output/readv2/
python publish.py output/${version}_MELD_icd10_no_translate.csv concepts-output/icd10/
python publish.py output/${version}_MELD_snomed_no_translate.csv concepts-output/snomed/
python publish.py output/${version}_MELD_atc_no_translate.csv concepts-output/atc/
cp output/${version}_MELD_errors.csv concepts-output/${version}_MELD_errors.csv
# Show Changes in Output repo (should be same as report)
cd concepts-output
git diff --stat
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment