Skip to content
Snippets Groups Projects
Commit 50a29145 authored by mjbonifa's avatar mjbonifa
Browse files

removed conda, notebook and scripts as they are no longer needed

parent 18c4229c
No related branches found
No related tags found
No related merge requests found
name: acmc
channels:
- conda-forge
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- asttokens=3.0.0=pyhd8ed1ab_1
- blosc=1.21.6=he440d0b_1
- brotli-python=1.1.0=py313h46c70d0_2
- bzip2=1.0.8=h4bc722e_7
- c-ares=1.34.4=hb9d3cd8_0
- c-blosc2=2.16.0=h3122c55_0
- ca-certificates=2025.1.31=hbcca054_0
- certifi=2025.1.31=pyhd8ed1ab_0
- cffi=1.17.1=py313hfab6e84_0
- charset-normalizer=3.4.1=pyhd8ed1ab_0
- comm=0.2.2=pyhd8ed1ab_1
- debugpy=1.8.12=py313h46c70d0_0
- decorator=5.1.1=pyhd8ed1ab_1
- exceptiongroup=1.2.2=pyhd8ed1ab_1
- executing=2.1.0=pyhd8ed1ab_1
- h2=4.2.0=pyhd8ed1ab_0
- hdf5=1.14.3=nompi_h2d575fe_109
- hpack=4.1.0=pyhd8ed1ab_0
- hyperframe=6.1.0=pyhd8ed1ab_0
- idna=3.10=pyhd8ed1ab_1
- importlib-metadata=8.6.1=pyha770c72_0
- ipykernel=6.29.5=pyh3099207_0
- ipython=8.32.0=pyh907856f_0
- jedi=0.19.2=pyhd8ed1ab_1
- jupyter_client=8.6.3=pyhd8ed1ab_1
- jupyter_core=5.7.2=pyh31011fe_1
- keyutils=1.6.1=h166bdaf_0
- krb5=1.21.3=h659f571_0
- ld_impl_linux-64=2.43=h712a8e2_2
- libaec=1.1.3=h59595ed_0
- libblas=3.9.0=28_h59b9bed_openblas
- libcblas=3.9.0=28_he106b2a_openblas
- libcurl=8.11.1=h332b0f4_0
- libedit=3.1.20250104=pl5321h7949ede_0
- libev=4.33=hd590300_2
- libexpat=2.6.4=h5888daf_0
- libffi=3.4.6=h2dba641_0
- libgcc=14.2.0=h77fa898_1
- libgcc-ng=14.2.0=h69a702a_1
- libgfortran=14.2.0=h69a702a_1
- libgfortran5=14.2.0=hd5240d6_1
- libgomp=14.2.0=h77fa898_1
- liblapack=3.9.0=28_h7ac8fdf_openblas
- liblzma=5.6.4=hb9d3cd8_0
- libmpdec=4.0.0=h4bc722e_0
- libnghttp2=1.64.0=h161d5f1_0
- libopenblas=0.3.28=pthreads_h94d23a6_1
- libsodium=1.0.20=h4ab18f5_0
- libsqlite=3.48.0=hee588c1_1
- libssh2=1.11.1=hf672d98_0
- libstdcxx=14.2.0=hc0a3c3a_1
- libstdcxx-ng=14.2.0=h4852527_1
- libuuid=2.38.1=h0b41bf4_0
- libzlib=1.3.1=hb9d3cd8_2
- lz4-c=1.10.0=h5888daf_1
- matplotlib-inline=0.1.7=pyhd8ed1ab_1
- ncurses=6.5=h2d0b736_3
- nest-asyncio=1.6.0=pyhd8ed1ab_1
- nomkl=1.0=h5ca1d4c_0
- numexpr=2.10.2=py313h5f97788_100
- numpy=2.2.3=py313h17eae1a_0
- openssl=3.4.1=h7b32b05_0
- packaging=24.2=pyhd8ed1ab_2
- pandas=2.2.3=py313ha87cce1_1
- parso=0.8.4=pyhd8ed1ab_1
- pexpect=4.9.0=pyhd8ed1ab_1
- pickleshare=0.7.5=pyhd8ed1ab_1004
- pip=25.0.1=pyh145f28c_0
- platformdirs=4.3.6=pyhd8ed1ab_1
- prompt-toolkit=3.0.50=pyha770c72_0
- psutil=6.1.1=py313h536fd9c_0
- ptyprocess=0.7.0=pyhd8ed1ab_1
- pure_eval=0.2.3=pyhd8ed1ab_1
- py-cpuinfo=9.0.0=pyhd8ed1ab_1
- pycparser=2.22=pyh29332c3_1
- pygments=2.19.1=pyhd8ed1ab_0
- pysocks=1.7.1=pyha55dd90_7
- pytables=3.10.2=py313hd261420_1
- python=3.13.1=ha99a958_105_cp313
- python-dateutil=2.9.0.post0=pyhff2d567_1
- python-tzdata=2025.1=pyhd8ed1ab_0
- python_abi=3.13=5_cp313
- pytz=2024.1=pyhd8ed1ab_0
- pyzmq=26.2.1=py313h8e95178_0
- readline=8.2=h8228510_1
- requests=2.32.3=pyhd8ed1ab_1
- six=1.17.0=pyhd8ed1ab_0
- snappy=1.2.1=h8bd8927_1
- stack_data=0.6.3=pyhd8ed1ab_1
- tk=8.6.13=noxft_h4845f30_101
- tornado=6.4.2=py313h536fd9c_0
- traitlets=5.14.3=pyhd8ed1ab_1
- typing-extensions=4.12.2=hd8ed1ab_1
- typing_extensions=4.12.2=pyha770c72_1
- tzdata=2025a=h78e105d_0
- urllib3=2.3.0=pyhd8ed1ab_0
- wcwidth=0.2.13=pyhd8ed1ab_1
- zeromq=4.3.5=h3b0a872_7
- zipp=3.21.0=pyhd8ed1ab_1
- zlib-ng=2.2.4=h7955e40_0
- zstandard=0.23.0=py313h80202fe_1
- zstd=1.5.6=ha6fb4c9_0
- pip:
- aiosqlite==0.21.0
- click==8.1.8
- cramjam==2.9.1
- et-xmlfile==2.0.0
- fastparquet==2024.11.0
- fsspec==2025.2.0
- gitdb==4.0.12
- gitpython==3.1.44
- greenlet==3.1.1
- iniconfig==2.0.0
- lxml==5.3.1
- openpyxl==3.1.5
- pluggy==1.5.0
- pyarrow==19.0.0
- pyomop==4.3.0
- pytest==8.3.4
- simpledbf==0.2.6
- smmap==5.0.2
- sqlalchemy==2.0.38
prefix: /opt/conda/envs/acmc
%% Cell type:code id:8c8f4cdf-04a5-4762-895e-6555781a1f05 tags:
``` python
import pandas as pd
import numpy as np
import json
```
%% Cell type:markdown id:c5786d78-7dc2-4f02-ad21-cee95e473823 tags:
### Ho generate JSON
%% Cell type:code id:0292dc90-e31a-4724-8536-d0b55533aaef tags:
``` python
#List v4 to json
df = pd.read_excel("PHEN_code_lists_sources_V4.xlsx", sheet_name="ho", dtype=str)
# df = df.sort_values(by="mapped_condition")
def json_file_template(file, cons, types, metadata):
concepts = ""
for concept in cons:
concepts += f'"{concept}", '
concepts = concepts[:-2] #remove last ,
type_str = ""
for k, v in types.items():
type_str += f'"{k}":"{v}", '
type_str = type_str[:-2]
meta_str = '"metadata":['
for v in metadata:
meta_str += f'"{v}", '
meta_str = meta_str[:-2]
meta_str = meta_str + "]"
return '''
{
\"file\":\"'''+file+'''",
\"columns\":{
'''+type_str+''',
'''+meta_str+'''
},
\"meldb_phenotypes\":['''+concepts+''']
},'''
out = '"files":['
folder = "codes/GitHub_TG_repository/"
for file, grp in df.groupby("mapped_condition"):
file = file.replace("%20", " ")
for ext in ["_CPRD_GOLD.csv", "_CPRD_AURUM.csv", "_IMRD.csv"]:
path = file+"/"+file+ext
if os.path.isfile(folder+path):
out+= json_file_template(path, grp["meldb_condition"],
types={
"read2_code":"READ_CODE",
"snomed_code":"SNOMED_CT_CODE",
# "med_code":"MEDICAL_CODE_ID",
},
metadata = ["DESCRIPTION"]
)
else:
print("NOT FILE", folder+path)
for ext in ["_ICD10.csv"]:
path = file+"/"+file+ext
if os.path.isfile(folder+path):
out+= json_file_template(path, grp["meldb_condition"],
types={
"icd10_code":"READ_CODE",
"snomed_code":"SNOMED_CT_CODE",
# "icd10_code":"MEDICAL_CODE_ID",
},
metadata = ["DESCRIPTION"]
)
else:
print("NOT FILE", folder+path)
# out+= json_file_template(file+"/"+file+"_CPRD_AURUM.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_ICD10.csv", grp["meldb_condition"])
# out+= json_file_template(file+"/"+file+"_IMRD.csv", grp["meldb_condition"])
# out += f' "{file}/{file}_CPRD_GOLD.csv":[{conds}],\n'
# out += f' "{file}/{file}_CPRD_AURUM.csv":[{conds}],\n'
# out += f' "{file}/{file}_ICD10.csv":[{conds}],\n'
# out += f' "{file}/{file}_IMRD.csv":[{conds}],\n'
out = out[:-1] #remove last ,
out += "\n]"
out = out.replace("%20", " ")
print(out)
```
%% Cell type:code id:f155b635-b459-4aff-81b2-e065fc223858 tags:
``` python
```
%% Cell type:code id:d040eda5-4028-4047-834c-7315e307e415 tags:
``` python
df = pd.read_parquet("maps/processed/icd10_code.parquet")
df
```
%% Cell type:code id:e0228ac9-8852-4818-b7f0-98429ca5229c tags:
``` python
code = ["A00.0", "*00.0"]
code = pd.Series(code)
print(code.isin(df["icd10_code"]))
print(code.isin(df["icd10_alt_code"]))
# print( )
~(
~code.isin(df["icd10_code"])
&
~code.isin(df["icd10_alt_code"])
)
```
%% Cell type:markdown id:18efcacd-45f0-4341-86cc-d8e2e584350c tags:
### Analyse the JSON file
%% Cell type:code id:85dc197b-451e-4fa9-a53b-e6770c132123 tags:
``` python
import json
import os
path_json = "../concepts/PHEN_assign_v3.json"
#Load JSON Concept Definitions
mapping = json.load(open(path_json,'rb'))
summary_config = mapping["concept_sets"]["concept_set"]
summary_df = pd.DataFrame(summary_config) #change to dataframe
summary_df = summary_df.join(pd.json_normalize(summary_df["metadata"])) #metadata to columns
summary_df = summary_df.drop(columns=["metadata"])
summary_df = summary_df.rename(columns={"concept_set_name":"CONCEPT_SET"})
summary_df = summary_df.drop_duplicates() #remove duplicates
summary_df
```
%% Cell type:code id:4c9b6b3f-08aa-4f61-b9b2-44a24b5d00a0 tags:
``` python
import json
import os
path_json = "PHEN_assign_v3.json"
path_excel = "PHEN_summary_working.xlsx"
path_codes = "codes/"
#Get all Files in JSON
def get_json_files(path_json):
folders = json.load(open(path_json,'rb'))
out = []
for folder in folders:
if "files" in folder:
for file in folder["files"]:
file_path = folder["folder"]+"/"+file["file"]
if "meldb_phenotypes" in file:
for concept in file["meldb_phenotypes"]:
out.append({"json_concept":concept, "filepath":file_path, "json_code_types":list(file["columns"].keys())})
elif "meldb_phenotypes_categories" in file:
for code, concept in file["meldb_phenotypes_categories"].items():
out.append({"json_concept":concept[0], "filepath":file_path, "json_code_types":list(file["columns"].keys())})
else:
out.append({"json_concept":None, "filepath":file_path})
out = pd.DataFrame(out)
out["filepath"] = out["filepath"].astype(str)
return out
out = get_json_files(path_json)
#Get all Files Excel Summary
def get_excel_files(path_excel):
path_excel = "PHEN_summary_working.xlsx"
out2 = pd.read_excel(path_excel)
out2 = out2[["CONCEPT NAME ", "CODING LIST", "AGREED", "FUNCTION"]].loc[1:] #select relevant columns
#Filter Concepts in use
out2 = out2[out2["AGREED"] == "USE"] #remove deprecated concepts
out2 = out2[out2["FUNCTION"] == "QUERY BY CODING LIST"] #remove deprecated concepts
out2 = out2.drop(['AGREED', 'FUNCTION'], axis=1)
#Get filepaths
out2["CODING LIST"] = out2["CODING LIST"].str.split(",") #split by ,
out2 = out2.explode("CODING LIST") #one row per file
out2["CODING LIST"] = out2["CODING LIST"].str.strip()
out2["CODING LIST"] = out2["CODING LIST"].str.replace("https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/", "")
out2["CODING LIST"] = out2["CODING LIST"].str.replace("%20", " ")
out2 = out2.rename(columns={"CONCEPT NAME ":"excel_concept", "CODING LIST":"filepath"})
return out2
out2 = get_excel_files(path_excel)
#Get all Files in /codes
def get_code_files(path_codes):
all_files = []
for root, dirs, files in os.walk(path_codes, topdown=False):
for name in files:
if ".ipynb_checkpoint" not in root: #exclude notebook checkpoints
if name.endswith(".csv") or name.endswith(".xlsx") or name.endswith(".dta"): #exclude non-data files
all_files.append(os.path.join(root, name))
all_files = pd.DataFrame(all_files)
all_files = all_files.rename(columns={0:"filepath"})
all_files["filepath"] = all_files["filepath"].astype(str)
return all_files
all_files = get_code_files(path_codes)
print("ALL FILES", len(all_files), len(all_files["filepath"].unique()))
print("JSON CONCEPTS", len(out), len(out["filepath"].unique()))
print("EXCEL CONCEPTS", len(out2), len(out2["filepath"].unique()))
outs = pd.merge(all_files, out, how="outer", on="filepath")
outs = pd.merge(outs, out2, how="outer", on="filepath")
print(len(outs), len(outs["filepath"].unique()))
outs.to_csv("output/MELD_file_to_concept.csv", index=False)
# display(outs[ outs["concept"].isna()])
# display(out )
```
%% Cell type:code id:f8e70c33-c869-46f8-953e-f6b52992cfbb tags:
``` python
display("JSON MISSING", outs[outs["json_concept"].isna() & outs["excel_concept"].notna()])
display("EXCEL MISSING", outs[outs["json_concept"].notna() & outs["excel_concept"].isna()])
```
%% Cell type:code id:9d84465f-f064-4df2-b0e4-2dfb217aea21 tags:
``` python
f = open('concepts-output/MELD-report.md', 'a') as f:
f.write(
"""
# Report
- One thing
- Two thing
- Three thing
""")
```
%% Cell type:code id:7f7fc771-e406-42c7-8a09-16a20b5298f5 tags:
``` python
total_length = 0
for file in all_files["filepath"]:
if file.endswith(".csv"):
df_file = pd.read_csv(file)
total_length += len(df_file)
elif file.endswith(".xlsx"):
df_file = pd.read_excel(file)
total_length += len(df_file)
elif file.endswith(".dta"):
df_file = pd.read_stata(file)
total_length += len(df_file)
total_length
```
%% Cell type:code id:08a9c565-28d6-46ee-9fa8-6fa0ee28a4d5 tags:
``` python
#turn filepaths into gitlab links
outs2 = outs.copy()
outs2["filepath"] = "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/"+outs2["filepath"].str.replace(" ", "%20")
#Groupby concepts and concat filepaths
outs2 = outs2.groupby("concept")["filepath"].apply(', '.join).reset_index()
outs2 = outs2.sort_values(by=["concept"])
outs2
outs2.to_csv("output/MELD_GitLab_link_to_concept.csv", index=False)
```
%% Cell type:markdown id:357bb84c-90c2-4b5f-95c0-443191783a7f tags:
### Analyse Output Files
%% Cell type:code id:7d3f9cb7-be86-4f1f-92f6-991094eb7bb7 tags:
``` python
version = "V2_2_2"
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
f"output/{version}_MELD_atc_no_translate.csv"
]
error_file = f"output/{version}_MELD_errors.csv"
for output_file in output_files:
print("---"*3,output_file,"---"*3,)
df = pd.read_csv(output_file)
# df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
if df["code"].dtype == "object":
print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
# len(df["MELDB_concept"].unique())
print("---"*3,error_file,"---"*3,)
df = pd.read_csv(error_file)
df = df.drop_duplicates()
df["CODE_TYPE"].value_counts()
# for i, row in df.drop_duplicates().iterrows():
# print(row["CODE"], row["CODE_TYPE"])
```
%% Cell type:code id:08e0ecc1-9271-48c3-9c5b-094800072906 tags:
``` python
def get_output_files(version):
output_files = [f"output/{version}_MELD_concepts_readv2.csv",
f"output/{version}_MELD_snomed_no_translate.csv",
f"output/{version}_MELD_icd10_no_translate.csv",
# f"output/{version}_MELD_med_no_translate.csv",
f"output/{version}_MELD_atc_no_translate.csv"
]
error_file = f"output/{version}_MELD_errors.csv"
return output_files, error_file
# version_1 = "V1_0_0"
version_1 = "V2_1_4"
version_2 = "V2_2_3"
output1, err1 = get_output_files(version_1)
output2, err2 = get_output_files(version_2)
print("## Compare Concepts", version_1, "to", version_2)
for out1, out2 in zip(output1, output2):
print(out1, out2 )
df1 = pd.read_csv(out1)
df1 = df1[["code","MELDB_concept"]].groupby("MELDB_concept").count()
df2 = pd.read_csv(out2)
df2 = df2[["code","MELDB_concept"]].groupby("MELDB_concept").count()
#Added/Removed Concepts
print("- Removed Concepts", list(set(df1.index) - set(df2.index)))
print("- Added Concepts", list(set(df2.index) - set(df1.index)))
#Changed Concepts
diff = df2 - df1 #diff in counts
diff = diff[(~(diff["code"] == 0.0)) & diff["code"].notna()] #get non-zero counts
s = "\n"
for concept, row in diff.iterrows():
s += "\t - {} {}\n".format(concept, row["code"])
print("- Changed Concepts", s)
# for output_file in output_files:
# print("---"*3,output_file,"---"*3,)
# df = pd.read_csv(output_file)
# # df["MELDB_concept"].loc[df["CONCEPT TYPE"].isna()]
# print("MELDB missing concepts ", len(df[df["CONCEPT TYPE"].isna()]))
# if df["code"].dtype == "object":
# print("Chars present:", np.sort(df["code"].apply(lambda x : set(x)).explode().unique()))
```
%% Cell type:code id:cc60c137-5a85-4155-af6b-6796f8c05980 tags:
``` python
import glob
import os
import pandas as pd
df = pd.read_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working.csv")
df = df.set_index("#")
for vocab in ["atc", "icd10", "readv2", "snomed"]:
df[vocab.upper()] = ""
for file in glob.glob(f"/home/jjd1c23/ssd/meldb/jjd1c23/concepts/{vocab}/*.csv"):
concept_set = os.path.basename(file)[:-4]
row_index = df[df["CONCEPT NAME "] == concept_set].index[0]
df.loc[row_index, vocab.upper()] = "YES"
df = df.drop(columns=["READv2_CODE", "ICD10_CODE"])
df.to_csv("/home/jjd1c23/ssd/meldb/jjd1c23/concepts/PHEN_summary_working_labelled.csv")
```
%% Cell type:markdown id:e5c4291f-847b-4c82-976e-bd5b3a7b6bcc tags:
### Mappings
%% Cell type:code id:08e34750-413c-469e-bcb8-e71bb188ff42 tags:
``` python
#NHS Read Browser
import simpledbf
import pandas as pd
#r2 only
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ANCESTOR.DBF').to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read2_code"})
df.to_parquet("maps/processed/read2_code.parquet", index=False)
#r2 -> atc
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ATC.DBF').to_dataframe()
df = df[["READCODE", "ATC"]]
df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
df.to_parquet("maps/processed/read2_code_to_atc_code.parquet", index=False)
#r2 -> icd10
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/ICD10.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read2_code_to_icd10_code.parquet", index=False)
#r2 -> opcs4
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V2/OPCS4V3.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read2_code_to_opcs4_code.parquet", index=False)
#r3 only
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ANCESTOR.DBF').to_dataframe()
df = pd.concat([df['READCODE'], df['DESCENDANT']])
df = pd.DataFrame(df.drop_duplicates())
df = df.rename(columns={0:"read3_code"})
df.to_parquet("maps/processed/read3_code.parquet", index=False)
#r3 -> icd10
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD10.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read3_code_to_icd10_code.parquet", index=False)
#r3 -> icd9
# dbf = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/ICD9V3.DBF')
#r3 -> opcs4
df = simpledbf.Dbf5('maps/nhs_readbrowser_25.0.0_20180401000001/Standard/V3/OPCS4V3.DBF').to_dataframe()
df = df[["READ_CODE", "TARG_CODE"]]
df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
df.to_parquet("maps/processed/read3_code_to_opcs4_code.parquet", index=False)
```
%% Cell type:code id:5fe95638-1f25-45f3-803c-2fff74a2a4fd tags:
``` python
#NHS Data Migrations
#r2 only
# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctcremap_uk_20200401000001.txt', sep='\t')
#r3 only
# df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3cremap_uk_20200401000001.txt', sep='\t')
#snomed only
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/sctcremap_uk_20200401000001.txt', sep='\t')
df = df[["SCT_CONCEPTID"]]
df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
df = df.drop_duplicates()
df = df.astype(str)
df.to_parquet("maps/processed/snomed_code.parquet", index=False)
#r2 -> r3
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rctctv3map_uk_20200401000001.txt', sep='\t')
df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
df = df.rename(columns={"V2_CONCEPTID":"read2_code",
"CTV3_CONCEPTID":"read3_code"})
df.to_parquet("maps/processed/read2_code_to_read3_code.parquet", index=False)
#r3->r2
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3rctmap_uk_20200401000002.txt', sep='\t')
df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
df = df.rename(columns={"CTV3_CONCEPTID":"read3_code",
"V2_CONCEPTID":"read2_code"})
df = df.drop_duplicates()
df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
df.to_parquet("maps/processed/read3_code_to_read2_code.parquet", index=False)
#r2 -> snomed
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/rcsctmap2_uk_20200401000001.txt', sep='\t', dtype=str)
df = df[["ReadCode", "ConceptId"]]
df = df.rename(columns={"ReadCode":"read2_code",
"ConceptId":"snomed_code"})
df.to_parquet("maps/processed/read2_code_to_snomed_code.parquet", index=False)
#r3->snomed
df = pd.read_csv('maps/nhs_datamigration_29.0.0_20200401000001/Mapping Tables/Updated/Clinically Assured/ctv3sctmap2_uk_20200401000001.txt', sep='\t')
df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
df = df.rename(columns={"CTV3_TERMID":"read3_code",
"SCT_CONCEPTID":"snomed_code"})
df["snomed_code"] = df["snomed_code"].astype(str)
df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
df.to_parquet("maps/processed/read3_code_to_snomed_code.parquet", index=False)
```
%% Cell type:code id:267fa1cc-5159-48c4-9eee-19af5039d627 tags:
``` python
#OPCS410 Data Files
df = pd.read_csv("maps/OPCS410 Data files txt/OPCS410 CodesAndTitles Nov 2022 V1.0.txt", sep='\t', dtype=str, header=None)
df = df.rename(columns={0:"opcs4_code", 1:"description"})
df.to_parquet("maps/processed/opcs4_code.parquet", index=False)
```
%% Cell type:code id:01d046fd-69af-44f3-acad-5d0edef3f745 tags:
``` python
#ICD10_edition5
df = pd.read_xml("maps/ICD10_Edition5_XML_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml",)
df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
df = df.rename(columns={"CODE":"icd10_code",
"ALT_CODE":"icd10_alt_code",
"DESCRIPTION":"description"
})
df.to_parquet("maps/processed/icd10_code.parquet", index=False)
```
%% Cell type:code id:36630e24-f56c-48e1-8ecf-4ccd2b41eaea tags:
``` python
code1="read2_code"
code2="icd10_code"
df_map = pd.read_parquet(f"maps/processed/{code1}_to_{code2}.parquet")
codes=df_map["read2_code"].iloc[:5]
pd.merge(codes, df_map, how='left')[code2]
```
%% Cell type:code id:9787adeb-8507-488b-9a91-b8df3fbbe21e tags:
``` python
#CPRD Code Browser
df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumMedical.txt', sep='\t')
df = df[["MedCodeId", "CleansedReadCode", "SnomedCTConceptId"]]
df = df.rename(columns={"MedCodeId":"med_code",
"CleansedReadCode":"read2_code",
"SnomedCTConceptId":"snomed_code"})
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_Aurum/CPRDAurumProduct.txt', sep='\t', dtype=str)
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/medical.txt', sep='\t')
# df = df.reset_index().iloc[:,[1,6]]
# df = df.rename(columns={"level_1":"read2_code", "20220523":"description"})
# df = pd.read_csv('maps/CPRD_CodeBrowser_202211_GOLD/product.txt', sep='\t', dtype=str) #CANNOT OPEN
df
```
%% Cell type:code id:a968ffb1-4337-456b-8d20-419888b4044f tags:
``` python
#BNF
df = pd.read_excel("maps/BNF Snomed Mapping data 20231215.xlsx")
df = df.astype(str)
df = df.rename(columns={"BNF Code":"bnf_code",
"SNOMED Code":"snomed_code"})
df[["bnf_code", "snomed_code"]].to_parquet("maps/processed/bnf_code_to_snomed_code.parquet", index=False)
```
%% Cell type:code id:c70b1ce2-0f41-4d02-ad17-6fc44bc3c6bf tags:
``` python
#BNF to Readv2 Merge
df1 = pd.read_parquet("maps/processed/bnf_code_to_snomed_code.parquet").astype(str)
df2 = pd.read_parquet("maps/processed/read2_code_to_snomed_code.parquet").astype(str)
# df1.merge(df2, how="inner", on="snomed_code")
```
%% Cell type:code id:d5d34237-02d4-4dea-8c20-5adaf337f6b5 tags:
``` python
df1.merge(df2, how='inner', on='snomed_code')
```
%% Cell type:code id:b3166cf0-e4a5-43e0-aeac-78827427422e tags:
``` python
.astype(str).dtypes
```
%% Cell type:code id:c0a766f9-7959-4a10-b58f-cd946a878b60 tags:
``` python
df = pd.read_csv("../concepts/PHEN_summary_working.csv")
cols = list(df.columns)
cols.remove('CONCEPT NAME ')
cols.remove('AGREED')
df = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x) #change to int
df_copy = df.rename(columns={
"CONCEPT NAME ":"concept_set_name",
"AGREED":"concept_set_status"
})
df_copy["concept_set_status"] = df_copy["concept_set_status"].replace("USE", "AGREED")
df_copy = df_copy[["concept_set_name", "concept_set_status"]]
outs = df_copy.to_dict(orient='records')
for i, out in enumerate(outs):
out["metadata"] = dict(df[cols].iloc[i])
json.dumps(outs)
```
%% Cell type:code id:8a204a95-dc4c-4183-9ea7-f5c5e95e9087 tags:
``` python
```
%% Cell type:code id:5ce1ab58-50b4-4c22-b72b-c698de6830f7 tags:
``` python
import json
```
%% Cell type:code id:f1ea81c6-d1db-408f-9d3a-b96f44efe21f tags:
``` python
```
%% Cell type:markdown id:5eb544a3-9dd1-41e8-88c2-a808646c6112 tags:
### OMOP Database
%% Cell type:code id:c9e58e62-9e44-4d0c-9d8d-35c175c07e6c tags:
``` python
import sqlite3
import csv
import pandas as pd
import os
```
%% Cell type:code id:4f67c9a1-373f-4799-8a85-72767662d912 tags:
``` python
```
%% Cell type:code id:d0ecdf69-ee90-42c1-ad25-d8357b603d1b tags:
``` python
#IMPORT OMOP VOCABS
conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
folder_path = "codes/vocabulary_download_v5_{9424944c-2b76-4127-8f05-f535e0f15e2a}_1731661390540"
# Check if the folder exists
if not os.path.isdir(folder_path):
raise Exception(f"Error: The folder '{folder_path}' does not exist.")
# Iterate through files in the folder
for filename in os.listdir(folder_path):
if filename.endswith(".csv"): # Check if the file is a CSV
file_path = os.path.join(folder_path, filename)
try:
print(f"Reading file: {file_path}")
# Read the CSV file with the specified delimiter
df = pd.read_csv(file_path, delimiter="\t", low_memory=False)
table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
#Export Table to sqlite db
df.to_sql(table_name, conn, if_exists='replace', index=False)
except Exception as e:
raise Exception(f"Error reading file {file_path}: {e}")
conn.commit()
conn.close()
```
%% Cell type:code id:b9cafd0c-a3bd-408b-bca8-b0de2acde1cd tags:
``` python
# Create a SQL connection to our SQLite database
conn = sqlite3.connect("codes/omop_54.sqlite")
cur = conn.cursor()
#Print ALL Columns in Table
# table="CONCEPT_SET"
# cur.execute(f"PRAGMA table_info({table});")
# print(pd.DataFrame(cur.fetchall()))
#Print ALL TABLE NAMES
# cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=? ;", ("VOCABULARY",))
# print(cur.fetchone())
cur.execute("SELECT vocabulary_id FROM VOCABULARY WHERE vocabulary_id=? ;", ("MELDB",))
print(cur.fetchone())
#Print WHOLE TABLE
# cur.execute('SELECT * FROM CONCEPT;')
# cur.execute('SELECT * FROM CONCEPT WHERE standard_concept = "C";')
# cur.execute('SELECT * FROM CONCEPT WHERE concept_code = "119768002" LIMIT 1;')
# cur.execute('SELECT * FROM CONCEPT WHERE concept_code IN ("119768002", "5905001");')
# cur.execute('SELECT DISTINCT VOCABULARY_ID FROM CONCEPT;')
# df = pd.DataFrame(cur.fetchall())
# print(list(df[0]))
# display(df)
# for row in :
# print(row)
#Get Header of Table
# table="CONCEPT_CLASS"
# cur.execute(f"SELECT * FROM {table} LIMIT 3;")
# print(cur.fetchall())
#create meldb VOCABULARY
# meldb_version='v3.2.10'
# meldb_description = 'Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity'
# meldb_reference = 'https://www.it-innovation.soton.ac.uk/projects/meldb'
# df_test = pd.DataFrame([{
# "vocabulary_id": 'MELDB',
# "vocabulary_name": meldb_description,
# "vocabulary_reference": meldb_reference,
# "vocabulary_version": meldb_version,
# # "vocabulary_concept_id": 0,
# }])
# df_test.to_sql("VOCABULARY", conn, if_exists='append', index=False)
# cur.execute("""
# CREATE TABLE CONCEPT_SET (
# concept_set_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each concept set
# atlas_id INTEGER, -- Unique identifier generated by ATLAS
# concept_set_name TEXT, -- Optional name for the concept set
# concept_set_description TEXT, -- Optional description for the concept set
# vocabulary_id TEXT NOT NULL, -- Foreign key to VOCABULARY table
# FOREIGN KEY (vocabulary_id) REFERENCES VOCABULARY(vocabulary_id)
# );""")
# cur.execute("DROP TABLE CONCEPT_SET;")
# cur.execute("""
# CREATE TABLE CONCEPT_SET_ITEM (
# concept_set_item_id INTEGER PRIMARY KEY AUTOINCREMENT, -- Unique identifier for each mapping
# concept_set_id INTEGER NOT NULL, -- Foreign key to CONCEPT_SET table
# concept_id INTEGER NOT NULL, -- Foreign key to CONCEPT table
# FOREIGN KEY (concept_set_id) REFERENCES CONCEPT_SET(concept_set_id),
# FOREIGN KEY (concept_id) REFERENCES CONCEPT(concept_id)
# );""")
# cur.execute("DROP TABLE CONCEPT_SET_ITEM;")
# Be sure to close the connection
conn.close()
```
%% Cell type:code id:d03b75f3-902f-42d7-b52f-dac7e79ecb11 tags:
``` python
conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
cur = conn.cursor()
file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
df = pd.read_csv(file_path, low_memory=False)
df = df.set_index("code")
df.to_sql(name='test', con=conn, if_exists='replace')
conn.commit()
conn.close()
```
%% Cell type:code id:d96c3511-3831-400e-ba40-0a36abcc60d3 tags:
``` python
#DISPLAY SQL TABLE
table="CONCEPT_SET_ITEM"
# Create a SQL connection to our SQLite database
conn = sqlite3.connect("codes/omop_54.sqlite")
cur = conn.cursor()
#Print ALL Columns in Table
cur.execute(f"PRAGMA table_info({table});")
df_cols = pd.DataFrame(cur.fetchall())
print(df_cols)
df_cols = df_cols[1]
#Print TABLE
cur.execute(f"SELECT * FROM {table};")
df = pd.DataFrame(cur.fetchall())
df = df.rename(columns={i:s for i, s in enumerate(df_cols)})
display(df)
conn.close()
# a+s = 13364
# a+s+i = 13591
```
%% Cell type:code id:42d49a00-9646-4ba4-afb6-12297289b7a7 tags:
``` python
def sql_row_exist(conn, table, column, value):
# Execute and check if a result exists
cur = conn.cursor()
query = f"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;"
cur.execute(query, (value,))
exists = cur.fetchone() is not None
return exists
```
%% Cell type:code id:f7b51bcd-6ee1-4023-8d36-7f419ce4120d tags:
``` python
#EXPORT MELDB CSV OUTPUT
conn = sqlite3.connect("codes/omop_54.sqlite") # change to 'sqlite:///your_filename.db'
cur = conn.cursor()
vocab_output = "MELDB"
vocab_type = "SNOMED"
file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/phenotype/output/V3_2_10_MELD_snomed_no_translate.csv"
# file_path = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/HEART_VALVE_DISORDERS.csv"
# Read the CSV file with the specified delimiter
out = pd.read_csv(file_path, low_memory=False)
print(df.columns)
for concept_set_name, grp in out.groupby("MELDB_concept"):
# display(concept_set_name, grp[["code", "MELDB_concept"]])
#Create Concept_Set
if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name):
cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');")
else:
print("concept_set", concept_set_name, "already exists")
#TODO: ask to remove old concept_set?
#Get Concept_set_Id
query = "SELECT concept_set_id FROM CONCEPT_SET WHERE concept_set_name = ? AND vocabulary_id = ?;"
cur.execute(query, (concept_set_name, vocab_output, ))
concept_set_id = cur.fetchone()[0]
#Get corresponing Concept_id (OMOP) for each Concept_code (e.g. SNOMED)
concept_codes = "'"+"', '".join(list(grp["code"].astype(str)))+"'"
query = f"SELECT concept_id FROM CONCEPT WHERE vocabulary_id = ? AND concept_code IN ({concept_codes});"
print(query)
cur.execute(query, (vocab_type, ))
df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"])
if not len(grp) == len(df_out):
print("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database")
#Create Concept_set_item
df_out["concept_set_id"] = concept_set_id
df_out.to_sql("CONCEPT_SET_ITEM", conn, if_exists='append', index=False)
display(df_out)
# break
# #Create New CONCEPT_SET
# table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file
# cur.execute(f"INSERT INTO CONCEPT_SET (concept_class_name) VALUES ('{table_name}');")
conn.commit()
conn.close()
```
%% Cell type:code id:85007741-e34c-4112-a63c-9fb302b76958 tags:
``` python
"'"+"', '".join(list(grp["code"].astype(str)))+"'"
```
%% Cell type:markdown id:423e7c21-f3bd-439d-9dcb-c17cc2cc6854 tags:
### ATLAS
%% Cell type:code id:c6b45e4d-c7d2-42e7-9b4a-0e9c1c86d34b tags:
``` python
#Create ATLAS Concept Set
def atlas_create_concept(name, description="", items=[]):
data={
"id": 0,
"name": name,
"description": description,
"expression": {
"items":items
}
}
try:
# Sending the POST request
response = requests.post(url, json=data, headers=headers)
# Check the response status
if response.status_code == 200 or response.status_code == 201:
print("POST request successful:")
print(response.json()) # Assuming the response is JSON
return response["id"]
else:
print(f"POST request failed. HTTP Status Code: {response.status_code}")
print("Response content:")
print(response.text)
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
# Heart Test 1 - 1885487
# Heart Test 2 - 1885488
# Heart Valve Disorders - 1885449
```
%% Cell type:code id:45497623-1da0-4f74-b21e-da8811c89b04 tags:
``` python
def get_omop_concepts(cur, codes, vocab_id):
#Create List for SQL
mask = ""
for c in codes:
mask+=f'"{c}", '
mask = mask[:-2] #remove last comma
#Execute SQL
cur.execute(f'SELECT * FROM CONCEPT WHERE concept_code IN ({mask}) AND VOCABULARY_ID = "{vocab_id}";')
df = pd.DataFrame(cur.fetchall()) #convert to pandas df
print("Identified", len(df[0]) ,"OMOP Concepts:", list(df[0]))
return df
def omop_concepts_to_atlas_json(df):
json = []
for i, row in df.iterrows():
#template for atlas api
out = {
"concept": {
'CONCEPT_ID': row[0],
'CONCEPT_NAME': row[1],
'STANDARD_CONCEPT': 'S',
'STANDARD_CONCEPT_CAPTION': 'Standard',
'INVALID_REASON': 'V',
'INVALID_REASON_CAPTION': 'Valid',
'CONCEPT_CODE': row[6],
'DOMAIN_ID': row[2],
'VOCABULARY_ID': row[3],
'CONCEPT_CLASS_ID': row[4],
'VALID_START_DATE': int(row[7]),
'VALID_END_DATE': int(row[8])
},
'isExcluded': False,
'includeDescendants': False,
'includeMapped': False
}
json.append(out)
return json
conn = sqlite3.connect("codes/omop_54.sqlite")
cur = conn.cursor()
vocab_id="SNOMED" #SNOMED, ATC, ICD10CM, ICD9CM, Read
csv_output = "/home/jjd1c23/ssd/meldb/jjd1c23/concepts/snomed/ANGER.csv"
#Load CSV Output File
df_in = pd.read_csv(csv_output)
print(len(df_in))
# df = get_omop_concepts(cur, ["119768002", "5905001"], "SNOMED")
df = get_omop_concepts(cur, list(df_in["code"]), vocab_id)
json = omop_concepts_to_atlas_json(df)
# display(json)
conn.close()
```
%% Cell type:code id:ea759907-c085-472a-82e2-07b6b19e2c8f tags:
``` python
#ATLAS GET CONCEPT SET
import requests
def request_get(url):
try:
# Sending the GET request
response = requests.get(url)
# Check if the response status code is 200 (OK)
if response.status_code == 200:
print("Response data:")
# print(response.json()) # Assuming the response is in JSON format
return response.json()
else:
print(f"Failed to fetch data. HTTP Status Code: {response.status_code}")
print("Response content:")
print(response.text)
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
#GET SET INFO
set_id = "1885449"
url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}"
request_get(url)
```
%% Cell type:code id:5a70e636-6051-4930-bf1b-30d093fd0552 tags:
``` python
#GET SET ITEMS (Concepts)
set_id = "1885449"
url = f"https://atlas-demo.ohdsi.org/WebAPI/conceptset/{set_id}/expression/ATLASPROD"
response = request_get(url)
display(response)
```
%% Cell type:code id:96bfcd9c-27e8-4be4-a680-7553d908790e tags:
``` python
#ATLAS CREATE CONCEPT SET
```
#! /usr/bin/bash
echo "Removing Corrupted Files from Ho"
rm codes/GitHub_TG_repository/lymphoma_prevalence_birm_cam/lymphoma_prevalence_birm_cam_ICD10.csv
rm codes/GitHub_TG_repository/Menieresdisease_birm_cam/Menieresdisease_birm_cam_ICD10.csv
rm codes/GitHub_TG_repository/peripheral_neuropathy_birm_cam/peripheral_neuropathy_birm_cam_ICD10.csv
rm codes/GitHub_TG_repository/Sjogrenssyndrome_Bham_CAM/Sjogrenssyndrome_Bham_CAM_ICD10.csv
\ No newline at end of file
#! /usr/bin/bash
version="V3_2_10"
previous="V3_2_9"
python main.py -r2 PHEN_assign_v3.json CONC_summary_working.xlsx
mv output/MELD_concepts_read.csv output/${version}_MELD_concepts_readv2.csv
python main.py -i PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
mv output/MELD_concepts_read.csv output/${version}_MELD_icd10_no_translate.csv
python main.py -s PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
mv output/MELD_concepts_read.csv output/${version}_MELD_snomed_no_translate.csv
# python main.py -o PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
# mv output/MELD_concepts_read.csv output/${version}_MELD_opcs4_no_translate.csv
python main.py -a PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
mv output/MELD_concepts_read.csv output/${version}_MELD_atc_no_translate.csv
# python main.py -m PHEN_assign_v3.json CONC_summary_working.xlsx --no-translate
# mv output/MELD_concepts_read.csv output/${version}_MELD_med_no_translate.csv
mv output/MELD_errors.csv output/${version}_MELD_errors.csv
#Generate Report
rm concepts-output/MELD-report.md
python report.py PHEN_assign_v3.json CONC_summary_working.xlsx codes/ concepts-output/MELD-report.md ${version} ${previous}
#Divide Concepts to Output Repo
rm -rf concepts-output/readv2/*
rm -rf concepts-output/icd10/*
rm -rf concepts-output/snomed/*
rm -rf concepts-output/atc/*
python publish.py output/${version}_MELD_concepts_readv2.csv concepts-output/readv2/
python publish.py output/${version}_MELD_icd10_no_translate.csv concepts-output/icd10/
python publish.py output/${version}_MELD_snomed_no_translate.csv concepts-output/snomed/
python publish.py output/${version}_MELD_atc_no_translate.csv concepts-output/atc/
cp output/${version}_MELD_errors.csv concepts-output/${version}_MELD_errors.csv
# Show Changes in Output repo (should be same as report)
cd concepts-output
git diff --stat
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment