Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
concepts-processing
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Package registry
Operate
Terraform modules
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
meldb
concepts-processing
Commits
c156c2d0
Commit
c156c2d0
authored
4 months ago
by
mjbonifa
Browse files
Options
Downloads
Patches
Plain Diff
refactored trud constants from MAPS to TRUD
parent
95c3d5cf
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
parse.py
+5
-5
5 additions, 5 deletions
parse.py
phen.py
+1
-1
1 addition, 1 deletion
phen.py
trud.py
+40
-40
40 additions, 40 deletions
trud.py
with
46 additions
and
46 deletions
parse.py
+
5
−
5
View file @
c156c2d0
...
@@ -69,7 +69,7 @@ class Read2_code(Proto_code):
...
@@ -69,7 +69,7 @@ class Read2_code(Proto_code):
def
__init__
(
self
,
file_path
=
None
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
super
().
__init__
(
file_path
)
input_path
=
trud
.
MAPS
_PROCESSED_DIR
/
'
read2_code.parquet
'
input_path
=
trud
.
TRUD
_PROCESSED_DIR
/
'
read2_code.parquet
'
if
not
input_path
.
is_file
():
if
not
input_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: Read2 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
raise
FileNotFoundError
(
f
"
Error: Read2 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
...
@@ -115,7 +115,7 @@ class Read3_code(Proto_code):
...
@@ -115,7 +115,7 @@ class Read3_code(Proto_code):
def
__init__
(
self
,
file_path
=
None
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
super
().
__init__
(
file_path
)
input_path
=
trud
.
MAPS
_PROCESSED_DIR
/
'
read3_code.parquet
'
input_path
=
trud
.
TRUD
_PROCESSED_DIR
/
'
read3_code.parquet
'
if
not
input_path
.
is_file
():
if
not
input_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: Read3 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
raise
FileNotFoundError
(
f
"
Error: Read3 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
...
@@ -160,7 +160,7 @@ class Icd10_code(Proto_code):
...
@@ -160,7 +160,7 @@ class Icd10_code(Proto_code):
def
__init__
(
self
,
file_path
=
None
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
super
().
__init__
(
file_path
)
input_path
=
trud
.
MAPS
_PROCESSED_DIR
/
'
icd10_code.parquet
'
input_path
=
trud
.
TRUD
_PROCESSED_DIR
/
'
icd10_code.parquet
'
if
not
input_path
.
is_file
():
if
not
input_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: ICD10 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
raise
FileNotFoundError
(
f
"
Error: ICD10 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
...
@@ -223,7 +223,7 @@ class Snomed_code(Proto_code):
...
@@ -223,7 +223,7 @@ class Snomed_code(Proto_code):
def
__init__
(
self
,
file_path
=
None
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
super
().
__init__
(
file_path
)
input_path
=
trud
.
MAPS
_PROCESSED_DIR
/
'
snomed_code.parquet
'
input_path
=
trud
.
TRUD
_PROCESSED_DIR
/
'
snomed_code.parquet
'
if
not
input_path
.
is_file
():
if
not
input_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: SNOMED code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
raise
FileNotFoundError
(
f
"
Error: SNOMED code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
...
@@ -280,7 +280,7 @@ class Opcs4_code(Proto_code):
...
@@ -280,7 +280,7 @@ class Opcs4_code(Proto_code):
def
__init__
(
self
,
file_path
=
None
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
super
().
__init__
(
file_path
)
input_path
=
trud
.
MAPS
_PROCESSED_DIR
/
'
opcs4_code.parquet
'
input_path
=
trud
.
TRUD
_PROCESSED_DIR
/
'
opcs4_code.parquet
'
if
not
input_path
.
is_file
():
if
not
input_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: OPCS4 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
raise
FileNotFoundError
(
f
"
Error: OPCS4 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
...
...
This diff is collapsed.
Click to expand it.
phen.py
+
1
−
1
View file @
c156c2d0
...
@@ -191,7 +191,7 @@ def convert_codes(df, target, translate):
...
@@ -191,7 +191,7 @@ def convert_codes(df, target, translate):
print
(
f
"
target type
{
target
}
"
)
print
(
f
"
target type
{
target
}
"
)
for
col_name
in
df
.
columns
[
df
.
columns
!=
target
]:
for
col_name
in
df
.
columns
[
df
.
columns
!=
target
]:
filename
=
f
"
{
col_name
}
_to_
{
target
}
.parquet
"
filename
=
f
"
{
col_name
}
_to_
{
target
}
.parquet
"
map_path
=
trud
.
MAPS
_PROCESSED_DIR
/
filename
map_path
=
trud
.
TRUD
_PROCESSED_DIR
/
filename
if
map_path
.
exists
():
if
map_path
.
exists
():
col
=
df
[
col_name
]
col
=
df
[
col_name
]
df_map
=
pd
.
read_parquet
(
map_path
)
df_map
=
pd
.
read_parquet
(
map_path
)
...
...
This diff is collapsed.
Click to expand it.
trud.py
+
40
−
40
View file @
c156c2d0
...
@@ -15,9 +15,9 @@ import simpledbf
...
@@ -15,9 +15,9 @@ import simpledbf
# Constants
# Constants
FQDN
=
"
isd.digital.nhs.uk
"
FQDN
=
"
isd.digital.nhs.uk
"
MAPS
_DIR
=
Path
(
'
./build/trud
'
)
TRUD
_DIR
=
Path
(
'
./build/trud
'
)
MAPS
_DOWNLOADS_DIR
=
MAPS
_DIR
/
'
downloads
'
TRUD
_DOWNLOADS_DIR
=
TRUD
_DIR
/
'
downloads
'
MAPS
_PROCESSED_DIR
=
MAPS
_DIR
/
'
processed
'
TRUD
_PROCESSED_DIR
=
TRUD
_DIR
/
'
processed
'
def
error_exit
(
message
):
def
error_exit
(
message
):
print
(
message
,
"
error
"
)
print
(
message
,
"
error
"
)
...
@@ -44,7 +44,7 @@ def get_releases(item_id, API_KEY, latest=False):
...
@@ -44,7 +44,7 @@ def get_releases(item_id, API_KEY, latest=False):
return
data
.
get
(
"
releases
"
,
[])
return
data
.
get
(
"
releases
"
,
[])
def
download_release_file
(
item_id
,
release_ordinal
,
release
,
file_json_prefix
,
file_type
=
None
,
items_folder
=
MAPS
_DOWNLOADS_DIR
):
def
download_release_file
(
item_id
,
release_ordinal
,
release
,
file_json_prefix
,
file_type
=
None
,
items_folder
=
TRUD
_DOWNLOADS_DIR
):
"""
Download specified file type for a given release of an item.
"""
"""
Download specified file type for a given release of an item.
"""
# check folder is a directory
# check folder is a directory
...
@@ -54,7 +54,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f
...
@@ -54,7 +54,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f
file_type
=
file_type
or
file_json_prefix
file_type
=
file_type
or
file_json_prefix
file_url
=
release
.
get
(
f
"
{
file_json_prefix
}
FileUrl
"
)
file_url
=
release
.
get
(
f
"
{
file_json_prefix
}
FileUrl
"
)
file_name
=
release
.
get
(
f
"
{
file_json_prefix
}
FileName
"
)
file_name
=
release
.
get
(
f
"
{
file_json_prefix
}
FileName
"
)
file_destination
=
MAPS
_DOWNLOADS_DIR
/
file_name
file_destination
=
TRUD
_DOWNLOADS_DIR
/
file_name
if
not
file_url
or
not
file_name
:
if
not
file_url
or
not
file_name
:
error_exit
(
f
"
Missing
{
file_type
}
file information for release
{
release_ordinal
}
of item
{
item_id
}
.
"
)
error_exit
(
f
"
Missing
{
file_type
}
file information for release
{
release_ordinal
}
of item
{
item_id
}
.
"
)
...
@@ -78,7 +78,7 @@ def validate_download_hash(file_destination:str, item_hash:str):
...
@@ -78,7 +78,7 @@ def validate_download_hash(file_destination:str, item_hash:str):
else
:
else
:
error_exit
(
f
"
Could not validate origin of
{
file_destination
}
. The SHA-256 hash should be:
{
item_hash
}
, but got
{
hash
}
instead
"
)
error_exit
(
f
"
Could not validate origin of
{
file_destination
}
. The SHA-256 hash should be:
{
item_hash
}
, but got
{
hash
}
instead
"
)
def
unzip_download
(
file_destination
:
str
,
items_folder
=
MAPS
_DOWNLOADS_DIR
):
def
unzip_download
(
file_destination
:
str
,
items_folder
=
TRUD
_DOWNLOADS_DIR
):
# check folder is a directory
# check folder is a directory
if
not
items_folder
.
is_dir
():
if
not
items_folder
.
is_dir
():
...
@@ -89,24 +89,24 @@ def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR):
...
@@ -89,24 +89,24 @@ def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR):
def
extract_icd10
():
def
extract_icd10
():
#ICD10_edition5
#ICD10_edition5
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
ICD10_Edition5_XML_20160401
'
/
'
Content
'
/
'
ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
ICD10_Edition5_XML_20160401
'
/
'
Content
'
/
'
ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml
'
df
=
pd
.
read_xml
(
file_path
)
df
=
pd
.
read_xml
(
file_path
)
df
=
df
[[
"
CODE
"
,
"
ALT_CODE
"
,
"
DESCRIPTION
"
]]
df
=
df
[[
"
CODE
"
,
"
ALT_CODE
"
,
"
DESCRIPTION
"
]]
df
=
df
.
rename
(
columns
=
{
"
CODE
"
:
"
icd10_code
"
,
df
=
df
.
rename
(
columns
=
{
"
CODE
"
:
"
icd10_code
"
,
"
ALT_CODE
"
:
"
icd10_alt_code
"
,
"
ALT_CODE
"
:
"
icd10_alt_code
"
,
"
DESCRIPTION
"
:
"
description
"
"
DESCRIPTION
"
:
"
description
"
})
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
icd10_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
icd10_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
def
extract_opsc4
():
def
extract_opsc4
():
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
OPCS410 Data files txt
'
/
'
OPCS410 CodesAndTitles Nov 2022 V1.0.txt
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
OPCS410 Data files txt
'
/
'
OPCS410 CodesAndTitles Nov 2022 V1.0.txt
'
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
dtype
=
str
,
header
=
None
)
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
dtype
=
str
,
header
=
None
)
df
=
df
.
rename
(
columns
=
{
0
:
"
opcs4_code
"
,
1
:
"
description
"
})
df
=
df
.
rename
(
columns
=
{
0
:
"
opcs4_code
"
,
1
:
"
description
"
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
opcs4_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
opcs4_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
...
@@ -114,30 +114,30 @@ def extract_nhs_data_migrations():
...
@@ -114,30 +114,30 @@ def extract_nhs_data_migrations():
#NHS Data Migrations
#NHS Data Migrations
#snomed only
#snomed only
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
sctcremap_uk_20200401000001.txt
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
sctcremap_uk_20200401000001.txt
'
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
)
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
)
df
=
df
[[
"
SCT_CONCEPTID
"
]]
df
=
df
[[
"
SCT_CONCEPTID
"
]]
df
=
df
.
rename
(
columns
=
{
"
SCT_CONCEPTID
"
:
"
snomed_code
"
})
df
=
df
.
rename
(
columns
=
{
"
SCT_CONCEPTID
"
:
"
snomed_code
"
})
df
=
df
.
drop_duplicates
()
df
=
df
.
drop_duplicates
()
df
=
df
.
astype
(
str
)
df
=
df
.
astype
(
str
)
output_path
=
MAPS
_PROCESSED_DIR
/
'
snomed_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
snomed_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r2 -> r3
#r2 -> r3
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
rctctv3map_uk_20200401000001.txt
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
rctctv3map_uk_20200401000001.txt
'
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
)
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
)
df
=
df
[[
"
V2_CONCEPTID
"
,
"
CTV3_CONCEPTID
"
]]
df
=
df
[[
"
V2_CONCEPTID
"
,
"
CTV3_CONCEPTID
"
]]
df
=
df
.
rename
(
columns
=
{
"
V2_CONCEPTID
"
:
"
read2_code
"
,
df
=
df
.
rename
(
columns
=
{
"
V2_CONCEPTID
"
:
"
read2_code
"
,
"
CTV3_CONCEPTID
"
:
"
read3_code
"
})
"
CTV3_CONCEPTID
"
:
"
read3_code
"
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
read2_code_to_read3_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read2_code_to_read3_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r3->r2
#r3->r2
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
ctv3rctmap_uk_20200401000002.txt
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
ctv3rctmap_uk_20200401000002.txt
'
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
)
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
)
df
=
df
[[
"
CTV3_CONCEPTID
"
,
"
V2_CONCEPTID
"
]]
df
=
df
[[
"
CTV3_CONCEPTID
"
,
"
V2_CONCEPTID
"
]]
df
=
df
.
rename
(
columns
=
{
"
CTV3_CONCEPTID
"
:
"
read3_code
"
,
df
=
df
.
rename
(
columns
=
{
"
CTV3_CONCEPTID
"
:
"
read3_code
"
,
...
@@ -145,23 +145,23 @@ def extract_nhs_data_migrations():
...
@@ -145,23 +145,23 @@ def extract_nhs_data_migrations():
df
=
df
.
drop_duplicates
()
df
=
df
.
drop_duplicates
()
df
=
df
[
~
df
[
"
read2_code
"
].
str
.
match
(
"
^.*_.*$
"
)]
#remove r2 codes with '_'
df
=
df
[
~
df
[
"
read2_code
"
].
str
.
match
(
"
^.*_.*$
"
)]
#remove r2 codes with '_'
output_path
=
MAPS
_PROCESSED_DIR
/
'
read3_code_to_read2_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read3_code_to_read2_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r2 -> snomed
#r2 -> snomed
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
rcsctmap2_uk_20200401000001.txt
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
rcsctmap2_uk_20200401000001.txt
'
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
dtype
=
str
)
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
dtype
=
str
)
df
=
df
[[
"
ReadCode
"
,
"
ConceptId
"
]]
df
=
df
[[
"
ReadCode
"
,
"
ConceptId
"
]]
df
=
df
.
rename
(
columns
=
{
"
ReadCode
"
:
"
read2_code
"
,
df
=
df
.
rename
(
columns
=
{
"
ReadCode
"
:
"
read2_code
"
,
"
ConceptId
"
:
"
snomed_code
"
})
"
ConceptId
"
:
"
snomed_code
"
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
read2_code_to_snomed_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read2_code_to_snomed_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r3->snomed
#r3->snomed
file_path
=
MAPS
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
ctv3sctmap2_uk_20200401000001.txt
'
file_path
=
TRUD
_DOWNLOADS_DIR
/
'
Mapping Tables
'
/
'
Updated
'
/
'
Clinically Assured
'
/
'
ctv3sctmap2_uk_20200401000001.txt
'
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
dtype
=
str
)
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
dtype
=
str
)
df
=
df
[[
"
CTV3_TERMID
"
,
"
SCT_CONCEPTID
"
]]
df
=
df
[[
"
CTV3_TERMID
"
,
"
SCT_CONCEPTID
"
]]
df
=
df
.
rename
(
columns
=
{
"
CTV3_TERMID
"
:
"
read3_code
"
,
df
=
df
.
rename
(
columns
=
{
"
CTV3_TERMID
"
:
"
read3_code
"
,
...
@@ -169,70 +169,70 @@ def extract_nhs_data_migrations():
...
@@ -169,70 +169,70 @@ def extract_nhs_data_migrations():
df
[
"
snomed_code
"
]
=
df
[
"
snomed_code
"
].
astype
(
str
)
df
[
"
snomed_code
"
]
=
df
[
"
snomed_code
"
].
astype
(
str
)
df
=
df
[
~
df
[
"
snomed_code
"
].
str
.
match
(
"
^.*_.*$
"
)]
#remove snomed codes with '_'
df
=
df
[
~
df
[
"
snomed_code
"
].
str
.
match
(
"
^.*_.*$
"
)]
#remove snomed codes with '_'
output_path
=
MAPS
_PROCESSED_DIR
/
'
read3_code_to_snomed_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read3_code_to_snomed_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
def
extract_nhs_read_browser
():
def
extract_nhs_read_browser
():
#r2 only
#r2 only
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
ANCESTOR.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
ANCESTOR.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
pd
.
concat
([
df
[
'
READCODE
'
],
df
[
'
DESCENDANT
'
]])
df
=
pd
.
concat
([
df
[
'
READCODE
'
],
df
[
'
DESCENDANT
'
]])
df
=
pd
.
DataFrame
(
df
.
drop_duplicates
())
df
=
pd
.
DataFrame
(
df
.
drop_duplicates
())
df
=
df
.
rename
(
columns
=
{
0
:
"
read2_code
"
})
df
=
df
.
rename
(
columns
=
{
0
:
"
read2_code
"
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
read2_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read2_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r2 -> atc
#r2 -> atc
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
ATC.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
ATC.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
df
[[
"
READCODE
"
,
"
ATC
"
]]
df
=
df
[[
"
READCODE
"
,
"
ATC
"
]]
df
=
df
.
rename
(
columns
=
{
"
READCODE
"
:
"
read2_code
"
,
"
ATC
"
:
"
atc_code
"
})
df
=
df
.
rename
(
columns
=
{
"
READCODE
"
:
"
read2_code
"
,
"
ATC
"
:
"
atc_code
"
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
read2_code_to_atc_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read2_code_to_atc_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r2 -> icd10
#r2 -> icd10
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
ICD10.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
ICD10.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read2_code
"
,
"
TARG_CODE
"
:
"
icd10_code
"
})
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read2_code
"
,
"
TARG_CODE
"
:
"
icd10_code
"
})
df
=
df
[
~
df
[
"
icd10_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
icd10_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read2_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read2_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
output_path
=
MAPS
_PROCESSED_DIR
/
'
read2_code_to_icd10_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read2_code_to_icd10_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r2 -> opcs4
#r2 -> opcs4
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
OPCS4V3.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V2
'
/
'
OPCS4V3.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read2_code
"
,
"
TARG_CODE
"
:
"
opcs4_code
"
})
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read2_code
"
,
"
TARG_CODE
"
:
"
opcs4_code
"
})
df
=
df
[
~
df
[
"
opcs4_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
opcs4_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read2_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read2_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
output_path
=
MAPS
_PROCESSED_DIR
/
'
read2_code_to_opcs4_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read2_code_to_opcs4_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r3 only
#r3 only
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V3
'
/
'
ANCESTOR.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V3
'
/
'
ANCESTOR.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
pd
.
concat
([
df
[
'
READCODE
'
],
df
[
'
DESCENDANT
'
]])
df
=
pd
.
concat
([
df
[
'
READCODE
'
],
df
[
'
DESCENDANT
'
]])
df
=
pd
.
DataFrame
(
df
.
drop_duplicates
())
df
=
pd
.
DataFrame
(
df
.
drop_duplicates
())
df
=
df
.
rename
(
columns
=
{
0
:
"
read3_code
"
})
df
=
df
.
rename
(
columns
=
{
0
:
"
read3_code
"
})
output_path
=
MAPS
_PROCESSED_DIR
/
'
read3_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read3_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
#r3 -> icd10
#r3 -> icd10
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V3
'
/
'
ICD10.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V3
'
/
'
ICD10.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read3_code
"
,
"
TARG_CODE
"
:
"
icd10_code
"
})
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read3_code
"
,
"
TARG_CODE
"
:
"
icd10_code
"
})
df
=
df
[
~
df
[
"
icd10_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
icd10_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read3_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read3_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
output_path
=
MAPS
_PROCESSED_DIR
/
'
read3_code_to_icd10_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read3_code_to_icd10_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
...
@@ -240,13 +240,13 @@ def extract_nhs_read_browser():
...
@@ -240,13 +240,13 @@ def extract_nhs_read_browser():
# dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF')
# dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF')
#r3 -> opcs4
#r3 -> opcs4
input_path
=
MAPS
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V3
'
/
'
OPCS4V3.DBF
'
input_path
=
TRUD
_DOWNLOADS_DIR
/
'
Standard
'
/
'
V3
'
/
'
OPCS4V3.DBF
'
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
simpledbf
.
Dbf5
(
input_path
).
to_dataframe
()
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
[[
"
READ_CODE
"
,
"
TARG_CODE
"
]]
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read3_code
"
,
"
TARG_CODE
"
:
"
opcs4_code
"
})
df
=
df
.
rename
(
columns
=
{
"
READ_CODE
"
:
"
read3_code
"
,
"
TARG_CODE
"
:
"
opcs4_code
"
})
df
=
df
[
~
df
[
"
opcs4_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
opcs4_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read3_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
df
=
df
[
~
df
[
"
read3_code
"
].
str
.
match
(
"
^.*-.*$
"
)]
#remove codes with '-'
output_path
=
MAPS
_PROCESSED_DIR
/
'
read3_code_to_opcs4_code.parquet
'
output_path
=
TRUD
_PROCESSED_DIR
/
'
read3_code_to_opcs4_code.parquet
'
df
.
to_parquet
(
output_path
,
index
=
False
)
df
.
to_parquet
(
output_path
,
index
=
False
)
print
(
f
"
Extracted:
{
output_path
}
"
)
print
(
f
"
Extracted:
{
output_path
}
"
)
...
@@ -255,11 +255,11 @@ def create_map_directories():
...
@@ -255,11 +255,11 @@ def create_map_directories():
# Check if build directory exists
# Check if build directory exists
create_map_dirs
=
False
create_map_dirs
=
False
if
MAPS
_DIR
.
exists
():
if
TRUD
_DIR
.
exists
():
user_input
=
input
(
f
"
The map directory
{
MAPS
_DIR
}
already exists. Do you want to download and process trud data again? (y/n):
"
).
strip
().
lower
()
user_input
=
input
(
f
"
The map directory
{
TRUD
_DIR
}
already exists. Do you want to download and process trud data again? (y/n):
"
).
strip
().
lower
()
if
user_input
==
"
y
"
:
if
user_input
==
"
y
"
:
# delete all build files
# delete all build files
shutil
.
rmtree
(
MAPS
_DIR
)
shutil
.
rmtree
(
TRUD
_DIR
)
create_map_dirs
=
True
create_map_dirs
=
True
elif
user_input
==
"
n
"
:
elif
user_input
==
"
n
"
:
print
(
"
Exiting TRUD installation
"
)
print
(
"
Exiting TRUD installation
"
)
...
@@ -269,9 +269,9 @@ def create_map_directories():
...
@@ -269,9 +269,9 @@ def create_map_directories():
if
create_map_dirs
:
if
create_map_dirs
:
# create maps directories
# create maps directories
MAPS
_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
TRUD
_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
MAPS
_DOWNLOADS_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
TRUD
_DOWNLOADS_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
MAPS
_PROCESSED_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
TRUD
_PROCESSED_DIR
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
def
install
(
api_key
):
def
install
(
api_key
):
print
(
f
"
Installing TRUD
"
)
print
(
f
"
Installing TRUD
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment