Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
concepts-processing
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Package registry
Operate
Terraform modules
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
meldb
concepts-processing
Commits
945ff7ca
Commit
945ff7ca
authored
4 months ago
by
mjbonifa
Browse files
Options
Downloads
Patches
Plain Diff
fixed default output and error files
parent
24c5944e
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
README.md
+46
-44
46 additions, 44 deletions
README.md
acmc.py
+7
-5
7 additions, 5 deletions
acmc.py
map.py
+20
-68
20 additions, 68 deletions
map.py
parse.py
+10
-32
10 additions, 32 deletions
parse.py
with
83 additions
and
149 deletions
README.md
+
46
−
44
View file @
945ff7ca
...
...
@@ -125,51 +125,53 @@ Phenotypes are defined in a JSON configuration file. The file describes how sour
An example concept set and code list for Abdominal Pain is show below:
```
json
{
"concept_sets"
:
{
"version"
:
"3.2.10"
,
"omop"
:
{
"vocabulary_id"
:
"MELDB"
,
"vocabulary_name"
:
"Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity"
,
"vocabulary_reference"
:
"https://www.it-innovation.soton.ac.uk/projects/meldb"
},
"concept_set"
:
[
{
"concept_set_name"
:
"ABDO_PAIN"
,
"concept_set_status"
:
"AGREED"
,
"metadata"
:
{
"#"
:
"18"
,
"CONCEPT DESCRIPTION"
:
"Abdominal pain"
,
"CONCEPT TYPE"
:
"Workload indicator (symptom)"
,
"DATE ADDED "
:
"2023-08-25"
,
"REQUEST REASON "
:
"Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis"
,
"SOURCE INFO"
:
"YES"
,
"FUNCTION"
:
"QUERY BY CODING LIST"
,
"FUNCTION.1"
:
"https://clinicalcodes.rss.mhs.man.ac.uk/"
,
"CODING LIST"
:
"https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv "
,
"NOTES"
:
"2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent."
,
}
},
}
"codes"
:
[
{
"folder"
:
"codes/ClinicalCodes.org from the University of Manchester"
,
"description"
:
"SF's clinical codes - downloaded 16/11/23"
,
"files"
:
[
{
"file"
:
"Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
,
"columns"
:
{
"read2_code"
:
"code"
,
"metadata"
:
[
"description"
]
},
"concept_set"
:
[
"ABDO_PAIN"
]
},
}
"concept_sets"
:
{
"version"
:
"3.2.10"
,
"omop"
:
{
"vocabulary_id"
:
"MELDB"
,
"vocabulary_name"
:
"Multidisciplinary Ecosystem to study Lifecourse Determinants and Prevention of Early-onset Burdensome Multimorbidity"
,
"vocabulary_reference"
:
"https://www.it-innovation.soton.ac.uk/projects/meldb"
},
"concept_set"
:
[
{
"concept_set_name"
:
"ABDO_PAIN"
,
"concept_set_status"
:
"AGREED"
,
"metadata"
:
{
"#"
:
"18"
,
"CONCEPT DESCRIPTION"
:
"Abdominal pain"
,
"CONCEPT TYPE"
:
"Workload indicator (symptom)"
,
"DATE ADDED "
:
"2023-08-25"
,
"REQUEST REASON "
:
"Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis"
,
"SOURCE INFO"
:
"YES"
,
"FUNCTION"
:
"QUERY BY CODING LIST"
,
"FUNCTION.1"
:
"https://clinicalcodes.rss.mhs.man.ac.uk/"
,
"CODING LIST"
:
"https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv "
,
"NOTES"
:
"2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent."
}
}
]
},
"codes"
:
[
{
"folder"
:
"clinical-codes-org"
,
"description"
:
"SF's clinical codes - downloaded 16/11/23"
,
"files"
:
[
{
"file"
:
"Symptom code lists/Abdominal pain/res176-abdominal-pain.csv"
,
"columns"
:
{
"read2_code"
:
"code"
,
"metadata"
:
[
"description"
]
},
"concept_set"
:
[
"ABDO_PAIN"
]
}
]
}
]
}
```
...
...
This diff is collapsed.
Click to expand it.
acmc.py
+
7
−
5
View file @
945ff7ca
...
...
@@ -4,6 +4,8 @@ import trud
import
omop
import
map
from
pathlib
import
Path
def
trud_install
(
args
):
"""
Handle the `trud install` command.
"""
print
(
f
"
Installing TRUD
"
)
...
...
@@ -31,7 +33,7 @@ def omop_delete(args):
def
map_process
(
args
):
"""
Handle the `map process` command.
"""
print
(
f
"
Processing map with phenotype config file:
{
args
.
config_file
}
"
)
print
(
f
"
Output directory:
{
args
.
output_
dir
}
"
)
print
(
f
"
Output directory:
{
args
.
output_
file
}
"
)
print
(
f
"
Target coding format:
{
args
.
target_coding
}
"
)
if
args
.
translate
:
print
(
"
Translating code types.
"
)
...
...
@@ -51,8 +53,8 @@ def map_process(args):
args
.
target_coding
,
args
.
translate
,
args
.
verify
,
args
.
error_log
,
output_path
=
"
MELD_concepts_read.csv
"
)
error_path
=
Path
(
args
.
error_log
)
,
output_path
=
Path
(
args
.
output_file
)
)
print
(
f
"
Phenotype processing completed
"
)
...
...
@@ -95,15 +97,15 @@ def main():
map_process_parser
=
map_subparsers
.
add_parser
(
"
process
"
,
help
=
"
Process map configuration file
"
)
map_process_parser
.
add_argument
(
"
-c
"
,
"
--config-file
"
,
required
=
True
,
help
=
"
Phenotype configuration file
"
)
map_process_parser
.
add_argument
(
"
-s
"
,
"
--source-codes-dir
"
,
required
=
True
,
help
=
"
Source codes root directory
"
)
map_process_parser
.
add_argument
(
"
-o
"
,
"
--output-dir
"
,
required
=
True
,
help
=
"
Output directory for CSV or OMOP database
"
)
map_process_parser
.
add_argument
(
"
-t
"
,
"
--target-coding
"
,
required
=
True
,
choices
=
[
'
read2
'
,
'
read3
'
,
'
icd10
'
,
'
snomed
'
,
'
opcs4
'
],
help
=
"
Specify the target coding (read2, read3, icd10, snomed, opcs4)
"
)
map_process_parser
.
add_argument
(
"
-o
"
,
"
--output-file
"
,
type
=
str
,
default
=
str
(
map
.
OUTPUT_PATH
.
resolve
()),
help
=
"
Output directory for CSV or OMOP database
"
)
# Flags
map_process_parser
.
add_argument
(
"
-tr
"
,
"
--translate
"
,
action
=
"
store_true
"
,
default
=
False
,
help
=
"
Do not translate code types
"
)
map_process_parser
.
add_argument
(
"
-v
"
,
"
--verify
"
,
action
=
"
store_true
"
,
default
=
False
,
help
=
"
Do not verify codes
"
)
# Error log file
map_process_parser
.
add_argument
(
"
-l
"
,
"
--error-log
"
,
type
=
str
,
default
=
'
error.csv
'
,
help
=
"
Filepath to save error log to
"
)
map_process_parser
.
add_argument
(
"
-l
"
,
"
--error-log
"
,
type
=
str
,
default
=
str
(
map
.
ERROR_PATH
.
resolve
())
,
help
=
"
Filepath to save error log to
"
)
# Set the function to call when 'process' subcommand is used
map_process_parser
.
set_defaults
(
func
=
map_process
)
...
...
This diff is collapsed.
Click to expand it.
map.py
+
20
−
68
View file @
945ff7ca
...
...
@@ -24,6 +24,8 @@ from omop import setup
pd
.
set_option
(
"
mode.chained_assignment
"
,
None
)
OUTPUT_PATH
=
Path
(
'
build
'
)
/
'
phenotype_mapping.csv
'
ERROR_PATH
=
Path
(
'
build
'
)
/
'
errors.csv
'
def
read_table_file
(
path
,
excel_sheet
=
None
):
"""
...
...
@@ -166,8 +168,7 @@ def sql_row_exist(conn, table, column, value):
return
exists
def
process
(
config_file
,
source_codes_dir
,
target_code_type
,
translate
=
True
,
verify
=
True
,
log_errors_path
=
"
errors.csv
"
,
output_path
=
"
MELD_concepts_read.csv
"
):
def
process
(
config_file
,
source_codes_dir
,
target_code_type
,
translate
=
True
,
verify
=
True
,
error_path
=
ERROR_PATH
,
output_path
=
OUTPUT_PATH
):
config_path
=
Path
(
config_file
)
if
not
config_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: phenotype configuration file
'
{
config_path
}
'
does not exist.
"
)
...
...
@@ -196,27 +197,16 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
# Load Code File
if
"
excel_sheet
"
in
file
:
df
=
read_table_file
(
path
=
file_path
,
excel_sheet
=
file
[
"
excel_sheet
"
]
)
df
=
read_table_file
(
path
=
file_path
,
excel_sheet
=
file
[
"
excel_sheet
"
])
else
:
df
=
read_table_file
(
path
=
file_path
)
# Perform Structural Changes to file before preprocessing
# split column with multiple code types
if
(
"
actions
"
in
file
and
"
split_col
"
in
file
[
"
actions
"
]
and
"
codes_col
"
in
file
[
"
actions
"
]
):
if
(
"
actions
"
in
file
and
"
split_col
"
in
file
[
"
actions
"
]
and
"
codes_col
"
in
file
[
"
actions
"
]):
split_col
=
file
[
"
actions
"
][
"
split_col
"
]
codes_col
=
file
[
"
actions
"
][
"
codes_col
"
]
print
(
"
Action: Splitting
"
,
split_col
,
"
column into:
"
,
df
[
split_col
].
unique
(),
)
print
(
"
Action: Splitting
"
,
split_col
,
"
column into:
"
,
df
[
split_col
].
unique
(),)
codes
=
df
[
codes_col
]
oh
=
pd
.
get_dummies
(
df
[
split_col
],
dtype
=
bool
)
# one hot encode
oh
=
oh
.
where
((
oh
!=
True
),
codes
,
axis
=
0
)
# fill in 1s with codes
...
...
@@ -231,74 +221,36 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
# TODO: enable metacolumns to be outputted - problem with map_file appending
if
"
metadata
"
in
file
[
"
columns
"
]:
meta_columns
+=
file
[
"
columns
"
][
"
metadata
"
]
df
=
preprocess
(
df
,
file
[
"
columns
"
],
meta_columns
=
meta_columns
,
file_path
=
file_path
,
target_code_type
=
target_code_type
,
verify
=
verify
,
translate
=
translate
,
)
df
=
preprocess
(
df
,
file
[
"
columns
"
],
meta_columns
=
meta_columns
,
file_path
=
file_path
,
target_code_type
=
target_code_type
,
verify
=
verify
,
translate
=
translate
)
else
:
raise
Exception
(
"
No column format provided
"
)
# partition table by categorical column
if
(
"
actions
"
in
file
and
"
divide_col
"
in
file
[
"
actions
"
]
and
len
(
df
)
>
0
):
if
(
"
actions
"
in
file
and
"
divide_col
"
in
file
[
"
actions
"
]
and
len
(
df
)
>
0
):
divide_col
=
file
[
"
actions
"
][
"
divide_col
"
]
print
(
"
Action: Dividing Table by
"
,
divide_col
,
"
column into:
"
,
df
[
divide_col
].
unique
(),
)
print
(
"
Action: Dividing Table by
"
,
divide_col
,
"
column into:
"
,
df
[
divide_col
].
unique
(),)
df
=
df
.
groupby
(
divide_col
)
# Map to MELDB Concept/Phenotype
if
len
(
df
)
==
0
:
pass
# out = df
elif
(
"
concept_set
"
in
file
)
and
isinstance
(
df
,
pd
.
core
.
frame
.
DataFrame
):
out
=
map_file
(
df
,
target_code_type
,
out
,
concepts
=
file
[
"
concept_set
"
],
meta_columns
=
meta_columns
,
translate
=
translate
,
)
elif
(
"
concept_set_categories
"
in
file
)
and
isinstance
(
df
,
pd
.
core
.
groupby
.
generic
.
DataFrameGroupBy
):
elif
(
"
concept_set
"
in
file
)
and
isinstance
(
df
,
pd
.
core
.
frame
.
DataFrame
):
out
=
map_file
(
df
,
target_code_type
,
out
,
concepts
=
file
[
"
concept_set
"
],
meta_columns
=
meta_columns
,
translate
=
translate
,)
elif
(
"
concept_set_categories
"
in
file
)
and
isinstance
(
df
,
pd
.
core
.
groupby
.
generic
.
DataFrameGroupBy
):
meta_columns
.
remove
(
divide_col
)
# delete categorical column
for
cat
,
grp
in
df
:
if
(
cat
in
file
[
"
concept_set_categories
"
].
keys
()
):
# check if category is mapped
grp
=
grp
.
drop
(
columns
=
[
divide_col
]
)
# delete categorical column
if
(
cat
in
file
[
"
concept_set_categories
"
].
keys
()):
# check if category is mapped
grp
=
grp
.
drop
(
columns
=
[
divide_col
])
# delete categorical column
print
(
"
Category:
"
,
cat
)
out
=
map_file
(
grp
,
target_code_type
,
out
,
concepts
=
file
[
"
concept_set_categories
"
][
cat
],
meta_columns
=
meta_columns
,
)
out
=
map_file
(
grp
,
target_code_type
,
out
,
concepts
=
file
[
"
concept_set_categories
"
][
cat
],
meta_columns
=
meta_columns
,)
else
:
print
(
"
Folder is empty
"
)
# check if out is empty
if
len
(
out
)
<=
0
:
raise
Exception
(
"
Output
fil
e is empty
"
)
raise
Exception
(
"
Output
datafram
e is empty
"
)
# Final Processing
out
=
out
.
reset_index
(
drop
=
True
)
...
...
@@ -340,11 +292,11 @@ def process(config_file, source_codes_dir, target_code_type, translate=True, ver
else
:
# export as CSV to /output
out
.
to_csv
(
output_path
,
index
=
False
)
print
(
"
s
aved to
"
,
output_path
)
print
(
"
S
aved to
"
,
output_path
)
# Save Error File
if
os
.
path
.
exists
(
log_errors_path
):
error_df
=
pd
.
read_csv
(
log_
error
s
_path
)
if
error_
path
.
exists
():
error_df
=
pd
.
read_csv
(
error_path
)
error_df
=
error_df
.
drop_duplicates
()
# Remove Duplicates from Error file
error_df
=
error_df
.
sort_values
(
by
=
[
"
SOURCE
"
,
"
VOCABULARY
"
,
"
CONCEPT
"
])
error_df
.
to_csv
(
log_
error
s
_path
,
index
=
False
)
error_df
.
to_csv
(
error_path
,
index
=
False
)
This diff is collapsed.
Click to expand it.
parse.py
+
10
−
32
View file @
945ff7ca
...
...
@@ -68,13 +68,12 @@ class Proto_code():
class
Read2_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
input_path
=
trud
.
MAPS_PROCESSED_DIR
/
'
read2_code.parquet
'
if
not
input_path
.
is_file
():
raise
FileNotFoundError
(
f
"
Error: Read2 code file
'
{
input_path
}
'
does not exist. Please ensure you have installed TRUD correctly
"
)
self
.
db
=
pd
.
read_parquet
(
input_path
)
self
.
arg_small
=
"
-r2
"
self
.
arg_long
=
"
--read2-code
"
self
.
arg_help
=
"
Read V2 Codes Column name in Source File
"
self
.
checks
=
[
(
"
Not Empty
"
,
...
...
@@ -115,9 +114,6 @@ class Read2_code(Proto_code):
class
Read3_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-r3
"
self
.
arg_long
=
"
--read3-code
"
self
.
arg_help
=
"
Read V3 Codes Column name in Source File
"
input_path
=
trud
.
MAPS_PROCESSED_DIR
/
'
read3_code.parquet
'
if
not
input_path
.
is_file
():
...
...
@@ -163,9 +159,6 @@ class Read3_code(Proto_code):
class
Icd10_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-i
"
self
.
arg_long
=
"
--icd10-code
"
self
.
arg_help
=
"
ICD10 Codes Column name in Source File
"
input_path
=
trud
.
MAPS_PROCESSED_DIR
/
'
icd10_code.parquet
'
if
not
input_path
.
is_file
():
...
...
@@ -229,9 +222,6 @@ class Icd10_code(Proto_code):
class
Snomed_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-s
"
self
.
arg_long
=
"
--snomed-code
"
self
.
arg_help
=
"
SNOMED Codes Column name in Source File
"
input_path
=
trud
.
MAPS_PROCESSED_DIR
/
'
snomed_code.parquet
'
if
not
input_path
.
is_file
():
...
...
@@ -289,9 +279,6 @@ class Snomed_code(Proto_code):
class
Opcs4_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-o
"
self
.
arg_long
=
"
--opcs4-code
"
self
.
arg_help
=
"
OPCS4 Codes Column name in Source File
"
input_path
=
trud
.
MAPS_PROCESSED_DIR
/
'
opcs4_code.parquet
'
if
not
input_path
.
is_file
():
...
...
@@ -317,9 +304,6 @@ class Opcs4_code(Proto_code):
class
Atc_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-a
"
self
.
arg_long
=
"
--atc-code
"
self
.
arg_help
=
"
ATC Codes Column name in Source File
"
self
.
checks
=
[
(
"
Not Empty
"
,
...
...
@@ -340,9 +324,6 @@ class Atc_code(Proto_code):
class
Med_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-m
"
self
.
arg_long
=
"
--med-code
"
self
.
arg_help
=
"
Med Codes Column name in Source File
"
self
.
checks
=
[
(
"
Not Empty
"
,
...
...
@@ -354,9 +335,6 @@ class Med_code(Proto_code):
class
Cprd_code
(
Proto_code
):
def
__init__
(
self
,
file_path
=
None
):
super
().
__init__
(
file_path
)
self
.
arg_small
=
"
-c
"
self
.
arg_long
=
"
--cprd-code
"
self
.
arg_help
=
"
CPRD Product Codes Column name in Source File
"
self
.
checks
=
[
(
"
Not Empty
"
,
...
...
@@ -366,14 +344,14 @@ class Cprd_code(Proto_code):
]
code_types
=
{
"
read2
_code
"
:
Read2_code
,
"
read3
_code
"
:
Read3_code
,
"
icd10
_code
"
:
Icd10_code
,
"
snomed
_code
"
:
Snomed_code
,
"
opcs4
_code
"
:
Opcs4_code
,
"
atc
_code
"
:
Atc_code
,
"
med
_code
"
:
Med_code
,
"
cprd
_code
"
:
Cprd_code
,
"
read2
"
:
Read2_code
,
"
read3
"
:
Read3_code
,
"
icd10
"
:
Icd10_code
,
"
snomed
"
:
Snomed_code
,
"
opcs4
"
:
Opcs4_code
,
"
atc
"
:
Atc_code
,
"
med
"
:
Med_code
,
"
cprd
"
:
Cprd_code
,
}
vocab_types
=
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment