Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
concepts-processing
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Package registry
Operate
Terraform modules
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
meldb
concepts-processing
Commits
633702e3
Commit
633702e3
authored
4 months ago
by
mjbonifa
Browse files
Options
Downloads
Patches
Plain Diff
checked metadata_df
parent
75615a67
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
acmc/phen.py
+28
-20
28 additions, 20 deletions
acmc/phen.py
with
28 additions
and
20 deletions
acmc/phen.py
+
28
−
20
View file @
633702e3
...
@@ -313,15 +313,33 @@ def read_table_file(path, excel_sheet=None):
...
@@ -313,15 +313,33 @@ def read_table_file(path, excel_sheet=None):
return
df
return
df
def
preprocess_code
(
out
,
codes
,
codes_file
,
checker
,
output_col
,
df_meta
):
def
process_actions
(
df
,
file
):
# Perform Structural Changes to file before preprocessing
logger
.
debug
(
"
Processing file structural actions
"
)
if
(
"
actions
"
in
file
and
"
split_col
"
in
file
[
"
actions
"
]
and
"
codes_col
"
in
file
[
"
actions
"
]):
split_col
=
file
[
"
actions
"
][
"
split_col
"
]
codes_col
=
file
[
"
actions
"
][
"
codes_col
"
]
logger
.
debug
(
"
Action: Splitting
"
,
split_col
,
"
column into:
"
,
df
[
split_col
].
unique
(),)
codes
=
df
[
codes_col
]
oh
=
pd
.
get_dummies
(
df
[
split_col
],
dtype
=
bool
)
# one hot encode
oh
=
oh
.
where
((
oh
!=
True
),
codes
,
axis
=
0
)
# fill in 1s with codes
oh
[
oh
==
False
]
=
np
.
nan
# replace 0s with None
df
=
pd
.
concat
([
df
,
oh
],
axis
=
1
)
# merge in new columns
return
df
def
preprocess_code
(
out
,
codes
,
codes_file
,
checker
,
output_col
,
metadata_df
):
# preprocess codes
codes
=
codes
.
astype
(
str
)
# convert to string
codes
=
codes
.
astype
(
str
)
# convert to string
codes
=
codes
.
str
.
strip
()
# remove excess spaces
codes
=
codes
.
str
.
strip
()
# remove excess spaces
codes
,
errors
=
checker
.
process
(
codes
,
codes_file
)
# resolve any identified issues
codes
,
errors
=
checker
.
process
(
codes
,
codes_file
)
if
len
(
errors
)
>
0
:
if
len
(
errors
)
>
0
:
raise
Exception
(
f
"
Code validation failed with
{
len
(
errors
)
}
errors
"
)
raise
Exception
(
f
"
Code validation failed with
{
len
(
errors
)
}
errors
"
)
# add metadata columns
# add metadata columns
out
=
pd
.
concat
([
out
,
pd
.
DataFrame
({
output_col
:
codes
}).
join
(
df_
meta
)],
ignore_index
=
True
)
out
=
pd
.
concat
([
out
,
pd
.
DataFrame
({
output_col
:
codes
}).
join
(
meta
data_df
)],
ignore_index
=
True
)
return
out
return
out
...
@@ -339,7 +357,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
...
@@ -339,7 +357,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
codes_file
=
codes_file
,
codes_file
=
codes_file
,
checker
=
code_types
[
target_code_type
](
file_path
),
checker
=
code_types
[
target_code_type
](
file_path
),
output_col
=
target_code_type
,
output_col
=
target_code_type
,
df_
meta
=
df
[
meta_columns
])
meta
data_df
=
df
[
meta_columns
])
else
:
else
:
logger
.
warning
(
f
"
No
{
target_code_type
}
Codes to process
"
)
logger
.
warning
(
f
"
No
{
target_code_type
}
Codes to process
"
)
else
:
else
:
...
@@ -352,7 +370,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
...
@@ -352,7 +370,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
codes_file
=
codes_file
,
codes_file
=
codes_file
,
checker
=
v
(),
checker
=
v
(),
output_col
=
k
,
output_col
=
k
,
df_
meta
=
df
[
meta_columns
])
meta
data_df
=
df
[
meta_columns
])
return
out
return
out
# Translate Df with multiple codes into single code type Series
# Translate Df with multiple codes into single code type Series
...
@@ -390,7 +408,7 @@ def convert_codes(df, target, translate):
...
@@ -390,7 +408,7 @@ def convert_codes(df, target, translate):
# Append file's codes to output Df with concept
# Append file's codes to output Df with concept
def
map_file
(
df
,
target_code_type
,
out
,
concepts
,
meta_columns
=
[],
translate
=
True
):
def
map_file
(
df
,
target_code_type
,
out
,
concepts
,
meta_columns
=
[],
translate
=
True
):
# seperate out meta_columns
# seperate out meta_columns
df_
meta
=
df
[
meta_columns
]
meta
data_df
=
df
[
meta_columns
]
df
=
df
.
drop
(
columns
=
meta_columns
)
df
=
df
.
drop
(
columns
=
meta_columns
)
codes
=
convert_codes
(
df
,
target_code_type
,
translate
)
codes
=
convert_codes
(
df
,
target_code_type
,
translate
)
codes
=
codes
.
dropna
()
# delete NaNs
codes
=
codes
.
dropna
()
# delete NaNs
...
@@ -398,7 +416,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=Tru
...
@@ -398,7 +416,7 @@ def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=Tru
# Append to out df
# Append to out df
if
len
(
codes
)
>
0
:
if
len
(
codes
)
>
0
:
codes
=
pd
.
DataFrame
({
"
CONCEPT
"
:
codes
})
codes
=
pd
.
DataFrame
({
"
CONCEPT
"
:
codes
})
codes
=
codes
.
join
(
df_
meta
)
codes
=
codes
.
join
(
meta
data_df
)
for
concept
in
concepts
:
for
concept
in
concepts
:
codes
[
"
CONCEPT_SET
"
]
=
np
.
repeat
(
concept
.
strip
(),
len
(
codes
))
codes
[
"
CONCEPT_SET
"
]
=
np
.
repeat
(
concept
.
strip
(),
len
(
codes
))
out
=
pd
.
concat
([
out
,
codes
])
out
=
pd
.
concat
([
out
,
codes
])
...
@@ -446,18 +464,8 @@ def map(phen_dir, target_code_type, translate=True):
...
@@ -446,18 +464,8 @@ def map(phen_dir, target_code_type, translate=True):
else
:
else
:
df
=
read_table_file
(
path
=
codes_file_path
)
df
=
read_table_file
(
path
=
codes_file_path
)
# Perform Structural Changes to file before preprocessing
# process structural actions
# split column with multiple code types
df
=
process_actions
(
df
,
file
)
logger
.
debug
(
"
Processing file structural actions
"
)
if
(
"
actions
"
in
file
and
"
split_col
"
in
file
[
"
actions
"
]
and
"
codes_col
"
in
file
[
"
actions
"
]):
split_col
=
file
[
"
actions
"
][
"
split_col
"
]
codes_col
=
file
[
"
actions
"
][
"
codes_col
"
]
logger
.
debug
(
"
Action: Splitting
"
,
split_col
,
"
column into:
"
,
df
[
split_col
].
unique
(),)
codes
=
df
[
codes_col
]
oh
=
pd
.
get_dummies
(
df
[
split_col
],
dtype
=
bool
)
# one hot encode
oh
=
oh
.
where
((
oh
!=
True
),
codes
,
axis
=
0
)
# fill in 1s with codes
oh
[
oh
==
False
]
=
np
.
nan
# replace 0s with None
df
=
pd
.
concat
([
df
,
oh
],
axis
=
1
)
# merge in new columns
# Preprocessing & Validation Checks
# Preprocessing & Validation Checks
logger
.
debug
(
"
Processing and validating code formats
"
)
logger
.
debug
(
"
Processing and validating code formats
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment