Skip to content
Snippets Groups Projects
Commit 3a7d715c authored by mjbonifa's avatar mjbonifa
Browse files

started the precommit hook work, but seems more complex as it requires some...

started the precommit hook work, but seems more complex as it requires some download from github etc with usernames and passwords #21
parent cae3acc7
No related branches found
No related tags found
No related merge requests found
...@@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] ...@@ -44,6 +44,7 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION]
CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"]
class PhenValidationException(Exception): class PhenValidationException(Exception):
"""Custom exception class raised when validation errors in phenotype configuration file""" """Custom exception class raised when validation errors in phenotype configuration file"""
...@@ -308,7 +309,9 @@ def validate(phen_dir): ...@@ -308,7 +309,9 @@ def validate(phen_dir):
# check code file type is supported # check code file type is supported
if concept_code_file_path.suffix not in CODE_FILE_TYPES: if concept_code_file_path.suffix not in CODE_FILE_TYPES:
raise ValueError(f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types") raise ValueError(
f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types"
)
# check columns specified are a supported medical coding type # check columns specified are a supported medical coding type
for column in item["file"]["columns"]: for column in item["file"]["columns"]:
...@@ -321,9 +324,7 @@ def validate(phen_dir): ...@@ -321,9 +324,7 @@ def validate(phen_dir):
if "actions" in item["file"]: if "actions" in item["file"]:
for action in item["file"]["actions"]: for action in item["file"]["actions"]:
if action not in COL_ACTIONS: if action not in COL_ACTIONS:
validation_errors.append( validation_errors.append(f"Action {action} is not supported")
f"Action {action} is not supported"
)
else: else:
validation_errors.append( validation_errors.append(
...@@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None): ...@@ -356,7 +357,9 @@ def read_table_file(path, excel_sheet=None):
elif path.suffix == ".dta": elif path.suffix == ".dta":
df = pd.read_stata(path, dtype=str) df = pd.read_stata(path, dtype=str)
else: else:
raise ValueError(f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types") raise ValueError(
f"Unsupported filetype {codes_file_path.suffix}, only support{CODE_FILE_TYPES} code file types"
)
return df return df
...@@ -394,7 +397,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None): ...@@ -394,7 +397,10 @@ def preprocess_codes(df, concept_set, code_file_path, target_code_type=None):
# TODO: Is there a better way of processing this action as it's distributed across # TODO: Is there a better way of processing this action as it's distributed across
# different parts of the programme. # different parts of the programme.
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"]: if (
"actions" in concept_set["file"]
and "divide_col" in concept_set["file"]["actions"]
):
divide_col_df = df[concept_set["file"]["actions"]["divide_col"]] divide_col_df = df[concept_set["file"]["actions"]["divide_col"]]
else: else:
divide_col_df = pd.DataFrame() divide_col_df = pd.DataFrame()
...@@ -582,29 +588,23 @@ def map(phen_dir, target_code_type): ...@@ -582,29 +588,23 @@ def map(phen_dir, target_code_type):
# Map # Map
# if processing a source coding list with categorical data # if processing a source coding list with categorical data
if "actions" in concept_set["file"] and "divide_col" in concept_set["file"]["actions"] and len(df) > 0: if (
"actions" in concept_set["file"]
and "divide_col" in concept_set["file"]["actions"]
and len(df) > 0
):
divide_col = concept_set["file"]["actions"]["divide_col"] divide_col = concept_set["file"]["actions"]["divide_col"]
logger.debug(f"Action: Dividing Table by {divide_col}") logger.debug(f"Action: Dividing Table by {divide_col}")
logger.debug(f"column into: {df[divide_col].unique()}") logger.debug(f"column into: {df[divide_col].unique()}")
df_grp = df.groupby(divide_col) df_grp = df.groupby(divide_col)
for cat, grp in df_grp: for cat, grp in df_grp:
if cat == concept_set["file"]["category"]: if cat == concept_set["file"]["category"]:
grp = grp.drop( grp = grp.drop(columns=[divide_col]) # delete categorical column
columns=[divide_col]
) # delete categorical column
out = map_file( out = map_file(
grp, grp, target_code_type, out, concept_name=concept_set["name"]
target_code_type,
out,
concept_name=concept_set['name']
) )
else: else:
out = map_file( out = map_file(df, target_code_type, out, concept_name=concept_set["name"])
df,
target_code_type,
out,
concept_name=concept_set['name']
)
if len(code_errors) > 0: if len(code_errors) > 0:
logger.error(f"The map processing has {len(code_errors)} errors") logger.error(f"The map processing has {len(code_errors)} errors")
...@@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir): ...@@ -847,9 +847,7 @@ def diff(phen_dir, phen_old_dir):
new_config = new_phen_path / CONFIG_FILE new_config = new_phen_path / CONFIG_FILE
with new_config.open("r") as file: with new_config.open("r") as file:
new_config = yaml.safe_load(file) new_config = yaml.safe_load(file)
report.write( report.write(f"\n\n# Report for version {new_config['phenotype']['version']}\n\n")
f"\n\n# Report for version {new_config['phenotype']['version']}\n\n"
)
report.write(f"- Removed outputs: {list(removed_outputs)}\n") report.write(f"- Removed outputs: {list(removed_outputs)}\n")
report.write(f"- Added outputs: {list(added_outputs)}\n") report.write(f"- Added outputs: {list(added_outputs)}\n")
report.write(f"- Common outputs: {list(common_outputs)}\n") report.write(f"- Common outputs: {list(common_outputs)}\n")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment