Skip to content
Snippets Groups Projects
Commit ab787128 authored by mjbonifa's avatar mjbonifa
Browse files

removed -verify from option as always should be doing this and added some...

removed -verify from option as always should be doing this and added some validation checks in the config file
parent 131f74ac
No related branches found
No related tags found
No related merge requests found
...@@ -235,34 +235,46 @@ def validate(phen_dir): ...@@ -235,34 +235,46 @@ def validate(phen_dir):
# check codes definition # check codes definition
concept_set_mapping_names = [] concept_set_mapping_names = []
for item in concept_codes: for item in concept_codes:
# check concept codes path is a directory required_keys = {"folder", "files"}
concept_code_dir_path = codes_path / item['folder'] if required_keys.issubset(item.keys()):
if not concept_code_dir_path.is_dir(): # check concept codes path is a directory
validation_errors.append(f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory") concept_code_dir_path = codes_path / item['folder']
if not concept_code_dir_path.is_dir():
for file in item["files"]: validation_errors.append(f"Folder directory {str(concept_code_dir_path.resolve())} is not a directory")
# check concepte code file exists
concept_code_file_path = concept_code_dir_path / file['file'] for file in item["files"]:
if not concept_code_file_path.exists(): # check concepte code file exists
validation_errors.append(f"Coding file {str(concept_code_file_path.resolve())} does not exist") concept_code_file_path = concept_code_dir_path / file['file']
if not concept_code_file_path.exists():
# check columns specified are a supported medical coding type validation_errors.append(f"Coding file {str(concept_code_file_path.resolve())} does not exist")
for column in file['columns']:
if column not in code_types and column != 'metadata': # check concepte code file is not empty
validation_errors.append(f"Column type {column} for file {concept_code_file_path} is not supported") concept_code_file_path = concept_code_dir_path / file['file']
if concept_code_file_path.stat().st_size == 0:
# check the actions are supported validation_errors.append(f"Coding file {str(concept_code_file_path.resolve())} is an empty file")
if 'actions' in file:
for action in file['actions']: # check columns section exists
if action not in COL_ACTIONS: if "columns" not in file:
validation_errors.append(f"Action {action} is not supported") validation_errors.append(f"Columns not defined for {concept_code_file_path}")
# check concept_set defined for the mapping # check columns specified are a supported medical coding type
for concept_set_mapping in file['concept_set']: for column in file['columns']:
# store the concept set names found for later set operations if column not in code_types and column != 'metadata':
if concept_set_mapping not in concept_set_mapping_names: validation_errors.append(f"Column type {column} for file {concept_code_file_path} is not supported")
concept_set_mapping_names.append(concept_set_mapping)
# check the actions are supported
if 'actions' in file:
for action in file['actions']:
if action not in COL_ACTIONS:
validation_errors.append(f"Action {action} is not supported")
# check concept_set defined for the mapping
for concept_set_mapping in file['concept_set']:
# store the concept set names found for later set operations
if concept_set_mapping not in concept_set_mapping_names:
concept_set_mapping_names.append(concept_set_mapping)
else:
validation_errors.append(f"Missing required elements {required_keys} in codes {item}")
# create sets to perform set operations on the lists of concept set names # create sets to perform set operations on the lists of concept set names
concept_set_names_set = set(concept_set_names) concept_set_names_set = set(concept_set_names)
concept_set_mapping_names_set = set(concept_set_mapping_names) concept_set_mapping_names_set = set(concept_set_mapping_names)
...@@ -302,22 +314,20 @@ def read_table_file(path, excel_sheet=None): ...@@ -302,22 +314,20 @@ def read_table_file(path, excel_sheet=None):
return df return df
def preprocess_code(out, codes, codes_file, checker, output_col, df_meta, verify=True): def preprocess_code(out, codes, codes_file, checker, output_col, df_meta):
logger.debug(f" Preprocess_code")
codes = codes.astype(str) # convert to string codes = codes.astype(str) # convert to string
codes = codes.str.strip() # remove excess spaces codes = codes.str.strip() # remove excess spaces
logger.debug(f" CODE TYPE IN PREPROCESS {type(codes)}")
if verify: codes, errors = checker.process(codes, codes_file) # resolve any identified issues
codes, errors = checker.process(codes, codes_file) # resolve any identified issues if len(errors) > 0:
if len(errors) > 0: raise Exception(f"Code validation failed with {len(errors)} errors")
raise Exception(f"Code validation failed with {len(errors)} errors")
# add metadata columns # add metadata columns
out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True) out = pd.concat([out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True)
return out return out
# Perform QA Checks on columns individually and append to df # Perform QA Checks on columns individually and append to df
def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=None, verify=True, translate=True,): def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=None, translate=True,):
""" Parses each column individually - Order and length will not be preserved! """ """ Parses each column individually - Order and length will not be preserved! """
out = pd.DataFrame([]) # create output df to append to out = pd.DataFrame([]) # create output df to append to
logger.debug(f"CODES file {codes_file}") logger.debug(f"CODES file {codes_file}")
...@@ -330,8 +340,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N ...@@ -330,8 +340,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
codes_file=codes_file, codes_file=codes_file,
checker=code_types[target_code_type](file_path), checker=code_types[target_code_type](file_path),
output_col=target_code_type, output_col=target_code_type,
df_meta=df[meta_columns], df_meta=df[meta_columns])
verify=verify,)
else: else:
logger.warning(f"No {target_code_type} Codes to process") logger.warning(f"No {target_code_type} Codes to process")
else: else:
...@@ -344,8 +353,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N ...@@ -344,8 +353,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], codes_file=N
codes_file=codes_file, codes_file=codes_file,
checker=v(), checker=v(),
output_col=k, output_col=k,
df_meta=df[meta_columns], df_meta=df[meta_columns])
verify=verify,)
return out return out
# Translate Df with multiple codes into single code type Series # Translate Df with multiple codes into single code type Series
...@@ -408,11 +416,10 @@ def sql_row_exist(conn, table, column, value): ...@@ -408,11 +416,10 @@ def sql_row_exist(conn, table, column, value):
return exists return exists
def map(phen_dir, target_code_type, translate=True, verify=True): def map(phen_dir, target_code_type, translate=True):
logger.info(f"Processing phenotype directory: {phen_dir}") logger.info(f"Processing phenotype directory: {phen_dir}")
logger.debug(f"Target coding format: {target_code_type}") logger.debug(f"Target coding format: {target_code_type}")
logger.debug(f"Translating: {translate}") logger.debug(f"Translating: {translate}")
logger.debug(f"Verifying: {verify}")
# Validate configuration # Validate configuration
validate(phen_dir) validate(phen_dir)
...@@ -470,7 +477,6 @@ def map(phen_dir, target_code_type, translate=True, verify=True): ...@@ -470,7 +477,6 @@ def map(phen_dir, target_code_type, translate=True, verify=True):
meta_columns=meta_columns, meta_columns=meta_columns,
codes_file=str(codes_file_path.resolve()), codes_file=str(codes_file_path.resolve()),
target_code_type=target_code_type, target_code_type=target_code_type,
verify=verify,
translate=translate) translate=translate)
else: else:
raise Exception("No column format provided") raise Exception("No column format provided")
......
...@@ -76,7 +76,7 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog): ...@@ -76,7 +76,7 @@ def test_phen_workflow(tmp_dir, monkeypatch, caplog):
# map phenotype # map phenotype
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
monkeypatch.setattr(sys, "argv", ["main.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read2", "-tr", "-ve"]) monkeypatch.setattr(sys, "argv", ["main.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read3", "-tr"])
main.main() main.main()
assert "Phenotype processed successfully" in caplog.text assert "Phenotype processed successfully" in caplog.text
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment