From 8bda4c7c3bf02473357e01c964512e1acfa7d63a Mon Sep 17 00:00:00 2001 From: Michael Boniface <m.j.boniface@soton.ac.uk> Date: Tue, 25 Feb 2025 13:54:34 +0000 Subject: [PATCH] fix: added schema validation using Cerberus to the validation in phen. Closes #16 --- acmc/phen.py | 131 +++++++++++++++++++++++++++++++++++-------------- pyproject.toml | 1 + 2 files changed, 95 insertions(+), 37 deletions(-) diff --git a/acmc/phen.py b/acmc/phen.py index 8669668..1ae243d 100644 --- a/acmc/phen.py +++ b/acmc/phen.py @@ -11,6 +11,7 @@ import re import logging import requests import yaml +from cerberus import Validator from pathlib import Path from urllib.parse import urlparse, urlunparse @@ -44,6 +45,59 @@ COL_ACTIONS = [SPLIT_COL_ACTION, CODES_COL_ACTION, DIVIDE_COL_ACTION] CODE_FILE_TYPES = [".xlsx", ".xls", ".csv"] +# config.yaml schema +CONFIG_SCHEMA = { + "phenotype": { + "type": "dict", + "required": True, + "schema": { + "version": { + "type": "string", + "required": True, + "regex": r"^v\d+\.\d+\.\d+$" # Enforces 'vN.N.N' format + }, + "omop": { + "type": "dict", + "required": True, + "schema": { + "vocabulary_id": {"type": "string", "required": True}, + "vocabulary_name": {"type": "string", "required": True}, + "vocabulary_reference": { + "type": "string", + "required": True, + "regex": r"^https?://.*" # Ensures it's a URL + }, + } + }, + "concept_sets": { + "type": "list", + "required": True, + "schema": { + "type": "dict", + "schema": { + "name": {"type": "string", "required": True}, + "file": { + "type": "dict", + "required": False, + "schema": { + "path": {"type": "string", "required": True}, + "columns": {"type": "dict", "required": True}, + "category": {"type": "string"}, # Optional but must be string if present + "actions": { + "type": "dict", + "schema": { + "divide_col": {"type": "string"} + }, + }, + }, + }, + "metadata": {"type": "dict", "required": True}, + }, + }, + }, + }, + } +} class PhenValidationException(Exception): """Custom exception class raised when validation errors in phenotype configuration file""" @@ -257,8 +311,21 @@ def validate(phen_dir): # Load configuration File if config_path.suffix == ".yaml": - with config_path.open("r") as file: - phenotype = yaml.safe_load(file) + try: + with config_path.open("r") as file: + phenotype = yaml.safe_load(file) + + validator = Validator(CONFIG_SCHEMA) + if validator.validate(phenotype): + logger.debug("YAML structure is valid.") + else: + logger.error(f"YAML structure validation failed: {validator.errors}") + raise Exception( + f"YAML structure validation failed: {validator.errors}" + ) + except yaml.YAMLError as e: + logger.error(f"YAML syntax error: {e}") + raise e else: raise Exception( f"Unsupported configuration filetype: {str(config_path.resolve())}" @@ -286,50 +353,40 @@ def validate(phen_dir): else: concept_set_names.append(item["name"]) - # TODO: change this to some sort of yaml schema validation - required_keys = {"name", "file", "metadata"} - # check codes definition for item in phenotype["concept_sets"]: + # check concepte code file exists + concept_code_file_path = codes_path / item["file"]["path"] + if not concept_code_file_path.exists(): + validation_errors.append( + f"Coding file {str(concept_code_file_path.resolve())} does not exist" + ) - if required_keys.issubset(item.keys()): + # check concepte code file is not empty + if concept_code_file_path.stat().st_size == 0: + validation_errors.append( + f"Coding file {str(concept_code_file_path.resolve())} is an empty file" + ) - # check concepte code file exists - concept_code_file_path = codes_path / item["file"]["path"] - if not concept_code_file_path.exists(): - validation_errors.append( - f"Coding file {str(concept_code_file_path.resolve())} does not exist" - ) + # check code file type is supported + if concept_code_file_path.suffix not in CODE_FILE_TYPES: + raise ValueError( + f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types" + ) - # check concepte code file is not empty - if concept_code_file_path.stat().st_size == 0: + # check columns specified are a supported medical coding type + for column in item["file"]["columns"]: + if column not in code_types: validation_errors.append( - f"Coding file {str(concept_code_file_path.resolve())} is an empty file" + f"Column type {column} for file {concept_code_file_path} is not supported" ) - # check code file type is supported - if concept_code_file_path.suffix not in CODE_FILE_TYPES: - raise ValueError( - f"Unsupported filetype {concept_code_file_path.suffix}, only support csv, xlsx, xls code file types" - ) + # check the actions are supported + if "actions" in item["file"]: + for action in item["file"]["actions"]: + if action not in COL_ACTIONS: + validation_errors.append(f"Action {action} is not supported") - # check columns specified are a supported medical coding type - for column in item["file"]["columns"]: - if column not in code_types: - validation_errors.append( - f"Column type {column} for file {concept_code_file_path} is not supported" - ) - - # check the actions are supported - if "actions" in item["file"]: - for action in item["file"]["actions"]: - if action not in COL_ACTIONS: - validation_errors.append(f"Action {action} is not supported") - - else: - validation_errors.append( - f"Missing required elements {required_keys} in concept set {item}" - ) if len(validation_errors) > 0: logger.error(validation_errors) diff --git a/pyproject.toml b/pyproject.toml index a690982..de414cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ requires-python = ">=3.9" dependencies = [ "aiosqlite", + "cerberus", "click", "cramjam", "et-xmlfile", -- GitLab