diff --git a/acmc.py b/acmc.py index 528eaed42346247deca26a607de25d23d2934a1d..94b449e7181476b05f769f252eb3c12e9596be97 100644 --- a/acmc.py +++ b/acmc.py @@ -8,8 +8,8 @@ import omop import phen # setup logging -from logging_config import setup_logger -logger = setup_logger() +import logging_config +logger = logging_config.setup_logger() BUILD_PATH = Path('build') @@ -66,8 +66,9 @@ def phen_diff(args): args.phen_dir_old) def main(): - logger.info("ACMC Tool") + logger.info("ACMC Tool") parser = argparse.ArgumentParser(description="ACMC command-line tool") + parser.add_argument("--debug", action="store_true", help="Enable debug mode") # Top-level commands subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands") @@ -143,6 +144,10 @@ def main(): # Parse arguments args = parser.parse_args() + # setup logging + if(args.debug): + logging_config.set_log_level(logging.DEBUG) + # Call the function associated with the command args.func(args) diff --git a/logging_config.py b/logging_config.py index d6bd75243bffe5ba9647b7282c5d56275e74a71a..b099ce458a5140d619d5b64cb1f3e7c2db4a1a98 100644 --- a/logging_config.py +++ b/logging_config.py @@ -2,19 +2,19 @@ import logging DEFAULT_LOG_FILE = "acmc.log" -def setup_logger(): +def setup_logger(log_level=logging.INFO): # Create a logger logger = logging.getLogger('acmc_logger') - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) if not logger.hasHandlers(): #Create a file handler that logs to a file file_handler = logging.FileHandler(DEFAULT_LOG_FILE) - file_handler.setLevel(logging.DEBUG) + file_handler.setLevel(logging.INFO) # Create a stream handler that prints to the console stream_handler = logging.StreamHandler() - stream_handler.setLevel(logging.DEBUG) + stream_handler.setLevel(logging.INFO) # Create a formatter for how the log messages should look formatter = logging.Formatter('%(asctime)s - - %(levelname)s - %(message)s') @@ -28,3 +28,11 @@ def setup_logger(): logger.addHandler(stream_handler) return logger + +def set_log_level(log_level): + logger = logging.getLogger('acmc_logger') + logger.setLevel(log_level) # Set logger level + + # Also update handlers to match the new level + for handler in logger.handlers: + handler.setLevel(log_level) \ No newline at end of file diff --git a/omop.py b/omop.py index ca1319df3f4e0218fb5810b43c38b1013046dd84..3ad79f2005e879db86d6c9a3683ee43a36ea983c 100644 --- a/omop.py +++ b/omop.py @@ -3,8 +3,13 @@ import argparse import sqlite3 import pandas as pd import json +import logging from pathlib import Path +# setup logging +from logging_config import setup_logger +logger = setup_logger() + OMOP_DB_DIR = Path('./build/omop') OMOP_DB_PATH = OMOP_DB_DIR / 'omop_54.sqlite' VERSION_FILE = 'omop_version.json' @@ -33,7 +38,7 @@ vocabularies = { #Populate SQLite3 Database with default OMOP CONCEPTS def install (omop_install_folder, version, db_path=OMOP_DB_PATH): """Installs the OMOP release csv files in a file-based sql database""" - print(f"Installing OMOP database from {omop_install_folder}") + logger.info(f"Installing OMOP database from {omop_install_folder}") # check folder for omop install files is a directory omop_install_path = Path(omop_install_folder) @@ -43,7 +48,7 @@ def install (omop_install_folder, version, db_path=OMOP_DB_PATH): # check codes directory exists and if not create it if not OMOP_DB_DIR.exists(): OMOP_DB_DIR.mkdir(parents=True) - print(f"OMOP directory '{OMOP_DB_DIR}' created.") + logger.debug(f"OMOP directory '{OMOP_DB_DIR}' created.") # connect to database, if it does not exist it will be created conn = sqlite3.connect(OMOP_DB_PATH) @@ -52,7 +57,7 @@ def install (omop_install_folder, version, db_path=OMOP_DB_PATH): if filename.endswith(".csv"): # Check if the file is a CSV file_path = os.path.join(omop_install_folder, filename) try: - print(f"Reading file: {file_path}") + logger.debug(f"Reading file: {file_path}") # read the CSV file with the specified delimiter df = pd.read_csv(file_path, delimiter="\t", low_memory=False) table_name = os.path.splitext(os.path.basename(file_path))[0] #Get name of file @@ -69,7 +74,7 @@ def install (omop_install_folder, version, db_path=OMOP_DB_PATH): # write version file write_version_file(version) - print(f"OMOP installation completed") + logger.info(f"OMOP installation completed") def write_version_file(version): """Writes the OMOP vocaburaries and version to a file""" @@ -79,7 +84,7 @@ def write_version_file(version): def clear(db_path): """Clears the OMOP sql database""" - print(f"Clearing OMOP data from database") + logger.info(f"Clearing OMOP data from database") omop_db_path = Path(db_path) if not omop_db_path.is_file(): raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.") @@ -89,23 +94,23 @@ def clear(db_path): # Fetch and print table names tables = cur.fetchall() - print("Tables in database:", [table[0] for table in tables]) + logger.debug("Tables in database:", [table[0] for table in tables]) #cur.execute("DROP TABLE CONCEPT_SET;") #cur.execute("DROP TABLE CONCEPT_SET_ITEM;") conn.close() - print(f"OMOP database cleared") + logger.info(f"OMOP database cleared") def delete(db_path): """Deletes the OMOP sql database""" - print(f"Deleting OMOP database") + logger.info(f"Deleting OMOP database") omop_db_path = Path(db_path) if not omop_db_path.is_file(): raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.") omop_db_path.unlink() - print(f"OMOP database deleted") + logger.info(f"OMOP database deleted") def table_exists(cursor, table_name): # Query to check if the table exists @@ -194,7 +199,7 @@ def publish_concept_sets(out, db_path, vocab_output, vocab_type, output_version) if not sql_row_exist(conn, "CONCEPT_SET", "concept_set_name", concept_set_name): cur.execute(f"INSERT INTO CONCEPT_SET (concept_set_name, vocabulary_id) VALUES ('{concept_set_name}', 'MELDB');") else: - print("concept_set", concept_set_name, "already exists") + logger.debug("concept_set", concept_set_name, "already exists") #TODO: ask to remove old concept_set? #Get Concept_set_Id @@ -209,7 +214,7 @@ def publish_concept_sets(out, db_path, vocab_output, vocab_type, output_version) df_out = pd.DataFrame(cur.fetchall(), columns=["concept_id"]) if not len(grp) == len(df_out): - print("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database") + logger.error("ERROR: Some", vocab_type, "Codes do not exist in OMOP Database") #Create Concept_set_item df_out["concept_set_id"] = concept_set_id diff --git a/parse.py b/parse.py index 85a0c948e84412ae62bf85be365b34804ee0ce85..ed68066355c838718fd5c61a774c932890b48e71 100644 --- a/parse.py +++ b/parse.py @@ -3,6 +3,10 @@ import numpy as np import os import trud +# setup logging +import logging_config +logger = logging_config.setup_logger() + from base import log_invalid_code from base import bcolors from base import raise_ @@ -36,14 +40,14 @@ class Proto(): for msg, cond, process in self.checks: #run each check if not cond(codes).all(): #if test failed # print("Check:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC) - print("Check: ", msg, bcolors.FAIL+f"{(~cond(codes)).sum()} FAILED"+bcolors.ENDC,) + logger.error(f"Check: {msg} {(~cond(codes)).sum()} FAILED") codes = process(codes) #run process to fix issue if cond(codes).all(): #is resloved by process - print("Check:", msg, "is resolved") + logger.debug("Check:", msg, "is resolved") else: #not resolved by process raise Exception(f"ERROR: Check {msg} is NOT resolved") else: - print("Check:", msg, bcolors.OKGREEN+"PASSED"+bcolors.ENDC) + logger.debug("Check: PASSED") return codes def verify(self, codes): @@ -55,14 +59,14 @@ class Proto(): out = cond(codes) conds = np.append(conds, out.all()) if not out.all(): #if test failed - print("Verify:", msg, bcolors.FAIL+"FAILED"+bcolors.ENDC) - print(codes[out]) #show failed codes + logger.error(f"Verify: {msg} FAILED") + logger.error(codes[out]) #show failed codes if conds.all(): #check all have passed - print(f"Verify: {bcolors.OKGREEN}ALL PASSED{bcolors.ENDC}") + logger.debug(f"Verify: ALL PASSED") return True else: #not all have passed - print("Verify: ", bcolors.FAIL, (len(conds) - conds.sum()), " FAILED", bcolors.ENDC) + logger.error(f"Verify: {(len(conds) - conds.sum())} FAILED") return False class Read2(Proto): diff --git a/phen.py b/phen.py index f8a436d59eb21b4aa2bc010f219e92ef8dc37a65..e13ad4636090d1de42577a564cbd2f4b5fabcb0b 100644 --- a/phen.py +++ b/phen.py @@ -82,7 +82,7 @@ def create_empty_git_dir(path): def init(phen_dir, remote_url): """Initial phenotype directory as git repo with standard structure""" - print(f"Initialising Phenotype in directory: {phen_dir}") + logger.info(f"Initialising Phenotype in directory: {phen_dir}") phen_path = Path(phen_dir) # check if directory already exists and ask user if they want to recreate it @@ -93,12 +93,12 @@ def init(phen_dir, remote_url): shutil.rmtree(phen_path) configure = True; else: - print("Phen directory was not recreated.") + logger.info("Phen directory was not recreated.") else: configure=True if not configure: - print(f"Exiting, phenotype not initiatised") + logger.info(f"Exiting, phenotype not initiatised") return # Initialise repo from local or remote @@ -116,16 +116,16 @@ def init(phen_dir, remote_url): # check if there are any commits (new repo has no commits) if len(repo.branches) == 0 or repo.head.is_detached: # Handle detached HEAD (e.g., after init) - print("The phen repository has no commits yet.") + logger.debug("The phen repository has no commits yet.") commit_count = 0 else: # Get the total number of commits in the default branch commit_count = sum(1 for _ in repo.iter_commits()) - print(f"Repo has previous commits: {commit_count}") + logger.debug(f"Repo has previous commits: {commit_count}") else: # local repo, create the directories and init phen_path.mkdir(parents=True, exist_ok=True) - print(f"Phen directory '{phen_path}' has been created.") + logger.debug(f"Phen directory '{phen_path}' has been created.") repo = git.Repo.init(phen_path) commit_count = 0 @@ -150,10 +150,10 @@ def init(phen_dir, remote_url): # if the phen path does not contain the config file then initialise the phen type config_path = phen_path / CONFIG_FILE if config_path.exists(): - print(f"Phenotype configuration files already exist") + logger.debug(f"Phenotype configuration files already exist") return - print("Creating phen directory structure and config files") + logger.info("Creating phen directory structure and config files") for d in DEFAULT_PHEN_DIR_LIST: create_empty_git_dir(phen_path / d) @@ -187,11 +187,11 @@ def init(phen_dir, remote_url): repo.git.add(all=True) repo.index.commit("initialised the phen git repo.") - print(f"Phenotype initialised successfully") + logger.info(f"Phenotype initialised successfully") def validate(phen_dir): """Validates the phenotype directory is a git repo with standard structure""" - print(f"Validating phenotype configuration {phen_dir}") + logger.info(f"Validating phenotype configuration {phen_dir}") phen_path = Path(phen_dir) if not phen_path.is_dir(): raise NotADirectoryError(f"Error: '{phen_path}' is not a directory") @@ -280,11 +280,11 @@ def validate(phen_dir): validation_errors.append(f"Concept sets mapped in codes do not exist in the concept sets: {codes_no_concept_set}") if len(validation_errors) > 0: - print(validation_errors) + logger.error(validation_errors) raise PhenValidationException(f"Configuration file {str(config_path.resolve())} failed validation", validation_errors) - print(f"Phenotype validated successfully") + logger.info(f"Phenotype validated successfully") def read_table_file(path, excel_sheet=None): """ @@ -310,7 +310,7 @@ def preprocess_code(out, codes, checker, output_col, df_meta, verify=True): if verify: codes = checker.process(codes) # resolve any identified issues if not checker.verify(codes): # verify all issues resolved - print("ERROR: FAILED") + logger.error("ERROR: FAILED") # add metadata columns out = pd.concat( [out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True @@ -325,7 +325,7 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], file_path=No if target_code_type and not translate: # QA only on target codes if target_code_type in columns: - print(f"Processing {target_code_type} Codes...") + logger.info(f"Processing {target_code_type} Codes...") out = preprocess_code(out=out, codes=df[columns[target_code_type]].dropna(), checker=code_types[target_code_type](file_path), @@ -333,12 +333,12 @@ def preprocess(df, columns, target_code_type=None, meta_columns=[], file_path=No df_meta=df[meta_columns], verify=verify,) else: - print(f"No {target_code_type} Codes to process") + logger.warning(f"No {target_code_type} Codes to process") else: # QA for every code type in df run preprocess_code() for k, v in code_types.items(): if k in columns: - print(f"Processing {k} Codes...") + logger.info(f"Processing {k} Codes...") out = preprocess_code(out=out, codes=df[columns[k]].dropna(), checker=v(file_path), @@ -354,14 +354,14 @@ def convert_codes(df, target, translate): # Append target column (if exists) - doesn't need conversion if target in df.columns: - print("Has", len(df), target, "in file") + logger.debug("Has", len(df), target, "in file") codes = pd.concat([codes, df[target]]) # else: - # print("No",target,"in file") + # logger.debug("No",target,"in file") if translate: # Convert codes to target type - print(f"target type {target}") + logger.info(f"target type {target}") for col_name in df.columns[df.columns != target]: filename = f"{col_name}_to_{target}.parquet" map_path = trud.TRUD_PROCESSED_DIR / filename @@ -376,9 +376,9 @@ def convert_codes(df, target, translate): # cause=f"Translation to {target}") #log codes with no translation codes = pd.concat([codes, translated]) # merge to output else: - print(f"No mapping from {col_name} to {target}, file {str(map_path.resolve())} does not exist") + logger.warning(f"No mapping from {col_name} to {target}, file {str(map_path.resolve())} does not exist") else: - print(f"NOT TRANSLATING {col_name}") + logger.warning(f"NOT TRANSLATING {col_name}") return codes @@ -409,10 +409,10 @@ def sql_row_exist(conn, table, column, value): return exists def map(phen_dir, target_code_type, translate=True, verify=True): - print(f"Processing phenotype directory: {phen_dir}") - print(f"Target coding format: {target_code_type}") - print(f"Translating: {translate}") - print(f"Verifying: {verify}") + logger.info(f"Processing phenotype directory: {phen_dir}") + logger.debug(f"Target coding format: {target_code_type}") + logger.debug(f"Translating: {translate}") + logger.debug(f"Verifying: {verify}") # Validate configuration validate(phen_dir) @@ -432,10 +432,10 @@ def map(phen_dir, target_code_type, translate=True, verify=True): # Process each folder in codes section for folder in codes: - print(bcolors.HEADER, folder["description"], bcolors.ENDC) + logger.debug(folder["description"]) if "files" in folder: for file in folder["files"]: - print("---" * 5, file["file"], "---" * 5) + logger.debug("---" * 5, file["file"], "---" * 5) file_path = codes_path / folder["folder"] / file["file"] # Load Code File @@ -446,11 +446,11 @@ def map(phen_dir, target_code_type, translate=True, verify=True): # Perform Structural Changes to file before preprocessing # split column with multiple code types - print("Processing actions") + logger.debug("Processing actions") if ("actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]): split_col = file["actions"]["split_col"] codes_col = file["actions"]["codes_col"] - print("Action: Splitting", split_col, "column into:", df[split_col].unique(),) + logger.debug("Action: Splitting", split_col, "column into:", df[split_col].unique(),) codes = df[codes_col] oh = pd.get_dummies(df[split_col], dtype=bool) # one hot encode oh = oh.where((oh != True), codes, axis=0) # fill in 1s with codes @@ -478,7 +478,7 @@ def map(phen_dir, target_code_type, translate=True, verify=True): # partition table by categorical column if ("actions" in file and "divide_col" in file["actions"] and len(df) > 0): divide_col = file["actions"]["divide_col"] - print("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),) + logger.debug("Action: Dividing Table by", divide_col, "column into: ", df[divide_col].unique(),) df = df.groupby(divide_col) # Map to Concept/Phenotype @@ -495,7 +495,7 @@ def map(phen_dir, target_code_type, translate=True, verify=True): for cat, grp in df: if (cat in file["concept_set_categories"].keys()): # check if category is mapped grp = grp.drop(columns=[divide_col]) # delete categorical column - print("Category:", cat) + logger.debug("Category:", cat) out = map_file(grp, target_code_type, out, @@ -503,7 +503,7 @@ def map(phen_dir, target_code_type, translate=True, verify=True): meta_columns=meta_columns,) else: - print("Folder is empty") + logger.warning("Folder is empty") # test if there's any output from processing if len(out) <= 0: @@ -524,8 +524,6 @@ def map(phen_dir, target_code_type, translate=True, verify=True): out = out.merge(concept_sets_df, how="left", on="CONCEPT_SET") # merge with output # Save output to map directory - print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC) - if translate: output_filename = target_code_type + '.csv' else: @@ -534,7 +532,7 @@ def map(phen_dir, target_code_type, translate=True, verify=True): map_path = phen_path / MAP_DIR / output_filename out.to_csv(map_path, index=False) - print(f"Saved mapped concepts to {str(map_path.resolve())}") + logger.info(f"Saved mapped concepts to {str(map_path.resolve())}") # save concept sets as separate files concept_set_path = phen_path / CONCEPT_SET_DIR / target_code_type @@ -569,9 +567,9 @@ def map(phen_dir, target_code_type, translate=True, verify=True): error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"]) error_df.to_csv(error_path, index=False) - print(f"Saved concept_sets to {str(concept_set_path.resolve())}") + logger.debug(f"Saved concept_sets to {str(concept_set_path.resolve())}") - print(f"Phenotype processed successfully") + logger.info(f"Phenotype processed successfully") def publish(phen_dir): """Publishes updates to the phenotype by commiting all changes to the repo directory""" @@ -590,7 +588,7 @@ def publish(phen_dir): # check if any changes to publish if not repo.is_dirty() and not repo.untracked_files: - print("Nothing to publish, no changes to the repo") + logger.info("Nothing to publish, no changes to the repo") return # get major version from configuration file @@ -605,7 +603,7 @@ def publish(phen_dir): # set version and write to config file so consistent with repo version next_minor_version = commit_count + 1 version = f"v{major_version}.{next_minor_version}" - print(f"New version: {version}") + logger.info(f"New version: {version}") config['concept_sets']['version'] = version with open(config_path, "w", encoding="utf-8") as f: json.dump(config, f, indent=4) @@ -614,24 +612,23 @@ def publish(phen_dir): commit_message = f"Committing updates to phenotype {phen_path}" repo.git.add('--all') repo.index.commit(commit_message) - print(commit_message) # Create and push the tag if version in repo.tags: raise Exception (f"Tag {version} already exists in repo {phen_path}") repo.create_tag(version, message=f"Release {version}") - print(f"Created tag: {version}") + logger.info(f"Created tag: {version}") # push to origin if a remote repo try: origin = repo.remotes.origin origin.push('main') origin.push(tags=True) - print("Changes pushed to 'origin'.") + logger.debug("Changes pushed to 'origin'.") except AttributeError: - print("No remote named 'origin' found, local repo.") + logger.debug("No remote named 'origin' found, local repo.") - print(f"Phenotype published successfully") + logger.info(f"Phenotype published successfully") def copy(phen_dir, target_dir, version=None): """Copys a phen repo at a specific tagged version into a target directory""" @@ -651,29 +648,29 @@ def copy(phen_dir, target_dir, version=None): else: copy_path = target_path / 'latest' - print(f"Copying repo {phen_path} to {copy_path}") + logger.info(f"Copying repo {phen_path} to {copy_path}") if not copy_path.exists(): # If copy directory doesn't exist, clone the repo - print(f"Cloning repo from {phen_path} into {copy_path}...") + logger.debug(f"Cloning repo from {phen_path} into {copy_path}...") repo = git.Repo.clone_from(phen_path, copy_path) else: # If copy directory exists, open the repo - print(f"Copy of repository already exists in {copy_path}. Opening the repo...") + logger.debug(f"Copy of repository already exists in {copy_path}. Opening the repo...") repo = git.Repo(copy_path) # Check out the latest commit or specified version if version: # Checkout a specific version (e.g., branch, tag, or commit hash) - print(f"Checking out version {version}...") + logger.info(f"Checking out version {version}...") repo.git.checkout(version) else: # Checkout the latest commit (HEAD) - print(f"Checking out the latest commit...") + logger.info(f"Checking out the latest commit...") repo.git.checkout("HEAD") - print(f"Copied {phen_path} {repo.head.commit.hexsha[:7]} in {copy_path}") + logger.debug(f"Copied {phen_path} {repo.head.commit.hexsha[:7]} in {copy_path}") - print(f"Phenotype copied successfully") + logger.info(f"Phenotype copied successfully") def diff(phen_dir, phen_old_dir): """Compare the differences between two versions of a phenotype""" @@ -689,7 +686,7 @@ def diff(phen_dir, phen_old_dir): report_path = new_phen_path / REPORT_FILE if report_path.suffix == ".md": report = open(report_path, 'w') - print(f"Writing to report file {str(report_path.resolve())}") + logger.debug(f"Writing to report file {str(report_path.resolve())}") else: raise ValueError(f"Unsupported filetype provided for report file {str(report_path.resolve())}") @@ -746,8 +743,7 @@ def diff(phen_dir, phen_old_dir): else: report.write(f"- Changed concepts []\n\n") - - print(f"Phenotypes diff'd successfully") + logger.info(f"Phenotypes diff'd successfully") # Here's the atlas code that needs to go into anotehr function # if output_path == "atlas": diff --git a/tests/test_acmc.py b/tests/test_acmc.py index c084f54312584584ad6a647c781b4b9b1919a6ee..1653ca1a653db657ee213d37741121e2dd8f36cd 100644 --- a/tests/test_acmc.py +++ b/tests/test_acmc.py @@ -3,12 +3,16 @@ import argparse import sys import shutil import logging +from pathlib import Path import trud import omop import phen import acmc -from pathlib import Path + +# setup logging +from logging_config import setup_logger +logger = setup_logger() @pytest.fixture def tmp_dir(): @@ -29,72 +33,72 @@ def logger(): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) -def test_phen_init_local_default(tmp_dir, monkeypatch, capsys): - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "init"]) - # Mock input() to return "yes" to the question about reinitialising the directory - monkeypatch.setattr("builtins.input", lambda _: "y") - acmc.main() - captured = capsys.readouterr() - assert "Phenotype initialised successfully" in captured.out - -def test_phen_init_local_specified(tmp_dir, monkeypatch, capsys): - phen_path = tmp_dir / "phen" - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "init", "-d", str(phen_path.resolve())]) - # Mock input() to return "yes" to the question about reinitialising the directory - monkeypatch.setattr("builtins.input", lambda _: "y") - acmc.main() - captured = capsys.readouterr() - assert "Phenotype initialised successfully" in captured.out +def test_phen_init_local_default(tmp_dir, monkeypatch, caplog): + with caplog.at_level(logging.DEBUG): + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "init"]) + # Mock input() to return "yes" to the question about reinitialising the directory + monkeypatch.setattr("builtins.input", lambda _: "y") + acmc.main() + assert "Phenotype initialised successfully" in caplog.text -def test_phen_workflow(tmp_dir, monkeypatch, capsys): - phen_path = tmp_dir / "phen" - phen_path = phen_path.resolve() - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "init", "-d", str(phen_path.resolve())]) - # Mock input() to return "yes" to the question about reinitialising the directory - monkeypatch.setattr("builtins.input", lambda _: "y") - acmc.main() - captured = capsys.readouterr() - assert "Phenotype initialised successfully" in captured.out +def test_phen_init_local_specified(tmp_dir, monkeypatch, caplog): + with caplog.at_level(logging.DEBUG): + phen_path = tmp_dir / "phen" + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "init", "-d", str(phen_path.resolve())]) + # Mock input() to return "yes" to the question about reinitialising the directory + monkeypatch.setattr("builtins.input", lambda _: "y") + acmc.main() + assert "Phenotype initialised successfully" in caplog.text - # copy examples across - shutil.rmtree(phen_path / 'codes') - ex_path = Path('./examples').resolve() - for item in ex_path.iterdir(): - source = ex_path / item.name - destination = phen_path / item.name - if source.is_dir(): - shutil.copytree(source, destination) - else: - shutil.copy2(source, destination) - shutil.copy( phen_path / 'config1.json', phen_path / 'config.json') +def test_phen_workflow(tmp_dir, monkeypatch, caplog): + with caplog.at_level(logging.DEBUG): + phen_path = tmp_dir / "phen" + phen_path = phen_path.resolve() + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "init", "-d", str(phen_path.resolve())]) + # Mock input() to return "yes" to the question about reinitialising the directory + monkeypatch.setattr("builtins.input", lambda _: "y") + acmc.main() + assert "Phenotype initialised successfully" in caplog.text - # validate phenotype - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "validate", "-d", str(phen_path.resolve())]) - acmc.main() - captured = capsys.readouterr() - assert "Phenotype validated successfully" in captured.out + with caplog.at_level(logging.DEBUG): + # validate phenotype + # copy examples across + shutil.rmtree(phen_path / 'codes') + ex_path = Path('./examples').resolve() + for item in ex_path.iterdir(): + source = ex_path / item.name + destination = phen_path / item.name + if source.is_dir(): + shutil.copytree(source, destination) + else: + shutil.copy(source, destination) + shutil.copy( phen_path / 'config1.json', phen_path / 'config.json') + + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "validate", "-d", str(phen_path.resolve())]) + acmc.main() + assert "Phenotype validated successfully" in caplog.text # map phenotype - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read2", "-tr", "-ve"]) - acmc.main() - captured = capsys.readouterr() - assert "Phenotype processed successfully" in captured.out + with caplog.at_level(logging.DEBUG): + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "map", "-d", str(phen_path.resolve()), "-t", "read2", "-tr", "-ve"]) + acmc.main() + assert "Phenotype processed successfully" in caplog.text # publish phenotype - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "publish", "-d", str(phen_path.resolve())]) - acmc.main() - captured = capsys.readouterr() - assert "Phenotype published successfully" in captured.out + with caplog.at_level(logging.DEBUG): + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "publish", "-d", str(phen_path.resolve())]) + acmc.main() + assert "Phenotype published successfully" in caplog.text # copy phenotype' - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "copy", "-d", str(phen_path.resolve()), "-td", str(tmp_dir.resolve()), "-v", "v1.0.3"]) - acmc.main() - captured = capsys.readouterr() - assert "Phenotype copied successfully" in captured.out + with caplog.at_level(logging.DEBUG): + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "copy", "-d", str(phen_path.resolve()), "-td", str(tmp_dir.resolve()), "-v", "v1.0.3"]) + acmc.main() + assert "Phenotype copied successfully" in caplog.text # diff phenotype - old_path = tmp_dir / "v1.0.3" - monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "diff", "-d", str(phen_path.resolve()), "-old", str(old_path.resolve())]) - acmc.main() - captured = capsys.readouterr() - assert "Phenotypes diff'd successfully" in captured.out + with caplog.at_level(logging.DEBUG): + old_path = tmp_dir / "v1.0.3" + monkeypatch.setattr(sys, "argv", ["acmc.py", "phen", "diff", "-d", str(phen_path.resolve()), "-old", str(old_path.resolve())]) + acmc.main() + assert "Phenotypes diff'd successfully" in caplog.text diff --git a/trud.py b/trud.py index 78b01ba88be42e80ad175898b0b33be0cbea6eb8..74bd856ca82ddab845b2d692ed9ee48b0410668b 100644 --- a/trud.py +++ b/trud.py @@ -4,14 +4,17 @@ import requests import json import argparse import shutil -from pathlib import Path - -from base import bcolors - import hashlib import zipfile import pandas as pd import simpledbf +from pathlib import Path + +# setup logging +import logging_config +logger = logging_config.setup_logger() + +from base import bcolors # Constants FQDN = "isd.digital.nhs.uk" @@ -22,7 +25,7 @@ TRUD_DOWNLOADS_DIR = TRUD_PATH / 'downloads' TRUD_PROCESSED_DIR = TRUD_PATH / 'processed' def error_exit(message): - print(message, "error") + logger.error(message, "error") sys.exit(1) def get_releases(item_id, API_KEY, latest=False): @@ -56,7 +59,7 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f if not file_url or not file_name: error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.") - print(f"Downloading item {item_id} {file_type} file: {file_name} from {file_url} to {file_destination}") + logger.info(f"Downloading item {item_id} {file_type} file: {file_name} from {file_url} to {file_destination}") response = requests.get(file_url, stream=True) if response.status_code == 200: @@ -69,9 +72,9 @@ def download_release_file(item_id, release_ordinal, release, file_json_prefix, f def validate_download_hash(file_destination:str, item_hash:str): with open(file_destination, "rb") as f: hash = hashlib.sha256(f.read()).hexdigest() - print(hash) + logger.debug(hash) if hash.upper() == item_hash.upper(): - print(f"Verified hash of {file_destination} {hash}") + logger.debug(f"Verified hash of {file_destination} {hash}") else: error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead") @@ -95,7 +98,7 @@ def extract_icd10(): }) output_path = TRUD_PROCESSED_DIR / 'icd10.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") def extract_opsc4(): file_path = TRUD_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt' @@ -105,7 +108,7 @@ def extract_opsc4(): output_path = TRUD_PROCESSED_DIR / 'opcs4.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") def extract_nhs_data_migrations(): #NHS Data Migrations @@ -120,7 +123,7 @@ def extract_nhs_data_migrations(): output_path = TRUD_PROCESSED_DIR / 'snomed.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r2 -> r3 file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt' @@ -131,7 +134,7 @@ def extract_nhs_data_migrations(): output_path = TRUD_PROCESSED_DIR / 'read2_to_read3.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r3->r2 file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt' @@ -144,7 +147,7 @@ def extract_nhs_data_migrations(): output_path = TRUD_PROCESSED_DIR / 'read3_to_read2.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r2 -> snomed file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt' @@ -155,7 +158,7 @@ def extract_nhs_data_migrations(): output_path = TRUD_PROCESSED_DIR / 'read2_to_snomed.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r3->snomed file_path = TRUD_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt' @@ -168,7 +171,7 @@ def extract_nhs_data_migrations(): output_path = TRUD_PROCESSED_DIR / 'read3_to_snomed.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") def extract_nhs_read_browser(): #r2 only @@ -179,7 +182,7 @@ def extract_nhs_read_browser(): df = df.rename(columns={0:"read2"}) output_path = TRUD_PROCESSED_DIR / 'read2.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r2 -> atc input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF' @@ -188,7 +191,7 @@ def extract_nhs_read_browser(): df = df.rename(columns={"READCODE":"read2", "ATC":"atc"}) output_path = TRUD_PROCESSED_DIR / 'read2_to_atc.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r2 -> icd10 input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF' @@ -199,7 +202,7 @@ def extract_nhs_read_browser(): df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-' output_path = TRUD_PROCESSED_DIR / 'read2_to_icd10.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r2 -> opcs4 input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF' @@ -210,7 +213,7 @@ def extract_nhs_read_browser(): df = df[~df["read2"].str.match("^.*-.*$")] #remove codes with '-' output_path = TRUD_PROCESSED_DIR / 'read2_to_opcs4.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r3 only input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF' @@ -220,7 +223,7 @@ def extract_nhs_read_browser(): df = df.rename(columns={0:"read3"}) output_path = TRUD_PROCESSED_DIR / 'read3.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r3 -> icd10 input_path = TRUD_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF' @@ -231,7 +234,7 @@ def extract_nhs_read_browser(): df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-' output_path = TRUD_PROCESSED_DIR / 'read3_to_icd10.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") #r3 -> icd9 # dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF') @@ -245,7 +248,7 @@ def extract_nhs_read_browser(): df = df[~df["read3"].str.match("^.*-.*$")] #remove codes with '-' output_path = TRUD_PROCESSED_DIR / 'read3_to_opcs4.parquet' df.to_parquet(output_path, index=False) - print(f"Extracted: {output_path}") + logger.info(f"Extracted: {output_path}") def create_map_directories(): """Create map directories.""" @@ -259,7 +262,7 @@ def create_map_directories(): shutil.rmtree(TRUD_PATH) create_map_dirs = True elif user_input == "n": - print("Exiting TRUD installation") + logger.info("Exiting TRUD installation") sys.exit(0) else: create_map_dirs = True @@ -271,7 +274,7 @@ def create_map_directories(): TRUD_PROCESSED_DIR.mkdir(parents=True,exist_ok=True) def install(): - print(f"Installing TRUD") + logger.info(f"Installing TRUD") # get TRUD api key from environment variable api_key = os.getenv("ACMC_TRUD_API_KEY") @@ -318,7 +321,7 @@ def install(): # Validate and process each item ID for item in items: item_id = item["id"] - print(bcolors.HEADER, "---"+item["name"]+"---", bcolors.ENDC) + logger.info(f"---{item["name"]}---") releases = get_releases(item_id, API_KEY=api_key, latest=items_latest) if not releases: @@ -348,6 +351,6 @@ def install(): if "extract" in item: item["extract"]() - print(f"Downloaded {release_ordinal} release(s) for item {item_id}.") + logger.info(f"Downloaded {release_ordinal} release(s) for item {item_id}.") - print(f"TRUD installation completed") \ No newline at end of file + logger.info(f"TRUD installation completed") \ No newline at end of file