From 9c72a9b39dff32617502a03e7fbf6918b5941dfb Mon Sep 17 00:00:00 2001
From: Michael Boniface <m.j.boniface@soton.ac.uk>
Date: Sat, 15 Feb 2025 14:03:54 +0000
Subject: [PATCH] added acmc command line wrapper for entire toolkit moving
 parameter passing from trud, omop and main into acmc so that they are not
 just functions. Still work to do on parse.py which includes command line
 arguments and does nto need to as this should all be in the arg parser
 configuration. Refactored trud and omop so that the paths are all constants,
 and also check that the directories required exist as it was failing on
 requiring the code directory.

---
 README.md              | 251 +++++++++++++++---------------
 acmc.py                | 111 ++++++++++++++
 main.py                | 335 ----------------------------------------
 map.py                 | 341 +++++++++++++++++++++++++++++++++++++++++
 mjb-conda.yaml         |   3 +
 omop_api.py => omop.py |  85 +++++-----
 parse.py               |  47 ++++--
 trud.py                | 187 +++++++++++-----------
 8 files changed, 763 insertions(+), 597 deletions(-)
 create mode 100644 acmc.py
 delete mode 100644 main.py
 create mode 100644 map.py
 rename omop_api.py => omop.py (73%)

diff --git a/README.md b/README.md
index de767fc..a09ca11 100644
--- a/README.md
+++ b/README.md
@@ -26,11 +26,11 @@ This tool automates the verification, translation and organisation of medical co
 
 The high level steps to use the tools are outlined below:
 
-**1. Define concept sets:** A domain expert defines a list of [concept sets](#concept-set-assigment) for each observable characteristic of the phenotype using CSV file format (e.g., `PHEN_concept_sets.csv`).
+**1. Define concept sets:** A domain expert defines a list of [concept sets](#defining-concept-sets) for each observable characteristic of the phenotype using CSV file format (e.g., `PHEN_concept_sets.csv`).
 
-**2. Define code lists for concept sets:** A domain expert defines [code lists](#???) for each concept set within the phenotype using supported coding list formats and stores them in the `/src` directory.
+**2. Define concept code lists for concept sets:** A domain expert defines [code lists](#defining-concept-codes) for each concept set within the phenotype using supported coding list formats and stores them in the `/src` directory.
 
-**3. Define mapping from code lists to concept sets:** A domain expert defines a [phenotype mapping](#???) that maps code lists to concept sets in JSON file format (PHEN_assign_v3.json)
+**3. Define mapping from code lists to concept sets:** A domain expert defines a [phenotype mapping](#mapping-codes-to-concept-sets) that maps code lists to concept sets.
 
 **4. Generate versioned phenotype coding lists and translations:** A domain expert or data engineer processes the phenotype mappings [using the command line tool](#usage) to validate against NHS TRUD-registered codes and mapping and to generate versioned concept set code lists with translations between coding standards. 
 
@@ -54,132 +54,78 @@ The tool supports verification and mapping across diagnostic coding formats belo
 
 ## Installation
 
-1. **Setup Conda Enviroment:** Download and Install Python Enviroment. Follow insturctions to install minicoda from [https://docs.conda.io/en/latest/miniconda.html](https://docs.conda.io/en/latest/miniconda.html).
+**1. Setup Conda Enviroment**
 
- - Run the following command to recreate the environment: `conda env create -f conda.yaml`.
- - Activate the environment: `conda activate acmc`
+ACMC requires Python and the enviroment is maintained using conda.
 
-2. **Sign Up:** Register at [NHS TRUD](https://isd.digital.nhs.uk/trud/user/guest/group/0/account/form)
+* Ensure you have conda installed, e.g. following instructions for miniconda from [https://docs.conda.io/en/latest/miniconda.html](https://docs.conda.io/en/latest/miniconda.html).
+* Create  environment: `conda env create -f conda.yaml`
+* Activate  environment: `conda activate acmc`
 
-3. **Subscribe** and accept the following licenses:
+**2. Register at TRUD** to access clinically assured terminology mappings  [NHS TRUD](https://isd.digital.nhs.uk/trud/user/guest/group/0/account/form)
 
-   - [NHS Read Browser](https://isd.digital.nhs.uk/trud/users/guest/filters/2/categories/9/items/8/releases)
-   - [NHS Data Migration](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/9/items/9/releases)
-   - https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/categories/8/items/9/releases
-   - [ICD10 Edition 5 XML](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/categories/28/items/259/releases)
-   - [OPCS-4.10 Data Files](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/10/items/119/releases)
-   	<!-- - [BNF/Snomed Mapping data.xlsx](https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping) -->
+**3. Subscribe and accept the following licenses**
 
-Each data file has a "Subscribe" link that will take you to the licence. You will need to "Tell us about your subscription request" that summarises why you need access to the data. Your subscription will not be approved immediately and will remain in the "pending" state until it is. This is usually approved within 24 hours. 
-	
-4. **Get API Key:** Retrieve your API key from [NHS TRUD Account Management](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/account/manage).
-
-5. **Install TRUD:** Download and install NHS TRUD medical code resources.
-
-Executing the script using the command: `python trud.py --key <API_KEY>`.
-
-Processed tables will be saved as `.parquet` files in the `maps/processed/` directory.
-	- *Note: NHS TRUD defines one-way mappings and does <b>NOT ADVISE</b> reversing the mappings. If you still wish to reverse these into two-way mappings, duplicate the given `.parquet` table and reverse the filename (e.g. `read2_code_to_snomed_code.parquet` to `snomed_code_to_read2_code.parquet`)*
-
-6. ***Optional: Install OMOP Database:** Download and install OMOP vocabularies from [Athena OHDSI](https://athena.ohdsi.org/vocabulary/list). 
-	- Required vocabularies include:
-   		- 1) SNOMED
-		- 2) ICD9CM
-		- 17) Readv2
-		- 21) ATC
-		- 55) OPCS4
-		- 57) HES Specialty
-		- 70) ICD10CM
-		- 75) dm+d
-		- 144) UK Biobank
-		- 154) NHS Ethnic Category
-		- 155) NHS Place of Service
-   - Un-zip the downloaded folder and copy it's path.  
-   - Install vocabularies using:  
-     `python omop_api.py --install <PATH_TO_DOWNLOADED_FILES>`
-
-## Configuration 
-
-The JSON configuration file specifies how input codes are grouped into **concept sets**, which are collections of related codes used for defining phenotypes or other data subsets. The configuration is divided into two main components: the `"concept_sets"` object and the `"codes"` object. The `"codes"` objects specifies the inputted codes; their filepaths, column names and code types, as well as any formatting actions that maybe be neccessary. The `"concept_sets"` object defines a grouping that each of the inputted codes will be assigned to. All files must be formatted as shown below. 
-```json
-{
-	"concept_sets": {
-	},
-	"codes":[
-	]
-}
-```
+ACMC uses clinically assured medical terminologies provided by the NHS. The datafiles are downloaded automatically but you need to register, request subscription and obtain an API key.
 
-> **_EXAMPLE:_**  Configuration file used in the MELD-B project: https://git.soton.ac.uk/meldb/concepts/-/blob/main/PHEN_assign_v3.json?ref_type=heads 
+* [NHS Read Browser](https://isd.digital.nhs.uk/trud/users/guest/filters/2/categories/9/items/8/releases)
+* [NHS Data Migration](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/8/items/9/releases)
+* [ICD10 Edition 5 XML](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/28/items/259/releases)
+* [OPCS-4.10 Data Files](https://isd.digital.nhs.uk/trud/users/guest/filters/0/categories/10/items/119/releases)
+   	<!-- - [BNF/Snomed Mapping data.xlsx](https://www.nhsbsa.nhs.uk/prescription-data/understanding-our-data/bnf-snomed-mapping) -->
 
+Each data file has a "Subscribe" link that will take you to the licence. You will need to "Tell us about your subscription request" that summarises why you need access to the data, e.g. for a specific research project. Your subscription will not be approved immediately and will remain in the "pending" state until it is. This is usually approved within 24 hours. 
+	
+**4. Get TRUD API Key** 
 
-### Folder and File Definitions
+Go to your [NHS TRUD Account Management](https://isd.digital.nhs.uk/trud/users/authenticated/filters/0/account/manage) and copy you api key to a safe place, e.g. a personnal key store. The api key is required by ACMC tools to download TRUD resources.
 
-The `"codes"` section defines the location and description of all input medical coding lists required for processing. Each `"folder"` is defined as an object of within the `"codes"` list. Similarily all files are objects within the `"files"` list.
+**5. Download and install TRUD resources** 
 
-- **`folder`**: Specifies the directory containing the input files.  
-- **`description`**: Provides a brief summary of the content or purpose of the files, often including additional context such as the date the data was downloaded.  
-- **`files`**: Lists the files within the specified folder. Each file is represented as an object with the key `"file"` and the file name as its value. Definitions of the columns in each file are detailed below.
+Execute the following script to download, install and process TRUD resources
 
-```json
-"codes":[
-	{
-		"folder": "codes/Medication code source",
-		"description": "Medication Codes - downloaded 15/12/23",
-		"files": [
-			{
-				"file": "WP02_SAIL_WILK_matched_drug_codes_with_categories.xlsx"
-			}
-		]
-	}
-]
-```
+`python trud.py --key <API_KEY>`.
 
-### Column Definitions in Files
-The `"columns"` property within a file object specifies the type and corresponding names of columns in the input file. Each key in the object represents a column type, while the associated value denotes the name of the column in the input file. 
+Processed TRUD resources are saved as `.parquet` files in the `build/maps/processed/` directory.
 
-The supported column types include:
-- **`read2_code`**: Read Version 2 codes  
-- **`read3_code`**: Read Version 3 codes  
-- **`icd10_code`**: International Classification of Diseases, 10th Revision  
-- **`snomed_code`**: SNOMED-CT codes  
-- **`opcs4_code`**: OPCS Classification of Interventions and Procedures, Version 4  
-- **`atc_code`**: Anatomical Therapeutic Chemical classification codes  
+*Note: NHS TRUD defines one-way mappings and does <b>NOT ADVISE</b> reversing the mappings. If you still wish to reverse these into two-way mappings, duplicate the given `.parquet` table and reverse the filename (e.g. `read2_code_to_snomed_code.parquet` to `snomed_code_to_read2_code.parquet`)*
 
-Additionally, the `"metadata"` object ensures that any remaining columns not explicitly categorized by the supported column types are preserved in the output file. These columns are specified as an array of column names to be copied directly.
+**6. Optional: Install OMOP Database:**  
 
-```json
-"files": [
-	{
-		"file":"WP02_SAIL_WILK_matched_drug_codes_with_categories.xlsx",
-		"columns": {
-			"read2_code": "READCODE",
-			"metadata": ["DESCRIPTION"]
-		}
-	}
-]
-```
+ACMC optionally supports outputting coding lists in structured OMOP database. To do this you will need to register with [Athena](https://athena.ohdsi.org/auth/login?forceSSO=true) and then download the following vocabularies manually from [Athena OHDSI](https://athena.ohdsi.org/vocabulary/list). 
 
+* Required vocabularies include:
+  * 1) SNOMED
+  * 2) ICD9CM
+  * 17) Readv2
+  * 21) ATC
+  * 55) OPCS4
+  * 57) HES Specialty
+  * 70) ICD10CM
+  * 75) dm+d
+  * 144) UK Biobank
+  * 154) NHS Ethnic Category
+  * 155) NHS Place of Service
 
+The vocabularies will not be available immediately, you will be notified by email when they are ready. This process cannot be automated due to the way that Athena delivers vocabularies for download.
 
-### Concept Set Assigment
+* Un-zip the downloaded folder and copy it's path.  
 
-The `"concept_sets"` object defines the structure and rules for grouping input codes into concept sets based on a source CSV file. Key elements include:
+* Install vocabularies using the following command:  
 
-- **`file`**: Specifies the CSV file used as the input for defining concept sets. 
+`python omop.py --install <PATH_TO_DOWNLOADED_FILES>`
 
-- **`version`**: Identifies the version of the concept set definitions being used. This can help track changes over time.  
+## Defining phenotypes
 
-- **`concept_set`**: Defines a list of concept_set objects along with their attributes:
-  - **`concept_set_name`**: Specifies the name of the concept set.  
-  - **`concept_set_status`**: Indicates the status of the concept set. Only concept sets the **"AGREED"** status will be outputted! 
-  - **`metadata`** (optional): A list of additional properties that will be copied into the output. Can be used for descriptive or contextual purposes.
+Phenotypes are defined in a JSON configuration file. The file describes how source **concept codes** (i.e. a code list) are mapped to the collection of **concept set** included in the phenotype.
 
-The `"codes"` object specifies the source files containing input codes and assigns them to the corresponding concept sets through the `"concept_set"` field. 
+* **concept_sets**: defines the collection of observable characteristics of the phenotype. See Observational Health Data Sciences and Informatics (OHDSI) definition for [Concept Set](https://github.com/OHDSI/Atlas/wiki/Concept-Sets)
+* **codes**: defines lists of source concept codes associated with a specific concept set and declarative actions (e.g. filepaths, column names, code types, actions) to process source concept code files. See OMOP Common Data Model definition for [Concept Codes](https://ohdsi.github.io/TheBookOfOhdsi/StandardizedVocabularies.html#concept-codes)
 
- - **`concept_set`**: Lists the concept sets to which all codes within this file will be assigned.
+An example concept set and code list for Abdominal Pain is show below:
 
 ```json
+
 {
 	"concept_sets": {
 		"version": "3.2.10",
@@ -193,31 +139,89 @@ The `"codes"` object specifies the source files containing input codes and assig
 				"concept_set_name": "ABDO_PAIN",
 				"concept_set_status": "AGREED",
 				"metadata": {
-					"concept_set_description": "Abdominal pain",
+					"#": "18",
+					"CONCEPT DESCRIPTION": "Abdominal pain",
+					"CONCEPT TYPE": "Workload indicator (symptom)",
+					"DATE ADDED ": "2023-08-25",
+					"REQUEST REASON ": "Clinician SF - requested by email - symptom example from Qualitative Evidence Synthesis",
+					"SOURCE INFO": "YES",
+					"FUNCTION": "QUERY BY CODING LIST",
+					"FUNCTION.1": "https://clinicalcodes.rss.mhs.man.ac.uk/",
+					"CODING LIST": "https://git.soton.ac.uk/meld/meldb-external/phenotype/-/tree/main/codes/ClinicalCodes.org%20from%20the%20University%20of%20Manchester/Symptom%20code%20lists/Abdominal%20pain/res176-abdominal-pain.csv ",
+					"NOTES": "2023-09-08: Clinical SF confirmed that the clinical view would be that this would need to be recurrent or persistent.",
 				}
-			}
-		]
-	},
-	"codes":[
+			},
+	}
+	"codes": [
 		{
-			"folder": "codes/Medication code source",
-			"description": "Medication Codes - downloaded 15/12/23",
+			"folder": "codes/ClinicalCodes.org from the University of Manchester",
+			"description": "SF's clinical codes - downloaded 16/11/23",
 			"files": [
 				{
-					"file": "WP02_SAIL_WILK_matched_drug_codes_with_categories.xlsx",
-					"concept_set": ["ALL_MEDICATIONS"]
-				}
-			]
+					"file": "Symptom code lists/Abdominal pain/res176-abdominal-pain.csv",
+					"columns": {
+						"read2_code": "code",
+						"metadata": [
+							"description"
+						]
+					},
+					"concept_set": [
+						"ABDO_PAIN"
+					]
+				},
 		}
-	]
 }
 ```
 
-### Additional preprocessing (if required):
+A full example of the phenotype for burdensome multiple long term conditions from the MELDB project can be found [here](https://git.soton.ac.uk/meldb/concepts/-/blob/main/PHEN_assign_v3.json?ref_type=heads)
+
+### Defining concept sets
+
+The `"concept_sets"` object defines the structure for grouping input codes into concept sets based on source concepts. Key elements include:
+
+- **`version`**: Identifies the version of the concept set definitions being used. This can help track changes over time.  
+
+- **`concept_set`**: Defines a list of concept_set objects along with their attributes:
+  - **`concept_set_name`**: Specifies the name of the concept set.  
+  - **`concept_set_status`**: Indicates the status of the concept set. Only concept sets the **"AGREED"** status will be outputted! 
+  - **`metadata`** (optional): A list of additional properties that will be copied into the output. Can be used for descriptive or contextual purposes.
+
+### Defining concept codes
+
+The `"codes"` object defines the location and description of all input medical coding lists required for processing. Each `"folder"` is defined as an object within the `"codes"` list. Similarily all files are objects within the `"files"` list.
+
+* **`folder`**: Specifies the directory containing the input files.  
+* **`description`**: Provides a brief summary of the content or purpose of the files, often including additional context such as the date the data was downloaded.  
+* **`files`**: Lists the files within the specified folder. Each file is represented as an object with the key `"file"` and the file name as its value. Definitions of the columns in each file are detailed below.
+
+### Mapping source column definitions in files to standard vocabulary types
+
+The `"columns"` property within a file object specifies the type and corresponding names of columns in the input file. Each key in the object represents a column type, while the associated value denotes the name of the column in the input file. 
+
+The supported column types include:
+
+* **`read2_code`**: Read Version 2 codes  
+* **`read3_code`**: Read Version 3 codes  
+* **`icd10_code`**: International Classification of Diseases, 10th Revision  
+* **`snomed_code`**: SNOMED-CT codes  
+* **`opcs4_code`**: OPCS Classification of Interventions and Procedures, Version 4  
+* **`atc_code`**: Anatomical Therapeutic Chemical classification codes  
+
+Additionally, the `"metadata"` object ensures that any remaining columns not explicitly categorized by the supported column types are preserved in the output file. These columns are specified as an array of column names to be copied directly.
+
+### Mapping codes to concept sets
+
+The `"codes"` object are mapping to a corresponding concept sets through the `"concept_set"` field.
+
+* **`concept_set`**: Lists the concept sets to which all codes within this file will be assigned.
+
+### Additional preprocessing actions supported
+
 In certain cases where you wish to sub-divde a code list table or a column features multiple code types additional processing is required. Add a `action` object inside of the `file` object.
 
 #### Table with a sub-categorical column:
 In order to sub-divide a table by a categorical column use the "divide_col" action
+
 ```json
 "actions":{
 	"divide_col": "MMCode"
@@ -225,9 +229,11 @@ In order to sub-divide a table by a categorical column use the "divide_col" acti
 ```
 
 #### Table with multiple code types in single column:
+
 Need to split column into multiple columns, so only one code type per column.
-- The "split_col" attribute is the categorical column indicating the code type in that row. The <b>category names should replace column</b> names in the `columns` properties.
-- The "codes_col" attribute is the code column with mulitple code types in a single column
+* The "split_col" attribute is the categorical column indicating the code type in that row. The <b>category names should replace column</b> names in the `columns` properties.
+* The "codes_col" attribute is the code column with mulitple code types in a single column
+
 ```json
 "actions":{
 	"split_col":"coding_system",
@@ -243,15 +249,15 @@ Need to split column into multiple columns, so only one code type per column.
 
 **<b>Large Code lists</b> with numerous phenotypes (e.g. Ho et al), require lots of JSON to be generated. See the "Ho generate JSON" section in process_codes_WP.ipynb for example code to generate* 
 
-
-
 ## Usage
+
 Script preprocess code lists and to map to given concept/phenotype
 
 ### Execute Command Line
 Execute via shell with customizable parameters:
+
 ```bash
-python main.py [-h] [-r2] [-r3] [-i] [-s] [-o] [-a] [--no-translate] [--no-verify] [--output] [--error-log] mapping_file
+python acmc.py [-h] [-r2] [-r3] [-i] [-s] [-o] [-a] [--no-translate] [--no-verify] [--output] [--error-log] mapping_file
 ```
 
 **Required Arguments:**
@@ -274,6 +280,7 @@ python main.py [-h] [-r2] [-r3] [-i] [-s] [-o] [-a] [--no-translate] [--no-verif
 ## Contributing
 
 ### Commit to GitLab
+
 ```
 git add .
 git commit -m "my message ..."
@@ -282,9 +289,11 @@ git push
 ```
 
 ## Acknowledgements  
+
 This project was developed in the context of the [MELD-B](https://www.southampton.ac.uk/publicpolicy/support-for-policymakers/policy-projects/Current%20projects/meld-b.page) project, which is funded by the UK [National Institute of Health Research](https://www.nihr.ac.uk/) under grant agreement NIHR203988.
 
 ## License
+
 This work is licensed under a [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
 
 ![apache2](https://img.shields.io/github/license/saltstack/salt)
diff --git a/acmc.py b/acmc.py
new file mode 100644
index 0000000..7c36a38
--- /dev/null
+++ b/acmc.py
@@ -0,0 +1,111 @@
+import argparse
+
+import trud
+import omop
+import map
+
+def trud_install(args):
+    """Handle the `trud install` command."""
+    print(f"Installing TRUD")
+    trud.install(args.api_key)
+    print(f"TRUD installation completed")
+
+def omop_install(args):
+    """Handle the `omop install` command."""
+    print(f"Installing OMOP database")
+    omop.install(omop.OMOP_DB_PATH, args.omop_folder)
+    print(f"OMOP installation completed")    
+
+def omop_clear(args):
+    """Handle the `omop clear` command."""
+    print(f"Clearing OMOP data from database")
+    omop.clear(omop.OMOP_DB_PATH)
+    print(f"OMOP database cleared")        
+
+def omop_delete(args):
+    """Handle the `omop delete` command."""
+    print(f"Deleting OMOP database")
+    omop.delete(omop.OMOP_DB_PATH)
+    print(f"OMOP database deleted")    
+
+def map_process(args):
+	"""Handle the `map process` command."""    
+	print(f"Processing map with phenotype config file: {args.config_file}")
+	print(f"Output directory: {args.output_dir}")
+	print(f"Target coding format: {args.target_coding}")	
+	if args.translate:
+		print("Translating code types.")
+	else:
+		print("Not translating codes")
+	if args.verify:
+		print("Verifying codes.")
+	else:
+		print("Not verifying codes.")
+	if args.error_log:
+		print(f"Saving errors to: {args.error_log}")
+	else:
+		args.error_log = 'errors.csv'
+		
+	map.process(args.config_file, args.target_coding, args.translate, args.verify, args.error_log, output_path="MELD_concepts_read.csv")
+
+	print(f"Phenotype processing completed")
+
+def main():
+	parser = argparse.ArgumentParser(description="ACMC command-line tool")
+	
+    # Top-level commands
+	subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands")
+
+	### TRUD Command ###
+	trud_parser = subparsers.add_parser("trud", help="TRUD commands")
+	trud_subparsers = trud_parser.add_subparsers(dest="subcommand", required=True, help="TRUD subcommands")
+	
+	# trud install
+	trud_install_parser = trud_subparsers.add_parser("install", help="Install TRUD components")
+	trud_install_parser.add_argument("-k", "--api-key", required=True, help="TRUD API Key")
+	trud_install_parser.set_defaults(func=trud_install)
+	
+	### OMOP Command ###
+	omop_parser = subparsers.add_parser("omop", help="OMOP commands")
+	omop_subparsers = omop_parser.add_subparsers(dest="subcommand", required=True, help="OMOP subcommands")
+	
+	# omop install
+	omop_install_parser = omop_subparsers.add_parser("install", help="Install OMOP codes within database")
+	omop_install_parser.add_argument("-f", "--omop-folder", required=True, help="Path to OMOP downloads folder")
+	omop_install_parser.set_defaults(func=omop_install)
+	
+	# omop clear
+	omop_clear_parser = omop_subparsers.add_parser("clear", help="Clear OMOP data from database")
+	omop_clear_parser.set_defaults(func=omop_clear)
+	# omop delete
+	omop_delete_parser = omop_subparsers.add_parser("delete", help="Delete OMOP database")
+	omop_delete_parser.set_defaults(func=omop_delete)
+	
+	### MAP Command ###
+	map_parser = subparsers.add_parser("map", help="Map commands")
+	map_subparsers = map_parser.add_subparsers(dest="subcommand", required=True, help="Map subcommands")
+	
+	# map process
+	map_process_parser = map_subparsers.add_parser("process", help="Process map configuration file")
+	map_process_parser.add_argument("-c", "--config-file", required=True, help="Phenotype configuration file")
+	map_process_parser.add_argument("-o", "--output-dir", required=True, help="Output directory for CSV or OMOP database")
+	map_process_parser.add_argument("-t", "--target-coding", required=True, choices=['read2', 'read3', 'icd10', 'snomed', 'opcs4'], help="Specify the target coding (read2, read3, icd10, snomed, opcs4)")
+	
+	# Flags
+	map_process_parser.add_argument("-tr", "--translate", action="store_true", default=False, help="Do not translate code types")
+	map_process_parser.add_argument("-v", "--verify", action="store_true", default=False, help="Do not verify codes")
+	
+	# Error log file
+	map_process_parser.add_argument("-l", "--error-log", type=str, default='error.csv', help="Filepath to save error log to")
+
+	# Set the function to call when 'process' subcommand is used
+	map_process_parser.set_defaults(func=map_process)    
+
+	# Parse arguments
+	args = parser.parse_args()
+
+	# Call the function associated with the command
+	args.func(args)
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
deleted file mode 100644
index ab32f81..0000000
--- a/main.py
+++ /dev/null
@@ -1,335 +0,0 @@
-import argparse
-import pandas as pd
-import numpy as np
-# import pathlib
-import json
-import os
-import sqlite3
-
-from base import log_invalid_code
-from base import bcolors
-from base import raise_
-from parse import Read2_code
-from parse import Read3_code
-from parse import Icd10_code
-from parse import Snomed_code
-from parse import Opcs4_code
-from parse import Atc_code
-from parse import Med_code
-from parse import code_types
-from parse import omop_vocab_types
-from omop_api import db_path
-from omop_api import omop_publish_concept_sets
-from omop_api import omop_setup
-
-pd.set_option('mode.chained_assignment', None)
-
-def read_table_file(path, excel_sheet=None):
-	"""
-	Load Code List File
-	"""
-	if path.endswith(".csv"):
-		df = pd.read_csv(path, dtype=str)
-	elif path.endswith(".xlsx"):
-		if excel_sheet:
-			df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str)
-		else:
-			df = pd.read_excel(path, dtype=str)
-	elif path.endswith(".dta"):
-		df = pd.read_stata(path, dtype=str)
-	else:
-		raise Exception("Unsupported filetype provided for source file")
-	return df
-
-def preprocess_code(out, codes, checker, output_col, df_meta, no_verify=False):
-	codes = codes.astype(str) #convert to string
-	codes = codes.str.strip() #remove excess spaces
-	if not no_verify:
-		codes = checker.process(codes) #resolve any identified issues
-		if not checker.verify(codes): #verify all issues resolved
-			print("ERROR: FAILED")
-	#add metadata columns
-	out = pd.concat([out,
-					 pd.DataFrame({output_col:codes}).join(df_meta)],
-					 ignore_index=True
-				   )
-	return out
-	
-#Perform QA Checks on columns individually and append to df
-def preprocess(df, columns, target_code_type=None, meta_columns=[], file_path=None,  no_verify=False, no_translate=False):
-	"""
-	Parses each column individually - Order and length will not be preserved!
-	"""
-	out = pd.DataFrame([]) #create output df to append to
-	
-	if target_code_type and no_translate:
-		#QA only on target codes
-		if target_code_type in columns:
-			print(f"Processing {target_code_type} Codes...")
-			out = preprocess_code(
-				out=out,
-				codes=df[columns[target_code_type]].dropna(),
-				checker=code_types[target_code_type](file_path),
-				output_col=target_code_type,
-				df_meta =df[meta_columns],
-				no_verify=no_verify
-			)
-		else:
-			print(f"No {target_code_type} Codes to process")
-	else:
-		#QA for every code type in df run preprocess_code()
-		for k, v in code_types.items():
-			if k in columns:
-				print(f"Processing {k} Codes...")
-				out = preprocess_code(
-					out=out,
-					codes=df[columns[k]].dropna(),
-					checker=v(file_path),
-					output_col=k,
-					df_meta = df[meta_columns],
-					no_verify=no_verify
-				)
-	
-	return out
-
-#Translate Df with multiple codes into single code type Series
-def convert_codes(df, target, no_translate):
-	codes = pd.Series([], dtype=str)
-	
-	#Append target column (if exists) - doesn't need conversion
-	if target in df.columns:
-		print("Has", len(df), target,"in file")
-		codes = pd.concat([codes,df[target]])
-	# else:
-	# 	print("No",target,"in file")
-	
-	if not no_translate:
-		#Convert codes to target type
-		for col_name in df.columns[df.columns != target]:
-			path_map = f"maps/processed/{col_name}_to_{target}.parquet"
-			if os.path.exists(path_map):
-				col = df[col_name]
-				df_map = pd.read_parquet(path_map)
-				translated = pd.merge(col,
-									  df_map,
-									  how='left')[target] #merge on corresponding codes and take target column
-				#TODO: BUG mask does not match column
-				# log_invalid_code(col,
-				# 				 ~translated.isna(),
-				# 				 code_type=col_name,
-				# 				 cause=f"Translation to {target}") #log codes with no translation
-				codes = pd.concat([codes, translated]) #merge to output
-			else:
-				print(f"No mapping from {col_name} to {target}")
-	else:
-		print("NOT TRANSLATING")
-
-	return codes
-
-#Append file's codes to output Df with meldb concept
-def map_file(df, target_code_type, out, concepts, meta_columns=[], no_translate=False):
-	# seperate out meta_columns
-	df_meta = df[meta_columns] 
-	df = df.drop(columns=meta_columns)
-	codes = convert_codes(df, target_code_type, no_translate)
-	codes = codes.dropna() #delete NaNs
-	
-	#Append to out df
-	if len(codes) > 0:
-		codes = pd.DataFrame({
-			"CONCEPT":codes
-		})
-		codes = codes.join(df_meta)
-		for concept in concepts:
-			codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
-			out = pd.concat([out, codes])
-	return out
-
-def sql_row_exist(conn, table, column, value):
-	# Execute and check if a result exists
-	cur = conn.cursor()
-	query = f"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;"
-	cur.execute(query, (value,))
-	exists = cur.fetchone() is not None
-	
-	return exists
-
-def run_all(mapping_file, target_code_type, 
-			no_translate=False, no_verify=False, 
-			log_errors_path="MELD_errors.csv",
-			output_path="MELD_concepts_read.csv"):
-	
-	#Load Mapping File
-	if mapping_file.endswith(".json"):
-		mapping = json.load(open(mapping_file,'rb'))
-		folders = mapping["codes"]
-		summary_config = mapping["concept_sets"]
-	else:
-		raise Exception("Unsupported filetype provided for source file")
-	
-	out = pd.DataFrame([]) #Create Output File to append to
-	
-	#Iterate JSON mapping file (OBJECT FORMAT)
-	for folder in folders:
-		print(bcolors.HEADER, folder["description"], bcolors.ENDC)
-		if "files" in folder:
-			for file in folder["files"]:
-				print("---"*5, file["file"], "---"*5)
-				file_path = folder["folder"]+"/"+file["file"]
-				
-				#Load Code File
-				if "excel_sheet" in file:
-					df = read_table_file(path=file_path,
-									    excel_sheet = file["excel_sheet"])
-				else:
-					df = read_table_file(path=file_path)
-					
-				#Perform Structural Changes to file before preprocessing
-				#split column with multiple code types
-				if "actions" in file and "split_col" in file["actions"] and "codes_col" in file["actions"]: 
-					split_col = file["actions"]["split_col"]
-					codes_col = file["actions"]["codes_col"]
-					print("Action: Splitting",split_col,"column into:",
-						  df[split_col].unique())
-					codes = df[codes_col]
-					oh = pd.get_dummies(df[split_col], dtype=bool) #one hot encode
-					oh = oh.where((oh != True), codes, axis=0) #fill in 1s with codes
-					oh[oh == False] = np.NaN #replace 0s with None
-					df = pd.concat([df, oh], axis=1) #merge in new columns
-				
-				#Preprocessing & Validation Checks 
-				if "columns" in file:
-					meta_columns=[] #meta columns to keep with codes
-					if "actions" in file and "divide_col" in file["actions"]: 
-						meta_columns += [file["actions"]["divide_col"]]
-					#TODO: enable metacolumns to be outputted - problem with map_file appending
-					if "metadata" in file["columns"]:
-						meta_columns += file["columns"]["metadata"]
-					df = preprocess(df,
-									file["columns"],
-									meta_columns=meta_columns,
-									file_path=file_path,
-									target_code_type=target_code_type,
-									no_verify=no_verify,
-									no_translate=no_translate)
-				else:
-					raise Exception("No column format provided")
-				
-				# partition table by categorical column
-				if "actions" in file and "divide_col" in file["actions"] and len(df) > 0: 
-					divide_col = file["actions"]["divide_col"]
-					print("Action: Dividing Table by",divide_col,"column into: ",
-						  df[divide_col].unique())
-					df = df.groupby(divide_col)
-				
-				#Map to MELDB Concept/Phenotype
-				if len(df) == 0:
-					pass
-					# out = df
-				elif ("concept_set" in file) and isinstance(df, pd.core.frame.DataFrame):
-					out = map_file(df,
-								   target_code_type,
-								   out, 
-								   concepts=file["concept_set"],
-								   meta_columns=meta_columns,
-								   no_translate=no_translate)
-				elif ("concept_set_categories" in file) and isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
-					meta_columns.remove(divide_col) #delete categorical column
-					for cat, grp in df:		
-						if cat in file["concept_set_categories"].keys(): #check if category is mapped
-							grp = grp.drop(columns=[divide_col]) #delete categorical column
-							print("Category:", cat)
-							out = map_file(grp, target_code_type, out, 
-										   concepts = file["concept_set_categories"][cat],
-										   meta_columns=meta_columns)
-					
-		else:
-			print("Folder is empty")
-	
-	#check if out is empty
-	if len(out) <= 0:
-		raise Exception("Output file is empty")
-	
-	#Final Processing
-	out = out.reset_index(drop=True)
-	out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
-	out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
-	
-	#Add Concept Set Defintions metadata
-	summary_df = pd.DataFrame(summary_config["concept_set"]) #transform to dataframe
-	if "metadata" in summary_df.columns:
-		summary_df = summary_df.join(pd.json_normalize(summary_df["metadata"])) #metadata to columns
-		summary_df = summary_df.drop(columns=["metadata"])
-	summary_df = summary_df.rename(columns={"concept_set_name":"CONCEPT_SET"})
-	summary_df = summary_df.drop_duplicates() #remove duplicates
-	out = out.merge(summary_df, how="left", on='CONCEPT_SET') #merge with output
-	
-	# Save Output File
-	print(bcolors.HEADER, "---"*5, "OUTPUT", "---"*5, bcolors.ENDC)
-	print(out)
-	if output_path == "atlas":
-		
-		vocab_id = summary_config["omop"]["vocabulary_id"]
-		vocab_version = summary_config["version"]
-		vocab_name = summary_config["omop"]["vocabulary_name"]
-		vocab_reference = summary_config["omop"]["vocabulary_reference"]
-
-		#Create New OMOP Vocabulary
-		omop_setup(db_path, vocab_id, vocab_version, vocab_name, vocab_reference)
-		
-		#Export to DB
-		omop_publish_concept_sets(out, db_path, vocab_id, omop_vocab_types[target_code_type], vocab_version)
-	else:
-		# export as CSV to /output
-		out.to_csv(output_path, index=False)
-		print("saved to", output_path)
-	
-	# Save Error File
-	if os.path.exists(log_errors_path):
-		error_df = pd.read_csv(log_errors_path)
-		error_df = error_df.drop_duplicates() #Remove Duplicates from Error file
-		error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"])
-		error_df.to_csv(log_errors_path, index=False)
-	
-
-if __name__ == '__main__':
-	parser = argparse.ArgumentParser(description="Script preprocess code lists and to map to given concept/phenotype",
-									 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-	
-	for v in code_types.values():
-		code_type = v()
-		parser.add_argument(code_type.arg_small, code_type.arg_long,
-							action='store_true',
-							help=code_type.arg_help)
-	parser.add_argument("mapping_file", help="Concept/Phenotype Assignment File (json)")
-	parser.add_argument("--no-translate", action='store_true', help="Do not translate code types")
-	parser.add_argument("--no-verify", action='store_true', help="Do not verify codes are correct")
-	parser.add_argument("--output", type=str, help="File Location to save output csv to")
-	parser.add_argument("--error-log", type=str, help="File Location to save error log csv to")
-
-	args = parser.parse_args()
-	config = vars(args)
-
-	#Check which code type is the target
-	specified = False
-	for k in code_types.keys():
-		if config[k]:
-			specified = True
-			target_code_type = k
-	if not specified:
-		raise Exception("Specify target code type")
-
-	#Format Arguments for python function
-	params={}
-	#Required Params
-	params["mapping_file"] = config["mapping_file"] if "mapping_file" in config else Exception("Must specify Location of JSON Mapping File")
-	params["target_code_type"] = target_code_type
-	#Optional Params
-	params["no_translate"] = config["no_translate"]
-	params["no_verify"] = config["no_verify"]
-	if not config["output"] == None :
-		params["output_path"] = config["output"]
-	if not config["error_log"] == None:
-		params["log_errors_path"] = config["error_log"]
-
-	run_all(**params)
\ No newline at end of file
diff --git a/map.py b/map.py
new file mode 100644
index 0000000..828ec4a
--- /dev/null
+++ b/map.py
@@ -0,0 +1,341 @@
+import argparse
+import pandas as pd
+import numpy as np
+import json
+import os
+import sqlite3
+
+from base import log_invalid_code
+from base import bcolors
+from base import raise_
+from parse import Read2_code
+from parse import Read3_code
+from parse import Icd10_code
+from parse import Snomed_code
+from parse import Opcs4_code
+from parse import Atc_code
+from parse import Med_code
+from parse import code_types
+from parse import vocab_types
+from omop import OMOP_DB_PATH
+from omop import publish_concept_sets
+from omop import setup
+
+pd.set_option("mode.chained_assignment", None)
+
+
+def read_table_file(path, excel_sheet=None):
+    """
+    Load Code List File
+    """
+    if path.endswith(".csv"):
+        df = pd.read_csv(path, dtype=str)
+    elif path.endswith(".xlsx"):
+        if excel_sheet:
+            df = pd.read_excel(path, sheet_name=excel_sheet, dtype=str)
+        else:
+            df = pd.read_excel(path, dtype=str)
+    elif path.endswith(".dta"):
+        df = pd.read_stata(path, dtype=str)
+    else:
+        raise Exception("Unsupported filetype provided for source file")
+    return df
+
+
+def preprocess_code(out, codes, checker, output_col, df_meta, verify=True):
+    codes = codes.astype(str)  # convert to string
+    codes = codes.str.strip()  # remove excess spaces
+    if verify:
+        codes = checker.process(codes)  # resolve any identified issues
+        if not checker.verify(codes):  # verify all issues resolved
+            print("ERROR: FAILED")
+    # add metadata columns
+    out = pd.concat(
+        [out, pd.DataFrame({output_col: codes}).join(df_meta)], ignore_index=True
+    )
+    return out
+
+
+# Perform QA Checks on columns individually and append to df
+def preprocess(
+    df,
+    columns,
+    target_code_type=None,
+    meta_columns=[],
+    file_path=None,
+    verify=True,
+    Translate=True,
+):
+    """
+    Parses each column individually - Order and length will not be preserved!
+    """
+    out = pd.DataFrame([])  # create output df to append to
+
+    if target_code_type and not translate:
+        # QA only on target codes
+        if target_code_type in columns:
+            print(f"Processing {target_code_type} Codes...")
+            out = preprocess_code(
+                out=out,
+                codes=df[columns[target_code_type]].dropna(),
+                checker=code_types[target_code_type](file_path),
+                output_col=target_code_type,
+                df_meta=df[meta_columns],
+                verify=verify,
+            )
+        else:
+            print(f"No {target_code_type} Codes to process")
+    else:
+        # QA for every code type in df run preprocess_code()
+        for k, v in code_types.items():
+            if k in columns:
+                print(f"Processing {k} Codes...")
+                out = preprocess_code(
+                    out=out,
+                    codes=df[columns[k]].dropna(),
+                    checker=v(file_path),
+                    output_col=k,
+                    df_meta=df[meta_columns],
+                    verify=verify,
+                )
+
+    return out
+
+
+# Translate Df with multiple codes into single code type Series
+def convert_codes(df, target, translate):
+    codes = pd.Series([], dtype=str)
+
+    # Append target column (if exists) - doesn't need conversion
+    if target in df.columns:
+        print("Has", len(df), target, "in file")
+        codes = pd.concat([codes, df[target]])
+    # else:
+    # 	print("No",target,"in file")
+
+    if translate:
+        # Convert codes to target type
+        for col_name in df.columns[df.columns != target]:
+            path_map = f"maps/processed/{col_name}_to_{target}.parquet"
+            if os.path.exists(path_map):
+                col = df[col_name]
+                df_map = pd.read_parquet(path_map)
+                translated = pd.merge(col, df_map, how="left")[
+                    target
+                ]  # merge on corresponding codes and take target column
+                # TODO: BUG mask does not match column
+                # log_invalid_code(col,
+                # 				 ~translated.isna(),
+                # 				 code_type=col_name,
+                # 				 cause=f"Translation to {target}") #log codes with no translation
+                codes = pd.concat([codes, translated])  # merge to output
+            else:
+                print(f"No mapping from {col_name} to {target}")
+    else:
+        print("NOT TRANSLATING")
+
+    return codes
+
+
+# Append file's codes to output Df with meldb concept
+def map_file(df, target_code_type, out, concepts, meta_columns=[], translate=True):
+    # seperate out meta_columns
+    df_meta = df[meta_columns]
+    df = df.drop(columns=meta_columns)
+    codes = convert_codes(df, target_code_type, translate)
+    codes = codes.dropna()  # delete NaNs
+
+    # Append to out df
+    if len(codes) > 0:
+        codes = pd.DataFrame({"CONCEPT": codes})
+        codes = codes.join(df_meta)
+        for concept in concepts:
+            codes["CONCEPT_SET"] = np.repeat(concept.strip(), len(codes))
+            out = pd.concat([out, codes])
+    return out
+
+
+def sql_row_exist(conn, table, column, value):
+    # Execute and check if a result exists
+    cur = conn.cursor()
+    query = f"SELECT 1 FROM {table} WHERE {column} = ? LIMIT 1;"
+    cur.execute(query, (value,))
+    exists = cur.fetchone() is not None
+
+    return exists
+
+
+def process(config_file, target_code_type, translate=True, verify=True, log_errors_path="errors.csv", output_path="MELD_concepts_read.csv"):
+
+    # Load configuration File
+    if config_file.endswith(".json"):
+        mapping = json.load(open(config_file, "rb"))
+        folders = mapping["codes"]
+        summary_config = mapping["concept_sets"]
+    else:
+        raise Exception(f"Unsupported filetype provided for configuration file: {config_file}")
+
+    out = pd.DataFrame([])  # Create Output File to append to
+
+    # Iterate JSON mapping file (OBJECT FORMAT)
+    for folder in folders:
+        print(bcolors.HEADER, folder["description"], bcolors.ENDC)
+        if "files" in folder:
+            for file in folder["files"]:
+                print("---" * 5, file["file"], "---" * 5)
+                file_path = folder["folder"] + "/" + file["file"]
+
+                # Load Code File
+                if "excel_sheet" in file:
+                    df = read_table_file(
+                        path=file_path, excel_sheet=file["excel_sheet"]
+                    )
+                else:
+                    df = read_table_file(path=file_path)
+
+                # Perform Structural Changes to file before preprocessing
+                # split column with multiple code types
+                if (
+                    "actions" in file
+                    and "split_col" in file["actions"]
+                    and "codes_col" in file["actions"]
+                ):
+                    split_col = file["actions"]["split_col"]
+                    codes_col = file["actions"]["codes_col"]
+                    print(
+                        "Action: Splitting",
+                        split_col,
+                        "column into:",
+                        df[split_col].unique(),
+                    )
+                    codes = df[codes_col]
+                    oh = pd.get_dummies(df[split_col], dtype=bool)  # one hot encode
+                    oh = oh.where((oh != True), codes, axis=0)  # fill in 1s with codes
+                    oh[oh == False] = np.NaN  # replace 0s with None
+                    df = pd.concat([df, oh], axis=1)  # merge in new columns
+
+                # Preprocessing & Validation Checks
+                if "columns" in file:
+                    meta_columns = []  # meta columns to keep with codes
+                    if "actions" in file and "divide_col" in file["actions"]:
+                        meta_columns += [file["actions"]["divide_col"]]
+                    # TODO: enable metacolumns to be outputted - problem with map_file appending
+                    if "metadata" in file["columns"]:
+                        meta_columns += file["columns"]["metadata"]
+                    df = preprocess(
+                        df,
+                        file["columns"],
+                        meta_columns=meta_columns,
+                        file_path=file_path,
+                        target_code_type=target_code_type,
+                        verify=verify,
+                        translate=translate,
+                    )
+                else:
+                    raise Exception("No column format provided")
+
+                # partition table by categorical column
+                if (
+                    "actions" in file
+                    and "divide_col" in file["actions"]
+                    and len(df) > 0
+                ):
+                    divide_col = file["actions"]["divide_col"]
+                    print(
+                        "Action: Dividing Table by",
+                        divide_col,
+                        "column into: ",
+                        df[divide_col].unique(),
+                    )
+                    df = df.groupby(divide_col)
+
+                # Map to MELDB Concept/Phenotype
+                if len(df) == 0:
+                    pass
+                    # out = df
+                elif ("concept_set" in file) and isinstance(
+                    df, pd.core.frame.DataFrame
+                ):
+                    out = map_file(
+                        df,
+                        target_code_type,
+                        out,
+                        concepts=file["concept_set"],
+                        meta_columns=meta_columns,
+                        translate=translate,
+                    )
+                elif ("concept_set_categories" in file) and isinstance(
+                    df, pd.core.groupby.generic.DataFrameGroupBy
+                ):
+                    meta_columns.remove(divide_col)  # delete categorical column
+                    for cat, grp in df:
+                        if (
+                            cat in file["concept_set_categories"].keys()
+                        ):  # check if category is mapped
+                            grp = grp.drop(
+                                columns=[divide_col]
+                            )  # delete categorical column
+                            print("Category:", cat)
+                            out = map_file(
+                                grp,
+                                target_code_type,
+                                out,
+                                concepts=file["concept_set_categories"][cat],
+                                meta_columns=meta_columns,
+                            )
+
+        else:
+            print("Folder is empty")
+
+    # check if out is empty
+    if len(out) <= 0:
+        raise Exception("Output file is empty")
+
+    # Final Processing
+    out = out.reset_index(drop=True)
+    out = out.drop_duplicates(subset=["CONCEPT_SET", "CONCEPT"])
+    out = out.sort_values(by=["CONCEPT_SET", "CONCEPT"])
+
+    # Add Concept Set Defintions metadata
+    summary_df = pd.DataFrame(summary_config["concept_set"])  # transform to dataframe
+    if "metadata" in summary_df.columns:
+        summary_df = summary_df.join(
+            pd.json_normalize(summary_df["metadata"])
+        )  # metadata to columns
+        summary_df = summary_df.drop(columns=["metadata"])
+    summary_df = summary_df.rename(columns={"concept_set_name": "CONCEPT_SET"})
+    summary_df = summary_df.drop_duplicates()  # remove duplicates
+    out = out.merge(summary_df, how="left", on="CONCEPT_SET")  # merge with output
+
+    # Save Output File
+    print(bcolors.HEADER, "---" * 5, "OUTPUT", "---" * 5, bcolors.ENDC)
+    print(out)
+    if output_path == "atlas":
+
+        vocab_id = summary_config["omop"]["vocabulary_id"]
+        vocab_version = summary_config["version"]
+        vocab_name = summary_config["omop"]["vocabulary_name"]
+        vocab_reference = summary_config["omop"]["vocabulary_reference"]
+
+        # Create New OMOP Vocabulary
+        omop_setup(OMOP_DB_PATH, vocab_id, vocab_version, vocab_name, vocab_reference)
+
+        # Export to DB
+        omop_publish_concept_sets(
+            out,
+            OMOP_DB_PATH,
+            vocab_id,
+            omop_vocab_types[target_code_type],
+            vocab_version,
+        )
+    else:
+        # export as CSV to /output
+        out.to_csv(output_path, index=False)
+        print("saved to", output_path)
+
+    # Save Error File
+    if os.path.exists(log_errors_path):
+        error_df = pd.read_csv(log_errors_path)
+        error_df = error_df.drop_duplicates()  # Remove Duplicates from Error file
+        error_df = error_df.sort_values(by=["SOURCE", "VOCABULARY", "CONCEPT"])
+        error_df.to_csv(log_errors_path, index=False)
diff --git a/mjb-conda.yaml b/mjb-conda.yaml
index 176ca83..eb437f4 100644
--- a/mjb-conda.yaml
+++ b/mjb-conda.yaml
@@ -76,12 +76,15 @@ dependencies:
   - zstandard=0.23.0=py313h80202fe_1
   - zstd=1.5.6=ha6fb4c9_0
   - pip:
+      - aiosqlite==0.21.0
+      - click==8.1.8
       - cramjam==2.9.1
       - fastparquet==2024.11.0
       - fsspec==2025.2.0
       - greenlet==3.1.1
       - lxml==5.3.1
       - pyarrow==19.0.0
+      - pyomop==4.3.0
       - simpledbf==0.2.6
       - sqlalchemy==2.0.38
 prefix: /opt/conda/envs/acmc
diff --git a/omop_api.py b/omop.py
similarity index 73%
rename from omop_api.py
rename to omop.py
index 1e5d7a6..37b2b0c 100644
--- a/omop_api.py
+++ b/omop.py
@@ -3,20 +3,32 @@ import argparse
 import sqlite3
 import pandas as pd
 
-db_path = "codes/omop_54.sqlite"
+from pathlib import Path
+
+OMOP_DB_DIR = Path('./codes/omop')
+OMOP_DB_PATH = OMOP_DB_DIR / 'omop_54.sqlite'
 
 #Populate SQLite3 Database with default OMOP CONCEPTS 
-def omop_install (db_path, folder_path):
-    conn = sqlite3.connect(db_path)
-    
-    # Check if the folder exists
-    if not os.path.isdir(folder_path):
-        raise Exception(f"Error: The folder '{folder_path}' does not exist.")  
+def install (db_path, omop_install_folder):
+
+    print(f"Installing OMOP files from {omop_install_folder}")
+
+    # check folder for omop install files is a directory
+    omop_install_path = Path(omop_install_folder)  
+    if not omop_install_path.is_dir():
+        raise NotADirectoryError(f"Error: '{omop_install_path}' for OMOP installation files is not a directory")    
 
+    # check codes directory exists and if not create it
+    if not OMOP_DB_DIR.exists():  
+        OMOP_DB_DIR.mkdir(parents=True)
+        print(f"Codes directory '{OMOP_DB_DIR}' created.")    
+
+    # connect to database, if it does not exist it will be created
+    conn = sqlite3.connect(OMOP_DB_PATH)    
     # Iterate through files in the folder
-    for filename in os.listdir(folder_path):
+    for filename in os.listdir(omop_install_folder):
         if filename.endswith(".csv"):  # Check if the file is a CSV
-            file_path = os.path.join(folder_path, filename)
+            file_path = os.path.join(omop_install_folder, filename)
             try:
                 print(f"Reading file: {file_path}")
                 # Read the CSV file with the specified delimiter
@@ -47,7 +59,7 @@ def table_exists(cursor, table_name):
 
     return result is not None
 
-def omop_vocab_exists(cursor, vocab_id):
+def vocab_exists(cursor, vocab_id):
     # Query to check if the table exists
     cursor.execute(
         """
@@ -63,8 +75,8 @@ def omop_vocab_exists(cursor, vocab_id):
 
     return result is not None
 
-#Setup SQLite3 Database for OMOP
-def omop_setup(db_path, vocab_id, vocab_version, vocab_name, vocab_reference):
+def setup(db_path, vocab_id, vocab_version, vocab_name, vocab_reference):
+    #Setup SQLite3 Database for OMOP    
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
 
@@ -109,7 +121,7 @@ def omop_setup(db_path, vocab_id, vocab_version, vocab_name, vocab_reference):
 
     conn.close()
 
-def omop_publish_concept_sets(out, db_path, vocab_output, vocab_type, output_version):
+def publish_concept_sets(out, db_path, vocab_output, vocab_type, output_version):
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
 
@@ -141,37 +153,28 @@ def omop_publish_concept_sets(out, db_path, vocab_output, vocab_type, output_ver
 
     conn.close()
 
-def omop_clear(db_path):
+def clear(db_path):
+    omop_db_path = Path(db_path)
+    if not omop_db_path.is_file():  
+        raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.")    
+    
     conn = sqlite3.connect(db_path)
     cur = conn.cursor()
-    
-    cur.execute("DROP TABLE CONCEPT_SET;")
-    cur.execute("DROP TABLE CONCEPT_SET_ITEM;")
-
-    conn.close()
-
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Installation of SQLite3 OMOP Database.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("-i", "--install", type=str, help="Install OMOP Vocabularies in Database with download from athena.ohdsi.org")
-    parser.add_argument("--clear", action="store_true", help="Delete ALL CONCEPT_SETS from OMOP Database")
-    parser.add_argument("--delete", action="store_true", help="Delete ALL DATA from OMOP Database")
 
-    args = parser.parse_args()
-    config = vars(args)
-
-    if config["install"] is not None:
-        omop_install(db_path, config["install"])
-    elif config["clear"]:
-        omop_clear(db_path)
-    elif config["delete"]:
-        omop_reset(db_path)
+    cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
 
+    # Fetch and print table names
+    tables = cur.fetchall()
+    print("Tables in database:", [table[0] for table in tables])    
+    
+    #cur.execute("DROP TABLE CONCEPT_SET;")
+    #cur.execute("DROP TABLE CONCEPT_SET_ITEM;")
 
+    conn.close()
 
-if __name__ == "__main__":
-    main()
\ No newline at end of file
+def delete(db_path):
+    omop_db_path = Path(db_path)
+    if not omop_db_path.is_file():  
+        raise FileNotFoundError(f"Error: OMOP DB file '{omop_db_path}' does not exist.")    
+    
+    omop_db_path.unlink()   
diff --git a/parse.py b/parse.py
index a509369..33907e6 100644
--- a/parse.py
+++ b/parse.py
@@ -1,13 +1,14 @@
 import pandas as pd
 import numpy as np
 import os
+import trud
 
 from base import log_invalid_code
 from base import bcolors
 from base import raise_
 		
 def in_database(codes, db, col):
-	return codes.isin(db[col])
+    return codes.isin(db[col])
 
 class Proto_code():
 	"""
@@ -65,13 +66,16 @@ class Proto_code():
 			return False
 	
 class Read2_code(Proto_code):
-	def __init__(self, file_path=None):
-		super().__init__(file_path)
-		self.db = pd.read_parquet("maps/processed/read2_code.parquet")
-		self.arg_small = "-r2"
-		self.arg_long = "--read2-code"
-		self.arg_help = "Read V2 Codes Column name in Source File"
-		self.checks = [
+    def __init__(self, file_path=None):
+        super().__init__(file_path)
+        input_path = trud.MAPS_PROCESSED_DIR / 'read2_code.parquet'
+        if not input_path.is_file():  
+            raise FileNotFoundError(f"Error: Read2 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")   
+        self.db = pd.read_parquet(input_path)
+        self.arg_small = "-r2"
+        self.arg_long = "--read2-code"
+        self.arg_help = "Read V2 Codes Column name in Source File"
+        self.checks = [
 			(
 				"Not Empty",
 				lambda codes : pd.Series([len(codes) > 0]),
@@ -114,7 +118,12 @@ class Read3_code(Proto_code):
 		self.arg_small = "-r3"
 		self.arg_long = "--read3-code"
 		self.arg_help = "Read V3 Codes Column name in Source File"
-		self.db = pd.read_parquet("maps/processed/read3_code.parquet")
+
+		input_path = trud.MAPS_PROCESSED_DIR / 'read3_code.parquet'
+		if not input_path.is_file():  
+			raise FileNotFoundError(f"Error: Read3 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
+		self.db = pd.read_parquet(input_path)
+            
 		self.checks = [
 			(
 				"Not Empty",
@@ -157,7 +166,11 @@ class Icd10_code(Proto_code):
 		self.arg_small = "-i"
 		self.arg_long = "--icd10-code"
 		self.arg_help = "ICD10 Codes Column name in Source File"
-		self.db = pd.read_parquet("maps/processed/icd10_code.parquet")
+
+		input_path = trud.MAPS_PROCESSED_DIR / 'icd10_code.parquet'
+		if not input_path.is_file():  
+			raise FileNotFoundError(f"Error: ICD10 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
+		self.db = pd.read_parquet(input_path)
 		self.checks = [
 			(
 				"Not Empty",
@@ -219,7 +232,11 @@ class Snomed_code(Proto_code):
 		self.arg_small = "-s"
 		self.arg_long = "--snomed-code"
 		self.arg_help = "SNOMED Codes Column name in Source File"
-		self.db = pd.read_parquet("maps/processed/snomed_code.parquet")
+
+		input_path = trud.MAPS_PROCESSED_DIR / 'snomed_code.parquet'
+		if not input_path.is_file():  
+			raise FileNotFoundError(f"Error: SNOMED code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
+		self.db = pd.read_parquet(input_path)        
 		self.checks = [
 			# (
 			# 	"Not Empty",
@@ -275,7 +292,11 @@ class Opcs4_code(Proto_code):
 		self.arg_small = "-o"
 		self.arg_long = "--opcs4-code"
 		self.arg_help = "OPCS4 Codes Column name in Source File"
-		self.db = pd.read_parquet("maps/processed/opcs4_code.parquet")
+
+		input_path = trud.MAPS_PROCESSED_DIR / 'opcs4_code.parquet'
+		if not input_path.is_file():  
+			raise FileNotFoundError(f"Error: OPCS4 code file '{input_path}' does not exist. Please ensure you have installed TRUD correctly")               
+		self.db = pd.read_parquet(input_path)          
 		self.checks = [
 			(
 				"Not Empty",
@@ -355,7 +376,7 @@ code_types = {
 	"cprd_code": Cprd_code,
 }
 
-omop_vocab_types = {
+vocab_types = {
 	"read2_code": "Read",
 	"read3_code": None,
 	"icd10_code": "ICD10CM",
diff --git a/trud.py b/trud.py
index 87dfbf4..230231c 100644
--- a/trud.py
+++ b/trud.py
@@ -15,6 +15,9 @@ import simpledbf
 
 # Constants
 FQDN = "isd.digital.nhs.uk"
+MAPS_DIR = Path('./build/maps')
+MAPS_DOWNLOADS_DIR = MAPS_DIR / 'downloads'
+MAPS_PROCESSED_DIR = MAPS_DIR / 'processed'
 
 def error_exit(message):
     print(message, "error")
@@ -41,12 +44,17 @@ def get_releases(item_id, API_KEY, latest=False):
 
     return data.get("releases", [])
 
-def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder="build/maps/downloads"):
+def download_release_file(item_id, release_ordinal, release, file_json_prefix, file_type=None, items_folder=MAPS_DOWNLOADS_DIR):
     """Download specified file type for a given release of an item."""
+
+    # check folder is a directory
+    if not items_folder.is_dir():
+        raise NotADirectoryError(f"Error: '{items_folder}' for OMOP installation files is not a directory") 
+    
     file_type = file_type or file_json_prefix
     file_url = release.get(f"{file_json_prefix}FileUrl")
     file_name = release.get(f"{file_json_prefix}FileName")
-    file_destination = os.path.join(items_folder, file_name)
+    file_destination = MAPS_DOWNLOADS_DIR / file_name 
 
     if not file_url or not file_name:
         error_exit(f"Missing {file_type} file information for release {release_ordinal} of item {item_id}.")
@@ -70,197 +78,205 @@ def validate_download_hash(file_destination:str, item_hash:str):
     else:
         error_exit(f"Could not validate origin of {file_destination}. The SHA-256 hash should be: {item_hash}, but got {hash} instead")
 
-def unzip_download(file_destination:str, items_folder="build/maps/downloads"):
+def unzip_download(file_destination:str, items_folder=MAPS_DOWNLOADS_DIR):
+
+    # check folder is a directory
+    if not items_folder.is_dir():
+        raise NotADirectoryError(f"Error: '{items_folder}' for OMOP installation files is not a directory") 
+        
     with zipfile.ZipFile(file_destination, 'r') as zip_ref:
         zip_ref.extractall(items_folder)
 
 def extract_icd10():
     #ICD10_edition5
-    file_path = Path('build') / 'maps' / 'downloads' / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml'
-
+    file_path = MAPS_DOWNLOADS_DIR / 'ICD10_Edition5_XML_20160401' / 'Content' / 'ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.xml'
     df = pd.read_xml(file_path)
     df = df[["CODE", "ALT_CODE", "DESCRIPTION"]]
     df = df.rename(columns={"CODE":"icd10_code",
                             "ALT_CODE":"icd10_alt_code",
                             "DESCRIPTION":"description"
                         })
-    df.to_parquet("build/maps/processed/icd10_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/icd10_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'icd10_code.parquet'
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")
 
 def extract_opsc4():
-    file_path = Path('build') / 'maps' / 'downloads' / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt'
+    file_path = MAPS_DOWNLOADS_DIR / 'OPCS410 Data files txt' / 'OPCS410 CodesAndTitles Nov 2022 V1.0.txt'
     
     df = pd.read_csv(file_path, sep='\t', dtype=str, header=None)
     df = df.rename(columns={0:"opcs4_code", 1:"description"})
-    df.to_parquet("build/maps/processed/opcs4_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/opcs4_code.parquet")
+    
+    output_path = MAPS_PROCESSED_DIR / 'opcs4_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")
 
 def extract_nhs_data_migrations():
     #NHS Data Migrations
     
     #snomed only
-    file_path = Path('build') / 'maps' / 'downloads' / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt'    
+    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'sctcremap_uk_20200401000001.txt'    
     df = pd.read_csv(file_path, sep='\t')    
     df = df[["SCT_CONCEPTID"]]
     df = df.rename(columns={"SCT_CONCEPTID":"snomed_code"})
     df = df.drop_duplicates()
     df = df.astype(str)
-    df.to_parquet("build/maps/processed/snomed_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/snomed_code.parquet")
 
+    output_path = MAPS_PROCESSED_DIR / 'snomed_code.parquet'    
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")
+    
     #r2 -> r3
-    file_path = Path('build') / 'maps' / 'downloads' / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt'
+    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rctctv3map_uk_20200401000001.txt'
     df = pd.read_csv(file_path, sep='\t')
     df = df[["V2_CONCEPTID", "CTV3_CONCEPTID"]]
     df = df.rename(columns={"V2_CONCEPTID":"read2_code",
                             "CTV3_CONCEPTID":"read3_code"})
-    df.to_parquet("build/maps/processed/read2_code_to_read3_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read2_code_to_read3_code.parquet")
+
+    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_read3_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")
 
     #r3->r2
-    file_path = Path('build') / 'maps' / 'downloads' / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt'
+    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3rctmap_uk_20200401000002.txt'
     df = pd.read_csv(file_path, sep='\t')
     df = df[["CTV3_CONCEPTID", "V2_CONCEPTID"]]
     df = df.rename(columns={"CTV3_CONCEPTID":"read3_code", 
                             "V2_CONCEPTID":"read2_code"})
     df = df.drop_duplicates()
     df = df[~df["read2_code"].str.match("^.*_.*$")] #remove r2 codes with '_'
-    df.to_parquet("build/maps/processed/read3_code_to_read2_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read3_code_to_read2_code.parquet")
 
+    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_read2_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")
+    
     #r2 -> snomed
-    file_path = Path('build') / 'maps' / 'downloads' / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt'
+    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'rcsctmap2_uk_20200401000001.txt'
     df = pd.read_csv(file_path, sep='\t', dtype=str)
     df = df[["ReadCode", "ConceptId"]]
     df = df.rename(columns={"ReadCode":"read2_code",
                             "ConceptId":"snomed_code"})
-    df.to_parquet("build/maps/processed/read2_code_to_snomed_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read2_code_to_snomed_code.parquet")
+
+    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_snomed_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")
 
     #r3->snomed
-    file_path = Path('build') / 'maps' / 'downloads' / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt'
+    file_path = MAPS_DOWNLOADS_DIR / 'Mapping Tables' / 'Updated' / 'Clinically Assured' / 'ctv3sctmap2_uk_20200401000001.txt'
     df = pd.read_csv(file_path, sep='\t', dtype=str)
     df = df[["CTV3_TERMID", "SCT_CONCEPTID"]]
     df = df.rename(columns={"CTV3_TERMID":"read3_code",
                             "SCT_CONCEPTID":"snomed_code"})
     df["snomed_code"] = df["snomed_code"].astype(str)
     df = df[~df["snomed_code"].str.match("^.*_.*$")] #remove snomed codes with '_'
-    df.to_parquet("build/maps/processed/read3_code_to_snomed_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read3_code_to_snomed_code.parquet")
+
+    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_snomed_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")    
 
 def extract_nhs_read_browser():
     #r2 only
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V2/ANCESTOR.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ANCESTOR.DBF'
+    df = simpledbf.Dbf5(input_path).to_dataframe()
     df = pd.concat([df['READCODE'], df['DESCENDANT']])
     df = pd.DataFrame(df.drop_duplicates())
     df = df.rename(columns={0:"read2_code"})
-    df.to_parquet("build/maps/processed/read2_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read2_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read2_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")    
 
     #r2 -> atc
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V2/ATC.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ATC.DBF'   
+    df = simpledbf.Dbf5(input_path).to_dataframe()
     df = df[["READCODE", "ATC"]]
     df = df.rename(columns={"READCODE":"read2_code", "ATC":"atc_code"})
-    df.to_parquet("build/maps/processed/read2_code_to_atc_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read2_code_to_atc_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_atc_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")        
 
     #r2 -> icd10
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V2/ICD10.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'ICD10.DBF'      
+    df = simpledbf.Dbf5(input_path).to_dataframe()        
     df = df[["READ_CODE", "TARG_CODE"]]
     df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"icd10_code"})
     df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
     df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df.to_parquet("build/maps/processed/read2_code_to_icd10_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read2_code_to_icd10_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_icd10_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")      
 
     #r2 -> opcs4
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V2/OPCS4V3.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V2' / 'OPCS4V3.DBF'  
+    df = simpledbf.Dbf5(input_path).to_dataframe()        
     df = df[["READ_CODE", "TARG_CODE"]]
     df = df.rename(columns={"READ_CODE":"read2_code", "TARG_CODE":"opcs4_code"})
     df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
     df = df[~df["read2_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df.to_parquet("build/maps/processed/read2_code_to_opcs4_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read2_code_to_opcs4_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read2_code_to_opcs4_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")      
 
     #r3 only
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ANCESTOR.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ANCESTOR.DBF'    
+    df = simpledbf.Dbf5(input_path).to_dataframe()    
     df = pd.concat([df['READCODE'], df['DESCENDANT']])
     df = pd.DataFrame(df.drop_duplicates())
     df = df.rename(columns={0:"read3_code"})
-    df.to_parquet("build/maps/processed/read3_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read3_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read3_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")     
 
     #r3 -> icd10
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD10.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'ICD10.DBF'        
+    df = simpledbf.Dbf5(input_path).to_dataframe()
     df = df[["READ_CODE", "TARG_CODE"]]
     df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"icd10_code"})
     df = df[~df["icd10_code"].str.match("^.*-.*$")] #remove codes with '-'
     df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df.to_parquet("build/maps/processed/read3_code_to_icd10_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read3_code_to_icd10_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_icd10_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")  
 
     #r3 -> icd9
     # dbf = simpledbf.Dbf5('build/maps/downloads/Standard/V3/ICD9V3.DBF')
 
     #r3 -> opcs4
-    df = simpledbf.Dbf5('build/maps/downloads/Standard/V3/OPCS4V3.DBF').to_dataframe()
+    input_path = MAPS_DOWNLOADS_DIR / 'Standard' / 'V3' / 'OPCS4V3.DBF'      
+    df = simpledbf.Dbf5(input_path).to_dataframe()
     df = df[["READ_CODE", "TARG_CODE"]]
     df = df.rename(columns={"READ_CODE":"read3_code", "TARG_CODE":"opcs4_code"})
     df = df[~df["opcs4_code"].str.match("^.*-.*$")] #remove codes with '-'
     df = df[~df["read3_code"].str.match("^.*-.*$")] #remove codes with '-'
-    df.to_parquet("build/maps/processed/read3_code_to_opcs4_code.parquet", index=False)
-    print("Extracted ", "build/maps/processed/read3_code_to_opcs4_code.parquet")
+    output_path = MAPS_PROCESSED_DIR / 'read3_code_to_opcs4_code.parquet'   
+    df.to_parquet(output_path, index=False)
+    print(f"Extracted: {output_path}")      
 
-def create_build_directories(build_dir='build'):
-    """Create build directories.""" 
-    build_path = Path(build_dir)
-
-    if not build_path.exists():
-        build_path.mkdir(parents=True, exist_ok=True)
+def create_map_directories():
+    """Create map directories.""" 
 
     # Check if build directory exists
-    maps_path = build_path / 'maps'
     create_map_dirs = False   
-    if maps_path.exists(): 
-        user_input = input(f"The map directory {maps_path} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower()
+    if MAPS_DIR.exists(): 
+        user_input = input(f"The map directory {MAPS_DIR} already exists. Do you want to download and process trud data again? (y/n): ").strip().lower()
         if user_input == "y":
             # delete all build files
-            shutil.rmtree(maps_path)
+            shutil.rmtree(MAPS_DIR)
             create_map_dirs = True
         elif user_input == "n":
-            print("Exiting TRUD processing")
+            print("Exiting TRUD installation")
             sys.exit(0)
     else:
         create_map_dirs = True  
 
     if create_map_dirs:
         # create maps directories
-        maps_path.mkdir(parents=True, exist_ok=True)
-        maps_download_path = maps_path / 'downloads'
-        maps_download_path.mkdir(parents=True, exist_ok=True)            
-        maps_processed_path = maps_path / 'processed'
-        maps_processed_path.mkdir(parents=True,exist_ok=True)                                 
-
-def main():
-    print("Processing TRUD files")
-    
-    parser = argparse.ArgumentParser(
-        description="Download releases of items using the TRUD API.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--key", type=str, help="TRUD API Key")
-#     parser.add_argument("item_ids", nargs="+", help="Item IDs to download releases for.")
-#     parser.add_argument("-l", "--latest", action="store_true", help="Download only the latest release")
-#     parser.add_argument("-c", "--checksum", action="store_true", help="Also download the checksum file")
-#     parser.add_argument("-s", "--signature", action="store_true", help="Also download the signature file")
-#     parser.add_argument("-p", "--public_key", action="store_true", help="Also download the public key file")
-    
-    args = parser.parse_args()
+        MAPS_DIR.mkdir(parents=True, exist_ok=True)
+        MAPS_DOWNLOADS_DIR.mkdir(parents=True, exist_ok=True)            
+        MAPS_PROCESSED_DIR.mkdir(parents=True,exist_ok=True)                                 
 
-    create_build_directories()
+def install(api_key):   
+    create_map_directories()
 
     items_latest = True
-    items_folder = "build/maps/downloads"
     items = [
         {
             "id": 259,
@@ -294,14 +310,14 @@ def main():
         item_id = item["id"]
         print(bcolors.HEADER, "---"+item["name"]+"---", bcolors.ENDC)
 
-        releases = get_releases(item_id, API_KEY=args.key, latest=items_latest)
+        releases = get_releases(item_id, API_KEY=api_key, latest=items_latest)
         if not releases:
             error_exit(f"No releases found for item {item_id}.")
 
         # Process each release in reverse order
         for release_ordinal, release in enumerate(releases[::-1], 1):
             # Download archive file
-            file_destination = download_release_file(item_id, release_ordinal, release, "archive", items_folder=items_folder)
+            file_destination = download_release_file(item_id, release_ordinal, release, "archive")
             
             # Optional files
             # if items.checksum:
@@ -316,14 +332,11 @@ def main():
                 validate_download_hash(file_destination, item["hash"])
 
             #Unzip downloaded .zip
-            unzip_download(file_destination, items_folder=items_folder)
+            unzip_download(file_destination)
 
             #Extract Tables to parquet
             if "extract" in item:
                 item["extract"]()
             
         print(f"Downloaded {release_ordinal} release(s) for item {item_id}.")
-
-    print(f"Successfully completed TRUD processing")
-if __name__ == "__main__":
-    main()
+    
-- 
GitLab