From 46d4752f75e4d43f2741acdb2aa5bf467303a1ce Mon Sep 17 00:00:00 2001
From: Jakub Dylag <jjd1c23@soton.ac.uk>
Date: Fri, 28 Mar 2025 17:12:55 +0000
Subject: [PATCH] Allow multiple files per concept set - loop over map function

---
 acmc/phen.py | 117 ++++++++++++++++++++++++++-------------------------
 1 file changed, 60 insertions(+), 57 deletions(-)

diff --git a/acmc/phen.py b/acmc/phen.py
index 4893144..119a4c9 100644
--- a/acmc/phen.py
+++ b/acmc/phen.py
@@ -818,67 +818,70 @@ def _map_target_code_type(phen_path: Path, phenotype: dict, target_code_type: st
     code_errors = []
 
     # Process each folder in codes section
-    for concept_set in phenotype["concept_sets"]:
-        _logger.debug(f"--- {concept_set['file']} ---")
-
-        # Load code file
-        codes_file_path = Path(concepts_path / concept_set["file"]["path"])
-        df = _read_table_file(codes_file_path)
-
-        # process structural actions
-        df = _process_actions(df, concept_set)
-
-        # preprocessing and validate of source concepts
-        _logger.debug("Processing and validating source concept codes")
-        df, errors = _preprocess_source_concepts(
-            df,
-            concept_set,
-            codes_file_path,
-        )
-
-        # create df with just the source code columns
-        source_column_names = list(concept_set["file"]["columns"].keys())
-        source_df = df[source_column_names]
+    for files in phenotype["concept_sets"]:
+        concept_set_name = files["name"]
+        for concept_set in files["files"]:
+        
+            _logger.debug(f"--- {concept_set} ---")
+
+            # Load code file
+            codes_file_path = Path(concepts_path / concept_set["path"])
+            df = _read_table_file(codes_file_path)
+
+            # process structural actions
+            df = _process_actions(df, concept_set)
+
+            # preprocessing and validate of source concepts
+            _logger.debug("Processing and validating source concept codes")
+            df, errors = _preprocess_source_concepts(
+                df,
+                concept_set,
+                codes_file_path,
+            )
 
-        _logger.debug(source_df.columns)
-        _logger.debug(source_df.head())
+            # create df with just the source code columns
+            source_column_names = list(concept_set["columns"].keys())
+            source_df = df[source_column_names]
 
-        _logger.debug(
-            f"Length of errors from _preprocess_source_concepts {len(errors)}"
-        )
-        if len(errors) > 0:
-            code_errors.extend(errors)
-        _logger.debug(f" Length of code_errors {len(code_errors)}")
+            _logger.debug(source_df.columns)
+            _logger.debug(source_df.head())
 
-        # Map source concepts codes to target codes
-        # if processing a source coding list with categorical data
-        if (
-            "actions" in concept_set["file"]
-            and "divide_col" in concept_set["file"]["actions"]
-            and len(df) > 0
-        ):
-            divide_col = concept_set["file"]["actions"]["divide_col"]
-            _logger.debug(f"Action: Dividing Table by {divide_col}")
-            _logger.debug(f"column into: {df[divide_col].unique()}")
-            df_grp = df.groupby(divide_col)
-            for cat, grp in df_grp:
-                if cat == concept_set["file"]["category"]:
-                    grp = grp.drop(columns=[divide_col])  # delete categorical column
-                    source_df = grp[source_column_names]
-                    trans_out = translate_codes(
-                        source_df,
-                        target_code_type=target_code_type,
-                        concept_name=concept_set["name"],
-                    )
-                    out = pd.concat([out, trans_out])
-        else:
-            source_df = df[source_column_names]
-            trans_out = translate_codes(
-                source_df,
-                target_code_type=target_code_type,
-                concept_name=concept_set["name"],
+            _logger.debug(
+                f"Length of errors from _preprocess_source_concepts {len(errors)}"
             )
-            out = pd.concat([out, trans_out])
+            if len(errors) > 0:
+                code_errors.extend(errors)
+            _logger.debug(f" Length of code_errors {len(code_errors)}")
+
+            # Map source concepts codes to target codes
+            # if processing a source coding list with categorical data
+            if (
+                "actions" in concept_set
+                and "divide_col" in concept_set["actions"]
+                and len(df) > 0
+            ):
+                divide_col = concept_set["actions"]["divide_col"]
+                _logger.debug(f"Action: Dividing Table by {divide_col}")
+                _logger.debug(f"column into: {df[divide_col].unique()}")
+                df_grp = df.groupby(divide_col)
+                for cat, grp in df_grp:
+                    if cat == concept_set["category"]:
+                        grp = grp.drop(columns=[divide_col])  # delete categorical column
+                        source_df = grp[source_column_names]
+                        trans_out = translate_codes(
+                            source_df,
+                            target_code_type=target_code_type,
+                            concept_name=concept_set_name,
+                        )
+                        out = pd.concat([out, trans_out])
+            else:
+                source_df = df[source_column_names]
+                trans_out = translate_codes(
+                    source_df,
+                    target_code_type=target_code_type,
+                    concept_name=concept_set_name,
+                )
+                out = pd.concat([out, trans_out])
 
     if len(code_errors) > 0:
         _logger.error(f"The map processing has {len(code_errors)} errors")
-- 
GitLab