diff --git a/DA.sv b/DA.sv
index b8d1d3fdde1093515e1828a4c07fd8c5b43a9e65..52a022c88f7bb435ee48ca196206411d40ed3098 100644
--- a/DA.sv
+++ b/DA.sv
@@ -8,17 +8,18 @@
 ======= Signed DA MAC =======
 */
 
-import DA_LUT::multiplication_coefficients;
 import DA_LUT::clog2;
 
 module DA_MAC #(parameter BW = DA_LUT::BW, parameter N = DA_LUT::N)
                (input logic signed [BW-1:0] in [0:N-1],
                 input logic input_ready, ck, rst,
+                //input logic [clog2(N)+BW:0] multiplication_coefficients [0:2**N-1],
+                input logic [(N-1)+8:0] multiplication_coefficients [0:2**N-1],
                 output logic signed [(2*N-1)+8:0] out,
                 output logic output_ready);
 
 // ==== Local Variables ==== 
-logic signed [N-1:0]       shifted_out;
+logic signed [BW-1:0]       shifted_out;
 logic signed [(N-1)+8:0]   partial_sum; 
 
 typedef enum logic [1:0] {waiting, loading, processing, saving} state_type;
@@ -48,7 +49,7 @@ always_ff@(posedge ck, posedge rst)
 
 
 // ==== DA accumulator ==== 
-always_ff @(posedge ck, posedge rst)
+always_ff @(posedge ck)
   if (reset_accumulator)
   begin
 	partial_sum <= '0;
diff --git a/DA_LUT.sv b/DA_LUT.sv
index b89de544626ef2879e2081967ad1eec60111353f..459f36870c8758c94f466032ac78769796bbd00d 100644
--- a/DA_LUT.sv
+++ b/DA_LUT.sv
@@ -3,7 +3,30 @@ package DA_LUT;
 parameter N = 16;
 parameter BW = 16;
 
-const logic signed [(BW-1)+8:0] multiplication_coefficients [0:2**N - 1] = '{
+function int clog2(input int n);
+	begin 
+	clog2 = 0;
+	n--;
+	while(n > 0)
+		begin
+			clog2++;
+			n >>= 1;
+		end
+	end
+endfunction
+
+/*
+- The input to the LUT is an N Bit input vector (hence 2^N - 1 inputs)
+- The maximum value of the output would be N*(2^BW-1) assuming a BW sized coefficients and all an input vector of all 1's
+- Hence, this required log2(N*(2^BW-1))+1 (+1 for sign bit) bits to store = log2(N) + log2(2^BW-1) + 1 = (BW) + log2(N)
+
+   N  |  BW   | log2(N)  | output size
+---------------------------------------
+  16  |  16   |    4     |     20         <----- this implementation
+  32  |  16   |    5     |     21
+  32  |  32   |    5     |     37s
+*/
+const logic signed [(N-1)+8:0] multiplication_coefficients [0:2**N-1] = '{
 24'd0, -24'd79, -24'd136, -24'd215, 24'd312, 24'd233, 24'd176, 24'd97, 24'd654, 
 24'd575, 24'd518, 24'd439, 24'd966, 24'd887, 24'd830, 24'd751, -24'd1244, 
 -24'd1323, -24'd1380, -24'd1459, -24'd932, -24'd1011, -24'd1068, -24'd1147, -24'd590, 
@@ -8195,19 +8218,6 @@ const logic signed [(BW-1)+8:0] multiplication_coefficients [0:2**N - 1] = '{
 24'd33180, 24'd33123, 24'd33044, 24'd33571, 24'd33492, 24'd33435, 24'd33356, 24'd33913, 
 24'd33834, 24'd33777, 24'd33698, 24'd34225, 24'd34146, 24'd34089, 24'd34010, 24'd32015, 
 24'd31936, 24'd31879, 24'd31800, 24'd32327, 24'd32248, 24'd32191, 24'd32112, 24'd32669, 
-24'd32590, 24'd32533, 24'd32454, 24'd32981, 24'd32902, 24'd32845, 24'd32766
-};
-
-function int clog2(input int n);
-	begin 
-	clog2 = 0;
-	n--;
-	while(n > 0)
-		begin
-			clog2++;
-			n >>= 1;
-		end
-	end
-endfunction
+24'd32590, 24'd32533, 24'd32454, 24'd32981, 24'd32902, 24'd32845, 24'd32766};
 
 endpackage
diff --git a/DA_test.sv b/DA_test.sv
index c0f134901cde0d28537e896e8f3e989a5e6c9296..d6bfc2a957ab722a7dca1d4585a7e2ba21b3b0d2 100644
--- a/DA_test.sv
+++ b/DA_test.sv
@@ -3,14 +3,14 @@ module DA_test;
 timeunit 1ns;
 timeprecision 100ps;
 
-logic signed [15:0] in [0:15] = '{2,5,123,345,123,5,10,2342,344,22,234,543,23,65,1111,54};
+logic signed [DA_LUT::BW-1:0] in [0:DA_LUT::N-1] = '{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
 logic input_ready, ck, rst;
-logic signed [39:0] out;
+logic signed [(2*DA_LUT::N-1)+8:0] out;
 logic output_ready;
 
 const int input_frequency = 5000;
 
-DA_MAC #(.BW(DA_LUT::BW), .N(DA_LUT::N))  DA  (.*);
+DA_MAC #(.BW(DA_LUT::BW), .N(DA_LUT::N))  DA  (.*, .multiplication_coefficients(DA_LUT::multiplication_coefficients));
 
 // clock generator
 // generates a 1 MHz clock
diff --git a/generate_LUT.cpp b/generate_LUT.cpp
index bb1fdcea71b04e91a4d838ca44d611ab74b09f62..421abbebe768c0bfc6a3e6808b6d775005617880 100644
--- a/generate_LUT.cpp
+++ b/generate_LUT.cpp
@@ -8,7 +8,8 @@
 
 /*
 - 1st arg is the propgram name
-- 2nd ... last args are the coefficients of the FIR filter
+- 2nd arg is the bit width of the coefficients
+- 3rd ... last args are the coefficients of the FIR filter
 */
 
 int clog2(int n)
@@ -26,15 +27,16 @@ int clog2(int n)
 
 int main(int argc, char *argv[])
 {
-    std::size_t num_coeff = argc - 1;
-    std::size_t address_bit_width = num_coeff;
-    std::size_t data_bit_width = num_coeff+8;
+    std::size_t num_coeff = argc - 2;
+    std::size_t N = num_coeff;
+    std::size_t BW = std::stoi(argv[1]);
+	std::size_t output_size = N+8;
     std::vector<double> coefficients;
 
     //generate a vector of the coefficients
     for(std::size_t i = 0; i < num_coeff; ++i)
     {
-        coefficients.push_back(std::stod(argv[i+1]));
+        coefficients.push_back(std::stod(argv[i+2]));
     }
 
     std::ofstream dest_file;
@@ -42,7 +44,7 @@ int main(int argc, char *argv[])
     
     //for each number in the range 0 ... pow(2,num_coeff-1), multiply each bit by the coefficients
     //assuming SIGNED 2c multiplication, so num_coeff-1
-    for(std::size_t i = 0; i < pow(2,num_coeff); ++i)
+    for(std::size_t i = 0; i < pow(2,N); ++i)
     {
         std::bitset<64> bits(i);
         int sum = 0; 
@@ -54,9 +56,9 @@ int main(int argc, char *argv[])
 
         //write the value of the input, i, and the corresponding sum, sum, to a file
         if(sum >= 0)
-            dest_file <<  data_bit_width << "'d" << sum << ", ";
+            dest_file <<  output_size << "'d" << sum << ", ";
         else
-            dest_file << "-" << data_bit_width << "'d" << abs(sum) << ", ";
+            dest_file << "-" << output_size << "'d" << abs(sum) << ", ";
 		
 		if(!(i%8) && i>0)
 			dest_file << "\n";