Made some graphs

3ccca28c · samuel.m · ba5e3b48 · 3ccca28c · 3ccca28c · 3ccca28c
Commit 3ccca28c authored 6 months ago by samuel.m
--- a/dashboard_code/__init__.py
+++ b/dashboard_code/__init__.py
--- a/dashboard_code/dashboard.py
+++ b/dashboard_code/dashboard.py
+import plotly.graph_objects as go
+import plotly.io as pio
+
+
+def generate_html():
+    # Step 1: Generate Graphs with Plotly
+    # Example Graph 1: Bar Chart
+    fig1 = go.Figure(data=[
+        go.Bar(name='Category A', x=['Region 1', 'Region 2', 'Region 3'], y=[10, 20, 30]),
+        go.Bar(name='Category B', x=['Region 1', 'Region 2', 'Region 3'], y=[15, 25, 35])
+    ])
+    fig1.update_layout(title='Bar Chart Example', barmode='group')
+
+    # Example Graph 2: Line Chart
+    fig2 = go.Figure(data=go.Scatter(x=['Jan', 'Feb', 'Mar'], y=[10, 15, 13], mode='lines+markers'))
+    fig2.update_layout(title='Line Chart Example')
+
+    # Step 2: Generate HTML Content
+    # Convert graphs to HTML divs
+    graph1_html = pio.to_html(fig1, full_html=False)
+    graph2_html = pio.to_html(fig2, full_html=False)
+
+    # Template for the webpage
+    html_template = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>Graphs Display</title>
+        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+        <style>
+            body {{
+                font-family: Arial, sans-serif;
+                margin: 20px;
+            }}
+            .graph-container {{
+                margin-bottom: 40px;
+            }}
+        </style>
+    </head>
+    <body>
+        <h1>Interactive Graphs</h1>
+        <div class="graph-container">
+            <h2>Bar Chart</h2>
+            {graph1_html}
+        </div>
+        <div class="graph-container">
+            <h2>Line Chart</h2>
+            {graph2_html}
+        </div>
+    </body>
+    </html>
+    """
+
+    # Step 3: Save the HTML File
+    output_path = "graphs_display.html"
+    with open(output_path, "w") as html_file:
+        html_file.write(html_template)
+
+    print(f"HTML page generated: {output_path}")
+    
+    
+if __name__ == "__main__":
+    generate_html()
--- a/data_code/querys.py
+++ b/data_code/querys.py
+"""
+Querys for specific data
+"""
+
+# DATABASES FOR UN DATA
+# SYB67_314_202411_Internet Usage.db
+# SYB67_245_202411_Public expenditure on education and access to computers.db
+# SYB67_328_202411_Intentional homicides and other crimes.db
+# SYB67_325_202411_Expenditure on health.db
+# master_UN.db
+# SYB67_128_202411_Consumer Price Index.db
+# SYB67_145_202411_Land.db
+# SYB67_230_202411_GDP and GDP Per Capita.db
+# SYB67_323_202411_Teaching Staff in education.db
+# SYB67_319_202411_Ratio of girls to boys in education.db
+# SYB67_329_202411_Labour Force and Unemployment.db
+# SYB67_309_202411_Education.db
+# SYB67_200_202411_Employment.db
+# SYB67_154_202411_Health Personnel.db
+
+
+# Getting "Percentage of male and female intentional homicide victims, Male" and "Capital expenditure as % of total expenditure in public institutions (%)"
+def male_homicide_rate_education_rate():
+    homicide_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db' 
+    AND Row_Descriptor = 'Percentage of male and female intentional homicide victims, Male'; 
+    """
+    education_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_245_202411_Public expenditure on education and access to computers.db'
+    AND Row_Descriptor = 'Capital expenditure as % of total expenditure in public institutions (%)';
+    """
+    
+    return homicide_rate, education_rate
+
+
+# Getting "Percentage of male and female intentional homicide victims, Female" and  "Capital expenditure as % of total expenditure in public institutions (%)"
+def female_homicide_rate_education_rate():
+    female_homicide_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db'
+    AND Row_Descriptor = 'Percentage of male and female intentional homicide victims, Female';
+    """
+    education_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_245_202411_Public expenditure on education and access to computers.db'
+    AND Row_Descriptor = 'Capital expenditure as % of total expenditure in public institutions (%)';
+    """
+    return female_homicide_rate, education_rate
+
+def overall_homicide_rate():
+    homicide_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db' 
+    AND Row_Descriptor = 'Intentional homicide rates per 100,000';
+    """
+    return homicide_rate
+
+def education_rate():
+    education_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_245_202411_Public expenditure on education and access to computers.db'
+    AND Row_Descriptor = 'Capital expenditure as % of total expenditure in public institutions (%)';
+    """
+    return education_rate
+
+def unemployment_rate():
+    unemployment_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_329_202411_Labour Force and Unemployment.db' 
+    AND Row_Descriptor = 'Unemployment rate - Total';
+    """
+    return unemployment_rate
+
+def female_homicide():
+    female_homicide_rate = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db'
+    AND Row_Descriptor = 'Percentage of male and female intentional homicide victims, Female';
+    """
+    return female_homicide_rate
+
+# Percentage of individuals using the internet
+def internet_usage():
+    internet_usage = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_314_202411_Internet Usage.db' 
+    AND Row_Descriptor = 'Percentage of individuals using the internet';
+    """
+    return internet_usage
+
+def consumer_price_index_general():
+    consumer_price_index = """
+    SELECT Region_Country_Area, Year, Value FROM master_table 
+    WHERE Database_Name = 'SYB67_128_202411_Consumer Price Index.db' 
+    AND Row_Descriptor = 'Consumer price index: General';
+    """
+    return consumer_price_index
\ No newline at end of file
--- a/data_code/sql_tool.py
+++ b/data_code/sql_tool.py
@@ -3,14 +3,16 @@ import pandas as pd
 import os
 from rich.console import Console
 from rich.table import Table
-
+import csv

 class SQL_tool():
    
    
    def __init__(self, data_storage_location : str):
+        self.data_storage_location = data_storage_location
        self.db_locations = []
        self.init_db(data_storage_location)
+        self.consolidate_tables()
        


@@ -29,6 +31,9 @@ class SQL_tool():
                    
                    try:
                        if verbose: print(f"Processing {file_name}")
+                        # Fix bad headers
+                        self.fix_un_csv_headers(file_path, 1, ["Index","Region_Country_Area","Year" ,"Row_Descriptor","Value","Footnotes","Source"])
+                        
                        df = pd.read_csv(file_path, header=1) # Depending on the CSV select the correct row the headers are on
                        with sqlite3.connect(db_path) as conn:
                            try:
@@ -38,11 +43,37 @@ class SQL_tool():
                            except:
                                if verbose: print(f"Data base {db_name} at location {db_path} exists... Doing nothing....")
                                self.db_locations.append(db_path)
-                    except:
+                    except Exception as e:
                        if verbose: print(f"Failed to process file {file_name}")
+                        print(e)
+                elif os.path.isfile(file_path) and file_name.endswith('.db'): # If the file is a db add it to the list
+                    self.db_locations.append(file_path)
                elif os.path.isdir(file_path): # Recurse through the data folders, should work for any folder csv configuration
                    self.init_db(file_path)
        
+        # Remove duplicates
+        self.db_locations = list(set(self.db_locations))
+        
+    #Index,Region/Country/Area,Year,Series,Value,Footnotes,Source <- Correct headers
+    def fix_un_csv_headers(self,file_path, line_number, new_data):
+        """
+        Replaces the headers of a CSV file with new (correct) headers.
+        """
+        with open(file_path, mode='r', newline='') as file:
+            reader = csv.reader(file)
+            rows = list(reader)
+        
+        if 0 < line_number <= len(rows):
+            rows[line_number] = new_data
+        else:
+            print(f"Line number {line_number} is out of range.")
+            return
+        
+        with open(file_path, mode='w', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerows(rows)
+        
+                
    def list_db(self, verbose: bool = False):
        """
        Pretty prints the DB
@@ -55,7 +86,7 @@ class SQL_tool():
            table.add_column("Database Name", style="cyan", justify="left")
            table.add_column("Path", style="magenta", justify="left")
            table.add_column("Tables", style="purple", justify="left") 
-            table.add_column("Colunms", style="green", justify="left")          
+            table.add_column("Colunms", style="green", justify="left", overflow="fold")          

            counter = 0
            for db_path in self.db_locations:
@@ -147,8 +178,69 @@ class SQL_tool():
                
            return table_lst
    
+    def query_db(self, query : str, table_name : str , verbose : bool=False):
+        """
+        Executes a query on the master table
+        """
+        if verbose: print(f"Querying: {query}")
+        db_path = None
+        for table_paths in self.db_locations: # Find path from table name
+            if table_name == os.path.basename(table_paths):
+                db_path = table_paths
+                break
+        
+        with sqlite3.connect(db_path) as conn:
+            df = pd.read_sql_query(query, conn)
+            return df
        
    
+    def consolidate_tables(self):
+        """
+        Consolidates all the tables in the database into one table
+        """
+        # Check if the master database already exists
+        if os.path.exists(os.path.join(self.data_storage_location, "master_UN.db")):
+            print("Master database already exists.")
+            return
+        master_db_path = os.path.join(self.data_storage_location, "master_UN.db")
+        master_conn = sqlite3.connect(master_db_path)
+        master_cursor = master_conn.cursor()
+        
+        master_cursor.execute('''
+        CREATE TABLE IF NOT EXISTS master_table (
+            Region_Country_Area TEXT,
+            Year INTEGER,
+            Row_Descriptor TEXT,
+            Value REAL,
+            Footnotes TEXT,
+            Source TEXT,
+            Database_Name TEXT
+        )
+        ''')
+        master_conn.commit()
+
+        for db_paths in self.db_locations:
+                db_name = os.path.basename(db_paths)
+                print(f"Processing {db_name}...")
+
+                # Connect to the individual database
+                conn = sqlite3.connect(db_paths)
+                df = pd.read_sql_query("SELECT * FROM data", conn)
+                #df = df.iloc[:, 0:] # Remove the index column
+
+
+                # Add a column for the database name
+                df["Database_Name"] = db_name
+
+                # Append the data to the master table
+                df[["Region_Country_Area","Year" ,"Row_Descriptor","Value","Footnotes","Source","Database_Name"]].to_sql(
+                    "master_table", master_conn, if_exists="append", index=False
+                )
+
+                conn.close()
+                
+        self.db_locations.append(master_db_path)
+        print("Consolidation complete.")
            
                
        

--- a/graph_code/graph_tool.py
+++ b/graph_code/graph_tool.py
@@ -53,30 +53,4 @@ class Graph_tool():
        plt.show()
        
        
-    def graph_yearly_data(self , formatted_data, title="Yearly Data Trends"):
-        """
-        Plots yearly data for multiple countries.
-
-        Args:
-            formatted_data (list): A list of lists where each sublist contains (year, stat) tuples for a country.
-            title (str): The title of the graph.
-        """
-        plt.figure(figsize=(10, 6))  # Set the figure size
-        
-        for country_index, country_data in enumerate(formatted_data):
-            # Separate years and stats for the current country
-            years, stats = zip(*country_data)  # Unzip the (year, stat) tuples
        
\ No newline at end of file
-            # Plot the line for this country's data
-            plt.plot(years, stats, marker='o', label=f"Country {country_index + 1}")
-
-        # Add labels, title, and grid
-        plt.xlabel("Year")
-        plt.ylabel("Stat")
-        plt.title(title)
-        plt.grid(True, linestyle='--', alpha=0.7)
-        plt.legend(title="Countries")
-        
-        # Show the plot
-        plt.tight_layout()
-        plt.show()
--- a/graphs_display.html
+++ b/graphs_display.html
--- a/main.py
+++ b/main.py
 from data_code.sql_tool import SQL_tool
 from graph_code.graph_tool import Graph_tool
 from stats_code.stats_tool import Stat_tool
-
-
+import data_code.querys as querys
+import pandas as pd
+import plotly.express as px


 if __name__ == "__main__":
@@ -10,28 +11,139 @@ if __name__ == "__main__":
    g_tool = Graph_tool()
    s_tool = Stat_tool()
    table_list = sql_t.list_db(verbose=True)
-    inp = input("x_table x_column y_table y_column(e.g 2 Year 4 Value)\n").split(" ")
-    x_tab, x_col, y_tab, y_col = (int(inp[0]), str(inp[1]), int(inp[2]), str(inp[3]))
-    # Get x
-    path = table_list[x_tab]
-    colunm_data_x = sql_t.get_column(db_path=path, column_name=x_col)
-    # Get y
-    path = table_list[y_tab]
-    colunm_data_y = sql_t.get_column(db_path=path, column_name=y_col)
    
-    formatted_yearly_data = s_tool.process_yearly_data(year_data=colunm_data_x, stats=colunm_data_y)
-    top_countries = s_tool.filter_top_countries(formatted_data=formatted_yearly_data, top_n=5)
-    g_tool.graph_yearly_data(formatted_data=top_countries, title="Yearly usage change")

-    #g_tool.gen_graph(x_data=colunm_data_x, x_name=x_col, y_data=colunm_data_y, y_name=y_col)
   
+    # # GRAPH 1 HOMICDES AGAINST MEN AND WOMEN AGAINST EDUCATION LEVEL GOLBALLY 
+    
+    # male_homicide_rate_query, education_rate_query = querys.male_homicide_rate_education_rate()
+    
+    # male_homicide_rate = sql_t.query_db(male_homicide_rate_query, table_name="master_UN.db")
+    # education_rate = sql_t.query_db(education_rate_query, table_name="master_UN.db")
+    
+    
+    # # Merge both dataframes on 'Region_Country_Area' and 'Year'
+    # merged_df_male = pd.merge(male_homicide_rate, education_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Male_Homicide', '_Education'))
+
+    # merged_df_male['Gender'] = 'Male' # Gender label
+
+
+    # fem_homicide_rate_query, education_rate = querys.female_homicide_rate_education_rate()
+    
+    # fem_homicide_rate = sql_t.query_db(fem_homicide_rate_query, table_name="master_UN.db")
+    # education_rate = sql_t.query_db(education_rate, table_name="master_UN.db")
+    
+    # # Merge both dataframes on 'Region_Country_Area' and 'Year'
+    # merged_df_female = pd.merge(fem_homicide_rate, education_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Fem_Homicide', '_Education'))
+    
+    # merged_df_female['Gender'] = 'Female' # Gender Label
+    
+    # combined_df = pd.concat([merged_df_male[['Region_Country_Area', 'Year', 'Value_Male_Homicide', 'Value_Education', 'Gender']], 
+    #                      merged_df_female[['Region_Country_Area', 'Year', 'Value_Fem_Homicide', 'Value_Education', 'Gender']]])
+
+    
+    # # Combine male and female dataframes
+    # combined_df['Homicide_Rate'] = combined_df.apply(
+    # lambda row: row['Value_Male_Homicide'] if row['Gender'] == 'Male' else row['Value_Fem_Homicide'], axis=1)
+
+    
+    # # Create an interactive scatter plot with Plotly (overlay male and female)
+    # fig = px.scatter(combined_df, 
+    #                 x='Homicide_Rate', 
+    #                 y='Value_Education', 
+    #                 color='Gender',  # Different colors for Male/Female
+    #                 symbol='Gender',  # Different symbols for Male/Female
+    #                 hover_data={'Region_Country_Area': True, 'Year': True},
+    #                 labels={'Homicide_Rate': 'Homicide Rate (per 100,000)', 
+    #                         'Value_Education': 'Education Expenditure (% of Total)', 
+    #                         'Value_Fem_Homicide': 'Homicide Rate (Female, per 100,000)',
+    #                         'Value_Male_Homicide': 'Homicide Rate (Male, per 100,000)'},
+    #                 title='Homicide Rate (Men and Women) vs Education Expenditure')
+
+    # # Show the plot
+    # fig.show()
+    
+    # GRAPH 2 UNEMPLOYMENT RATE AGAINST CRIME RATE GLOBALLY
+    
+    # unemployment_rate_query = querys.unemployment_rate()
+    # unemployment_rate = sql_t.query_db(unemployment_rate_query, table_name="master_UN.db")
+    
+    # homicide_rate_query = querys.overall_homicide_rate()
+    # homicide_rate = sql_t.query_db(homicide_rate_query, table_name="master_UN.db")
+    
+    # # Merge both dataframes on 'Region_Country_Area' and 'Year'
+    # merged_df = pd.merge(unemployment_rate, homicide_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Unemployment', '_Homicide'))
+    
+    # # Create an interactive scatter plot with Plotly
+    # fig = px.scatter(merged_df, 
+    #                 x='Value_Unemployment', 
+    #                 y='Value_Homicide', 
+    #                 hover_data={'Region_Country_Area': True, 'Year': True},
+    #                 labels={'Value_Unemployment': 'Unemployment Rate (%)', 
+    #                         'Value_Homicide': 'Homicide Rate (per 100,000)'},
+    #                 title='Unemployment Rate vs Homicide Rate')
+    
+    # # Show the plot
+    # fig.show()
+    
+    # GRAPH 3 INTERNET USAGE AGAINST CRIME RATE (FEMAIL)
+    
+    # fem_homicide_rate_query = querys.female_homicide()
+    # fem_homicide_rate = sql_t.query_db(fem_homicide_rate_query, table_name="master_UN.db")
+    
+    # oveall_homicide_rate_query = querys.overall_homicide_rate()
+    # oveall_homicide_rate = sql_t.query_db(oveall_homicide_rate_query, table_name="master_UN.db")
+    
+    # # Get value of women killed in homicide (per 100,000)
+    # # Divide homicide rate by 100 and multiply by femaicde rate to get number of women killed
+    # fem_homicide_rate['Num_fem_homicide'] = (fem_homicide_rate['Value'] / 100) * oveall_homicide_rate['Value']
+    
+    
+    
+    
+    # internet_usage = querys.internet_usage()
+    # internet_usage = sql_t.query_db(internet_usage, table_name="master_UN.db")
+    
+    # # Merge both dataframes on 'Region_Country_Area' and 'Year'
+    # merged_df = pd.merge(fem_homicide_rate, internet_usage, on=['Region_Country_Area', 'Year'], suffixes=('_Fem_Homicide', '_Internet'))
+    # merged_df_2022 = merged_df[merged_df['Year'] == 2022]
+    
+    # # Create an interactive scatter plot with Plotly
+    # fig = px.scatter(merged_df, 
+    #                 x='Value_Internet', 
+    #                 y='Num_fem_homicide',  # Use 'Num_fem_homicide' to show number of women killed
+    #                 hover_data={'Region_Country_Area': True, 'Year': True},
+    #                 labels={'Value_Internet': 'Internet Usage (%)', 
+    #                         'Num_fem_homicide': 'Number of Women Killed (per 100,000)',
+    #                         'Value_Fem_Homicide': 'Percentage of Female Homicide Victims'},
+    #                 title='Internet Usage vs Number of Women Killed in Homicides')
+
+    # # Show the plot
+    # fig.show()
    
+    # GRAPH 4 CONSUMER PRICE INDEX AGAINST CRIME RATE 

    
+    # oveall_homicide_rate_query = querys.overall_homicide_rate()
+    # oveall_homicide_rate = sql_t.query_db(oveall_homicide_rate_query, table_name="master_UN.db")
    
+    # consumer_price_index_query = querys.consumer_price_index_general()
+    # consumer_price_index = sql_t.query_db(consumer_price_index_query, table_name="master_UN.db")
    
+    # # Merge both dataframes on 'Region_Country_Area' and 'Year'
+    # merged_df = pd.merge(consumer_price_index, oveall_homicide_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Consumer_Price_Index', '_Homicide'))
    
+    # # Create an interactive scatter plot with Plotly
+    # fig = px.scatter(merged_df, 
+    #                 x='Value_Consumer_Price_Index', 
+    #                 y='Value_Homicide', 
+    #                 hover_data={'Region_Country_Area': True, 'Year': True},
+    #                 labels={'Value_Consumer_Price_Index': 'Consumer Price Index', 
+    #                         'Value_Homicide': 'Homicide Rate (per 100,000)'},
+    #                 title='Consumer Price Index vs Homicide Rate')
    
+    # # Show the plot
+    # fig.show()

        
        
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
 pandas
 rich
 matplotlib
+plotly
\ No newline at end of file
--- a/stats_code/stats_tool.py
+++ b/stats_code/stats_tool.py
@@ -6,55 +6,5 @@ class Stat_tool():
    def __init__(self):
        pass
    
-    def process_yearly_data(self, year_data, stats):
-        """
-        Processes yearly data and groups stats for each country by year.
-        Assumes `year_data` repeats in a fixed pattern for each country.

-        Args:
-            year_data (list): List of years (repeated for each country).
-            stats (list): List of statistics corresponding to the years.
-        """
-        # Determine the looping constant (number of years per country)
-        start_year = year_data[0]
-        looping_constant = 0
-        for year in year_data:
-            if year == start_year and looping_constant > 0:
-                break
-            looping_constant += 1
-
-        # Format the data: group stats by country and year
-        formatted_data = []
-        for i in range(0, len(stats), looping_constant):
-            country_data = []
-            for j in range(looping_constant):
-                if i + j < len(stats):  # Ensure we don't exceed list bounds
-                    country_data.append((year_data[j], stats[i + j]))
-            formatted_data.append(country_data)
-
-        return formatted_data
-    
-    def filter_top_countries(self, formatted_data, top_n=5, metric_index=1):
-        """
-        Filters top N countries based on the sum of a metric (e.g., stats).
-        
-        Args:
-            formatted_data (list): List of lists with (year, stat) tuples.
-            top_n (int): Number of countries to include.
-            metric_index (int): Index of the stat in the tuple.
-
-        Returns:
-            list: Filtered data for top N countries.
-        """
-        # Calculate total stats for each country
-        country_totals = [
-            (country_index, sum(data[metric_index] for data in country_data))
-            for country_index, country_data in enumerate(formatted_data)
-        ]
-        
-        # Sort by total stats and get the top N
-        top_countries = sorted(country_totals, key=lambda x: x[1], reverse=True)[:top_n]
-        
-        # Return only the data for the top N countries
-        return [formatted_data[country[0]] for country in top_countries]