Skip to content
Snippets Groups Projects
Commit 3ccca28c authored by samuel.m's avatar samuel.m
Browse files

Made some graphs

parent ba5e3b48
Branches
No related tags found
No related merge requests found
import plotly.graph_objects as go
import plotly.io as pio
def generate_html():
# Step 1: Generate Graphs with Plotly
# Example Graph 1: Bar Chart
fig1 = go.Figure(data=[
go.Bar(name='Category A', x=['Region 1', 'Region 2', 'Region 3'], y=[10, 20, 30]),
go.Bar(name='Category B', x=['Region 1', 'Region 2', 'Region 3'], y=[15, 25, 35])
])
fig1.update_layout(title='Bar Chart Example', barmode='group')
# Example Graph 2: Line Chart
fig2 = go.Figure(data=go.Scatter(x=['Jan', 'Feb', 'Mar'], y=[10, 15, 13], mode='lines+markers'))
fig2.update_layout(title='Line Chart Example')
# Step 2: Generate HTML Content
# Convert graphs to HTML divs
graph1_html = pio.to_html(fig1, full_html=False)
graph2_html = pio.to_html(fig2, full_html=False)
# Template for the webpage
html_template = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Graphs Display</title>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
<style>
body {{
font-family: Arial, sans-serif;
margin: 20px;
}}
.graph-container {{
margin-bottom: 40px;
}}
</style>
</head>
<body>
<h1>Interactive Graphs</h1>
<div class="graph-container">
<h2>Bar Chart</h2>
{graph1_html}
</div>
<div class="graph-container">
<h2>Line Chart</h2>
{graph2_html}
</div>
</body>
</html>
"""
# Step 3: Save the HTML File
output_path = "graphs_display.html"
with open(output_path, "w") as html_file:
html_file.write(html_template)
print(f"HTML page generated: {output_path}")
if __name__ == "__main__":
generate_html()
"""
Querys for specific data
"""
# DATABASES FOR UN DATA
# SYB67_314_202411_Internet Usage.db
# SYB67_245_202411_Public expenditure on education and access to computers.db
# SYB67_328_202411_Intentional homicides and other crimes.db
# SYB67_325_202411_Expenditure on health.db
# master_UN.db
# SYB67_128_202411_Consumer Price Index.db
# SYB67_145_202411_Land.db
# SYB67_230_202411_GDP and GDP Per Capita.db
# SYB67_323_202411_Teaching Staff in education.db
# SYB67_319_202411_Ratio of girls to boys in education.db
# SYB67_329_202411_Labour Force and Unemployment.db
# SYB67_309_202411_Education.db
# SYB67_200_202411_Employment.db
# SYB67_154_202411_Health Personnel.db
# Getting "Percentage of male and female intentional homicide victims, Male" and "Capital expenditure as % of total expenditure in public institutions (%)"
def male_homicide_rate_education_rate():
homicide_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db'
AND Row_Descriptor = 'Percentage of male and female intentional homicide victims, Male';
"""
education_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_245_202411_Public expenditure on education and access to computers.db'
AND Row_Descriptor = 'Capital expenditure as % of total expenditure in public institutions (%)';
"""
return homicide_rate, education_rate
# Getting "Percentage of male and female intentional homicide victims, Female" and "Capital expenditure as % of total expenditure in public institutions (%)"
def female_homicide_rate_education_rate():
female_homicide_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db'
AND Row_Descriptor = 'Percentage of male and female intentional homicide victims, Female';
"""
education_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_245_202411_Public expenditure on education and access to computers.db'
AND Row_Descriptor = 'Capital expenditure as % of total expenditure in public institutions (%)';
"""
return female_homicide_rate, education_rate
def overall_homicide_rate():
homicide_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db'
AND Row_Descriptor = 'Intentional homicide rates per 100,000';
"""
return homicide_rate
def education_rate():
education_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_245_202411_Public expenditure on education and access to computers.db'
AND Row_Descriptor = 'Capital expenditure as % of total expenditure in public institutions (%)';
"""
return education_rate
def unemployment_rate():
unemployment_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_329_202411_Labour Force and Unemployment.db'
AND Row_Descriptor = 'Unemployment rate - Total';
"""
return unemployment_rate
def female_homicide():
female_homicide_rate = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_328_202411_Intentional homicides and other crimes.db'
AND Row_Descriptor = 'Percentage of male and female intentional homicide victims, Female';
"""
return female_homicide_rate
# Percentage of individuals using the internet
def internet_usage():
internet_usage = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_314_202411_Internet Usage.db'
AND Row_Descriptor = 'Percentage of individuals using the internet';
"""
return internet_usage
def consumer_price_index_general():
consumer_price_index = """
SELECT Region_Country_Area, Year, Value FROM master_table
WHERE Database_Name = 'SYB67_128_202411_Consumer Price Index.db'
AND Row_Descriptor = 'Consumer price index: General';
"""
return consumer_price_index
\ No newline at end of file
......@@ -3,14 +3,16 @@ import pandas as pd
import os
from rich.console import Console
from rich.table import Table
import csv
class SQL_tool():
def __init__(self, data_storage_location : str):
self.data_storage_location = data_storage_location
self.db_locations = []
self.init_db(data_storage_location)
self.consolidate_tables()
......@@ -29,6 +31,9 @@ class SQL_tool():
try:
if verbose: print(f"Processing {file_name}")
# Fix bad headers
self.fix_un_csv_headers(file_path, 1, ["Index","Region_Country_Area","Year" ,"Row_Descriptor","Value","Footnotes","Source"])
df = pd.read_csv(file_path, header=1) # Depending on the CSV select the correct row the headers are on
with sqlite3.connect(db_path) as conn:
try:
......@@ -38,11 +43,37 @@ class SQL_tool():
except:
if verbose: print(f"Data base {db_name} at location {db_path} exists... Doing nothing....")
self.db_locations.append(db_path)
except:
except Exception as e:
if verbose: print(f"Failed to process file {file_name}")
print(e)
elif os.path.isfile(file_path) and file_name.endswith('.db'): # If the file is a db add it to the list
self.db_locations.append(file_path)
elif os.path.isdir(file_path): # Recurse through the data folders, should work for any folder csv configuration
self.init_db(file_path)
# Remove duplicates
self.db_locations = list(set(self.db_locations))
#Index,Region/Country/Area,Year,Series,Value,Footnotes,Source <- Correct headers
def fix_un_csv_headers(self,file_path, line_number, new_data):
"""
Replaces the headers of a CSV file with new (correct) headers.
"""
with open(file_path, mode='r', newline='') as file:
reader = csv.reader(file)
rows = list(reader)
if 0 < line_number <= len(rows):
rows[line_number] = new_data
else:
print(f"Line number {line_number} is out of range.")
return
with open(file_path, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(rows)
def list_db(self, verbose: bool = False):
"""
Pretty prints the DB
......@@ -55,7 +86,7 @@ class SQL_tool():
table.add_column("Database Name", style="cyan", justify="left")
table.add_column("Path", style="magenta", justify="left")
table.add_column("Tables", style="purple", justify="left")
table.add_column("Colunms", style="green", justify="left")
table.add_column("Colunms", style="green", justify="left", overflow="fold")
counter = 0
for db_path in self.db_locations:
......@@ -147,8 +178,69 @@ class SQL_tool():
return table_lst
def query_db(self, query : str, table_name : str , verbose : bool=False):
"""
Executes a query on the master table
"""
if verbose: print(f"Querying: {query}")
db_path = None
for table_paths in self.db_locations: # Find path from table name
if table_name == os.path.basename(table_paths):
db_path = table_paths
break
with sqlite3.connect(db_path) as conn:
df = pd.read_sql_query(query, conn)
return df
def consolidate_tables(self):
"""
Consolidates all the tables in the database into one table
"""
# Check if the master database already exists
if os.path.exists(os.path.join(self.data_storage_location, "master_UN.db")):
print("Master database already exists.")
return
master_db_path = os.path.join(self.data_storage_location, "master_UN.db")
master_conn = sqlite3.connect(master_db_path)
master_cursor = master_conn.cursor()
master_cursor.execute('''
CREATE TABLE IF NOT EXISTS master_table (
Region_Country_Area TEXT,
Year INTEGER,
Row_Descriptor TEXT,
Value REAL,
Footnotes TEXT,
Source TEXT,
Database_Name TEXT
)
''')
master_conn.commit()
for db_paths in self.db_locations:
db_name = os.path.basename(db_paths)
print(f"Processing {db_name}...")
# Connect to the individual database
conn = sqlite3.connect(db_paths)
df = pd.read_sql_query("SELECT * FROM data", conn)
#df = df.iloc[:, 0:] # Remove the index column
# Add a column for the database name
df["Database_Name"] = db_name
# Append the data to the master table
df[["Region_Country_Area","Year" ,"Row_Descriptor","Value","Footnotes","Source","Database_Name"]].to_sql(
"master_table", master_conn, if_exists="append", index=False
)
conn.close()
self.db_locations.append(master_db_path)
print("Consolidation complete.")
......
......@@ -53,30 +53,4 @@ class Graph_tool():
plt.show()
def graph_yearly_data(self , formatted_data, title="Yearly Data Trends"):
"""
Plots yearly data for multiple countries.
Args:
formatted_data (list): A list of lists where each sublist contains (year, stat) tuples for a country.
title (str): The title of the graph.
"""
plt.figure(figsize=(10, 6)) # Set the figure size
for country_index, country_data in enumerate(formatted_data):
# Separate years and stats for the current country
years, stats = zip(*country_data) # Unzip the (year, stat) tuples
\ No newline at end of file
# Plot the line for this country's data
plt.plot(years, stats, marker='o', label=f"Country {country_index + 1}")
# Add labels, title, and grid
plt.xlabel("Year")
plt.ylabel("Stat")
plt.title(title)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title="Countries")
# Show the plot
plt.tight_layout()
plt.show()
This diff is collapsed.
from data_code.sql_tool import SQL_tool
from graph_code.graph_tool import Graph_tool
from stats_code.stats_tool import Stat_tool
import data_code.querys as querys
import pandas as pd
import plotly.express as px
if __name__ == "__main__":
......@@ -10,28 +11,139 @@ if __name__ == "__main__":
g_tool = Graph_tool()
s_tool = Stat_tool()
table_list = sql_t.list_db(verbose=True)
inp = input("x_table x_column y_table y_column(e.g 2 Year 4 Value)\n").split(" ")
x_tab, x_col, y_tab, y_col = (int(inp[0]), str(inp[1]), int(inp[2]), str(inp[3]))
# Get x
path = table_list[x_tab]
colunm_data_x = sql_t.get_column(db_path=path, column_name=x_col)
# Get y
path = table_list[y_tab]
colunm_data_y = sql_t.get_column(db_path=path, column_name=y_col)
formatted_yearly_data = s_tool.process_yearly_data(year_data=colunm_data_x, stats=colunm_data_y)
top_countries = s_tool.filter_top_countries(formatted_data=formatted_yearly_data, top_n=5)
g_tool.graph_yearly_data(formatted_data=top_countries, title="Yearly usage change")
#g_tool.gen_graph(x_data=colunm_data_x, x_name=x_col, y_data=colunm_data_y, y_name=y_col)
# # GRAPH 1 HOMICDES AGAINST MEN AND WOMEN AGAINST EDUCATION LEVEL GOLBALLY
# male_homicide_rate_query, education_rate_query = querys.male_homicide_rate_education_rate()
# male_homicide_rate = sql_t.query_db(male_homicide_rate_query, table_name="master_UN.db")
# education_rate = sql_t.query_db(education_rate_query, table_name="master_UN.db")
# # Merge both dataframes on 'Region_Country_Area' and 'Year'
# merged_df_male = pd.merge(male_homicide_rate, education_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Male_Homicide', '_Education'))
# merged_df_male['Gender'] = 'Male' # Gender label
# fem_homicide_rate_query, education_rate = querys.female_homicide_rate_education_rate()
# fem_homicide_rate = sql_t.query_db(fem_homicide_rate_query, table_name="master_UN.db")
# education_rate = sql_t.query_db(education_rate, table_name="master_UN.db")
# # Merge both dataframes on 'Region_Country_Area' and 'Year'
# merged_df_female = pd.merge(fem_homicide_rate, education_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Fem_Homicide', '_Education'))
# merged_df_female['Gender'] = 'Female' # Gender Label
# combined_df = pd.concat([merged_df_male[['Region_Country_Area', 'Year', 'Value_Male_Homicide', 'Value_Education', 'Gender']],
# merged_df_female[['Region_Country_Area', 'Year', 'Value_Fem_Homicide', 'Value_Education', 'Gender']]])
# # Combine male and female dataframes
# combined_df['Homicide_Rate'] = combined_df.apply(
# lambda row: row['Value_Male_Homicide'] if row['Gender'] == 'Male' else row['Value_Fem_Homicide'], axis=1)
# # Create an interactive scatter plot with Plotly (overlay male and female)
# fig = px.scatter(combined_df,
# x='Homicide_Rate',
# y='Value_Education',
# color='Gender', # Different colors for Male/Female
# symbol='Gender', # Different symbols for Male/Female
# hover_data={'Region_Country_Area': True, 'Year': True},
# labels={'Homicide_Rate': 'Homicide Rate (per 100,000)',
# 'Value_Education': 'Education Expenditure (% of Total)',
# 'Value_Fem_Homicide': 'Homicide Rate (Female, per 100,000)',
# 'Value_Male_Homicide': 'Homicide Rate (Male, per 100,000)'},
# title='Homicide Rate (Men and Women) vs Education Expenditure')
# # Show the plot
# fig.show()
# GRAPH 2 UNEMPLOYMENT RATE AGAINST CRIME RATE GLOBALLY
# unemployment_rate_query = querys.unemployment_rate()
# unemployment_rate = sql_t.query_db(unemployment_rate_query, table_name="master_UN.db")
# homicide_rate_query = querys.overall_homicide_rate()
# homicide_rate = sql_t.query_db(homicide_rate_query, table_name="master_UN.db")
# # Merge both dataframes on 'Region_Country_Area' and 'Year'
# merged_df = pd.merge(unemployment_rate, homicide_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Unemployment', '_Homicide'))
# # Create an interactive scatter plot with Plotly
# fig = px.scatter(merged_df,
# x='Value_Unemployment',
# y='Value_Homicide',
# hover_data={'Region_Country_Area': True, 'Year': True},
# labels={'Value_Unemployment': 'Unemployment Rate (%)',
# 'Value_Homicide': 'Homicide Rate (per 100,000)'},
# title='Unemployment Rate vs Homicide Rate')
# # Show the plot
# fig.show()
# GRAPH 3 INTERNET USAGE AGAINST CRIME RATE (FEMAIL)
# fem_homicide_rate_query = querys.female_homicide()
# fem_homicide_rate = sql_t.query_db(fem_homicide_rate_query, table_name="master_UN.db")
# oveall_homicide_rate_query = querys.overall_homicide_rate()
# oveall_homicide_rate = sql_t.query_db(oveall_homicide_rate_query, table_name="master_UN.db")
# # Get value of women killed in homicide (per 100,000)
# # Divide homicide rate by 100 and multiply by femaicde rate to get number of women killed
# fem_homicide_rate['Num_fem_homicide'] = (fem_homicide_rate['Value'] / 100) * oveall_homicide_rate['Value']
# internet_usage = querys.internet_usage()
# internet_usage = sql_t.query_db(internet_usage, table_name="master_UN.db")
# # Merge both dataframes on 'Region_Country_Area' and 'Year'
# merged_df = pd.merge(fem_homicide_rate, internet_usage, on=['Region_Country_Area', 'Year'], suffixes=('_Fem_Homicide', '_Internet'))
# merged_df_2022 = merged_df[merged_df['Year'] == 2022]
# # Create an interactive scatter plot with Plotly
# fig = px.scatter(merged_df,
# x='Value_Internet',
# y='Num_fem_homicide', # Use 'Num_fem_homicide' to show number of women killed
# hover_data={'Region_Country_Area': True, 'Year': True},
# labels={'Value_Internet': 'Internet Usage (%)',
# 'Num_fem_homicide': 'Number of Women Killed (per 100,000)',
# 'Value_Fem_Homicide': 'Percentage of Female Homicide Victims'},
# title='Internet Usage vs Number of Women Killed in Homicides')
# # Show the plot
# fig.show()
# GRAPH 4 CONSUMER PRICE INDEX AGAINST CRIME RATE
# oveall_homicide_rate_query = querys.overall_homicide_rate()
# oveall_homicide_rate = sql_t.query_db(oveall_homicide_rate_query, table_name="master_UN.db")
# consumer_price_index_query = querys.consumer_price_index_general()
# consumer_price_index = sql_t.query_db(consumer_price_index_query, table_name="master_UN.db")
# # Merge both dataframes on 'Region_Country_Area' and 'Year'
# merged_df = pd.merge(consumer_price_index, oveall_homicide_rate, on=['Region_Country_Area', 'Year'], suffixes=('_Consumer_Price_Index', '_Homicide'))
# # Create an interactive scatter plot with Plotly
# fig = px.scatter(merged_df,
# x='Value_Consumer_Price_Index',
# y='Value_Homicide',
# hover_data={'Region_Country_Area': True, 'Year': True},
# labels={'Value_Consumer_Price_Index': 'Consumer Price Index',
# 'Value_Homicide': 'Homicide Rate (per 100,000)'},
# title='Consumer Price Index vs Homicide Rate')
# # Show the plot
# fig.show()
\ No newline at end of file
pandas
rich
matplotlib
plotly
\ No newline at end of file
......@@ -6,55 +6,5 @@ class Stat_tool():
def __init__(self):
pass
def process_yearly_data(self, year_data, stats):
"""
Processes yearly data and groups stats for each country by year.
Assumes `year_data` repeats in a fixed pattern for each country.
Args:
year_data (list): List of years (repeated for each country).
stats (list): List of statistics corresponding to the years.
"""
# Determine the looping constant (number of years per country)
start_year = year_data[0]
looping_constant = 0
for year in year_data:
if year == start_year and looping_constant > 0:
break
looping_constant += 1
# Format the data: group stats by country and year
formatted_data = []
for i in range(0, len(stats), looping_constant):
country_data = []
for j in range(looping_constant):
if i + j < len(stats): # Ensure we don't exceed list bounds
country_data.append((year_data[j], stats[i + j]))
formatted_data.append(country_data)
return formatted_data
def filter_top_countries(self, formatted_data, top_n=5, metric_index=1):
"""
Filters top N countries based on the sum of a metric (e.g., stats).
Args:
formatted_data (list): List of lists with (year, stat) tuples.
top_n (int): Number of countries to include.
metric_index (int): Index of the stat in the tuple.
Returns:
list: Filtered data for top N countries.
"""
# Calculate total stats for each country
country_totals = [
(country_index, sum(data[metric_index] for data in country_data))
for country_index, country_data in enumerate(formatted_data)
]
# Sort by total stats and get the top N
top_countries = sorted(country_totals, key=lambda x: x[1], reverse=True)[:top_n]
# Return only the data for the top N countries
return [formatted_data[country[0]] for country in top_countries]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment