import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import scipy
import statsmodels
import seaborn as sb
import sklearn 
from sklearn import datasets, metrics
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
# import statsmodels.formula.api as sm
import statsmodels.api as sm
from IPython.display import display, HTML
import pycountry as pyco
%matplotlib inline
# Data Retrieved From: https://www.kaggle.com/unsdsn/world-happiness


# all csv files are stored under the data directory
path = "data/"
data = {file[:4]: pd.read_csv("data/" + file) for file in os.listdir(path) if file[0].isdigit()}


# column names
columns = ["Country", "Region", "Happiness Rank", "Happiness Score", "Economy (GDP per Capita)", "Social support",
           "Health (Life Expectancy)", "Freedom", "Trust (Government Corruption)", "Generosity"]

# rename and drop columns for 2015, 2016, 2017 data files
data["2015"] = data["2015"].rename(columns={"Family": "Social support"})
data["2015"] = data["2015"][data["2015"].columns.intersection(columns)]

data["2016"] = data["2016"].rename(columns={"Family": "Social support"})
data["2016"] = data["2016"][data["2016"].columns.intersection(columns)]

data["2017"] = data["2017"].drop(["Whisker.high", "Whisker.low", "Dystopia.Residual"], axis=1)
data["2017"] = data["2017"].rename(columns={"Family": "Social Support"})
data["2017"].columns =  [c for c in columns if c != "Region"]

# rename columns for 2018 and 2019 data files
new_names = {"Overall rank": "Happiness Rank", "Country or region": "Country", "Score": "Happiness Score",
     "GDP per capita": "Economy (GDP per Capita)", "Healthy life expectancy": "Health (Life Expectancy)",
     "Freedom to make life choices": "Freedom", "Perceptions of corruption": "Trust (Government Corruption)"}

data["2018"] = data["2018"].rename(columns=new_names)
data["2018"] = data["2018"][[c for c in columns if c in data["2018"].columns]]

data["2019"] = data["2019"].rename(columns=new_names)
data["2019"] = data["2019"][[c for c in columns if c in data["2019"].columns]]


# inconsistent country naming
errors = ["Congo (Kinshasa)", "Congo (Brazzaville)", "Palestinian Territories", "Ivory Coast", "South Korea"]
alpha_2_names = [pyco.countries.get(alpha_2=a2).name for a2 in ["CG", "CD","PS", "CI","KR"]]

# use pycountry to standardize
for year, df in data.items():
    for c in df["Country"]:
        try:
            d = {c: country.name for country in pyco.countries.search_fuzzy(c) if country.name not in df["Country"]}
            df["Country"] = df["Country"].replace(d)
        except LookupError:
            df["Country"] = df["Country"].replace(dict(zip(errors, alpha_2_names)))


# remove all rows for countries that are not common to all the data files
# this will be used to filter out the countries that do not appear in each data file
countries_list = [df["Country"] for df in data.values()]
common_countries = set(countries_list[0]).intersection(*countries_list[1:])

for year, df in data.items():
    data[year] = df[df["Country"].isin(common_countries)]


# 2017, 2018, 2019 data files do not have the regions for the different countries filled in
# so we can use the regions from the 2015 to fill them in since the regions won't change
country_to_region = dict(zip(data["2015"]["Country"], data["2015"]["Region"]))
country_to_region = {k: v for k,v in country_to_region.items() if k in common_countries}

# create region column for the years it does not exist
pd.set_option('mode.chained_assignment', None)
data["2017"]["Region"] = data["2017"]["Country"].apply(lambda country : country_to_region[country])
data["2018"]["Region"] = data["2018"]["Country"].apply(lambda country : country_to_region[country])
data["2019"]["Region"] = data["2019"]["Country"].apply(lambda country : country_to_region[country])

# set proper order of the columns
for year in data.keys():
    data[year] = data[year].loc[:, columns]
    
# pd.set_option("display.max_rows", None, "display.max_columns", None)


# A table below will count the number of countries that are in each region.

western_europe_count = 0
north_america_count = 0
australia_and_new_zealand_count = 0
middle_east_and_northern_africa = 0
latin_america_and_carribbean = 0
southeastern_asia = 0
central_and_eastern_europe = 0
eastern_asia = 0
sub_saharan_africa = 0
southern_asia = 0

# iterates through the entire region column of the 2015 data set and counts how many countries 
# are in each region.
for each in data["2015"]["Region"]:
    if (each == "Western Europe"):
        western_europe_count = western_europe_count + 1
    elif (each == "North America"):
        north_america_count = north_america_count + 1
    elif (each == "Australia and New Zealand"):
        australia_and_new_zealand_count = australia_and_new_zealand_count + 1
    elif (each == "Middle East and Northern Africa"):
        middle_east_and_northern_africa = middle_east_and_northern_africa + 1
    elif (each == "Latin America and Caribbean"):
        latin_america_and_carribbean = latin_america_and_carribbean + 1
    elif (each == "Southeastern Asia"):
        southeastern_asia = southeastern_asia + 1
    elif (each == "Central and Eastern Europe"):
        central_and_eastern_europe = central_and_eastern_europe + 1
    elif (each == "Eastern Asia"):
        eastern_asia = eastern_asia + 1
    elif (each == "Sub-Saharan Africa"):
        sub_saharan_africa = sub_saharan_africa + 1
    elif (each == "Southern Asia"):
        southern_asia = southern_asia + 1
        
# data frame that stores the counts
region_count = {'Region' : ['Western Europe', 'North America', 'Australia and New Zealand',
                            'Middle East and Northern Africa', 'Latin America and Caribbean','Southeastern Asia',
                           'Central and Eastern Europe', 'Eastern Asia', 'Sub-Saharan Africa','Southern Asia'],
               'Number of Countries in Region' : [western_europe_count,north_america_count,australia_and_new_zealand_count,
                                                middle_east_and_northern_africa,latin_america_and_carribbean, 
                                                  southeastern_asia,central_and_eastern_europe,eastern_asia, 
                                                sub_saharan_africa, southern_asia]}

count_df = pd.DataFrame(region_count, columns = ['Region', 'Number of Countries in Region'])
count_df


medians = {}
for year, df in data.items():
    sb.set_style("whitegrid")
    dims = (15,10)
    _, ax = plt.subplots(figsize=dims)
    medians[year] = df.groupby(["Region"])["Happiness Score"].median()
    plt.title("Boxplot of the Region and their Happiness Score in " + year, fontsize = 20)
    plt.xlabel("Happiness Score", fontsize = 15)
    sb.boxplot(x="Region", y="Happiness Score", data=data[year], ax=ax)
    plt.show()


# shows the median Happiness score for each region from 2015 to 2019
median_data = pd.DataFrame.from_dict(medians)
median_data


# load income classification data
econ = pd.read_excel(path + "OGHIST.xls", sheet_name = "Country Analytical History")
pd.set_option("display.max_rows", None, "display.max_columns", None)

# format nd extract the country names and data from years (2015-2019)
econ = econ.iloc[[4] + list(range(10, 228)), [1] + list(range(30, 35))]
econ.reset_index(drop=True, inplace=True)
econ.columns = [str(c) for c in econ.iloc[0]]
econ.rename(columns={"4": "index", "Data for calendar year :": "Country"}, inplace=True)
econ = econ.drop([0], axis=0)
econ.reset_index(drop=True, inplace=True)

# deal with inconsistent country names
a2 = ["BS", "CG", "CD", "EG", "GM", "HK", "KP", "KR", "LA", "FM", "KN", "LC", "MF", "VC", "VE", "TW", "VI", "PS", "YE"]
a2 = [pyco.countries.get(alpha_2=c).name for c in a2]
discard = ["Channel Islands", "Faeroe Islands", "Macao SAR, China"]
errors = []

# use pycountry to get standard names
for c in econ["Country"]:
    try:
        d = {c: pyco.countries.search_fuzzy(c)[0].name}
        if c == "Niger":
            d = {c: "Niger"}
        econ["Country"] = econ["Country"].replace(d)
    except LookupError:
        if c not in discard:
            errors.append(c)
            
econ["Country"] = econ["Country"].replace(dict(zip(errors, a2)))

# create new dictionary to be used for the income class learning portion
new_data = {year: df.copy(deep=True) for year, df in data.items()}
for year, df in new_data.items():
    df["Income Class"] = pd.Series(dtype=str)


# add the income class to the new_data 
for year, df in new_data.items():
    for i, row in econ.iterrows():
        country, income_class = row["Country"], row[year]
        if country in common_countries:
            df.loc[df["Country"] == country, "Income Class"] = income_class

# drop countries that contain an NaN for their income score
for _, df in new_data.items():
    df.dropna(inplace=True)


# create the KNN classifier and set the train and test data pairs
knn = KNeighborsClassifier()
pairs = [("2015", "2016"), ("2016", "2017"), ("2017", "2018"), ("2018", "2019")]

# perform classification
for train_year, test_year in pairs:
    X, y = new_data[train_year].iloc[:, 4:10], new_data[train_year]["Income Class"]
    X, y = np.array(X), np.array(y)
    knn.fit(X, y)
    X_test, y_test = new_data[test_year].iloc[:, 4:10], new_data[test_year]["Income Class"]
    X_test, y_test = np.array(X), np.array(y)
    print("Mean accuracy (train year: %s test year: %s): %f" % (train_year, test_year, knn.score(X_test, y_test)))

Mean accuracy (train year: 2015 test year: 2016): 0.840278
Mean accuracy (train year: 2016 test year: 2017): 0.863014
Mean accuracy (train year: 2017 test year: 2018): 0.819444
Mean accuracy (train year: 2018 test year: 2019): 0.805556


# load the hdi data
hdi = pd.read_csv(path + "Human_Development_Index.csv")
hdi = hdi.drop(columns=["Human Development Index (HDI)"])

# clean and format the hdi data
remove_nan = lambda r: [e for e in r if str(e) != "nan"]
rows = [remove_nan(row.name) for i, row in hdi.iterrows()]
rows = [r[:2] + r[-4:] for r in rows]
rows = rows[:len(rows) - 18] 
columns = rows.pop(0)

# create hdi dataframe
hdi = {c: [r[i] for r in rows] for i, c in enumerate(columns)}
hdi = pd.DataFrame(data=hdi)   

# standardize country names in hdi dataframe
for c in hdi["Country"]:
    try:
        pyco.countries.search_fuzzy(c)
    except LookupError:
        name = re.search(r"^([\w\s]*).*$", c).group(1)
        d = {c: country.name for country in pyco.countries.search_fuzzy(name) if country.name not in hdi["Country"]}
        hdi["Country"] = hdi["Country"].replace(d)

hdi = hdi[hdi["Country"].isin(common_countries)]
hdi.reset_index(drop=True, inplace=True)


# pd.set_option("display.max_rows", 500)
# create data arrays
happiness_score, hdi_score = [], []

# add the data to the arrays
for year, df in data.items():
    if year != "2019":
        for country in df["Country"]:
            if country in list(hdi["Country"]):
                happiness_score.append(df[df["Country"] == country]["Happiness Score"].iloc[0])
                hdi_score.append(float(hdi[hdi["Country"] == country][year].iloc[0]))

# create regression model
happiness_score, hdi_score = np.array(happiness_score), np.array(hdi_score)
reshape_x, reshape_y = happiness_score.reshape(-1, 1), hdi_score.reshape(-1, 1)
reg = LinearRegression().fit(reshape_x, reshape_y)
print("R^2 coefficient: %f" % reg.score(reshape_x, reshape_y))

# plot data on scatter plot
plt.figure(figsize = (15, 10))
plt.title("Happiness Score Relation with HDI")
plt.xlabel("Happiness Score")
plt.ylabel("HDI Score")
plt.scatter(happiness_score, hdi_score)

# plot best fit line
m, b = np.polyfit(happiness_score, hdi_score, 1)
plt.plot(happiness_score, m * happiness_score + b, color = "orange")

X = sm.add_constant(happiness_score)
y = hdi_score
model = sm.OLS(y, X).fit()
print(model.summary())

R^2 coefficient: 0.626395
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.626
Model:                            OLS   Adj. R-squared:                  0.626
Method:                 Least Squares   F-statistic:                     942.3
Date:                Mon, 21 Dec 2020   Prob (F-statistic):          2.99e-122
Time:                        00:55:20   Log-Likelihood:                 534.16
No. Observations:                 564   AIC:                            -1064.
Df Residuals:                     562   BIC:                            -1056.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1424      0.020      7.303      0.000       0.104       0.181
x1             0.1073      0.003     30.696      0.000       0.100       0.114
==============================================================================
Omnibus:                       22.188   Durbin-Watson:                   1.887
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               25.654
Skew:                          -0.421   Prob(JB):                     2.69e-06
Kurtosis:                       3.618   Cond. No.                         28.3
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

	2015	2016	2017	2018	2019
Region
Australia and New Zealand	7.2850	7.3235	7.2990	7.2980	7.2675
Central and Eastern Europe	5.2860	5.4880	5.3950	5.6200	5.5290
Eastern Asia	5.7290	5.6465	5.5555	5.6525	5.6580
Latin America and Caribbean	6.1300	6.1260	6.0080	6.0705	6.0955
Middle East and Northern Africa	5.1920	5.3030	5.5000	5.3580	5.2110
North America	7.2730	7.2540	7.1545	7.1070	7.0850
Southeastern Asia	5.3795	5.2965	5.3460	5.3135	5.2655
Southern Asia	4.5650	4.6430	4.7850	4.6900	4.6845
Sub-Saharan Africa	4.2520	4.1300	4.1390	4.3500	4.4900
Western Europe	6.9385	6.9180	6.9510	6.9770	7.0540

Happiness in the World¶

Overview¶

Required Tools¶

Data Collection¶

Data Tidying and Cleaning¶

Inconsistent column names¶

Inconsistent country names¶

Some countries are not in all the datasets¶

Add region column for certain datasets¶

Exploratory Analysis and Data Visualization¶

Number of Countries in Each Region¶

Analysis of Number of Countries in Region¶

Part One: Boxplots of Data¶

Analysis of Boxplots¶

Analysis of Medians¶

Part Two: Income Classification Based on Happiness Metrics¶

Data Cleaning and Tidying for Income Status¶

Adding Income Class Column¶

Using Kth Nearest Neighbors Classifier¶

Analysis¶

Part Three: Linear Regressions¶

Tidying and Cleaning HDI Dataset¶

Hypothesis¶

Regression Model and Graph¶

Analysis of Regression Graph¶

Conclusion¶

Sources¶