import pandas as pd
import requests as req
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


# imports the csv into a pandas dataframe object
ghg_data = pd.read_csv("greenhouse_gas_inventory_data.csv")

# shows the first 5 rows
ghg_data.head()


# gets the text file from nasa.gov
web = req.get("https://data.giss.nasa.gov/gistemp/graphs/graph_data/Global_Mean_Estimates_based_on_Land_and_Ocean_Data/graph.txt")

# first 500 characters
web.text[:500]

# this is going to need a bit of cleaning up...

'Land-Ocean Temperature Index (C)\n--------------------------------\n\nYear No_Smoothing  Lowess(5)\n----------------------------\n1880     -0.17     -0.09\n1881     -0.08     -0.13\n1882     -0.10     -0.16\n1883     -0.17     -0.20\n1884     -0.28     -0.23\n1885     -0.33     -0.26\n1886     -0.31     -0.27\n1887     -0.36     -0.27\n1888     -0.17     -0.26\n1889     -0.10     -0.25\n1890     -0.35     -0.25\n1891     -0.22     -0.25\n1892     -0.27     -0.26\n1893     -0.31     -0.26\n1894     -0.30     -0.24\n'


# makes an array of each line as a string, and removes junk from the beginning and end
arr = web.text.split("\n")[5:-1]

arr[:5]

['1880     -0.17     -0.09',
 '1881     -0.08     -0.13',
 '1882     -0.10     -0.16',
 '1883     -0.17     -0.20',
 '1884     -0.28     -0.23']


# makes a dataframe from a 2d matrix of the year and temp columns
temp_data = pd.DataFrame(list(map(lambda s: s.split("     ")[:2], arr)))
temp_data.columns = ["year", "temp"]
temp_data.year = temp_data.year.transform(lambda y: int(y)) # makes consistent with ghg data
temp_data.temp = temp_data.temp.transform(lambda y: float(y))

temp_data.head()


# prints all categories in ghg_data, will rename to make dataframe easier to read
for s in ghg_data.category.drop_duplicates().values:
    print(s + "\n")

carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent

greenhouse_gas_ghgs_emissions_including_indirect_co2_without_lulucf_in_kilotonne_co2_equivalent

greenhouse_gas_ghgs_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent

hydrofluorocarbons_hfcs_emissions_in_kilotonne_co2_equivalent

methane_ch4_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent

nitrogen_trifluoride_nf3_emissions_in_kilotonne_co2_equivalent

nitrous_oxide_n2o_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent

perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent

sulphur_hexafluoride_sf6_emissions_in_kilotonne_co2_equivalent

unspecified_mix_of_hydrofluorocarbons_hfcs_and_perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent


# pivots table to give each category a column, takes mean temp across all countries/areas
df = ghg_data.pivot_table(index="year", columns=["category"], aggfunc="mean")
df.columns = df.columns.droplevel()
df.columns = ["CO2", "GHGs", "GHGs_2", "HFCs", "CH4", "NF3", "N2O", "PFCs", "SF6", "HFCs_PFCs_mix"]

# GHGs_2 does not have indirect co2, so let's just drop in
df = df.drop("GHGs_2", axis=1)

# some data has HFCs and PFCs combined, so let's just combine them for the whole dataset
df["HFCs_PFCs_mix"] = df["HFCs_PFCs_mix"] + df["HFCs"] + df["PFCs"]
df = df.drop(["HFCs", "PFCs"], axis=1)

# adds a column for temp data
df = df.join(temp_data.set_index("year"), how="inner")

df.head()


# plots all columns in df except temp
df.drop(["temp"], axis=1).plot(figsize=(10, 7))
plt.xlabel("Year")
plt.ylabel("Kiloton of CO2 equivalent")
plt.title("Greenhouse Gas Emissions in UN Member Nations Over Time")
plt.legend(bbox_to_anchor=(1,1), loc="upper left");


# plots all columns in df except temp, seperately this time
fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))
fig.delaxes(ax[0][1])
fig.delaxes(ax[0][2])
df["GHGs"].plot(ax=ax[0][0], title="Total Greenhouse Gas Emissions in UN members, 1990-2014", color="orange")
df["CO2"].plot(ax=ax[1][0], title="Carbon Dioxide Emissions in UN members, 1990-2014")
df["CH4"].plot(ax=ax[1][1], title="Methane Emissions in UN members, 1990-2014")
df["NF3"].plot(ax=ax[1][2], title="Nitrogen Trifluoride Emissions in UN members, 1990-2014")
df["N2O"].plot(ax=ax[2][0], title="Nitrous Oxide Emissions in UN members, 1990-2014")
df["SF6"].plot(ax=ax[2][1], title="Sulfur Hexafluoride Emissions in UN members, 1990-2014")
df["HFCs_PFCs_mix"].plot(ax=ax[2][2], title="(Hydro)fluorocarbon Emissions in UN members, 1990-2014")

plt.setp(ax, xlabel="Year")
plt.setp(ax, ylabel="GHG Emissions (Kiloton of CO2 equivalent)");


# plots temp data vs year
df.reset_index().plot(x="year", y="temp", kind="scatter", title="Global Temperatures, 1990-2014", \
                      color="purple", figsize=(10, 7))
plt.xlabel("Year")
plt.ylabel("Temperature Anomoly (C)");


# plots temp data vs year
df.reset_index().plot(x="year", y="temp", kind="scatter", title="Global Temperatures, 1990-2014", \
                      color="purple", figsize=(10, 7))
plt.xlabel("Year")
plt.ylabel("Temperature Anomoly (C)")

# makes linear regression
X = df.index.values.reshape(-1, 1)
y = df.temp
reg = LinearRegression().fit(X, y)

# dataframe with model's predictions
reg_df = pd.DataFrame(reg.predict(X)).set_index(df.index.values)

# plots linear regression
plt.plot(reg_df, color="red");


# plots data
ax = df.plot(x="GHGs", y="temp", kind="scatter", title="Temperature vs. Greenhouse Gas Emissions", \
             color="blue", figsize=(10, 7))
plt.xlabel("Greenhouse Gas Emissions (Kiloton of CO2 equivalent)")
plt.ylabel("Temperature Anomoly (C)")
plt.xticks(rotation = 45)

# plots linear regression
X = df.GHGs.values.reshape(-1, 1)
y = df.temp
reg = LinearRegression().fit(X, y)
reg_df = pd.DataFrame(reg.predict(X)).set_index(df.GHGs.values)
plt.plot(reg_df, color="orange");


# new dataframe with derivative of temp data
diff_df = df.copy()
diff_df["temp"] = diff_df["temp"].diff()

diff_df.head()


# drops the row with NaN
diff_df.drop(1990, inplace=True)


# plots data
diff_df.plot(x="GHGs", y="temp", kind="scatter", \
             title="Change in Temperature vs. Greenhouse Gas Emissions", color="blue", figsize=(10, 7))
plt.xlabel("Greenhouse Gas Emissions (Kiloton of CO2 equivalent)")
plt.ylabel("Temperature Change (C)")
plt.xticks(rotation = 45)

# plots linear regression
X = diff_df.GHGs.values.reshape(-1, 1)
y = diff_df.temp
reg = LinearRegression().fit(X, y)
reg_df = pd.DataFrame(reg.predict(X)).set_index(diff_df.GHGs.values)
plt.plot(reg_df, color="orange");


# dataframe with second derivative of temp
diff2_df = diff_df.copy()
diff2_df["temp"] = diff2_df["temp"].diff()
diff2_df.dropna(inplace=True)

# plots data
diff2_df.plot(x="GHGs", y="temp", kind="scatter", \
              title="Second Order Change in Temperature vs. Greenhouse Gas Emissions", \
              color="blue", figsize=(10, 7))
plt.xlabel("Greenhouse Gas Emissions (Kiloton of CO2 equivalent)")
plt.ylabel("Second Order Temperature Change (C)")
plt.xticks(rotation = 45)

# plots linear regression
X = diff2_df.GHGs.values.reshape(-1, 1)
y = diff2_df.temp
reg = LinearRegression().fit(X, y)
reg_df = pd.DataFrame(reg.predict(X)).set_index(diff2_df.GHGs.values)
plt.plot(reg_df, color="orange");


# creates the linear regression model
X = df.reset_index(drop=True).drop(["GHGs", "temp"], axis=1)
y = df["temp"]
reg = LinearRegression().fit(X, y)


# creates dataframe with the model's predictions for 1990-2014
reg_df = pd.DataFrame(reg.predict(X))
reg_df.columns = ["model"]
model_df = reg_df.set_index(df.index).join(df["temp"])
model_df["residual"] = model_df["temp"] - model_df["model"]

model_df.head()


# makes the boxplot
plt.figure(figsize=(10, 7))
plt.boxplot(model_df["residual"], vert=False, showmeans=True)
plt.title("Boxplot of Residuals")
plt.xlabel("Residual (degrees C)")
plt.tick_params(left=False, labelleft=False)


# prints the coefficients of the model
coefs = pd.DataFrame(reg.coef_).set_index(X.columns)
coefs.columns = ["coefficient"]
coefs


model_df.plot(y=["temp", "model"], style=["o", "rx"], title="Observed Temperature Data and Model, 1990-2014", \
             figsize=(10, 7));
plt.xlabel("Year")
plt.ylabel("Temperature Anomoly (C)");


X = df.index.values.reshape(-1, 1)
y = df.temp

# creates polynomial regression
poly_reg=PolynomialFeatures(degree=2)
X_poly=poly_reg.fit_transform(X)
lin_reg = LinearRegression()
lin_reg.fit(X_poly,y)

# creates dataframe with the model's predictions for values in X_grid
X_grid = np.arange(min(X),max(X),0.1)
X_grid = X_grid.reshape(len(X_grid),1) 
poly_reg_df = pd.DataFrame(lin_reg.predict(poly_reg.fit_transform(X_grid)))

# plots temp data
plt.figure(figsize=(10, 7))
plt.scatter(X,y, color='purple') 

# plots polynomial regression model
plt.plot(X_grid, lin_reg.predict(poly_reg.fit_transform(X_grid)),color="red")

plt.title("Global Temperatures With Polynomial Regression, 1990-2014")
plt.xlabel("Year")
plt.ylabel("Temperature Anomoly (C)");

	country_or_area	year	value	category
0	Australia	2014	393126.946994	carbon_dioxide_co2_emissions_without_land_use_...
1	Australia	2013	396913.936530	carbon_dioxide_co2_emissions_without_land_use_...
2	Australia	2012	406462.847704	carbon_dioxide_co2_emissions_without_land_use_...
3	Australia	2011	403705.528314	carbon_dioxide_co2_emissions_without_land_use_...
4	Australia	2010	406200.993184	carbon_dioxide_co2_emissions_without_land_use_...

	CO2	GHGs	CH4	NF3	N2O	SF6	HFCs_PFCs_mix	temp
year
1990	455379.936166	633243.305620	87094.095299	18.383797	36194.407289	2074.533801	14357.312190	0.45
1991	446193.986253	618145.225307	84511.860775	18.871227	34748.195912	2055.211806	13363.035486	0.41
1992	431901.700232	597397.641955	81528.146297	19.408413	33609.735740	2059.102498	12162.126651	0.22
1993	426304.676894	588482.640738	79167.104710	23.278081	32976.558788	2075.474610	11839.852256	0.23
1994	421997.498173	580717.203385	77076.932886	25.988809	32253.689517	2029.666582	11960.795374	0.32

	CO2	GHGs	CH4	NF3	N2O	SF6	HFCs_PFCs_mix	temp
year
1990	455379.936166	633243.305620	87094.095299	18.383797	36194.407289	2074.533801	14357.312190	NaN
1991	446193.986253	618145.225307	84511.860775	18.871227	34748.195912	2055.211806	13363.035486	-0.04
1992	431901.700232	597397.641955	81528.146297	19.408413	33609.735740	2059.102498	12162.126651	-0.19
1993	426304.676894	588482.640738	79167.104710	23.278081	32976.558788	2075.474610	11839.852256	0.01
1994	421997.498173	580717.203385	77076.932886	25.988809	32253.689517	2029.666582	11960.795374	0.09

	model	temp	residual
year
1990	0.437675	0.45	0.012325
1991	0.365398	0.41	0.044602
1992	0.300855	0.22	-0.080855
1993	0.271325	0.23	-0.041325
1994	0.267307	0.32	0.052693

Greehouse Gas Emissions and Temperature from 1990-2014

Data Science Pipeline Tutorial

By Justin DeVito

Introduction¶

Getting Data¶

Data Management & Exploring the Data¶

Hypothesis Testing & Machine Learning¶

Conclusions¶

	coefficient
CO2	-6.737207e-08
CH4	-7.161686e-06
NF3	4.014544e-04
N2O	5.707049e-05
SF6	-4.109046e-04
HFCs_PFCs_mix	1.708625e-05

Greehouse Gas Emissions and Temperature from 1990-2014

Data Science Pipeline TutorialBy Justin DeVito

Introduction¶

Getting Data¶

Data Management & Exploring the Data¶

Hypothesis Testing & Machine Learning¶

Conclusions¶

Data Science Pipeline Tutorial

By Justin DeVito