import sqlite3 as sql
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# toggle off row display limit
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


# toggle on row display limit
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 5)


sqlite_file = "lahman2014.sqlite"
conn = sql.connect(sqlite_file)


# sql code to retrieve the data of interest from the database
teams_query = """
                SELECT 
                    A.yearID, 
                    A.teamID, 
                    franchID,   
                    W, 
                    G, 
                    100*CAST(W AS float)/CAST(G AS float) AS winPercent,
                    payroll
                FROM 
                    Teams A
                INNER JOIN 
                    (SELECT yearID, teamID, SUM(salary) AS payroll FROM Salaries GROUP BY yearID, teamID) B 
                ON
                    A.yearID = B.yearID AND A.teamId = B.teamId
            """

# runs the sql code and stores the data in a dataframe
teams = pd.read_sql(teams_query, conn)


teams


# organizes dataframe for analyzing payrolls
payrolls = teams.drop(teams.columns.difference(["yearID", "teamID", "payroll"]), 1).set_index("yearID")

# removes years before 1985-1989, we're only looking at 1990-2014
payrolls = payrolls.drop(index=[1985, 1986, 1987, 1988, 1989])


payrolls


# "unmelts"; creates a seperate column for each team, index is still yearID
payrolls_team_cols = payrolls.pivot(columns="teamID")


payrolls_team_cols


# removes "payroll" from column names for cleaner legend when plotted
payrolls_team_cols.columns = payrolls_team_cols.columns.droplevel()


# plots the data
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(14, 7))
payrolls_team_cols.plot(ax=axes, title = "Payroll vs. Time for each MLB Team from 1990-2014")
plt.setp(axes, xlabel="Year", ylabel="Payroll (USD)")

# customizes legend
_ = axes.legend(frameon=False, loc='upper left', ncol=3)


# creates groupby object with the leage's total payrolls for each year
# object can be used to find mean, std deviation, etc of payrolls for each year
payrolls_by_year = payrolls.groupby("yearID")


# plots the data
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
payrolls_by_year.mean().plot(ax=axes[0], legend=False, ylabel="Average Payroll (USD)", 
                     title="Average Payrolls of MLB Teams from 1990-2014")
payrolls_by_year.std().plot(ax=axes[1], legend=False, ylabel="Std. Deviation for Payrolls (USD)",
                    title="Std. Deviation for Payrolls of MLB Teams from 1990-2014")
_ = plt.setp(axes, xlabel="Year")


# puts each index from the teams dataframe in a bin based on its year
years = teams.iloc[:, 0].rename("bin")
bins = pd.cut(x=years, bins=[1990, 1995, 2000, 2005, 2010, 2015], labels=["A", "B", "C", "D", "E"], right=False)

# creates new dataframe with a bin column and select columns from the teams dataframe
binned = teams.drop(teams.columns.difference(["teamID", "winPercent", "payroll"]), 1).join(bins)


binned


means = binned.groupby(by=["bin", "teamID"]).mean()


means

# some bins have NaN because the teams don't exist in that time period, 
# for example the Diamonbacks (ARI) weren't founded until 1998, so they're NaN in bin A (1990-1994)


means = means.dropna()


# creates 5 subplots
fig_p4, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
fig_p4.delaxes(axes[1][2])

# plots the data
means.loc["A"].plot.scatter(x="payroll", y="winPercent", ax=axes[0][0], title="Winning Percentage vs. Payroll (1990-1994)")
# adds a label "OAK" for the point representing the Oakland A's
axes[0, 0].text(means.loc["A"].at["OAK", "payroll"], means.loc["A"].at["OAK", "winPercent"], "OAK")
# adds a label "NYY" for the point representing the New York Yankees
axes[0, 0].text(means.loc["A"].at["NYA", "payroll"], means.loc["A"].at["NYA", "winPercent"], "NYY")

means.loc["B"].plot.scatter(x="payroll", y="winPercent", ax=axes[0][1], title="Winning Percentage vs. Payroll (1995-1999)")
axes[0, 1].text(means.loc["B"].at["OAK", "payroll"], means.loc["B"].at["OAK", "winPercent"], "OAK")
axes[0, 1].text(means.loc["B"].at["NYA", "payroll"], means.loc["B"].at["NYA", "winPercent"], "NYY")

means.loc["C"].plot.scatter(x="payroll", y="winPercent", ax=axes[0][2], title="Winning Percentage vs. Payroll (2000-2004)")
axes[0, 2].text(means.loc["C"].at["OAK", "payroll"], means.loc["C"].at["OAK", "winPercent"], "OAK")
axes[0, 2].text(means.loc["C"].at["NYA", "payroll"], means.loc["C"].at["NYA", "winPercent"], "NYY")

means.loc["D"].plot.scatter(x="payroll", y="winPercent", ax=axes[1][0], title="Winning Percentage vs. Payroll (2005-2009)")
axes[1, 0].text(means.loc["D"].at["OAK", "payroll"], means.loc["D"].at["OAK", "winPercent"], "OAK")
axes[1, 0].text(means.loc["D"].at["NYA", "payroll"], means.loc["D"].at["NYA", "winPercent"], "NYY")

means.loc["E"].plot.scatter(x="payroll", y="winPercent", ax=axes[1][1], title="Winning Percentage vs. Payroll (2010-2014)")
axes[1, 1].text(means.loc["E"].at["OAK", "payroll"], means.loc["E"].at["OAK", "winPercent"], "OAK")
axes[1, 1].text(means.loc["E"].at["NYA", "payroll"], means.loc["E"].at["NYA", "winPercent"], "NYY")

plt.setp(axes, xlabel="Average Annual Payroll (USD)")
# added "_ =" for clean output
_ = plt.setp(axes, ylabel="Average Winning Percentage")


teams_temp = teams[["yearID", "payroll"]]

# adds columns for the mean and standard deviation payrolls in the row's yearID to each row
teams_temp = teams_temp.join(other=teams.groupby("yearID").mean()["payroll"], on="yearID", rsuffix="Mean")
teams_temp = teams_temp.join(other=teams.groupby("yearID").std()["payroll"], on="yearID", rsuffix="Std")


# adds a column that contains the row's payroll, mean, and std as a tuple
teams_temp["asTuple"]= list(zip(teams_temp.payroll, teams_temp.payrollMean, teams_temp.payrollStd))


# function to get standardized payroll
def standardize(tup):
    payroll, avg, std = tup
    return (payroll - avg) / std


# transforms the column of tuples and adds the result to the teams dataframe
teams["standardizedPayroll"] = teams_temp.asTuple.transform(standardize)


teams


# can reuse the "bins" series from before

# creates new dataframe with a bin column and select columns from the teams dataframe
binned = teams.drop(teams.columns.difference(["teamID", "winPercent", "standardizedPayroll"]), 1).join(bins)

means = binned.groupby(by=["bin", "teamID"]).mean()
means = means.dropna()


means


# creates 5 subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
fig.delaxes(axes[1][2])

# plots the data
means.loc["A"].plot.scatter(x="standardizedPayroll", y="winPercent", ax=axes[0][0], title="Winning Percentage vs. Std. Payroll (1990-1994)")
# adds a label "OAK" for the point representing the Oakland A's
axes[0, 0].text(means.loc["A"].at["OAK", "standardizedPayroll"], means.loc["A"].at["OAK", "winPercent"], "OAK")
# adds a label "NYY" for the point representing the New York Yankees
axes[0, 0].text(means.loc["A"].at["NYA", "standardizedPayroll"], means.loc["A"].at["NYA", "winPercent"], "NYY")

means.loc["B"].plot.scatter(x="standardizedPayroll", y="winPercent", ax=axes[0][1], title="Winning Percentage vs. Std. Payroll (1995-1999)")
axes[0, 1].text(means.loc["B"].at["OAK", "standardizedPayroll"], means.loc["B"].at["OAK", "winPercent"], "OAK")
axes[0, 1].text(means.loc["B"].at["NYA", "standardizedPayroll"], means.loc["B"].at["NYA", "winPercent"], "NYY")

means.loc["C"].plot.scatter(x="standardizedPayroll", y="winPercent", ax=axes[0][2], title="Winning Percentage vs. Std. Payroll (2000-2004)")
axes[0, 2].text(means.loc["C"].at["OAK", "standardizedPayroll"], means.loc["C"].at["OAK", "winPercent"], "OAK")
axes[0, 2].text(means.loc["C"].at["NYA", "standardizedPayroll"], means.loc["C"].at["NYA", "winPercent"], "NYY")

means.loc["D"].plot.scatter(x="standardizedPayroll", y="winPercent", ax=axes[1][0], title="Winning Percentage vs. Std. Payroll (2005-2009)")
axes[1, 0].text(means.loc["D"].at["OAK", "standardizedPayroll"], means.loc["D"].at["OAK", "winPercent"], "OAK")
axes[1, 0].text(means.loc["D"].at["NYA", "standardizedPayroll"], means.loc["D"].at["NYA", "winPercent"], "NYY")

means.loc["E"].plot.scatter(x="standardizedPayroll", y="winPercent", ax=axes[1][1], title="Winning Percentage vs. Std. Payroll (2010-2014)")
axes[1, 1].text(means.loc["E"].at["OAK", "standardizedPayroll"], means.loc["E"].at["OAK", "winPercent"], "OAK")
axes[1, 1].text(means.loc["E"].at["NYA", "standardizedPayroll"], means.loc["E"].at["NYA", "winPercent"], "NYY")

plt.setp(axes, xlabel="Average Annual Standardized Payroll")
# added "_ =" for clean output
_ = plt.setp(axes, ylabel="Average Winning Percentage")


# figure from problem 4 for comparison
fig_p4


fig, axes = plt.subplots(figsize=(14, 7))

# plots data
teams.plot.scatter(x="standardizedPayroll", y="winPercent", ax=axes, title="Winning Percentage vs. Std. Payroll for MLB teams from 1985-2014")
plt.setp(axes, xlabel="Standardized Payroll", ylabel="Winning Percentage")

# calculates polynomial for best fit line
fit = np.polyfit(x=teams.standardizedPayroll, y=teams.winPercent, deg=1)
fun = np.poly1d(fit)

# plots best fit line
x = teams.standardizedPayroll
y = fun(x)
_ = axes.plot(x, y, color="r")


# calculates a series for all expected win percents based on equation given in project description
ex_win_pcts = teams.standardizedPayroll.transform(lambda x: 50 + 2.5*x)


# creates a series with win percent and expected win percent in a tuple for each team 
tups = pd.Series(list(zip(teams.winPercent, ex_win_pcts)))


def get_efficiency(tup):
    win_pct, ex_win_pct = tup
    return win_pct - ex_win_pct


# adds column for efficiency to teams dataframe
teams["efficiency"] = tups.transform(get_efficiency)


teams


# creates dataframe with yearID as the index and each team's efficiency as a column
efficiencies = teams[["yearID", "teamID", "efficiency"]].set_index("yearID").pivot(columns="teamID")


efficiencies


# removes "efficiency" from column names for cleaner legend when plotted
efficiencies.columns = efficiencies.columns.droplevel()


fig, axes = plt.subplots(figsize=(14, 7))

# plots the data
efficiencies[["OAK", "NYA", "BOS", "DET"]].plot(ax=axes, title="Spending Efficiency vs. Time for MLB Teams")
plt.setp(axes, xlabel="Year", ylabel="Spedning Efficiency")

# customizes legend
_ = axes.legend(frameon=False, loc='upper left', ncol=1)


fig, axes = plt.subplots(figsize=(14, 7))

# plots the rolling average of the data
efficiencies[["OAK", "NYA", "BOS", "DET"]].rolling(5).mean().plot(ax=axes, title="Spending Efficiency vs. Time for MLB Teams (5 Year Rolling Average)")
plt.setp(axes, xlabel="Year", ylabel="Spedning Efficiency (5 year Rolling Average)")

# customizes legend
_ = axes.legend(frameon=False, loc='upper left', ncol=1)

	yearID	teamID	...	winPercent	payroll
0	1985	ATL	...	40.740741	14807000.0
1	1985	BAL	...	51.552795	11560712.0
2	1985	BOS	...	49.693252	10897560.0
3	1985	CAL	...	55.555556	14427894.0
4	1985	CHA	...	52.147239	9846178.0
...	...	...	...	...	...
853	2014	SLN	...	55.555556	120693000.0
854	2014	TBA	...	47.530864	72689100.0
855	2014	TEX	...	41.358025	112255059.0
856	2014	TOR	...	51.234568	109920100.0
857	2014	WAS	...	59.259259	131983680.0

	teamID	payroll
yearID
1990	ATL	14555501.0
1990	BAL	9680084.0
1990	BOS	20558333.0
1990	CAL	21720000.0
1990	CHA	9491500.0
...	...	...
2014	SLN	120693000.0
2014	TBA	72689100.0
2014	TEX	112255059.0
2014	TOR	109920100.0
2014	WAS	131983680.0

	payroll
teamID	ANA	ARI	...	TOR	WAS
yearID
1990	NaN	NaN	...	17756834.0	NaN
1991	NaN	NaN	...	19902417.0	NaN
1992	NaN	NaN	...	44788666.0	NaN
1993	NaN	NaN	...	47279166.0	NaN
1994	NaN	NaN	...	43433668.0	NaN
...	...	...	...	...	...
2010	NaN	60718166.0	...	62234000.0	61400000.0
2011	NaN	53639833.0	...	62567800.0	63856928.0
2012	NaN	73804833.0	...	75009200.0	80855143.0
2013	NaN	90132000.0	...	126288100.0	113703270.0
2014	NaN	97861500.0	...	109920100.0	131983680.0

	teamID	winPercent	payroll	bin
0	ATL	40.740741	14807000.0	NaN
1	BAL	51.552795	11560712.0	NaN
2	BOS	49.693252	10897560.0	NaN
3	CAL	55.555556	14427894.0	NaN
4	CHA	52.147239	9846178.0	NaN
...	...	...	...	...
853	SLN	55.555556	120693000.0	E
854	TBA	47.530864	72689100.0	E
855	TEX	41.358025	112255059.0	E
856	TOR	51.234568	109920100.0	E
857	WAS	59.259259	131983680.0	E

		winPercent	payroll
bin	teamID
A	ANA	NaN	NaN
	ARI	NaN	NaN
	ATL	56.497726	31721852.8
	BAL	50.444080	23785204.0
	BOS	49.514761	34863216.8
...	...	...	...
E	SLN	55.679012	104445659.0
	TBA	54.992047	60558982.8
	TEX	53.881694	98567688.2
	TOR	48.888889	87203840.0
	WAS	53.024308	90359804.2

Part 1: Wrangling¶

Problem 1¶

Part 2: Exploratory Data Analysis¶

Payroll Distribution¶

Problem 2¶

Question 1¶

Problem 3¶

Correlation between payroll and winning percentage¶

Problem 4¶

Question 2¶

Part 3: Data transformations¶

Standardizing Across Years¶

Problem 5¶

Problem 6¶

Question 3¶

Expected Wins¶

Problem 7¶

Spending Efficiency¶

Problem 8¶

Question 4¶

	yearID	teamID	...	payroll	standardizedPayroll
0	1985	ATL	...	14807000.0	1.914905
1	1985	BAL	...	11560712.0	0.601068
2	1985	BOS	...	10897560.0	0.332678
3	1985	CAL	...	14427894.0	1.761474
4	1985	CHA	...	9846178.0	-0.092838
...	...	...	...	...	...
853	2014	SLN	...	120693000.0	0.457126
854	2014	TBA	...	72689100.0	-0.593171
855	2014	TEX	...	112255059.0	0.272509
856	2014	TOR	...	109920100.0	0.221422
857	2014	WAS	...	131983680.0	0.704160

		winPercent	standardizedPayroll
bin	teamID
A	ATL	56.497726	0.381441
	BAL	50.444080	-0.658533
	BOS	49.514761	1.014231
	CAL	45.704777	0.338304
	CHA	56.426308	-0.309019
...	...	...	...
E	SLN	55.679012	0.199817
	TBA	54.992047	-0.850728
	TEX	53.881694	0.036064
	TOR	48.888889	-0.274002
	WAS	53.024308	-0.195302

	yearID	teamID	...	standardizedPayroll	efficiency
0	1985	ATL	...	1.914905	-14.046522
1	1985	BAL	...	0.601068	0.050124
2	1985	BOS	...	0.332678	-1.138442
3	1985	CAL	...	1.761474	1.151872
4	1985	CHA	...	-0.092838	2.379333
...	...	...	...	...	...
853	2014	SLN	...	0.457126	4.412740
854	2014	TBA	...	-0.593171	-0.986208
855	2014	TEX	...	0.272509	-9.323248
856	2014	TOR	...	0.221422	0.681014
857	2014	WAS	...	0.704160	7.498860