/
pga_tour_sum_funs.py
221 lines (156 loc) · 6.4 KB
/
pga_tour_sum_funs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 7 10:32:10 2021
@author: BaillieD
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os import getcwd
import os
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import plotly.io as pio; pio.renderers.default='notebook'
import pga_tour_sum_funs# DB: Custom functions to reduce size of notebook
from IPython.display import Markdown as md
from sklearn.linear_model import LinearRegression
def load_data(year):
cd = getcwd()
relPath = 'hole-' + str(year) + '.TXT'
#dataDir = os.path.join(cd,relPath)
df_hole = pd.read_csv(relPath, sep=';', encoding='latin-1')
#Convert inches to feet
#df_hole['Made Putt Distance'] = df_hole['Made Putt Distance'] * 0.0833333
relPath = 'event-' + str(year) + '.TXT'
# dataDir = os.path.join(cd,relPath)
df_event = pd.read_csv(relPath, sep=';', encoding='latin-1')
df_event['Player Number'] = df_event[' Player Number']
l = df_event['Official Event(Y/N)'] == 'Y'
#rel_events = df_event['Permanent Tournament Number'][l].unique()
#ll = np.zeros((len(df), 1))
#for r in rel_events:
# ll[df['Permanent #'] == r] = 1
# df = df[ll == 1]
return df_hole, df_event
def get_measured_data(df, min_drives):
isDriving = (df['DrivingDistMeasuredFlag'] == 'Y')
isDriving = isDriving & df['DrivingDistance_rounded_'] > 0
dfMeas = df[isDriving]
dfMeasMean = dfMeas.groupby('Player_').mean()
playerCount = dfMeas.groupby('Player_').count()
dfMeasMean['Count'] = playerCount['Tour']
dfMeasMean = dfMeasMean[dfMeasMean['Count'] >= min_drives]
return dfMeas, dfMeasMean
def get_hole_average_data(df, min_holes):
dfHole = df
dfHoleMean = dfHole.groupby('Player_').mean()
playerCount = dfHole.groupby('Player_').count()
dfHoleMean['Count'] = playerCount['Tour']
# sum
dfHoleSum = dfHole.groupby('Player_').sum()
dfHoleMean = dfHoleMean[dfHoleMean['Count'] >= min_holes]
return dfHoleMean, dfHoleSum
def player_level_data(df_hole, df_event, min_events):
#Hole
# Get list of players
playerIds = df_hole['Player_'].unique()
isDriving = df_hole['DrivingDistMeasuredFlag'] == 'Y'
isDriving = isDriving & df_hole['DrivingDistance_rounded_'] > 0
# Create empty lists to score values
meanPph = []
for i in range(0,len(playerIds)):
lp = (df_hole['Player_'] == playerIds[i])
meanPph.append(df_hole[lp]['Putts'].mean())
#Event
player_ids = df_event['Player Number'].unique()
driving_distance = []
driving_acc = []
money_earned = []
name = []
gir = []
sg_p = []
stroke_average = []
pn = []
for i in range(0,len(player_ids)):
lp = (df_event['Player Number'] == player_ids[i])
df_p = df_event[lp]
if np.sum(lp) >= min_events:
# Money
l = list((df_p['Money'].dropna()))
l = [x.strip(' ') for x in l]
l = [x.replace(',','') for x in l]
x2 = list(map(float, l))
money_earned.append(np.sum(x2))
#Driving Distance
driving_distance.append(df_p['Driving Distance(Total Distance)'].sum()/df_p['Driving Distance(Total Drives)'].sum())
#Driving Accuracy
driving_acc.append(100 * (df_p['Driving Acc. %(Fairways Hit)'].sum()/df_p['Driving Acc. %(Possible Fairways)'].sum()))
#GIR
gir.append(100*(df_p['Total Greens in Regulation'].sum()/df_p['Total Holes Played'].sum()))
#ppr
sg_p.append(df_p['Overall Putting Avg(# of Putts)'].sum()/df_p['Total Rounds)'].sum())
#strokes
stroke_average.append(df_p['Total Strokes'].sum()/df_p['Total Rounds)'].sum())
#Name
name.append(df_p['Player Name'].iloc[1])
#Number
pn.append(df_p['Player Number'].iloc[1])
df_event = pd.DataFrame()
df_event['Name'] = name
df_event['Player Number'] = pn
df_event['Money'] = money_earned
df_event['Driving Distance'] = driving_distance
df_event['Driving Accuracy'] = driving_acc
df_event['GIR'] = gir
df_event['Putts Per Round'] = sg_p
return df_event
def add_putting(df_event_player, df):
#names = df_event_player['Name'].unique()
names = df_event_player['Player Number'].unique()
pph = []
for n in names:
#l = (df['Player Name'] == n) & (df['Event Name'] != 'World Golf Championships-Dell Technologies Match Play')
l = (df['Player #'] == n) & (df['Event Name'] != 'World Golf Championships-Dell Technologies Match Play')
pdat = df[l]
pph.append(np.sum(pdat.Putts)/np.sum(l))
df_event_player['Putts Per Hole'] = pph
df_event_player['Putts Per Round'] = 18 * df_event_player['Putts Per Hole']
return df_event_player
def getAllData(year, min_events):
df, df_event = load_data(year)
df_measured, df_measured_player = get_measured_data(df, 0)
df_hole_average, df_hole_sum = get_hole_average_data(df, 0)
df_event_player = player_level_data(df, df_event, min_events)
if year == 2021:
av = 295.7
elif year == 2020:
av = 296.4
elif year == 2019:
av = 293.3
elif year == 2018:
av = 296.1
elif year == 2017:
av = 292.1
elif year == 2016:
av = 290.0
elif year == 2015:
av = 289.7
return df, df_event, df_measured, df_measured_player, df_hole_average, df_hole_sum, df_event_player, av
def bubble(df, x_var, y_var, title_str, year):
fig = px.scatter(df, x=x_var, y=y_var, trendline = 'ols',
color="Name", size="Money",
hover_name="Name", title=title_str + str(year))
# Do regression
x = df[x_var].to_numpy().reshape(-1, 1)
y = df[y_var].to_numpy().reshape(-1, 1)
model = LinearRegression().fit(x,y)
rsq = model.score(x, y)
# Construct line to plot
lx = [np.min(x), np.max(x)]
ly = lx * model.coef_[0] + model.intercept_[0]
# Add line and text
fig.add_scatter(x=lx, y=ly, mode='lines')
return fig, model, rsq