/
prob_added.py
122 lines (80 loc) · 3.6 KB
/
prob_added.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Calculate marginal utility of each stat
This uses an exponential decay to weight previous years scores. The current year has a weight of 1, the previous year has a weight of 0.25, and the year before that has a weight of 0.0625.
Since each year has 25 scoring weeks, that means that 4 weeks into the current season, the weights are:
* current season = 1 * 4 weeks
* previous season = 0.25 * 25 weeks
* 2 years seasons ago = 0.25 * 0.25 * 25 weeks
= ~ 11.8 total weight
Therefore the current season would have an influence of about 34%. Halfway through the current season (12 weeks) would have an influence of 61%.
"""
import os
import joblib
import numpy
import pandas
from sklearn.linear_model import LogisticRegression
from config import CURRENT_YEAR, HISTORICAL_DATA_YEARS, LEAGUE_DATA_DIRECTORY
def main():
calculate_probability_added_batting()
calculate_probability_added_pitching()
def calculate_probability_added_batting():
"""
Calculate the winning probability added for each category when an additional unit is added
"""
scores = load_scores(HISTORICAL_DATA_YEARS)
scores['ePA'] = scores['AB'] + scores['BB']
scores['OBP_big'] = scores['OBP'] * 1000
batting_categories = ['ePA', 'R', 'RBI', 'HR', 'TB', 'SB', 'OBP_big']
batter_categories_info = {
'models': {},
'p_added': {},
}
for category in batting_categories:
train = scores[['year', 'year_weight', 'matchup_id', category]].copy()
train = train[~train[category].isna()].copy()
train['winner'] = train.groupby('matchup_id')[category].transform(lambda x: numpy.where(x == x.max(), 1, 0))
lm = LogisticRegression()
lm.fit(train[[category]], train['winner'], sample_weight=train['year_weight'])
batter_categories_info['models'][category] = lm
joblib.dump(batter_categories_info, os.path.join(LEAGUE_DATA_DIRECTORY,'batters.pickle'))
return True
def calculate_probability_added_pitching():
"""
Calculate the winning probability added for each category when an additional unit is added
"""
scores = load_scores(HISTORICAL_DATA_YEARS)
pitcher_categories = ['IP', 'W', 'SV', 'ERA', 'WHIP', 'K9']
pitcher_categories_info = {
'models': {},
'p_added': {},
}
for category in pitcher_categories:
train = scores[['year', 'year_weight', 'matchup_id', category]].copy()
train['winner'] = train.groupby('matchup_id')[category].transform(lambda x: numpy.where(x == x.max(), 1, 0))
lm = LogisticRegression()
lm.fit(train[[category]], train['winner'], sample_weight=train['year_weight'])
pitcher_categories_info['models'][category] = lm
joblib.dump(pitcher_categories_info, os.path.join(LEAGUE_DATA_DIRECTORY, 'pitchers.pickle'))
return True
def load_scores(years):
"""
Load data files containing scores
"""
scores = []
for year in years:
scores.append(pandas.read_csv(os.path.join(LEAGUE_DATA_DIRECTORY, 'scores', 'scores_{year}.csv'.format(year=year))))
scores = pandas.concat(scores)
# create unique matchup is by concating year, matchup period, team id 1, team id 2
scores['matchup_id'] = scores['year'].astype(str).str.cat(
[
scores['matchup_period'].astype(str),
scores[['team_id', 'opponent_team_id']].max(axis=1).astype(str),
scores[['team_id', 'opponent_team_id']].min(axis=1).astype(str),
],
sep='_'
)
scores['year_weight'] = 0.25 ** (CURRENT_YEAR - scores['year'])
scores = scores[scores['team_id'] != 6]
return scores
if __name__ == '__main__':
main()