forked from cpsievert/HOFmodel
/
HOF_vis_model.html
1070 lines (1059 loc) · 53.1 KB
/
HOF_vis_model.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta http-equiv="Content-Style-Type" content="text/css" />
<meta name="generator" content="pandoc" />
<meta name="author" content="Kenny Shirley" />
<title>Visualizing and Modeling Baseball Hall of Fame Voting</title>
<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; }
code > span.dt { color: #902000; }
code > span.dv { color: #40a070; }
code > span.bn { color: #40a070; }
code > span.fl { color: #40a070; }
code > span.ch { color: #4070a0; }
code > span.st { color: #4070a0; }
code > span.co { color: #60a0b0; font-style: italic; }
code > span.ot { color: #007020; }
code > span.al { color: #ff0000; font-weight: bold; }
code > span.fu { color: #06287e; }
code > span.er { color: #ff0000; font-weight: bold; }
</style>
<link rel="stylesheet" type="text/css" media="screen, projection, print"
href="http://www.w3.org/Talks/Tools/Slidy2/styles/slidy.css" />
<script src="http://www.w3.org/Talks/Tools/Slidy2/scripts/slidy.js"
charset="utf-8" type="text/javascript"></script>
</head>
<body>
<div class="slide titlepage">
<h1 class="title">Visualizing and Modeling Baseball Hall of Fame Voting</h1>
<p class="author">
Kenny Shirley
</p>
<p class="date">NYC Sports Analytics Meetup, August 19, 2014</p>
</div>
<div id="welcome-to-att-labs-at-33-thomas-street-in-nyc" class="slide section level1">
<h1>Welcome to AT&T Labs at 33 Thomas Street in NYC!</h1>
<p><img src='figures/33thomas.jpg' height=500></p>
<p>Built for machines; safe, secure, and habitable by humans.</p>
</div>
<div id="about-me" class="slide section level1">
<h1>About Me</h1>
<ul>
<li>Long-time (~ 10 years) sports statistics researcher/hobbyist</li>
<li>Undergrad thesis on the Markov model for baseball</li>
<li>In statistics grad school, co-authored "Bayesball", a relatively early (Bayesian) model for fielding that used location information about all batted balls.
<ul>
<li>Our method was called SAFE = Spatial Aggregate Fielding Evaluation (Annals of Applied Statistics, 2009)</li>
</ul></li>
<li>I've also dabbled with some basketball and football statistics...</li>
</ul>
</div>
<div id="todays-outline" class="slide section level1">
<h1>Today's Outline</h1>
<ol class='incremental'>
<li>
Introduction: Baseball Hall of Fame voting is awful
</li>
<li>
But... if you can't beat 'em, join 'em
</li>
<li>
Part 1: Visualize the data
</li>
<li>
Part 2: Model the outcome
</li>
</ol>
<p><img src='figures/halloffamebuilding.jpg' width=700></p>
</div>
<div id="collaborators" class="slide section level1">
<h1>Collaborators</h1>
<p>Joint work with:</p>
<p>Carlos Scheidegger (University of Arizona)</p>
<p>
<img src='figures/cscheid.jpg' width='100'>
</p>
<p>and</p>
<p>Carson Sievert (Iowa State University)</p>
<p><img src='figures/sievert.png' width='100'></p>
</div>
<div id="introduction-baseball-hall-of-fame-voting-is-awful" class="slide section level1">
<h1>Introduction: Baseball Hall of Fame Voting is Awful</h1>
</div>
<div id="first-the-rules" class="slide section level1">
<h1>First, the rules:</h1>
<ol class='incremental'>
<li>
A player can appear on the ballot after having played for at least 10 years and having been retired for at least 5 years.
</li>
<li>
A committee chooses who appears on the ballot, and they are... "generous"
</li>
</ol>
</div>
<div id="jeff-cirillo" class="slide section level1">
<h1>Jeff Cirillo</h1>
<p><img src='figures/cirillo.jpg'></p>
<p>Jeff Cirillo, 3B, 1994 - 2007: .296 BA, 112 HR, 32 WAR, 2 All-Star Teams, 2013 HOF ballot</p>
</div>
<div id="jeromy-burnitz" class="slide section level1">
<h1>Jeromy Burnitz</h1>
<p><img src='figures/burnitz.jpg'></p>
<p>Jeromy Burnitz, OF, 1993 - 2006: .253 BA, 315 HR, 17.4 WAR, 1 All-Star Team, 2012 HOF ballot</p>
</div>
<div id="dan-plesac" class="slide section level1">
<h1>Dan Plesac</h1>
<p><img src='figures/plesac.jpg'></p>
<p>Dan Plesac, P, 1986 - 2003: 65 - 71 Win-Loss, 3.64 ERA, 17.2 WAR, 3 All-Star Teams, 2009 HOF ballot</p>
</div>
<div id="these-are-just-a-few-players-who-have-appeared-on-the-ballot" class="slide section level1">
<h1>These are just a few players who have appeared on the ballot</h1>
<ol class='incremental'>
<li>
... and these are just some of the former Brewers!
</li>
<li>
None of them received a single HOF vote, thankfully.
</li>
<li>
Unlike:
</li>
<li>
<pre><code>## Name Year WAR Votes NumBallots
## 1 Jacque Jones 2014 11.5 1 571
## 2 David Segui 2010 7.8 1 539
## 3 Shawon Dunston 2008 9.1 1 543
## 4 Walt Weiss 2006 14.6 1 520
## 5 Randy Myers 2004 14.2 1 506
## 6 Cecil Fielder 2004 14.7 1 506
## 7 Mark Davis 2003 6.8 1 496
## 8 Jim Deshaies 2001 10.2 1 515
## 9 Steve Bedrosian 2001 13.2 1 515
## 10 Ray Knight 1994 10.9 1 456</code></pre>
</li>
</ol>
</div>
<div id="more-rules" class="slide section level1">
<h1>More Rules:</h1>
<ul>
<li>About 625 members (as of 2014) of the Baseball Writers Association of America (BBWAA) receive a ballot, and 570 of them voted last year.</li>
<li>Ballots are due by 12/31, and results are announced in the first week of January.</li>
<li>A player must be selected on at least 75% of the returned ballots to be inducted into the HOF that year.</li>
<li>In 1967 they started to enforce the rule that if you appear on less than 5% of ballots, you are permanently removed from future ballots.</li>
<li>A player may appear on a maximum of 15 ballots (i.e. consecutive years) before being permanently removed.</li>
<li>In a given year, each voter may vote for as few as 0 players, and as many as 10 players (where an average year's ballot contains about 40 players).</li>
</ul>
</div>
<div id="some-statistics" class="slide section level1">
<h1>Some Statistics</h1>
<ol class='incremental'>
<li>
1936 was the first year of Hall of Fame voting
</li>
<li>
Five players were elected:
</li>
<li>
<pre><code>## Year Name Pos NumBallots Votes Percentage
## 1 1936 Ty Cobb OF 226 222 98.20%
## 2 1936 Honus Wagner SS 226 215 95.10%
## 3 1936 Babe Ruth OF 226 215 95.10%
## 4 1936 Christy Mathewson P 226 205 90.70%
## 5 1936 Walter Johnson P 226 189 83.60%</code></pre>
</li>
<li>
From 1936 - 2014, 1089 unique players have appeared on the ballot
</li>
<li>
115 have been elected, 47 on their first ballot appearance (<em>not</em> Lou Gehrig, Cy Young, Warren Spahn)
</li>
<li>
Famously, no player has been unanimously elected.
</li>
<li>
We consider 1967 to be the first year of 'modern' HOF voting (when the 5% rule was established)
</li>
</ol>
</div>
<div id="some-problems" class="slide section level1">
<h1>Some Problems</h1>
<ol class='incremental' style='list-style-type:none'>
<li>
<p>Does it really take 15 years to decide?</p>
</li>
<li>
<p>The so-called 'morals' clause, rule 5 out of 9:</p>
<blockquote>
<p>Voting: Voting shall be based upon the player's record, playing ability, integrity, sportsmanship, character, and contributions to the team(s) on which the player played.</p>
</blockquote>
<blockquote>
<p>(from http://baseballhall.org/hall-famers/rules-election/BBWAA)</p>
</blockquote>
</li>
<li>
<p>The voters don't actively cover baseball!</p>
<blockquote>
<p>Q: Does that mean some Hall of Fame voters don’t even cover baseball any more?</p>
</blockquote>
<blockquote>
<p>A: Yes. The BBWAA trusts that its voters take their responsibility seriously, and even those honorary members who are no longer covering baseball do their due diligence to produce a thoughtful ballot.</p>
</blockquote>
<blockquote>
<p>(from http://bbwaa.com/voting-faq/)</p>
</blockquote>
</li>
</ol>
</div>
<div id="some-responses" class="slide section level1">
<h1>Some Responses</h1>
<ul>
<li>Will Leitch: "Voter Fraud" 11/14/2013, sportsonearth.com</li>
</ul>
<blockquote>
<p>Stupid things are called out as stupid, and people try to figure out how to correct them. It's encouraging. It's kinda nice. And then there is Baseball Hall of Fame voting.</p>
</blockquote>
<ul>
<li>Jonathan Mahler: "Kill The HOF Character Clause, For The Sake Of The Writers", 1/10/2013, deadspin.com</li>
<li>Deadspin offered to buy a vote in 2013, and mainstream journalist Dan Le Batard (Miami Herald) gave his vote to the website (for no money). He was banned for life from voting on future HOF ballots.</li>
<li>Last month the BBWAA announced that they will only allow players to appear up to 10 times on the ballot, rather than 15. (with Mattingly, Trammell, and Smith grandfathered in to the 15-year clock)</li>
</ul>
</div>
<div id="my-thoughts" class="slide section level1">
<h1>My Thoughts</h1>
<ul>
<li>In a nutshell: the voting process is a phenomenon unto itself. It is not a yes-or-no question!</li>
<li>We might as well embrace this, and study it.</li>
<li>(Hasn't baseball always been famous for embracing the 'human element' of the game?)</li>
</ul>
</div>
<div id="part-1-visualize-the-data" class="slide section level1">
<h1>Part 1: Visualize the data</h1>
</div>
<div id="getting-the-data" class="slide section level1">
<h1>Getting the data</h1>
<ul>
<li>Baseball Reference has it all! Fantastic website.</li>
</ul>
<p><img src='figures/baseballreference.png' width=500></p>
<ul>
<li>Career statistics and voting percentages for each player, each year, from 1936-2014.</li>
<li>We did a bit of web scraping to download and parse each year's ballot.</li>
</ul>
</div>
<div id="a-few-plots-in-r" class="slide section level1">
<h1>A few plots in R</h1>
<p>We were really interested in the trajectories of voting percentages of players who had appeared on the ballot multiple times.</p>
<pre class="sourceCode r"><code class="sourceCode r">dat <-<span class="st"> </span><span class="kw">read.csv</span>(<span class="dt">file=</span><span class="st">"HOFregression_updated.csv"</span>, <span class="dt">as.is=</span><span class="ot">TRUE</span>)
<span class="kw">par</span>(<span class="dt">mfrow=</span><span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">2</span>))
sel <-<span class="st"> </span>dat[, <span class="st">"Name"</span>] ==<span class="st"> "Alan Trammell"</span>
<span class="kw">plot</span>(dat[sel, <span class="st">"Year"</span>], dat[sel, <span class="st">"p"</span>], <span class="dt">ylim=</span><span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>), <span class="dt">las=</span><span class="dv">1</span>, <span class="dt">pch=</span><span class="dv">19</span>, <span class="dt">xlab=</span><span class="st">"Year"</span>,
<span class="dt">ylab=</span><span class="st">"Voting Proportion"</span>)
<span class="kw">lines</span>(dat[sel, <span class="st">"Year"</span>], dat[sel, <span class="st">"p"</span>])
<span class="kw">title</span>(<span class="dt">main=</span><span class="st">"Alan Trammell"</span>)
<span class="kw">abline</span>(<span class="dt">h =</span> <span class="fl">0.05</span>, <span class="dt">col=</span><span class="dv">2</span>, <span class="dt">lwd=</span><span class="dv">2</span>)
<span class="kw">abline</span>(<span class="dt">h =</span> <span class="fl">0.75</span>, <span class="dt">col=</span><span class="dv">3</span>, <span class="dt">lwd=</span><span class="dv">2</span>)
sel <-<span class="st"> </span>dat[, <span class="st">"Name"</span>] ==<span class="st"> "Bert Blyleven"</span>
<span class="kw">plot</span>(dat[sel, <span class="st">"Year"</span>], dat[sel, <span class="st">"p"</span>], <span class="dt">ylim=</span><span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>), <span class="dt">las=</span><span class="dv">1</span>, <span class="dt">pch=</span><span class="dv">19</span>, <span class="dt">xlab=</span><span class="st">"Year"</span>,
<span class="dt">ylab=</span><span class="st">"Voting Proportion"</span>)
<span class="kw">lines</span>(dat[sel, <span class="st">"Year"</span>], dat[sel, <span class="st">"p"</span>])
<span class="kw">title</span>(<span class="dt">main=</span><span class="st">"Bert Blyleven"</span>)
<span class="kw">abline</span>(<span class="dt">h =</span> <span class="fl">0.05</span>, <span class="dt">col=</span><span class="dv">2</span>, <span class="dt">lwd=</span><span class="dv">2</span>)
<span class="kw">abline</span>(<span class="dt">h =</span> <span class="fl">0.75</span>, <span class="dt">col=</span><span class="dv">3</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</code></pre>
</div>
<div id="a-few-plots-in-r-1" class="slide section level1">
<h1>A few plots in R</h1>
<p>We were really interested in the trajectories of voting percentages of players who had appeared on the ballot multiple times.</p>
<div class="figure">
<img src="figure/edaplots2-1.png" alt="plot of chunk edaplots2" /><p class="caption">plot of chunk edaplots2</p>
</div>
<p>How did these guys end up with such different voting trajectories?</p>
</div>
<div id="we-built-an-interactive-plot-using-d3" class="slide section level1">
<h1>We built an interactive plot using D3</h1>
<ul>
<li>Link: http://cscheid.net/static/mlb-hall-of-fame-voting</li>
<li>Two main parts of the visualization:
<ul>
<li>Brushable plot of voting % vs. year</li>
<li>Linked histograms for each statistical category</li>
</ul></li>
</ul>
</div>
<div id="lots-of-interesting-trivia-was-uncovered-here" class="slide section level1">
<h1>Lots of interesting trivia was uncovered here:</h1>
<ul>
<li>Lefty Grove vs. Orval Grove</li>
<li>Runoff Elections</li>
<li>Dips in voting percentages for most players when a group of strong first-ballot players appears (1999, 2007)</li>
<li>Edd Roush's 19 ballots</li>
<li>Minnie Minoso and Jose Rijo's comebacks</li>
<li>Warren Spahn's early vote</li>
<li>HR > 400</li>
<li>OPS > .850</li>
</ul>
</div>
<div id="part-2-model-the-data-and-make-predictions" class="slide section level1">
<h1>Part 2: Model the data and make predictions</h1>
</div>
<div id="the-obvious-next-question-is-can-we-predict-next-years-vote" class="slide section level1">
<h1>The obvious next question is: Can we predict next year's vote?</h1>
<ul>
<li>Others have worked on this, of course:
<ul>
<li>JAWS: Jay Jaffe's 'Jaffe WAR Scoring System', an average of WAR and 'peak' WAR, normalized for position</li>
<li>Bill James has a few systems. Hall of Fame Monitor, for example, as described on baseball-reference.com (leader glossary section):
<pre style='white-space: pre-wrap'>
[1] For Batting Average, 2.5 points for each season over .300, 5.0 for over .350, 15 for over .400. Seasons are not double-counted. I require 100 games in a season to qualify for this bonus.
[2] For hits, 5 points for each season of 200 or more hits.
[3] 3 points for each season of 100 RBI's and 3 points for each season of 100 runs.
[4] 10 points for 50 home runs, 4 points for 40 HR, and 2 points for 30 HR.
[5] 2 points for 45 doubles and 1 point for 35 doubles.
[6] 8 points for each MVP award and 3 for each AllStar Game, and 1 point for a Rookie of the Year award.
[7] 2 points for a gold glove at C, SS, or 2B, and 1 point for any other gold glove.
[8] 6 points if they were the regular SS or C on a WS winning team, 5 points for 2B or CF, 3 for 3B, 2 for LF or RF, and 1 for 1B. I don't have the OF distribution, so I give 3 points for OF (requires at least 82 games as the position).
...
[19] ...
</pre></li>
<li>Michael Freiman's paper in JQAS a few years back. Random Forests for 0/1 induction into HOF.</li>
<li>Mills and Salaga, 2011 JQAS paper using random forests as well.</li>
</ul></li>
<li>Most of these focus on the yes/no question of induction, rather than predicting voting percentages.</li>
</ul>
</div>
<div id="what-predictors-should-we-use" class="slide section level1">
<h1>What predictors should we use?</h1>
<ul>
<li>Career Statstics? (including position, of course)</li>
<li>Awards? (Rookie of the Year, MVP, All-star appearnaces, Gold glove awards, etc.)</li>
<li>Measures of post-season success?</li>
<li>Measures of popularity with the press/public?</li>
<li>We used the first two, and the third should be easy to incorporate in future work. The fourth is a little tougher. (Some type of meta-analysis on newspaper articles, perhaps?)</li>
</ul>
</div>
<div id="what-data-do-we-use" class="slide section level1">
<h1>What data do we use?</h1>
<ul>
<li>I'll talk about making predictions last year (ballots were due 12/31/13, and results were announced January, 2014), and I'll finish by presenting my 2015 predictions.</li>
<li>We discarded anybody whose first year of eligibility was before 1967.</li>
<li>Through the 2013 vote, this gave us a historical data set of:
<ul>
<li>406 position players</li>
<li>229 pitchers</li>
<li><span class="math"> = 635</span> total unique players</li>
<li>and 1458 total data points, or voting outcomes (for an average of 2.3 years on the ballot per person)</li>
</ul></li>
<li>We'll test the accuracy of our model by making one-year-ahead (out-of-sample) predictions, starting in 1997, and going through 2014.</li>
</ul>
</div>
<div id="our-methodology" class="slide section level1">
<h1>Our methodology:</h1>
<ul>
<li>For each year, we fit a different model to the historical data from three different subsets of players:
<ul>
<li>First-ballot batters</li>
<li>First-ballot pitchers</li>
<li>Returning ballot players</li>
</ul></li>
<li>Career statistics are basically all we have for the first two groups.</li>
<li>For the third group, we have their previous HOF voting percentages, of course. (which should make our job much easier)</li>
</ul>
</div>
<div id="modeling" class="slide section level1">
<h1>Modeling:</h1>
<ul>
<li>All the models we fit were logistic regression models: assume the <span class="math"><em>n</em></span> voters in a given year act independently, and model the number of 'yes' votes as a binomial random variable: <br /><span class="math"><em>y</em><sub><em>i</em><em>t</em></sub> ∼ Binom(<em>n</em><sub><em>t</em></sub>, <em>p</em><sub><em>i</em><em>t</em></sub>), </span><br /> where
<ul>
<li><span class="math"><em>y</em><sub><em>i</em><em>t</em></sub></span> is the number of 'yes' votes for player <span class="math"><em>i</em></span> in year <span class="math"><em>t</em></span></li>
<li><span class="math"><em>n</em><sub><em>t</em></sub></span> is the number of ballots returned in year <span class="math"><em>t</em></span>, and</li>
<li><span class="math"><em>p</em><sub><em>i</em><em>t</em></sub></span> is the probability of a given voter voting 'yes' for player <span class="math"><em>i</em></span> in year <span class="math"><em>t</em></span>.</li>
</ul></li>
<li>Next, model <span class="math"><em>p</em><sub><em>i</em><em>t</em></sub></span> as a linear function of the predictor variables: <br /><span class="math"><em>p</em><sub><em>i</em><em>t</em></sub> = <em>β</em><sub>0</sub> + <em>β</em><sub>1</sub><em>X</em><sub><em>i</em></sub><sup>HR</sup> + <em>β</em><sub>2</sub><em>X</em><sub><em>i</em></sub><sup>RBI</sup> + <em>β</em><sub>3</sub><em>X</em><sub><em>i</em></sub><sup>OBP</sup> + ...</span><br /> for batters, for instance.</li>
<li>An alternative would be to use something like random forests, support vector machines, or whatever. We tried random forests and got similar results, and logistic regression is easy and interpretable, so we chose it.</li>
<li>Given the estimated regression coefficients, <span class="math"><em>β</em><sub>0</sub>, <em>β</em><sub>1</sub>, ...</span>, and the predictors for the players on next year's ballot, we can easily compute an estimate of <span class="math"><em>p</em><sub><em>i</em><em>t</em></sub></span> for them.</li>
</ul>
</div>
<div id="baseline-model" class="slide section level1">
<h1>Baseline Model:</h1>
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Group 1: batters</span>
var.names[[<span class="dv">1</span>]] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Yrs"</span>, <span class="st">"G"</span>, <span class="st">"AB"</span>, <span class="st">"R"</span>, <span class="st">"H"</span>, <span class="st">"HR"</span>, <span class="st">"RBI"</span>, <span class="st">"SB"</span>, <span class="st">"BB"</span>,
<span class="st">"BA"</span>, <span class="st">"OBP"</span>, <span class="st">"SLG"</span>,
<span class="st">"posC"</span>, <span class="st">"pos1B"</span>, <span class="st">"pos2B"</span>, <span class="st">"pos3B"</span>, <span class="st">"posSS"</span>, <span class="st">"posLF"</span>, <span class="st">"posCF"</span>, <span class="st">"posRF"</span>)
<span class="co"># Group 2: pitchers</span>
var.names[[<span class="dv">2</span>]] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Yrs"</span>, <span class="st">"W"</span>, <span class="st">"L"</span>, <span class="st">"G"</span>, <span class="st">"GS"</span>, <span class="st">"SV"</span>, <span class="st">"IP"</span>, <span class="st">"H"</span>, <span class="st">"HR"</span>, <span class="st">"BB"</span>, <span class="st">"SO"</span>,
<span class="st">"ERA"</span>, <span class="st">"WHIP"</span>)
<span class="co"># Group 3: returning players</span>
<span class="co"># Just use the previous year's voting percentage as the sole predictor</span>
var.names[[<span class="dv">3</span>]] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"prev1"</span>)</code></pre>
</div>
<div id="lets-head-back-in-time...-to-1996" class="slide section level1">
<h1>Let's head back in time... to 1996:</h1>
<p><img src='figures/backtothefuture.jpg' width='1000'></p>
</div>
<div id="example-first-ballot-batters-from-1967---1996-n-255" class="slide section level1">
<h1>Example: First-ballot batters from 1967 - 1996 (n = 255):</h1>
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Fit the model using weak priors:</span>
fit <-<span class="st"> </span><span class="kw">bayesglm</span>(data[sel, <span class="st">"p"</span>] ~<span class="st"> </span>X.scale, <span class="dt">weights=</span>data[sel, <span class="st">"NumBallots"</span>],
<span class="dt">family=</span><span class="kw">binomial</span>(<span class="dt">link =</span> <span class="st">"logit"</span>),
<span class="dt">prior.mean=</span><span class="dv">0</span>, <span class="dt">prior.scale=</span><span class="fl">2.5</span>)</code></pre>
</div>
<div id="example-first-ballot-batters-from-1967---1996-n-255-1" class="slide section level1">
<h1>Example: First-ballot batters from 1967 - 1996 (n = 255):</h1>
<pre class="sourceCode r"><code class="sourceCode r">Coefficients:
<span class="st"> </span>Estimate Std. Error z value <span class="kw">Pr</span>(><span class="er">|</span>z|)
(Intercept) -<span class="fl">4.95399</span> <span class="fl">0.05830</span> -<span class="fl">84.972</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scaleYrs <span class="fl">0.50693</span> <span class="fl">0.05896</span> <span class="fl">8.597</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scaleG <span class="fl">1.02455</span> <span class="fl">0.19500</span> <span class="fl">5.254</span> <span class="fl">1.49e-07</span> **<span class="er">*</span>
X.scaleAB -<span class="fl">3.63447</span> <span class="fl">0.48990</span> -<span class="fl">7.419</span> <span class="fl">1.18e-13</span> **<span class="er">*</span>
X.scaleR <span class="fl">2.27816</span> <span class="fl">0.14228</span> <span class="fl">16.012</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scaleH <span class="fl">3.09098</span> <span class="fl">0.51523</span> <span class="fl">5.999</span> <span class="fl">1.98e-09</span> **<span class="er">*</span>
X.scaleHR <span class="fl">1.02895</span> <span class="fl">0.11611</span> <span class="fl">8.862</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scaleRBI -<span class="fl">0.96718</span> <span class="fl">0.11939</span> -<span class="fl">8.101</span> <span class="fl">5.44e-16</span> **<span class="er">*</span>
X.scaleSB <span class="fl">0.05451</span> <span class="fl">0.02301</span> <span class="fl">2.370</span> <span class="fl">0.0178</span> *<span class="st"> </span>
X.scaleBB <span class="fl">0.11784</span> <span class="fl">0.10958</span> <span class="fl">1.075</span> <span class="fl">0.2822</span>
X.scaleBA <span class="fl">0.36248</span> <span class="fl">0.14991</span> <span class="fl">2.418</span> <span class="fl">0.0156</span> *<span class="st"> </span>
X.scaleOBP -<span class="fl">0.87497</span> <span class="fl">0.12853</span> -<span class="fl">6.807</span> <span class="fl">9.93e-12</span> **<span class="er">*</span>
X.scaleSLG <span class="fl">0.66728</span> <span class="fl">0.12253</span> <span class="fl">5.446</span> <span class="fl">5.15e-08</span> **<span class="er">*</span>
X.scaleposC <span class="fl">1.23696</span> <span class="fl">0.08342</span> <span class="fl">14.828</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scalepos1B <span class="fl">0.62907</span> <span class="fl">0.08655</span> <span class="fl">7.268</span> <span class="fl">3.65e-13</span> **<span class="er">*</span>
X.scalepos2B <span class="fl">0.69809</span> <span class="fl">0.07841</span> <span class="fl">8.903</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scalepos3B <span class="fl">0.54610</span> <span class="fl">0.07735</span> <span class="fl">7.060</span> <span class="fl">1.66e-12</span> **<span class="er">*</span>
X.scaleposSS <span class="fl">0.98036</span> <span class="fl">0.07683</span> <span class="fl">12.759</span> <<span class="st"> </span><span class="fl">2e-16</span> **<span class="er">*</span>
X.scaleposLF <span class="fl">0.40763</span> <span class="fl">0.08836</span> <span class="fl">4.613</span> <span class="fl">3.97e-06</span> **<span class="er">*</span>
X.scaleposCF -<span class="fl">0.01915</span> <span class="fl">0.08636</span> -<span class="fl">0.222</span> <span class="fl">0.8245</span>
X.scaleposRF <span class="fl">0.49648</span> <span class="fl">0.08293</span> <span class="fl">5.987</span> <span class="fl">2.14e-09</span> **<span class="er">*</span>
---
Signif. codes:<span class="st"> </span><span class="dv">0</span> ‘**<span class="er">*</span>’ <span class="fl">0.001</span> ‘**’ <span class="fl">0.01</span> ‘*’ <span class="fl">0.05</span> ‘.’ <span class="fl">0.1</span> ‘ ’ <span class="dv">1</span></code></pre>
</div>
<div id="interpretation" class="slide section level1">
<h1>Interpretation?</h1>
<ul>
<li>Position coefficients are reasonable:
<ul>
<li>DH = 0, as it is the baseline position.</li>
<li>SS and C have the highest estimates.</li>
</ul></li>
<li>OBP has a negative coefficient? But SLG and BA are positive... (and of course, coefficients for correlated predictors in linear models can't be interpreted in isolation).</li>
</ul>
</div>
<div id="making-out-of-sample-predictions-for-1997" class="slide section level1">
<h1>Making out-of-sample predictions for 1997:</h1>
<pre>
Name Prediction Actual
1 Dave Parker 39.4 17.5
2 Dwight Evans 51.8 5.9
3 Ken Griffey 12.7 4.7
4 Garry Templeton 5.3 0.4
5 Terry Kennedy 0.5 0.2
6 Terry Puhl 0.2 0.2
</pre>
<ul>
<li>Pretty bad!</li>
<li>The RMSE here (for the percentage difference) is about 21%.</li>
<li>If we had just guessed the intercept for everybody we would have gotten a 25% RMSE.</li>
<li>But this is a tiny sample...</li>
</ul>
</div>
<div id="overall-results-for-baseline-model" class="slide section level1">
<h1>Overall Results for Baseline Model:</h1>
<ul>
<li>Pseudo-code:</li>
</ul>
<pre class="sourceCode r"><code class="sourceCode r">for (year in <span class="dv">1997</span>:<span class="dv">2014</span>) {
for (group in <span class="kw">c</span>(<span class="st">"batters"</span>, <span class="st">"pitchers"</span>, <span class="st">"returning"</span>)) {
train <-<span class="st"> </span>Year <<span class="st"> </span>year &<span class="st"> </span>Group ==<span class="st"> </span>group
test <-<span class="st"> </span>Year ==<span class="st"> </span>year &<span class="st"> </span>Group ==<span class="st"> </span>group
historical.fit <-<span class="st"> </span><span class="kw">glm</span>(y[train] ~<span class="st"> </span>data[train, ])
predict <-<span class="st"> </span><span class="kw">predict</span>(historical.fit, <span class="dt">newdata=</span>data[test, ])
}
}</code></pre>
<table>
<thead>
<tr class="header">
<th align="left">Group</th>
<th align="left">Baseline</th>
<th align="left"></th>
<th align="left"></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">First-ballot Batters (n = 151)</td>
<td align="left">18.4%</td>
<td align="left"></td>
</tr>
<tr class="even">
<td align="left">First-ballot Pitchers (n = 85)</td>
<td align="left">9.7%</td>
<td align="left"></td>
</tr>
<tr class="odd">
<td align="left">Returning Players (n = 262)</td>
<td align="left">5.7%</td>
<td align="left"></td>
</tr>
<tr class="even">
<td align="left">Overall</td>
<td align="left">11.7%</td>
<td align="left"></td>
</tr>
</tbody>
</table>
</div>
<div id="lets-make-2014-predictions" class="slide section level1">
<h1>Let's make 2014 Predictions</h1>
<pre style='font-size:14px'>
Name Previous Predicted
1 Craig Biggio 68.2 77.0
2 Jack Morris 67.7 76.5
3 Jeff Bagwell 59.6 67.5
4 Mike Piazza 57.8 65.3
5 Tim Raines 52.2 58.0
6 Lee Smith 47.8 52.0
7 Curt Schilling 38.8 39.7
8 Frank Thomas 0.0 39.2
9 Jeff Kent 0.0 38.7
10 Roger Clemens 37.6 38.1
11 Greg Maddux 0.0 37.5
12 Barry Bonds 36.2 36.3
13 Edgar Martinez 35.9 35.9
14 Alan Trammell 33.6 33.0
15 Luis Gonzalez 0.0 23.0
16 Larry Walker 21.6 20.3
17 Fred McGriff 20.7 19.5
18 Mark McGwire 16.9 16.4
19 Mike Mussina 0.0 16.3
20 Tom Glavine 0.0 15.1
21 Don Mattingly 13.2 13.8
22 Sammy Sosa 12.5 13.3
23 Rafael Palmeiro 8.8 11.1
24 Moises Alou 0.0 10.5
25 Ray Durham 0.0 7.1
26 Armando Benitez 0.0 3.3
27 Sean Casey 0.0 0.9
28 Eric Gagne 0.0 0.8
29 Richie Sexson 0.0 0.6
30 Paul Lo Duca 0.0 0.5
31 J.T. Snow 0.0 0.4
32 Kenny Rogers 0.0 0.4
33 Hideo Nomo 0.0 0.2
34 Jacque Jones 0.0 0.1
35 Todd Jones 0.0 0.1
36 Mike Timlin 0.0 0.1
</pre>
<p><audio controls> <source src='figures/record-scratch-1.wav' type='audio/wav'> </audio></p>
</div>
<div id="residuals-for-baseline-model" class="slide section level1">
<h1>Residuals for Baseline Model</h1>
<p><img src='figures/fig_residuals_without2014.jpg' width=1200></p>
</div>
<div id="residual-analysis" class="slide section level1">
<h1>Residual Analysis:</h1>
<ul>
<li>Stats 101: Fit a model, look at the residuals, see if you can think of a variable to incorporate that would explain some of the residuals.
<pre>
Year Name Actual Predicted Residual
1 2002 Ozzie Smith 91.7 9.3 82.4
2 2001 Kirby Puckett 82.1 2.8 79.3
3 2005 Wade Boggs 91.9 57.1 34.8
4 2004 Paul Molitor 85.2 56.0 29.2
5 2010 Edgar Martinez 36.2 7.6 28.6
</pre>
<pre>
1 2011 Rafael Palmeiro 11.0 93.8 -82.8
2 2013 Barry Bonds 36.2 99.5 -63.3
3 2013 Roger Clemens 37.6 99.5 -61.9
4 2010 Fred McGriff 21.5 73.9 -52.4
5 2013 Julio Franco 1.1 52.3 -51.3
</pre></li>
<li>Let's try:
<ul>
<li>Awards, like All-Star teams and Gold Gloves, and...</li>
<li>Drugs! (i.e. steroid or HGH suspicion or suspension) Specifically, we can gather data on drug suspensions, and on being named in the Mitchell Report, as our proxy for suspected use.</li>
</ul></li>
</ul>
</div>
<div id="a-side-note..." class="slide section level1">
<h1>A side note...</h1>
<p>Don't google image search "Barry Bonds before vs. after" unless you have half an hour to kill...</p>
<p><img src='figures/bonds1.jpg' width=700></p>
</div>
<div id="a-side-note...-1" class="slide section level1">
<h1>A side note...</h1>
<p>Don't google image search "Barry Bonds before vs. after" unless you have half an hour to kill...</p>
<p><img src='figures/bonds2.jpg' width=700></p>
</div>
<div id="a-side-note...-2" class="slide section level1">
<h1>A side note...</h1>
<p>Don't google image search "Barry Bonds before vs. after" unless you have half an hour to kill...</p>
<p><img src='figures/bonds3.jpg' width=700></p>
<p>People have really put a lot of work into this sort of comparison</p>
<p>Related searches: McGwire, Clemens, Sosa.</p>
</div>
<div id="model-2-awards-drugs-batters-from-1967---2013" class="slide section level1">
<h1>Model 2 ('Awards + Drugs'), Batters from 1967 - 2013:</h1>
<pre style='font-size:14px'>
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.06179 0.04106 -123.281 < 2e-16 ***
X.scaleYrs 0.45714 0.03482 13.127 < 2e-16 ***
X.scaleG 0.04859 0.14207 0.342 0.732324
X.scaleAB 1.19308 0.35354 3.375 0.000739 ***
X.scaleR 0.77132 0.08730 8.835 < 2e-16 ***
X.scaleH -0.45612 0.33860 -1.347 0.177963
X.scaleHR 0.23513 0.07811 3.010 0.002611 **
X.scaleRBI -0.25777 0.07589 -3.397 0.000682 ***
X.scaleSB 0.05965 0.01926 3.098 0.001950 **
X.scaleBB 0.19772 0.07434 2.660 0.007821 **
X.scaleBA 0.70078 0.09828 7.130 1.00e-12 ***
X.scaleOBP -0.34334 0.09172 -3.743 0.000182 ***
X.scaleSLG 0.44604 0.08467 5.268 1.38e-07 ***
X.scaleposC 0.15346 0.02416 6.351 2.14e-10 ***
X.scalepos1B 0.12147 0.02200 5.523 3.34e-08 ***
X.scalepos2B -0.11253 0.02412 -4.665 3.09e-06 ***
X.scalepos3B -0.05741 0.02361 -2.431 0.015055 *
X.scaleposSS 0.10689 0.02301 4.646 3.38e-06 ***
X.scaleposLF 0.03260 0.02365 1.379 0.168038
X.scaleposCF -0.20443 0.02516 -8.127 4.41e-16 ***
X.scaleposRF -0.17331 0.02406 -7.203 5.87e-13 ***
X.scaledrugs -0.91577 0.02574 -35.583 < 2e-16 ***
X.scaleAllStarpy 1.12873 0.01691 66.752 < 2e-16 ***
X.scalegold.gloves 0.20908 0.01136 18.411 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
</pre>
<p>Nice -- the z-scores for the three new predictors are highly significant, and have signs that we expected.</p>
</div>
<div id="new-rmse-results" class="slide section level1">
<h1>New RMSE results:</h1>
<table>
<thead>
<tr class="header">
<th align="left">Group</th>
<th align="left">Baseline</th>
<th align="left">Awards + Drugs</th>
<th align="left"></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">First-ballot Batters (n = 151)</td>
<td align="left">18.4%</td>
<td align="left">15.2%</td>
</tr>
<tr class="even">
<td align="left">First-ballot Pitchers (n = 85)</td>
<td align="left">9.7%</td>
<td align="left">8.6%</td>
</tr>
<tr class="odd">
<td align="left">Returning Players (n = 262)</td>
<td align="left">5.7%</td>
<td align="left">5.7%</td>
</tr>
<tr class="even">
<td align="left">Overall</td>
<td align="left">11.7%</td>
<td align="left">10.0%</td>
</tr>
</tbody>
</table>
</div>
<div id="updated-2014-predictions" class="slide section level1">
<h1>Updated 2014 predictions:</h1>
<pre style='font-size:14px'>
Name Previous Predicted
1 Greg Maddux 0.0 96.0
2 Craig Biggio 68.2 77.0
3 Jack Morris 67.7 76.5
4 Jeff Bagwell 59.6 67.5
5 Mike Piazza 57.8 65.3
6 Tim Raines 52.2 58.0
7 Mike Mussina 0.0 57.7
8 Lee Smith 47.8 52.0
9 Frank Thomas 0.0 51.0
10 Tom Glavine 0.0 50.5
11 Curt Schilling 38.8 39.7
12 Roger Clemens 37.6 38.1
13 Barry Bonds 36.2 36.3
14 Edgar Martinez 35.9 35.9
15 Alan Trammell 33.6 33.0
16 Luis Gonzalez 0.0 20.9
17 Larry Walker 21.6 20.3
18 Fred McGriff 20.7 19.5
19 Mark McGwire 16.9 16.4
20 Don Mattingly 13.2 13.8
21 Sammy Sosa 12.5 13.3
22 Moises Alou 0.0 13.1
23 Rafael Palmeiro 8.8 11.1
24 Jeff Kent 0.0 11.0
</pre>
</div>
<div id="residuals-for-awards-drugs-model" class="slide section level1">
<h1>Residuals for 'Awards + Drugs' Model</h1>
<p><img src='figures/fig_residuals_M2_without2014.jpg' width=1200> Whew. At least now the results are plausible. Maddux is a lock, and Glavine and Thomas are above 50% (just barely).</p>
</div>
<div id="residual-analysis-1" class="slide section level1">
<h1>Residual Analysis:</h1>
Top-5 and Bottom-5 residuals:
<pre>
Year Name Actual Predicted Residual
1 2001 Kirby Puckett 82.1 12.0 70.1
2 1999 Robin Yount 77.5 8.6 68.9
3 1999 George Brett 98.2 54.3 43.9
4 2004 Paul Molitor 85.2 50.3 34.9
5 2005 Wade Boggs 91.9 60.4 31.4
</pre>
<pre>
Year Name Actual Predicted Residual
1 2013 Barry Bonds 36.2 99.3 -63.1
2 2013 Roger Clemens 37.6 97.9 -60.3
3 2008 Tim Raines 24.3 81.7 -57.4
4 2007 Jose Canseco 1.1 38.6 -37.5
5 2007 Mark McGwire 23.5 58.7 -35.2
</pre>
<ul>
<li>Any thoughts?</li>
</ul>
</div>
<div id="residual-analysis-2" class="slide section level1">
<h1>Residual Analysis:</h1>
Top-5 and Bottom-5 residuals:
<pre>
Year Name Actual Predicted Residual
1 2001 Kirby Puckett 82.1 12.0 70.1
2 1999 Robin Yount 77.5 8.6 68.9
3 1999 George Brett 98.2 54.3 43.9
4 2004 Paul Molitor 85.2 50.3 34.9
5 2005 Wade Boggs 91.9 60.4 31.4
</pre>
<pre>
Year Name Actual Predicted Residual
1 2013 Barry Bonds 36.2 99.3 -63.1
2 2013 Roger Clemens 37.6 97.9 -60.3
3 2008 Tim Raines 24.3 81.7 -57.4
4 2007 Jose Canseco 1.1 38.6 -37.5
5 2007 Mark McGwire 23.5 58.7 -35.2
</pre>
<ul>
<li>Any thoughts?</li>
<li>We can try some milestones, like 3000 hits, and also a variable for 'Played for only one team entire career'.</li>
<li>Specifically, we'll add: Rookie of the Year, MVP, Cy Young, 3000 hits, 500 HR, 300 wins, 3000 strikeouts, and 'Oneteam' (there is a list available online for those with > 15 years of service. I cheated and included Puckett).</li>
</ul>
</div>
<div id="improving-the-model-for-returning-players" class="slide section level1">
<h1>Improving the model for returning players</h1>
<ul>
<li>Previous Year's voting percentage</li>
<li>Previous Year's voting percentage squared (to model a potentially quadratic relationship)</li>
<li>"Top 3 First Ballot": The mean voting percentage of the top 3 first-ballot players in a given year. This variable we expect to have a negative association with a returning player's voting percentage, as it did in 1999, for example, when first-timers George Brett, Nolan Ryan, and Robin Yount all received very high voting percentages, and most returning players received a lower voting percentage than their previous year.</li>
<li>"Top 5 Returning Ballot": The mean of the top five voting percentages from the previous year of players who are re-appearing on this year's ballot; this should also have a negative correlation with the current year's voting percentages.</li>
<li>Indicator of whether this is a player's second or final ballot (two years in which an extra bump often occurs)</li>
<li>Interaction between indicator of second ballot and previous year's percentage -- included specifically to capture the fact that first-ballot players who get between 60 and 75% of the vote percentage tend to increase a lot in their second year.</li>
<li>All of this based on EDA and residual plots</li>
</ul>
</div>
<div id="returning-player-updated-model" class="slide section level1">
<h1>Returning Player Updated Model</h1>
<ul>
<li>All the coefficients are statistically significant (so there's that, at least)</li>
<li>Do the predictions improve? Recall that the baseline RMSE for this group (n = 262) was 5.7%.
<pre>
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.104016 0.004748 -232.535 < 2e-16 ***
X.scaleprev1 1.456224 0.018842 77.288 < 2e-16 ***
X.scaleprev1.squared -0.419837 0.017857 -23.510 < 2e-16 ***
X.scaletop3 -0.189887 0.004369 -43.460 < 2e-16 ***
X.scalereturn -0.016023 0.004591 -3.490 0.000482 ***
X.scaleballot2ndyear -0.071682 0.009175 -7.813 5.60e-15 ***
X.scaleballotfinal 0.026895 0.004286 6.276 3.48e-10 ***
X.scaleballot2nd.x.prev1 0.091116 0.008260 11.031 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
</pre>
</li>
</ul>
</div>
<div id="new-rmse-results-1" class="slide section level1">
<h1>New RMSE results:</h1>
<table>
<thead>
<tr class="header">
<th align="left">Group</th>
<th align="left">Baseline</th>
<th align="left">Awards + Drugs</th>
<th align="left">Milestones, One-team, and 'Returning'</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">First-ballot Batters (n = 151)</td>
<td align="left">18.4%</td>
<td align="left">15.2%</td>
<td align="left">13.5%</td>
</tr>
<tr class="even">
<td align="left">First-ballot Pitchers (n = 85)</td>
<td align="left">9.7%</td>
<td align="left">8.6%</td>
<td align="left">9.6%</td>
</tr>
<tr class="odd">
<td align="left">Returning Players (n = 262)</td>
<td align="left">5.7%</td>
<td align="left">5.7%</td>
<td align="left">4.6%</td>
</tr>
<tr class="even">
<td align="left">Overall</td>
<td align="left">11.7%</td>
<td align="left">10.0%</td>
<td align="left">9.1%</td>
</tr>
</tbody>
</table>
<p>Well, it's not great that the 1st-ballot pitchers RMSE went up (adding milestones, or rookie of the year, or something made the model perform worse for them), but we'll keep the effect in.</p>
</div>
<div id="one-more-change" class="slide section level1">
<h1>One more change:</h1>
<ul>
<li>Let's add in a few interaction effects between the 'drugs' variable and slugging variables, since you could argue that drug use is more damaging to one's reputation (and helpful to one's career) for sluggers.
<ul>
<li>Include drugs <span class="math"> × </span> SLG and drugs <span class="math"> × </span> HR.</li>
<li>Also let's include 'drugs' interaction with MVP awards and Cy Young Awards (to additionally 'downweight' Clemens and Bonds -- a little bit of snooping here...)</li>
</ul></li>
<li>With this change, our final 2014 predictions are:
<pre style='font-size:14px'>
Name Previous Predicted
1 Greg Maddux 0.0 97.4
2 Craig Biggio 68.2 73.7
3 Jack Morris 67.7 70.2
4 Frank Thomas 0.0 64.2
5 Mike Piazza 57.8 63.7
6 Jeff Bagwell 59.6 60.4
7 Tom Glavine 0.0 53.5
8 Tim Raines 52.2 52.7
9 Lee Smith 47.8 47.7
10 Mike Mussina 0.0 42.4
11 Curt Schilling 38.8 38.5
12 Roger Clemens 37.6 36.8
13 Barry Bonds 36.2 34.8
14 Edgar Martinez 35.9 33.1
15 Alan Trammell 33.6 30.4
16 Larry Walker 21.6 17.5
17 Fred McGriff 20.7 16.7
18 Mark McGwire 16.9 13.5
19 Don Mattingly 13.2 10.8
20 Luis Gonzalez 0.0 9.6
21 Sammy Sosa 12.5 9.4
22 Moises Alou 0.0 8.7
23 Rafael Palmeiro 8.8 8.2
24 Jeff Kent 0.0 5.1
</pre></li>
<li>These predictions looked decent at the time, although I thought Frank Thomas should have been a bit higher.</li>
</ul>
</div>
<div id="checking-the-2014-predictions-ouch" class="slide section level1">
<h1>Checking the 2014 Predictions (ouch!)</h1>
<p><img src='figures/fig_2014_intervals_updated.jpg' width=1050></p>
</div>
<div id="lessons-learned" class="slide section level1">
<h1>Lessons learned?</h1>
<ul>
<li>RMSE for 2014 was 9.9%, compared to 9.0% historically. Not terrible. But misleading since very low data points (< 2%, for example) are equally weighted in RMSE calculation. They're just not that interesting or important.</li>
<li>Postseason success might be a big factor. The data is a bit harder to gather.</li>
<li>Is there a teammate effect? (Maddux + Glavine?)</li>
<li>The predictions are dependent. We underestimated the first-ballot voting percentages in 2014, and thus, our model also overestimated the voting percentages of <em>every</em> returning player (because of the 'top-3' variable in the Group 3 = 'Returning' regression.)</li>
<li>Also, historical RMSE is fine to look at, but with Clemens/Bonds, there may have simply been no way for us to predict their voting percentages, since there were truly none like them before (in the modern era, or in any era when including the drugs variable).</li>
</ul>
</div>
<div id="predictions-drum-roll-please..." class="slide section level1">
<h1>2015 predictions (drum roll, please...)</h1>
<pre style='font-size:14px'>
Name Previous Predicted
1 Randy Johnson 0.0 99.8
2 Pedro Martinez 0.0 93.5
3 John Smoltz 0.0 72.4
4 Craig Biggio 74.8 69.2
5 Mike Piazza 62.2 59.2
6 Jeff Bagwell 54.3 51.3
7 Tim Raines 46.1 41.9
8 Roger Clemens 35.4 29.3
9 Barry Bonds 34.7 28.5
10 Lee Smith 29.9 23.4
11 Curt Schilling 29.2 22.6
12 Edgar Martinez 25.2 18.7
13 Alan Trammell 20.8 14.9
14 Mike Mussina 20.3 13.8
15 Jeff Kent 15.2 9.9
16 Fred McGriff 11.7 8.7
17 Mark McGwire 11.0 8.3
18 Larry Walker 10.2 7.8
19 Don Mattingly 8.2 7.8
20 Nomar Garciaparra 0.0 7.8
21 Gary Sheffield 0.0 7.6
22 Sammy Sosa 7.2 6.4
23 Troy Percival 0.0 5.3
24 Carlos Delgado 0.0 1.9
</pre>
</div>
<div id="predictions-drum-roll-please...-1" class="slide section level1">
<h1>2015 predictions (drum roll, please...)</h1>
<pre style='font-size:14px'>
Name Previous Predicted
1 Randy Johnson 0.0 99.8
2 Pedro Martinez 0.0 93.5
3 John Smoltz 0.0 72.4
4 Craig Biggio 74.8 69.2
5 Mike Piazza 62.2 59.2
6 Jeff Bagwell 54.3 51.3
7 Tim Raines 46.1 41.9
8 Roger Clemens 35.4 29.3
9 Barry Bonds 34.7 28.5
10 Lee Smith 29.9 23.4
11 Curt Schilling 29.2 22.6
12 Edgar Martinez 25.2 18.7
13 Alan Trammell 20.8 14.9
14 Mike Mussina 20.3 13.8
15 Jeff Kent 15.2 9.9
16 Fred McGriff 11.7 8.7
17 Mark McGwire 11.0 8.3
18 Larry Walker 10.2 7.8
19 Don Mattingly 8.2 7.8
20 Nomar Garciaparra 0.0 7.8
21 Gary Sheffield 0.0 7.6
22 Sammy Sosa 7.2 6.4
23 Troy Percival 0.0 5.3
24 Carlos Delgado 0.0 1.9
</pre>
<p><img src='figures/johnson-plaque.png' width=200></p>
</div>
<div id="predictions-drum-roll-please...-2" class="slide section level1">
<h1>2015 predictions (drum roll, please...)</h1>
<pre style='font-size:14px'>
Name Previous Predicted
1 Randy Johnson 0.0 99.8
2 Pedro Martinez 0.0 93.5
3 John Smoltz 0.0 72.4
4 Craig Biggio 74.8 69.2
5 Mike Piazza 62.2 59.2
6 Jeff Bagwell 54.3 51.3
7 Tim Raines 46.1 41.9
8 Roger Clemens 35.4 29.3
9 Barry Bonds 34.7 28.5
10 Lee Smith 29.9 23.4
11 Curt Schilling 29.2 22.6
12 Edgar Martinez 25.2 18.7
13 Alan Trammell 20.8 14.9
14 Mike Mussina 20.3 13.8
15 Jeff Kent 15.2 9.9
16 Fred McGriff 11.7 8.7
17 Mark McGwire 11.0 8.3
18 Larry Walker 10.2 7.8
19 Don Mattingly 8.2 7.8
20 Nomar Garciaparra 0.0 7.8
21 Gary Sheffield 0.0 7.6
22 Sammy Sosa 7.2 6.4
23 Troy Percival 0.0 5.3
24 Carlos Delgado 0.0 1.9
</pre>
<p><img src='figures/pedro-plaque.png' width=200></p>
</div>
<div id="predictions-drum-roll-please...-3" class="slide section level1">
<h1>2015 predictions (drum roll, please...)</h1>
<pre style='font-size:14px'>
Name Previous Predicted
1 Randy Johnson 0.0 99.8
2 Pedro Martinez 0.0 93.5
3 John Smoltz 0.0 72.4
4 Craig Biggio 74.8 69.2
5 Mike Piazza 62.2 59.2
6 Jeff Bagwell 54.3 51.3
7 Tim Raines 46.1 41.9
8 Roger Clemens 35.4 29.3
9 Barry Bonds 34.7 28.5
10 Lee Smith 29.9 23.4
11 Curt Schilling 29.2 22.6
12 Edgar Martinez 25.2 18.7
13 Alan Trammell 20.8 14.9
14 Mike Mussina 20.3 13.8