/
surveySetup.py
executable file
·208 lines (166 loc) · 8.31 KB
/
surveySetup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# python_version: 3.5.2
'''
title: surveySetup.py
author: Paula Conn
version: 3
status: in-use
description: This python script can be used after each IST and SWEN data
collection period. It will automatically select a survey winner among eligible
students (Eligible-Participants.csv), output repeated entries, and format the
results to numeric values with no extraneous columns created by SurveyMonkey.
latest update: added encoding to work with other computers (11/1/18), added
functionality for CS surveys and created ordered output in csv (11/2/18),
added isCS to arrangedata function (2/15/19).
'''
try:
import sys
import pandas as pd
import numpy as np
import re
from random import randint
import csv
except ImportError:
print("Error: missing a library (pandas, numpy, random, re, etc.).",
"Please install. \n NOTE: If error is with pandas module, ensure that the",
"code is running on the latest version of python")
sys.exit()
def formatScale (column_list, data, scale):
data.iloc[:,column_list] = data.iloc[:,column_list].applymap(lambda s:
scale.get(s) if s in scale else s)
def assignScale(data):
data = data
# place numeric equivalents for likert questions
defaultScale = {'Agree very much': 6, 'Agree pretty much': 5, 'Agree a little': 4,
'Disagree a little': 3, 'Disagree pretty much': 2, 'Disagree very much': 1}
IDPcolumns = [i for i in range(8,28) if i not in (17,21,22)]
formatScale (IDPcolumns, data, defaultScale)
# place reverse numeric equivalents for likert (scale:1-6) questions: 10,14,15
reverseScale = {'Agree very much': 1, 'Agree pretty much': 2, 'Agree a little': 3,
'Disagree a little': 4, 'Disagree pretty much': 5, 'Disagree very much': 6}
data = data.replace({'I am aware of the problems that disabled people face': reverseScale,
'I dont pity them': reverseScale,
'After frequent contact I find I just notice the person not the disability': reverseScale})
# change questions for experience questions
expScale = {'I have personal experience with this': 2, 'I have knowledge of this': 1}
formatScale (list(range(29,37)), data, expScale)
# change software questions
swScale = {'I have heard or read about this': 1, 'I have done this before': 2}
formatScale (list(range(38,46)), data, swScale)
# change web design questions. If you receive an error of:
# 'SyntaxError: Non-ASCII character '\xe2' ensure that the code is running on
# the latest version of Python. If you are using SublimeText, a custom build
# will be needed to use Python3.
webScale = {'I’m familiar with this issue': 1,
'I have taken this issue into account to make the site more accessible for people with disabilities': 2,
'I have taken this issue into account to make it more accessible for people with disabilities': 2}
formatScale (list(range(46,62)), data, webScale)
# change boolean questions
booleanScale = {"Yes": 1, "No": 0}
formatScale ([62,63], data, booleanScale)
return(data)
def selectRaffleWinner (df, university, collector1, collector2):
# format student e-mails by removing the @email.com
df.iloc[:,10] = df.iloc[:,10].str.lower()
if university.lower() == 'rit':
df.iloc[:,10] = df.iloc[:,10].str.replace('@rit.edu','')
df.iloc[:,10] = df.iloc[:,10].str.replace('@g.rit.edu','')
if university.lower() == 'unt':
df.iloc[:,10] = df.iloc[:,10].str.replace('@unt.edu','')
df.iloc[:,10] = df.iloc[:,10].str.replace('@g.unt.edu','')
# subset current collection list
df_ISTE = df.loc[df['Collector ID'] == collector1].dropna(axis=1, how='all')
df_SWEN= df.loc[df['Collector ID'] == collector2].dropna(axis=1, how='all')
# combine both survey sets and remove empty columns created by Survey Monkey
df_ISTE_SWEN = df_ISTE.append(df_SWEN)
if ('IP Address' in df_ISTE_SWEN.columns):
df_ISTE_SWEN = df_ISTE_SWEN.drop('IP Address', 1)
# check eligible students (completed 80% of the survey) and choose winner
eligible = df_ISTE_SWEN.copy()
raffle_threshold = len(df_ISTE_SWEN.columns)*.2
# document and display warning if any entries are duplicates
try:
dup_emails = pd.concat(g for _, g in eligible.groupby(eligible.iloc[:, 5])if len(g) > 1)
dup_emails.to_csv('Duplicate-Entries.csv', sep=",")
dup_eligible = list(set(dup_emails.iloc[:, 5]).intersection(set(eligible.iloc[:, 5])))
for dup in dup_eligible:
if (eligible.iloc[:,5] == dup).sum()>1:
print("Warning duplicate: " + dup)
print("\n Program stopped due to duplicates. Please resolve duplicate",
"entries in the spreadsheet and run the program again. \n")
sys.exit()
except:
print("No duplicate entries\n")
# add a column calculating the incomplete answers for future reference
eligible['Total Incomplete Answers']= pd.isnull(eligible).astype(int).sum(axis=1)
eligible = eligible.loc[eligible['Total Incomplete Answers']<=raffle_threshold]
# select the raffle winner
random_winner = randint(0,len(eligible))
print('Raffle Winner: ', eligible.iloc[random_winner,4], eligible.iloc[random_winner,5])
eligible.to_csv('Eligible-Participants.csv', sep=",")
return(df_ISTE_SWEN)
def arrangeData (data, isCS):
# arrange data based on master spreadsheet
try:
if isCS:
order=pd.read_csv('CS-Column-Order.csv', encoding = "utf-8")
else:
order=pd.read_csv('Column-Order.csv', encoding = "utf-8")
except (FileNotFoundError, OSError, UnicodeDecodeError):
print("\nThe CSV file for the final column order was not found OR was not in utf-8 format.",
"Please add the spreadsheet to the location of this file and name it: Column-Order.csv\n")
sys.exit()
# remove punctuation in column headers
data.columns = data.columns.str.replace(r'[^\w\s]+', '')
reordered_data = data.reindex(columns=list(order))
mismatch_columns = list(set(reordered_data) - set(data))
if len(mismatch_columns) > 0:
print('\nError: Columns the columns do not match, please check both the',
'Column-Order.csv and Survey-for-ISTE-and-SWEN.csv. It is possible that no',
'responses were submitted for a column and the column header was deleted.',
' Below are the mismatched columns:\n %s \n', mismatch_columns)
# sys.exit()
return(reordered_data)
def main():
##################
# Change the variables below to what you need:
# Ensure that both collector IDs are for the same term and interval (e.g., Fall Pre)
##################
collector1 = 99123866
collector2 = 99123877 #Assign to None if CS
university = 'rit'
isCS = False
################
try:
df=pd.read_csv('Survey-for-ISTE-and-SWEN.csv', encoding = "utf-8", sep=',')
except (FileNotFoundError, OSError):
print('\nThe CSV file was not found. Please add the spreadsheet to the location',
'of this file and name it: Survey-for-ISTE-and-SWEN.csv\n')
sys.exit()
# Uncomment the code below if CS data includes new formatting by SurveyMonkey
# if isCS:
# if ('Respondent ID' in df.columns):
# df = df.drop('Respondent ID', 1)
# import and format column headers
df.columns.values[13:33] = df.iloc[0,13:33]
df.columns.values[43:67] = df.iloc[0,43:67]
df.columns.values[43:51] = [str(col) + '_web' for col in df.columns[43:51]]
df.columns.values[34:42] = df.iloc[0,34:42]
df.columns.values[34:42] = [str(col) + '_exp' for col in df.columns[34:42]]
df.columns=df.columns.str.replace('[^\w\s]','')
if isCS:
df = df.loc[df['Collector ID'] == collector1].dropna(axis=1, how='all')
if ('IP Address' in df.columns):
df = df.drop('IP Address', 1)
else:
if None in (collector1, collector2):
print('\nA collector ID was not found. When running non-CS data, please'
'enter both collector IDs for IT and SE students.\n')
sys.exit()
df = selectRaffleWinner(df, university, collector1, collector2)
df = assignScale(df)
df = arrangeData(df, isCS)
df.to_csv('Formatted-Data.csv', sep=",", encoding = 'utf-8-sig')
if __name__ == '__main__':
main()