-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert SAS to STATA import DO file.py
260 lines (218 loc) · 9.63 KB
/
convert SAS to STATA import DO file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""
June 2016
Robert Gambrel
Context: I need to take the SAS import text file and convert it to a STATA
import do file. They use different syntax but have patterns that should convert
from one to the other.
"""
import re
import os
import collections
## Changes for USER to Make:
# directory of ASCII Dataset, which is also where SAS code should be and
# STATA text will be output to
data_directory = '/Users/worker/ASCII_DIRECTORY'
# Name of the SAS script
sas_script_name = 'original_SAS.sas.txt'
# ASCII name, which will also be recoded as the dataset for STATA to save to
base_name = 'raw ASCII name'
## END THINGS TO CHANGE
os.chdir(data_directory)
# use base name to construct name of ASCII files and output Stata dataset
# use extra quotes so they're saved as strings
ascii_name = '"{}.asc"'.format(base_name)
stata_name = '"{}.dta"'.format(base_name)
# import SAS script
sas = open(sas_script_name, 'r').readlines()
# set up dictionary for recoding missing items
# This is the top section of the SAS dataset. Depending on variable type, different
# values are recoded as missing. This searches them and sets up a list automatically.
# It also looks up each SAS data type (string, int, float) and sets up a corresponding
# Stata data type (char, int, byte)
missing_dict = {}
types_dict = {}
for i in range(0, len(sas)-1):
line = sas[i]
# token for start of lines indicating how to recode missing values
if re.search("INVALUE", line):
# what is the name for the recode rule
missing_key = re.findall('INVALUE ([a-zA-Z0-9_]+)', line)[0]
missing_dict[missing_key] = []
j = 1
while True:
# go line by line until you see ';', which is SAS's end delimiter
if re.search(';', sas[i+j]):
break
# if there's a match for the missing key, add it to a list. If not,
# then catch the IndexError and move on
try:
recoded_value = re.findall('\'(-[0-9\.]+)', sas[i+j])[0]
except IndexError:
pass
# the missing are currently coded as strings. Convert to float /
# int as appropriate
if float(recoded_value) % 1 != 0:
recoded_value = float(recoded_value)
else:
recoded_value = int(recoded_value)
missing_dict[missing_key].append(recoded_value)
j += 1
# SAS uses this formatting rule in the import scripts I'm modifying:
# N#P#F. N# indicates digits before the decimal, P# indicates digits after.
# So use that to translate to Stata format: long (integer) or byte (float)
if re.search("N[0-9]+PF", missing_key):
types_dict[missing_key] = "long"
elif re.search("N[0-9]+P[0-9]+F", missing_key):
types_dict[missing_key] = "byte"
elif re.search("DATE", missing_key):
types_dict[missing_key] = 'date'
# Define the 'variable' class. Each variable has a variable name, label, length, and
# type
class Variable():
""" A Variable has a name, label, length, start point (in the raw data), end point,
a variable type (str, int, etc.), and a recode list for missing values
"""
def __init__(self, name):
self.name = name
self.label = None
self.length = None
self.vartype = None
self.recode_list = None
self.start = None
self.end = None
def get_name(self):
return self.name
def add_label(self, new_label):
self.label = new_label
def get_label(self):
return self.label
def add_length(self, new_length):
self.length = new_length
def get_length(self):
return self.length
def add_vartype(self, new_vartype):
self.vartype = new_vartype
def get_vartype(self):
return self.vartype
def add_recode_list(self, new_recode_list):
self.recode_list = new_recode_list
def get_recode_list(self):
return self.recode_list
def add_start(self, new_start):
self.start = new_start
def get_start(self):
return self.start
def add_end(self, new_end):
self.end = new_end
def get_end(self):
return self.end
# set up a list to hold variables
stata_values = collections.OrderedDict()
line_start = False
# use indexing, so I can pull information from subsequent lines when necessary
for i in range(0, len(sas)):
line = sas[i]
# "length=" in the line means it's a variable name. Pull the name and length
if re.search("LENGTH=", line):
variable_name = re.findall('([a-zA-Z0-9_]+) ', line)[0]
stata_var = Variable(variable_name)
variable_length = int(re.findall('LENGTH=\$*([0-9]+)', line)[0])
stata_var.add_length(variable_length)
# labels start at various points. account for some that take two lines,
# and some only take one. They can also start 2 lines below, 3 lines below,
# or directly below
if re.search("\"", sas[i+2]):
first_label_line = re.findall("\"(.*)\\n", sas[i+2])[0]
try:
second_label_line = re.findall("(.*)\"", sas[i+3])[0]
except IndexError:
second_label_line = ''
variable_label = first_label_line + second_label_line
elif re.search("\"", sas[i+3]):
first_label_line = re.findall("\"(.*)\\n", sas[i+3])[0]
try:
second_label_line = re.findall("(.*)\"", sas[i+4])[0]
except IndexError:
second_label_line = ''
variable_label = first_label_line + second_label_line
# single-line labels start directly below
else:
variable_label = re.findall("LABEL=\"(.*)\"", sas[i+1])[0]
# preserver it as a hard string
stata_var.add_label('"{}"'.format(variable_label))
#globals()[variable_name] = stata_var
stata_values[variable_name] = stata_var
# at the bottom of the file, the @ values indicate starting points, and the
# numbers and letters at the end of the line indicate variable type. Pull the
# starting points and use the type to convert to Stata type, using the conversion
# dictionary above
# example line: @12 Var2 N2P4F.
if re.search("\@[0-9]+", line):
variable_name = re.findall("\@[0-9]+\s*([a-zA-Z0-9_]+) ", line)[0]
variable_start = int(re.findall("\@([0-9]+)", line)[0])
# base it off the next row if I can. If it's the last one, do start + length,
# where length is defined above as an attribute of each variable object (including
# the last one)
try:
stop = int(re.findall("\@([0-9]+)", sas[i+1])[0]) - 1
except IndexError:
stop = variable_start + stata_values[variable_name].get_length()
stata_values[variable_name].add_start(variable_start)
stata_values[variable_name].add_end(stop)
# Find a series of characters before a period - this is the recode key
recode_key = re.findall("(\S+)\.", line)[0]
# pull recode key if it exists
recode_list = missing_dict.get(recode_key, None)
stata_values[variable_name].add_recode_list(recode_list)
# pull type; if it's not in the list, assume a character type
stata_values[variable_name].add_vartype(types_dict.get(recode_key, "char"))
# the ASCII raw data might have a line or two at the top with descriptions.
# Pull the info in the SAS Import that says which line to start with.
# Note above line_start was set to False. Immediately below, this condition
# affects how the beginning of the Stata file is written.
if re.search("INFILE", line):
try:
line_start = re.findall("FIRSTOBS=([0-9]+)", line)[0]
except IndexError:
line_start = False
# start writing the output file:
if line_start:
output_lines = ['clear all',
'cd "{}"'.format(data_directory),
'/// Code to import {}'.format(ascii_name),
'infix {} firstlineoffile ///'.format(line_start)]
else:
output_lines = ['clear all',
'cd "{}"'.format(data_directory),
'/// Code to import {}'.format(ascii_name),
'infix ///']
# lines to read in data (each line is a type, variable name, start-end, and ///
# to continue to next line):
for variable in stata_values.keys():
new_line = '{} {} {}-{} ///'.format(stata_values[variable].get_vartype(),
variable,
stata_values[variable].get_start(),
stata_values[variable].get_end())
output_lines.append(new_line)
# using data source:
output_lines.append('using ' + ascii_name)
# label variables:
for variable in stata_values.keys():
new_line = 'label var {} {}'.format(variable, str(stata_values[variable].get_label()))
output_lines.append(new_line)
# recode as needed
for variable in stata_values.keys():
recode = stata_values[variable].get_recode_list()
if recode:
recode = ' '.join(str(item) for item in recode)
new_line = 'recode {} ( {} = .)'.format(variable, recode)
output_lines.append(new_line)
recode = None
# line in STATA script to save the file as a dta
output_lines.append('save {}, replace'.format(stata_name))
# create a new file to write a DO file to
output_file = open('{}.do'.format(base_name), 'w')
output_file.flush()
for line in output_lines:
output_file.write(line + ' \n')
output_file.close()