/
Class 3 - Reading and Writing Data.R
362 lines (238 loc) · 8.19 KB
/
Class 3 - Reading and Writing Data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
################## Reading Data #########################
## Reading data from program editor
## Create a matrix
input.matrix <- matrix(1:100, ncol = 5)
## See the data
input.matrix # we see that a matrix with 5 columns and 20 rows has been formed
# Uniform Random Numbers between 1 to 1000
rr <-1000*runif(2)
class(rr)
rr<-as.integer(1000*runif(2))
class(rr)
mode(rr)
unif <- as.integer(1000*runif(200000))
plot(unif)
head(unif)
hist(unif)
# Scenario
# Application: 300,000
# Selection: 60,000
Prob <- 60000/300000
Appl.num <- seq(1,300000,1)
head(Appl.num)
selected <- sample(Appl.num,60000,replace = F)
selected.df <- data.frame(selected)
LETTERS[1:4]
sample(letters[1:4], 10, replace = T)
## Create a Data Frame
input.df <- data.frame(ID = 1:10,
Class = sample(letters[1:4], 10, replace = TRUE),
Value = seq(1:5))
## View the data frame
View(input.df)
# we see that the data frame has 3 columns named ID, Class, and Value
# The data frame also has 10 rows of data
## See the column names
colnames(input.df)
names(input.df)
row.names(input.df)
## See the dimensions
dim(input.df)
## Set the Working Directory
getwd()
## Reading data from Comma separated file (csv)
input_csv.df <- read.csv(file="C:\\Ram\\R for Data Science\\data\\binary.csv")
input_csv.df <- read.csv(file="C:\\Moksha\\MICA\\PGP2\\AMMA\\Data\\data_2017\\binary.csv")
# reset directory and read file
setwd("C:\\Ram\\R for Data Science\\data")
input_csv.df <- read.csv("binary.csv",header = T)
getwd()
## See the structure of the data
str(input_csv.df)
## reading the file to validate the data
sum(input_csv.df$gre)
## Reading Date Values
input_wthdt.df <- read.csv("binary_withdate.csv")
str(input_wthdt.df)
tab <- data.frame(table(input_wthdt.df$application_date))
sum(tab$Freq)
# we see the date field has been read as factor
# read Date as Factor
input_wthdt.df <- read.csv("binary_withdate.csv", stringsAsFactors = F)
str(input_wthdt.df)
## working with dates
d <-"2004-12-03"
d
class(d)
d1 <- as.Date(d)
class(d1)
mode(d1)
## find system date
s <-Sys.Date()
s
## Current date and time
c <-date()
c
# reading dates with other than defaul format
# d - Day e.g 1, 2 etc
# m - month
# b - month /Jan, Feb
# B - Month January
# y - 2 digit year
# Y - 4 Digit year
d2 <-as.Date("12-January-2012",format="%d-%B-%Y")
d2
class(d2)
format(d2,"%B")
d3 <-as.Date("12-February-2012",format="%d-%B-%Y")
d3 <-as.Date("12-February-12",format="%d-%B-%y")
d4 <-as.Date("12-12-12",format="%d-%m-%y")
d2
dd <- format(d2,"%d/%B/%Y")
## Calculate age
dob <-as.Date("12-Jan-1983",format="%d-%b-%Y")
dob
age <- difftime(Sys.Date(),dob,units="days")
as.integer(as.numeric(age)/365)
## Correct the date format
input_wthdt.df$application_date1 <- as.Date(input_wthdt.df$application_date, format="%m/%d/%Y")
str(input_wthdt.df)
## Read the data specifying the Class of the data
input_wthdt.df1 <- read.csv("binary_withdate.csv", colClasses=c(application_date = 'myDate'))
str(input_wthdt.df1)
## Read data from the web
## You can directly read a file directly from the internet by specifying the URL
input_webdata.df <- read.table("http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat")
str(input_webdata.df)
input_webdata.df$
## Read the first few records of a dataset
head(input_webdata.df)
## The first 6 rows of data have been displayed
## Ques 1 : What is we need to display the first 10 rows instead?
## Ques 2 : What if we want to display the last few rows?
csv_file <- read.table(file="C:\\Ram\\General 20150804 v1\\Trainings\\R Programming for Data Science\\data\\binary.csv",
header = TRUE,
sep = ',')
## Read a Tab Delimited file
input_tabdlmtd.df <- read.table(file="C:\\Ram\\R for Data Science\\data\\tab_delimited_data.txt",
header = TRUE,
sep = '\t')
head(input_tabdlmtd.df)
## We can use the same functions as with a csv file
## to read dates and modify formats with a tab-delimited file as well
input_dollar.df <- read.table(file="C:\\Ram\\R for Data Science\\data\\dollar_delimited_data.txt",
header = TRUE,
sep = '$')
## Read SAS dataset Data
install.packages("sas7bdat")
library(sas7bdat)
library(help=sas7bdat)
bank_ins = read.sas7bdat("C:\\Ram\\R for Data Science\\data\\bank_additional_full.sas7bdat")
names(bank_ins)
str(bank_ins)
library(foreign)
library(help=foreign)
# Read SPSS file
spss <- data.frame(read.spss("p004.sav"))
names(spss)
table(spss$PROTEIN)
library(help=sqldf)
sum.tb <- sqldf("select PROTEIN,count(*)
from spss
where PROTEIN>1
group by PROTEIN")
## Read Data From Facebook
install.packages(c("Rfacebook","RCurl","rjson"))
library(Rfacebook)
library(RCurl)
library(rjson)
library(help=Rfacebook)
# connecting to Facebook
#https://developers.facebook.com/tools/explorer
accessToken <-"EAACEdEose0cBAG6Br8me7NAiRGtnaK0NZBuFXA75eFANGDUOLotThDmXRGlo2x7G8ZAYFw9a9SKuJgeCMLPNqN07XJLGsmmv0Cvq6jImKi1jslSbmruQ1n8pxrMQADI44VsUpIfEzbgOyUROaa7608X8RBe5ld09ktJRo6z5hphsIgTnsJRBGJl58tdf4ZD" # Get data from a company page, extract data from a company's page on facebook
flipkartPage <-getPage(page="flipkart",
token=accessToken,
n=10)
#take the name from the link
flipkartPage <-getPage(page="flipkart",
token=accessToken,
n=150)
pages<- getURL("")
library(XML)
overall_matces= reahHTMLTable(pages, header=T, which=2, string)
################## Writing Data #########################
one_row <-bank_ins[,3]
bank_ins.smpl <- bank_ins[1:1000,]
## Save the R object
save(bank_ins,file="bank_ins.smpl.Rda")
# remove
rm(bank_ins.smpl)
rm(bank_ins)
## Load the data back
names(bank_ins)
load("bank_ins.smpl.Rda")
## install and load datasets package
install.packages("datasets")
require(datasets)
library(help="datasets")
## Save the data as a csv file
tt <- mtcars
names(tt)
write.csv(mtcars, "mtcars.csv")
tt$carmodel <- row.names(mtcars)
row.names(tt) <- NULL
write.csv(tt, "C:\\Ram\\R for Data Science\\data\\mtcars.csv",
row.names=F
)
## Save the data as a Tab Delimited file
write.table(mtcars, "mtcars.txt", sep = '%', quote = FALSE, row.names=F)
## write to file without column names
write.table(mtcars,
"mtcars_noheader.txt",
sep = '%',
quote = FALSE,
row.names=F,
col.names = F)
write.csv(mtcars,
"mtcars_noheader1.txt",
quote = FALSE,
row.names=F,
col.names = F)
write.table(mtcars,
"mtcars_noheadercsv.csv",
sep = ',',
quote = FALSE,
row.names=F,
col.names = F)
attach()
names(male)
summary(Age)
rm(Age)
Age
mean(male$Age)
attach(male)
mean(Age)
myfile <- read.table(text="MyName Age
Ram 38
Shyam 25", header=T)
write.csv(myfile,file="myfile.csv")
write.table(myfile,
"myfile.txt",
sep = '$',
row.names=F,
col.names = F)
students <- data.frame(Name= c("Ram","Dinesh","Mona","Sona"),
Gender=c("Male","Male","Female","Female"),
Age =rnorm(4,50,20))
View(students)
write.csv(students,
file="students.csv",
row.names = F)
getwd()
# ----------------- Reference --------------------------
# http://dni-institute.in/blogs/read-large-files-into-r/
# Assignment 2
# Q(1): Down a csv file from https://data.oecd.org/healthstat/infant-mortality-rates.htm and read the csv file
# And remove the last variable
# Q(2): Read 2 table from html page https://en.wikipedia.org/wiki/India%E2%80%93Pakistan_cricket_rivalry
# Find number of ODI Matches won by India