/
run_analysis.r
94 lines (78 loc) · 4.55 KB
/
run_analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#VARIABLES
WORKING_DIR <- "D:/Dropbox/Classes/GettingData/Project";
PROJECT_URL <- "http://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
PROJECT_ARCHIVE <- "archive.zip"
PROJECT_OUTPUT <- "Summarized_HAR_Data.txt"
CONST_SUBJECTID <- "subject_ID"
CONST_BODYPOSITIONID <- "bodyPosition_ID"
CONST_BODYPOSITIONDSESC <- "bodyPosition_Desc"
#Set the proper working folder if you're not already there
#TODO: submission probably does not want this
if (!grepl(paste0("^.+", WORKING_DIR, "$"),getwd())) {
setwd(WORKING_DIR)
}
#Download the archive to the working directory
if (!file.exists(PROJECT_ARCHIVE)) {
download.file(url = PROJECT_URL, destfile = PROJECT_ARCHIVE)
}
# ASSUMPTION!!! From looking at this zip before, I know all its files to be in a subfolder.
# We should get the name of the subfolder for future file loading:
# unzip(..., list=TRUE) returns a list of files in the archive, so grab the "Name" of the first row.
# then, split that path on the path-separator and take the first result to get the top subdirectory.
DATA_SUBFOLDER <- strsplit(unzip(PROJECT_ARCHIVE, list = TRUE)$Name[1],"/")[[1]][1]
if (!file.exists(DATA_SUBFOLDER))
{
unzip(PROJECT_ARCHIVE)
}
#"Merges the training and the test sets to create one data set."
# X_test.txt -- 561 datapoints x 2948 observations
# features.txt -- 2 datapoints x 561 observations; (column number, column name of datapoints in X_test)
# Y_test.txt -- 1 datapoint x 2948 observations; (number representing body position)
# activity_labels.txt -- 2 datapoints x 6 observations; (data value from Y_test, descriptive text)
setwd(DATA_SUBFOLDER)
FEATURE_NAMES <- read.table("features.txt", col.names = c("order", "featureName"), check.names = TRUE)
POSITIONS <- read.table("activity_labels.txt", col.names = c(CONST_BODYPOSITIONID, CONST_BODYPOSITIONDSESC))
# Read the test data
TEST_X <- read.table("test/X_test.txt", col.names = FEATURE_NAMES$featureName, check.names = TRUE)
TEST_Y <- read.table("test/Y_test.txt", col.names = c(CONST_BODYPOSITIONID))
TEST_Y <- merge(TEST_Y, POSITIONS)
TEST_SUBJECT <- read.table("test/subject_test.txt", col.names = c(CONST_SUBJECTID))
#TODO: Remove this before submission!
#TEST_X <- head(TEST_X)
#TEST_Y <- head(TEST_Y)
#TEST_SUBJECT <- head(TEST_SUBJECT)
#TODO: Remove this before submission!
# Read the training data -- in a perfect world, this would use dynamic var names or a function
TRAIN_X <- read.table("train/X_train.txt", col.names = FEATURE_NAMES$featureName, check.names = TRUE)
TRAIN_Y <- read.table("train/Y_train.txt", col.names = c(CONST_BODYPOSITIONID))
TRAIN_Y <- merge(TRAIN_Y, POSITIONS)
TRAIN_SUBJECT <- read.table("train/subject_train.txt", col.names = c(CONST_SUBJECTID))
#TODO: Remove this before submission!
#TRAIN_X <- head(TRAIN_X)
#TRAIN_Y <- head(TRAIN_Y)
#TRAIN_SUBJECT <- head(TRAIN_SUBJECT)
#TODO: Remove this before submission!
TESTTRAIN <- rbind(cbind(TEST_X, TEST_Y, TEST_SUBJECT),
cbind(TRAIN_X, TRAIN_Y, TRAIN_SUBJECT))
#"Extracts only the measurements on the mean and standard deviation for each measurement. "
#Uses descriptive activity names to name the activities in the data set
#Appropriately labels the data set with descriptive variable names. "
# From feature_info.txt: "The set of variables that were estimated from these signals are:
# mean(): Mean value
# std(): Standard deviation"
AVGS_STDS <- c(grep("mean",FEATURE_NAMES$featureName, value = TRUE),
grep("std",FEATURE_NAMES$featureName, value = TRUE))
COLNAMES <- make.names(c(CONST_SUBJECTID, CONST_BODYPOSITIONDSESC, AVGS_STDS))
TESTTRAIN <- subset(TESTTRAIN, select=COLNAMES) #or TESTTRAIN[,c(COLNAMES)]
#"Creates a second, independent tidy data set with the average of each variable for each activity and each subject."
#TODO: create tidy data set
molten <- melt(TESTTRAIN, id.vars = c(CONST_SUBJECTID, CONST_BODYPOSITIONDSESC))
# yeah, yeah, I know I'm breaking my convention of using a var to hold the field names.
# as.quoted(VARNAME) does exactly what I seek when used alone but not in context of dcast.
FINAL_ANSWER <- dcast(molten, subject_ID + bodyPosition_Desc ~ variable, mean)
FRIENDLY_NAMES <- gsub("\\.+", "_", names(FINAL_ANSWER)) #replace one-to-many periods with underscore
FRIENDLY_NAMES <- gsub("_$", "", FRIENDLY_NAMES) #replace any trailing underscores
#notify user everything but Subj ID and position are averages
FRIENDLY_NAMES[3:length(FRIENDLY_NAMES)] <- paste0("Avg_", FRIENDLY_NAMES[3:length(FRIENDLY_NAMES)])
names(FINAL_ANSWER) <- FRIENDLY_NAMES
write.table(FINAL_ANSWER, file = PROJECT_OUTPUT)