Skip to content

Preparing Transcripts for qdap

trinker edited this page Aug 2, 2012 · 19 revisions

##Import Microsoft Word Transcript Into R

###The following video demonstrates how to clean a Microsoft Word based transcript and read it into R. ------------------------INSERT VIDEO UPON APPROVAL---------------

Some rich text characters to be aware of include:

name rich char replacement
ellipsis ... or (pause)
left curly quote
right curly quote
left curly apostrophe '
right curly apostophe '
en dash ... or (pause)
em dash ... or (pause)
###Bracket types that `bracketX` and `bracketXtract` can parse:
bracket types names
<text> angle
(text) round
{text} curly
[text] square

R code used in the clean and import video

library(qdap);library(gdata)  

#doc is dependant on the name of the researcher's document
doc <- "TCH 7 Pre-data Les 2,  Year 1, 1-15-09.csv"     

dat1 <- read.csv(doc, header=FALSE,  strip.white = TRUE, sep=",", 
    as.is=FALSE, na.strings= " ") 

truncdf(dat1, 80)
htruncdf(dat1, 15, 80)
htruncdf(dat1)
left.just(htruncdf(dat1, 15, 80), 2)

The bracketX and bracketXtract functions

examp2 <- examp2 <- structure(list(person = structure(c(1L, 2L, 1L, 3L), .Label = c("bob", 
    "greg", "sue"), class = "factor"), text = c("I love chicken [unintelligible]!", 
    "Me too! (laughter) It's so good.[interupting]", "Yep it's awesome {reading}.", 
    "Agreed. {is so much fun}")), .Names = c("person", "text"), row.names = c(NA, 
    -4L), class = "data.frame")    

examp2                                                              
bracketX(examp2$text, 'square')  
bracketX(examp2$text, 'curly')  
bracketX(examp2$text)  
                                              
examp2                                              
bracketXtract(examp2$text, 'square')  
bracketXtract(examp2$text, 'curly')  
bracketXtract(examp2$text)  

paste2(bracketXtract(examp2$text, 'curly'), " ")

##Cleaning Transcript Data Imported Into R for qdap

###The following video demonstrates how to clean a transcript data set for qdap. ------------------------INSERT VIDEO UPON APPROVAL---------------

R code used in the cleaning for qdap video

library(qdap);library(gdata)  
        
dat1 <- read.csv(doc, header=FALSE,  strip.white = TRUE, sep=",", 
    as.is=FALSE, na.strings= " ")    
                                                     
colnames(dat1) <- c("person", "dialogue")                                         #rename the columns
is.blank <- function(x)x %in% c("", " ", "   ")                                   #function for finding blank cells
dat1 <- dat1[!apply(apply(dat1, 2, is.blank), 1, all), ]                          #select non blank cases
dat1$person <- as.factor(mgsub(c("#", " ", ":"), "", dat1$person, spacer=FALSE))  #standardize people's names
#strWrap(paste2(bracketXtract(dat1$dialogue, bracket = "curly")," "), 80)         #the story
dat1$dialogue <- bracketX(scrubber(dat1$dialogue))                                #remove extreneous marks
dat1[nchar(dat1$dialogue) < 3, ]                                                  #check to make sure everything looks good 

dat1$person <- drop.levels(dat1$person)                                           #drop unused people names (factor levels)
x <- c("Mrs.", "Ms.", "Mr.", "www.", ".com")                                      #remove abreviated names
dat1$dialogue <- mgsub(c(x, tolower(x)), c("Misses", "Miss", "Mister",  
     "www dot ", " dot com"), dat1$dialogue, fixed=TRUE)
dat1s <- sentSplit(dat1, "dialogue")                                              #splot TOT into sentences and stem                                                 

#SANITY CHECKS
truncdf(dat1s[nchar(as.character(dat1s$dialogue)) < 3, ] )
truncdf(dat1s, 50)