# modifying data # cleaning up rm(list=ls()) # reading in data hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",") # adding comment comment(hs0)<-"High school and beyond data" # checking comment(hs0) # variable labels using comment comment(hs0$write)<-"writing score" comment(hs0$read) <-"reading score" # more checking save(hs0,file="hs0.rda") rm(list=ls()) load(file="hs0.rda") comment(hs0) comment(hs0$write) # checking that hs0 is on the search path search() attach(hs0) search() #checking what variables are factor variables sapply(hs0, is.factor) # creating a factor, ie. a categorical variable schtyp.f <- factor(schtyp, levels=c(1, 2), labels=c("public", "private")) female <- factor(gender, levels=c(0, 1), labels=c("male", "female")) # schtyp.f or female is a factor variable, mean function does not apply mean(schtyp.f) # but table function makes perfect sense table(schtyp.f) table(female) # plotting using variable label plot(write, read, xlab=comment(write)) # recoding # looking at race variable table(hs0$race) hs0$race[hs0$race==5] <-NA table(hs0$race) # displaying the missings as well table(hs0$race, useNA="ifany") total<-read+write+math+science # noticing the missing values generated summary(total) # initializing a variable grade<-0 grade[total <=140]<-0 grade[total > 140 & total <= 180] <-1 grade[total > 180 & total <= 210] <-2 grade[total > 210 & total <= 234] <-3 grade[total > 234] <-4 comment(grade)<-"combined grades of read, write, math, science" grade<-factor(grade, levels=c(0, 1, 2, 3, 4), labels=c("F", "D", "C", "B", "A")) table(grade) # difference between the mean function and manually computing the mean m1<-(read+write+math+science)/4 m2<-rowMeans(cbind(read, write, math, science)) m2<-rowMeans(cbind(read, write, math, science), na.rm=T) # adding the newly-created variables to the original data set hs1<-cbind(hs0, cbind(schtyp.f, female, total, grade)) table(hs1$race) is.data.frame(hs1)