# exlporing data # entering data hs0 <- read.table("http://www.ats.ucla.edu/stat/R/notes/hs0.csv", header=T, sep=",") # listing the first 20 observations, noticing missing values of variable science hs0[1:20, ] # listing the names of all the objects in hs0, in this case, listing all the variable names names(hs0) # creating a subset containing fewer variables, in this case, read-science read.sci <- hs0[ , 7:10] # checking the type of object class(read.sci) # listing the first 10 observations head(read.sci, n=10) # the descriptive statistics # displaying the dimension dim(read.sci) length(read.sci) length(read.sci$read) summary(read.sci) # range function range(read.sci$write) range(read.sci$science) range(read.sci$science, na.rm=T) # the minimum and the maximum among all the variables range(read.sci, na.rm=T) # looking at the mean and standard deviation of each variable in read.sci mean(read.sci) mean(read.sci, na.rm=T) sd(read.sci, na.rm=T) # looking at the entire data set, the means and standard deviations of all # the variables by program type (prgtype) table(hs0$prgtype) by(hs0, hs0$prgtype, mean) by(hs0, hs0$prgtype, sd) # now it is time to put hs0 on the search path # so its variables will be searchable by R attach(hs0) # changing the default number of digits = 7 to 2 getOption("digits") options(digits=2) by(hs0, prgtype, mean, na.rm=T) by(hs0, prgtype, sd, na.rm=T) # displaying the stats in a nicer way # ragged array: http://www.encyclopedia.com/doc/1O11-raggedarray.html m <- tapply(write, prgtype, mean) v <- tapply(write, prgtype, var) med <- tapply(write, prgtype, median) n <- tapply(write, prgtype, length) sd <- tapply(write, prgtype, sd) cbind(mean=m, var=v, std.dev=sd, median=med, n=n) # set the number of digits to 7 options(digits=7) # now graphics... # histogram hist(write) # load trellis graphics library(lattice) # trellis graphs histogram(~write, hs0, type="count") # histogram of write by gender histogram(~write | gender, hs0, type="count") # change the number of bins to 15 hist(write, breaks=15) # boxplot function in the graphics package boxplot(write) #trellis graphs bwplot(ses~ write, hs0) # boxplot by gender bwplot(ses~ write| gender, hs0) # bar chart with default options barplot(table(ses, gender)) # side by side bar chart with legend barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high")) # changing the default color barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), col=c("lightblue", "blue", "dark blue")) # labeling gender variable barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), col=c("lightblue", "blue", "dark blue"), names.arg=c("male", "female")) # spacing barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), col=c("lightblue", "blue", "dark blue"), names.arg=c("male", "female"), space=c(.1, 2)) # changing the location of legend and adding a title barplot(table(ses, gender), beside=T, legend.text=c("low", "medium", "high"), ylim=c(0, 50), space=c(.1, 1), col=c("lightblue", "blue", "dark blue"), names.arg=c("male", "female"), main="Distribution of SES by gender", args.legend=list(x =9, y=45, cex=.6)) # correlations and scatter plots # correlation of a pair of variables cor(write, math) cor(write, science) cor(write, science, use="complete.obs") # correlation matrix cor(read.sci, use="complete.obs") cor(read.sci, use="pairwise.complete.obs") plot(math, write) # scatter plot matrix plot(read.sci) detach(hs0)