############################################################################### # Powerful Calculator # # Author: Jonathan Rosenblatt ############################################################################### # This is a comment. #--------------------- Simple Calculator ------------------------# 10+5 70*81 2 ** 4 2 ^ 4 6 %% 4 #Modulus 6 %/% 4 #Integer division ("Erech Tachton") 6 %% 4 #Modulus log(10) log(16, 2) log(1000, 10) Inf+7 1/Inf 1/0 0/0 Inf/Inf ?Syntax #For operator precedence ?Arithmetic #For a comprehensive list of functions and operators # Vector operations x<- c(1,2,3,4,5,6) x x+2 log(x) sum(x) prod(x) x+ c(1,2) #Vectors are recycled x+ c(1,2,3,4) y<- c(1/6, 1/6, 1/6, 1/6, 1/6, 1/6) x * y t(x) %*% y #Inner product x %*% y crossprod(x,y) #Same. But more efficient. x %*% t(y) #Outer product outer(x,y) #Same. But more efficient. # Complex numbers 7+0.5i + 2 a<- 1i #Useful when programming a sin(6i) exp(6i) Re(3+2i) #Real element Im(3+2i) #Imaginary element abs(3+4i) #Modulus Mod(3+4i) #------------------ Probability Calculator ---------------------# # What is the PDF of X~B(n=10,p=0.5) at 3? dbinom(x=3, size=10, prob=0.5) # Without names, arguments are evaluated by order of appearance dbinom(3, 10, 0.5) # How do I know the argument names of a know function? ?dbinom # And if I don't know the function's name? help.search('binomial') ??binomial help.start() # What about the CDF? pbinom(q=3, size=10, prob=0.5) dbinom(x=0, size=10, prob=0.5)+dbinom(x=1, size=10, prob=0.5)+dbinom(x=2, size=10, prob=0.5)+dbinom(x=3, size=10, prob=0.5) #Percentiles qbinom(p=0.1718, size=10, prob=0.5) # Generating random numbers rbinom(n=1, size=10, prob=0.5) rbinom(n=10, size=10, prob=0.5) rbinom(n=100, size=10, prob=0.5) # Assigning output to variables x<-rbinom(n=1000, size=10, prob=0.5) x # Simple summary statistics hist(x) rm(x) #How about other distributions? hist( rexp(n=100,rate=1) ) hist( runif(n=100,min=0,max=1) ) hist( rnorm(n=100,mean=0,sd=1) ) # For more information on distributions see # http://cran.r-project.org/web/views/Distributions.html ############################################################################### # Data Import # # Author: Jonathan Rosenblatt ############################################################################### # For a complete review see: # http://cran.r-project.org/doc/manuals/R-data.html # or # "Import and Export Manual" in help.start() # --------------- Local .csv file------------# # Note! # R is portable over platforms but path specification differs. # This means import functions are platform depedant. my.data<- read.csv(file='/home/jonathan/Projects/R Workshop/what women want.csv') my.data<- read.csv(file='C:\\Documents and Settings\\Jonathan\\My Documents\\R Workshop\\what women want.csv') names(my.data) my.data edit(my.data) # --------------Local txt file ------------# women<- read.table(file='C:\\Documents and Settings\\Jonathan\\My Documents\\R Workshop\\what women want.txt') edit(women) #Something is wrong with the data! women<- read.table(file='C:\\Documents and Settings\\Jonathan\\My Documents\\R Workshop\\what women want.txt', header=T, sep="\t") edit(women) edit(women) women<- edit(women) #Saves the changed file edit(women) fix(women) # Also saves the changed file edit(women) ?read.table #Get help on importing options #-----------Writing Data------------------# getwd() #What is the working directory? setwd('/home/jonathan/Projects/R Workshop/') #Setting the working directory in Linux setwd('C:\\Documents and Settings\\Jonathan\\My Documents\\R Workshop\\') #Setting the working directory in Windows write.csv(x=women, file='write_women.csv', append=F, row.names=F ) ?write.table # ------------- On-line files--------------# web.data<- read.table( 'http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data', header=T) edit(web.data) #---------------.XLS files-------------------# # Strongly recommended to convert to .csv # If you still insist see: # http://cran.r-project.org/doc/manuals/R-data.html#Reading-Excel-spreadsheets #------------- SAS XPORT files--------------# #See read.xport in package "foreign" install.packages('foreign') library(foreign) ?read.xport #--------------SPSS .sav files-----------# #See read.spss in package "foreign" install.packages('foreign') library(foreign) ?read.spss #------------------- MASSIVE files-----------------# # scan() is faster then read.table() but less convenient: start<- proc.time() #Initializing stopper A<- read.table('matrix.txt', header=F) proc.time()-start #Stopping stopper dim(A) start<- proc.time() A <- matrix(scan("matrix.txt", n = 20*2000), 20, 2000, byrow = TRUE) proc.time()-start # On Linux/Mac differences are less notable. #----------------MySQL-----------------# # MySQL is the best integrated relational database. # This is done with package RMySQL # Here is an example assuming you have a MySQL server setup with a database named "test". install.packages('RMySQL') library(RMySQL) # will load package DBI as well # open a connection to a MySQL database names "test". con <- dbConnect(dbDriver("MySQL"), dbname = "test") ## list the tables in the database dbListTables(con) ## load a data frame named "USAarrests" into database "test", deleting any existing copy data(USArrests) dbWriteTable(con, "arrests", USArrests, overwrite = TRUE) ## get the whole table dbReadTable(con, "arrests") ## Send SQL query to database dbGetQuery(con, paste("select row_names, Murder from arrests", "where Rape > 30 order by Murder")) dbRemoveTable(con, "arrests") #Removed the table "arrests" from database "test" dbDisconnect(con) #Closes the connection to database "test" # For more information see: # http://cran.r-project.org/doc/manuals/R-data.html#Relational-databases ########################################################################### # A Programming Environment # # Author: Liad Shekel ########################################################################### x<-100 x rm(x) x ls() x1<-5 x1 x1.aaa<-123 x1.aaa x_1<-6 x_1 ls() 1x<-6 ##Error ########################################################################### ##vector (numeric) x<-c(2,4,6,8,10,12,14,16,18) x x<-seq(0,5,by=0.1) x x<-seq(0,5,length=100) x class(x) x<-c(11,16, 18,17,14,19,12 ,13,15) x[2] x[3:7] x[ c(1,4,6) ] x[ c(1,6,4) ] sort(x) sort(x, decreasing = T) args( sort ) ?sort order(x) order(x, decreasing = T) ##logical ( T / F ) TRUE T FALSE F a<-T a b<- 1 == 0 b c<- 1 < 4 c class(c) b+c c*9 x<-c(11,12,13,14,15,16) x x[ c(T,F,F,T,F,T) ] x[c(1,0,0,1,0,1)] x[ as.logical( c(1,0,0,1,0,1) ) ] ##character a<-2010 t<-a t t<-"a" t x<-c("a","b","c","d") x month.name letters LETTERS x<-month.name x class(x) x<-c("a","a","a","a","b","b","c","c","c","d","d","e","e") x table(x) ##factor x.f<-factor(x) x.f x.f<-factor(x, labels = c("Num1","Num2","Num3","Num4","Num5") ) x.f ##list name<-c("a","b","c") month<-c(11,3,7) number.a<-round( rnorm(n=10, mean=0, sd=1) , digits = 2) number.b<-round( rnorm(n=10, mean=0, sd=1) , digits = 2) number.c<-round( rnorm(n=10, mean=0, sd=1) , digits = 2) list.a<-list( name[1] , month[1] , number.a ) list.a list.b<-list( name[2] , month[2] , number.b ) list.b list.c<-list( name[3] , month[3] , number.c ) list.c data<-list(A = list.a, B=list.b , C=list.c) data data[[1]] data$A data$A[[3]] data$A[[3]][1] data$A[[3]][1:5] data[1:2] mean(data$A[[3]]) ##matrix mat1<-matrix(data = c(seq(1,7),1,1) , ncol=3) mat1 mat2<-matrix(c(1:4,-5:2),nrow=3) mat2 edit(mat1) mat1<-edit(mat1) fix(mat1) mat1 %*% mat2 solve(mat1) eigen(mat1) eigen(mat1)$v eigen(mat1)$va eigen(mat1)$ve ##data.frame A<-as.data.frame(mat1) B<-as.data.frame(mat2) A;B A %*% B ## Error x<-c(1,2,3,4) n<-c("H","I","J","K") q<-c(0,0.33,0.66,1) x;n;q data.frame(x,n,q) x<-c("a","b","c") B B<-cbind(B,x) B A<-rbind(A,x) A x<-c("a",1,2,3,4,5,6,7,8,9,10) x y<-c("151","143","176","187","651","311") y class(y) mean(y) ## NA y<-as.numeric(y) y mean(y) a<-c(1,2,3,4,5,6,7,8,9,10) b<-c(5,32,5,7,2,4,5,6,7,8) c<-c(5,1,4,10,7,2,6,8,9,3) ord<-data.frame(a,b,c) ord ord$c ind <- order( ord[,3] ) ord[ ind, ] x<-c(0,1,2,3,4) class(x) as.factor(x) as.list(x) as.character(x) as.logical(x) ########################################################################### days<-c(31,28,31,30,31,30,31,31,30,31,30,31) data<-data.frame(Number=1:12 , month.name , month.abb) data data<- cbind( data , days ) data ##subset subset(data, days==31) subset(data, days==31, select=month.name ) subset(data, days!=31 & Number >= 6 , select= -month.abb ) J1<-round( rnorm(9,100,25) , digits=2) F1<-round( rnorm(9,100,25) , digits=2) M1<-round( rnorm(9,100,25) , digits=2) J1;F1;M1 J2<-T F2<-T M2<-F J2;F2;M2 J<- list(Numbers= J1, Logic = J2, Day =days[1]) F<- list(Numbers= F1, Logic = F2, Day =days[2]) M<- list(Numbers= M1, Logic = M2, Day =days[3]) J J$Numbers J$Day tbl<- list(Jan = J,Feb = F,Mar = M) tbl tbl$Jan$Day tbl[[1]][[3]] sum(tbl$Jan$Numbers) data<- data.frame ( jan = tbl$J$N, feb = tbl$F$N, mar = tbl$M$N) data class(data[,1]) class(data) min(data) max(data) #--------------------------- plot(data[,1], type="l") plot(data[,2], type="l") plot(data[,1], type="b") lines(data[,2], type="b" ,col=2) lines(data[,3], type="l" ,col=3) #--------------------------- ########################################################################### ## Loops i<-1 i*2 for (i in 1:5) { i*2 } for (i in 1:5) { print(i*2) } num<-0 while (num<10) { num <- runif(n=1, min= 0, max= 12) print(num) } i<-0; j<-1 while (i<10) { i <- i+1 j <- j*2 } i;j data t<-0 for (i in 1:9) { if (data[ i ,1] > mean(data[,1])) { t <- t + data[ i , 1 ] } else {break} } t mean(data$jan) dim(data) dim(data)[1] n<- dim(data)[1] plot(as.numeric(data[1,]),ylim=c(min(data),max(data)) ) for (i in 2:n) lines(as.numeric ( data[ i , ]) , col=i , type="p") ########################################################################### months<-month.name tbl2<-data.frame(month=months,val=rnorm(12)) tbl2 library(car) # -10 :-0.5 --> 1 # -0.5: 0 --> 2 # 0 : 0.5 --> 3 # 0.5: 10 --> 4 gr<-recode (tbl2$val , " -10:-0.5 =1; -0.5:0 = 2; 0:0.5 =3; 0.5:10=4 ") gr tbl2<- cbind(tbl2,group=gr) tbl2 ##switch switch(3 , "a","b","c","d","e","f","g") tbl tbl$J tbl[[1]] for (i in 1:3) { s<-tbl[[i]]$Num print(s) print( switch( i , quantile(s, 0.33) ,quantile(s, 0.667) ,quantile(s, 1) ) ) } ########################################################################### ## Functions sum.for<-function(val , data ) { t<-0 for (i in 1:9) { if (data[ i ,1] > val) { t<-t+data[ i ,1] } else {break} } return(t) } sum.for( 90 ,data ) f1<-function(x) { if ( identical( x,sort(x) ) ) return ( mean(x) ) quant<-quantile(x, seq(0,1,by=0.25)) E<-mean(x) V<-var(x) y<-list(mean= E, var = V, q=quant) return(y) } f1(c(1,2,3,4,5,6)) f1(rnorm(10000)) f2<-function(a,b,c) { SU<-a+b+c MU<-a*b*c if ( MU < SU ) { print("SU>MU") return(SU) } else { print("MU>SU") return(MU) } } f2(1,1,1) f2(9,1,2) debug(f2) f2(1,1,1) undebug(f2) rm(V) rm(y) g<-function(x) { if ( identical(x,sort(x) )) return (mean(x)) quant<-quantile(x, seq(0,1,by=0.1)) E<-mean(x) browser() #-------------------------------------------# V<-var(x) y<-list(mean= E, var = V, q=quant) return(y) } g(rnorm(100)) E quant V g2<-function(x) { if ( identical(x,sort(x) )) return (mean(x)) quant<-quantile(x, seq(0,1,by=0.1)) E<-mean(x) V<-var(x) y<-list(mean= E, var = V, q=quant) return(y) } debug(g2) g2( rnorm(100) ) undebug(g2) ####################################################### # Elementary Statistics # # # Author: Tal Galili # E-mail: Tal.galili@gmail.com (For any questions, feel welcome to contact me) # Blog: www.r-statistics.com ############################################################################### # R ref card: http://cran.r-project.org/doc/contrib/Short-refcard.pdf #?airquality # About our data: # Description: Daily air quality measurements in New York, May to September 1973. #A data frame with 154 observations on 6 variables. # Format: #[,1] Ozone numeric Ozone (ppb) #[,2] Solar.R numeric Solar R (lang) #[,3] Wind numeric Wind (mph) #[,4] Temp numeric Temperature (degrees F) #[,5] Month numeric Month (1–12) #[,6] Day numeric Day of month (1–31) # Details: Daily readings of the following air quality values for May 1, 1973 (a Tuesday) to September 30, 1973. #Ozone: Mean ozone in parts per billion from 1300 to 1500 hours at Roosevelt Island #Solar.R: Solar radiation in Langleys in the frequency band 4000–7700 Angstroms from 0800 to 1200 hours at Central Park #Wind: Average wind speed in miles per hour at 0700 and 1000 hours at LaGuardia Airport #Temp: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport. # Source: The data were obtained from the New York State Department of Conservation (ozone data) and the National Weather Service (meteorological data). # References: Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P. A. (1983) Graphical Methods for Data Analysis. Belmont, CA: Wadsworth. # This is the data we will work with data(airquality) # "data" Loads specified data sets #?airquality head(airquality) # first look on the data. notice the "NA's" (NA = not available) dim(airquality) # 153 6 attach(airquality) # move the variables (column vectors) in the data.frame into the working environment ############################################################################### # 1 factor # make a integer into a factor Month # print it length(Month) # 153 class(Month) # it's "integer" (we'd want to change that) f.Month <- factor(Month) f.Month class(f.Month) # "factor" # ways to present the content of the factor table(f.Month) summary(f.Month) # notice how this function will give different outcome for different "objects" barplot( table(f.Month) ) # the difference between a factor and integer/character table(Month) # It seems both give the same table(f.Month) # but they won't here: Month[1:10] f.Month[1:10] table(Month[1:10]) table(f.Month[1:10]) # changing factor levels levels(f.Month) levels(f.Month) <- c("May", "June", "July", "August", "September") levels(f.Month) # Notice the change in factor levels in the output table(f.Month) pie( table(f.Month) ) barplot( table(f.Month) ) # Simple statistical test # chisq test: chisq.test(table(f.Month)) # goodness-of-fit for uniform test # NOT signif ############################################################################### # 2 factors # let's look at a second factor # We'll work with a factor of "is there a missing data here" - we could have done the same with any "factor" object Ozone class(Ozone) # We have NA's is.na(Ozone) # tell for each cell if it's content is NA (TRUE) or not. (na.Ozone <- factor(is.na(Ozone))) table(na.Ozone) barplot( table(na.Ozone) ) # how much of our data is missing ? pie( table(na.Ozone) ) prop.table( table(na.Ozone) ) # table of proportion of the values from the orig table # Do we have different amounts of NA's for different months ? table(na.Ozone, f.Month) plot( table(na.Ozone, f.Month) ) # this gives us the month dist conditioned on the missing (and we want the other way) plot(t( table(na.Ozone, f.Month) )) # let's add some color plot(t( table(na.Ozone, f.Month) ), col = c("green", "red")) # to mention printer friendly - there are other color palates that downgrade to grey scale well. # (and also good for color blind people) see package "colorspace" # Statistical tests for chisq.test(table(na.Ozone, f.Month)) # very signif fisher.test(table(na.Ozone, f.Month)) # very signif ############################################################################### # 1 Numeric Ozone length(Ozone) # 153 hist(Ozone) # first look at our data mean(Ozone) # wouldn't work. Why? beacuse of the NA's... mean(Ozone, na.rm = T) na.omit(Ozone) # removes the NA's from the vector Ozone.2 <- na.omit(Ozone) length(Ozone.2) # 116 (instead of 153 with NA values) # statistics: mean(Ozone.2) var(Ozone.2) sd(Ozone.2) min(Ozone.2) max(Ozone.2) range(Ozone.2) median(Ozone.2) mad(Ozone.2) IQR(Ozone.2) quantile(Ozone.2) quantile(Ozone.2, probs = c(.05,.95)) fivenum(Ozone.2) # Tukey's five number summary: (minimum, lower-hinge, median, upper-hinge, maximum) summary(Ozone) # summary is applied for (almost) all object types # Notice the different results for summary(f.Month) # statistical tests t.test(Ozone) # computing the lower CI M <- mean(Ozone.2) M (M <- mean(Ozone.2)) # using (command) gives us the command + print(command output) (M <- mean(Ozone.2)) (S <- sd(Ozone.2)) (N <- length(Ozone.2)) (T <- qt(.975, df = N-1)) M - T * S/sqrt(N) # bingo... # could we have assumed normality? Not so much... shapiro.test(Ozone) # P < 2.790e-08 # After transformation - better, but not enough shapiro.test(log(Ozone)) # P < 0.014 wilcox.test(Ozone) # Wilcoxon signed rank test # Other Plots stem(Ozone) plot(Ozone) # will give us different results for different objects boxplot(Ozone) plot(density(Ozone)) # wouldn't work because of the NA! plot(density(Ozone.2)) hist(Ozone, col = "grey") # adding the plots one on top of the other hist(Ozone, col = "grey") lines(density(Ozone.2), col = "red", lwd = 4) # wouldn't work hist(Ozone, col = "grey", freq = F) lines(density(Ozone.2), col = "red", lwd = 4) rug(Ozone) # adding small lines distribution at the bottom boxplot(Ozone, horizontal = T, add = T, at = 0.01, boxwex = .02, lwd = 3, border = "blue") # Looking at normality par(mfrow = c(1,2)) qqnorm(Ozone.2) qqline(Ozone.2) qqnorm(log(Ozone.2)) ; qqline(log(Ozone.2)) # transformation of Ozone using log par(mfrow = c(1,1)) # This return us to view only one window (instead of 2) hist(log(Ozone), col = "grey", freq = F) lines(density(log(Ozone.2)), col = "red", lwd = 4) rug(log(Ozone)) boxplot(log(Ozone), horizontal = T, add = T, at = 0.1, boxwex = .2, lwd = 3, border = "blue") ############################################################################### # 2 numerics # Ozone Temp summary(Temp) hist(Temp, col = "orange") rug(Temp, col = "orange") c(1,1,1) jitter(c(1,1,1)) rug(jitter(Temp)) #use jitter to spread the lines a bit cor(Temp, Ozone) ss <- !is.na(Ozone) # shortcut Temp.2 <- Temp[ss] Ozone.2 <- Ozone[ss] cor(Temp.2, Ozone.2) # 0.69 cor.test(Temp.2, Ozone.2) # 0.69 signif... plot(x= Temp.2,y= Ozone.2 , sub = "Cor: 0.69" ) # notice the move from assignment to a function notation plot(Ozone.2 ~ Temp.2, sub = "Cor: 0.69" ) # notice the different output of plot, per object # notice the "~" notation => Y ~ X rug(jitter(Temp.2), side = 1) # notice the equal spacing! 1 = X rug(Ozone.2, side = 2) # Notice the use of "side" 2 = Y # a bit about linear regression # ta tadaaam; (lm.1 <- lm(Ozone.2 ~ Temp.2)) # OLS attributes(lm.1) # the lm object holds MANY objects in it (a list of lists) summary(lm.1) # we can extract some highlights through the summary # anova(lm.1) # the anova table of the lm == ancova # we can extract data from the object through various commands, such as: lm.1$coefficients coef(lm.1) # both gives the same. But the latter offers (sometimes) more control lm.1$fitted.values fitted(lm.1) lm.1$residuals residuals(lm.1) confint(lm.1) # this one is not inside the lm object, but being computes it... model.matrix(lm.1) # how did our X matrix look like head(model.matrix(lm.1)) # very important for later # we can plot our fitted line on the scatter plot plot(Ozone.2 ~ Temp.2) # by plotting the scatter plot abline(lm.1, col = "red") # and using abline on the lm object # diagnostic plot(lm.1) # We get a bunch (4) of diagnostic plots, when plotting on lm # we can also create such plots "manually" qqnorm( residuals(lm.1) ) qqline( residuals(lm.1) ) shapiro.test(residuals(lm.1)) # the resid or not very normal... # consider durbin watson test (from car or lmstat) plot( residuals(lm.1) ~ fitted(lm.1) ) abline( h= 0, lwd = 3) par(mfrow = c(1,3)) plot(Ozone.2 ~ Temp.2) abline(lm.1, col = "red") plot(lm.1, which = 1:2) par(mfrow = c(1,1)) plot(Ozone.2 ~ Temp.2) abline(lm.1, col = "red") (lowess.1 <- lowess(Ozone.2 ~ Temp.2)) lines(lowess.1$y ~ lowess.1$x, type = "l", col = "green", lwd = 3) abline(v = 78) ############################################################################### # 1 numerics 1 factor (two levels) # turning a numeric vector into a factor f.Temp <- cut(Temp, breaks = c(min(Temp), 78, max(Temp)), include.lowest = T) # this is like SPSS recode # cut(Temp, breaks = c(min(Temp), 78, max(Temp)), # include.lowest =F) # If we didn't use T, we would get f.Temp levels(f.Temp) levels(f.Temp) <- c("low", "high") f.Temp # table(f.Temp) # pie(table(f.Temp)) f.Temp == "low" f.Temp == "high" stripchart(Ozone~f.Temp) # scatter plot of Ozone by f.Temp levels mean( Ozone [f.Temp == "low"], na.rm = T) mean( Ozone [f.Temp == "high"], na.rm = T) tapply(X=Ozone, INDEX=f.Temp, FUN = mean, na.rm = T) #explain tapply boxplot(Ozone ~ f.Temp) var.test(Ozone ~ f.Temp) # F test for variance t.test(Ozone ~ f.Temp, var.equal = FALSE) # this is also the default wilcox.test(Ozone ~ f.Temp) # Mann-Whitney (a.k.a: Wilcoxon rank-sum test NOT Wilcoxon signed-rank test # contrasts in lm summary( lm(Ozone ~ f.Temp ) ) # t.test(Ozone ~ f.Temp, var.equal = T) # we'll get the same if we will assum equal variances (f.Temp.c <- C(f.Temp, contr="contr.sum")) coef( lm(Ozone ~ f.Temp ) ) # treatment contrasts 0 VS 1 (dummy coding) coef( lm(Ozone ~ f.Temp.c ) ) # treatment contrasts -1 VS 1 (effect coding) tapply(X=Ozone, INDEX=f.Temp, FUN = mean, na.rm = T) #explain tapply tail(model.matrix(lm(Ozone ~ f.Temp )), 7) tail(model.matrix(lm(Ozone ~ f.Temp.c )), 7) ############################################################################### # 1 numerics 1 factor (many levels) Ozone f.Month tapply(X=Ozone, INDEX=f.Month, FUN = mean) #wouldn't work tapply(X=Ozone.2, INDEX=f.Month, FUN = mean)#wouldn't work ss <- !is.na(Ozone) tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = mean) # works barplot(tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = mean)) barplot(tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = mean), ylab = "Mean Ozone", xlab = "Month") tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = sd) barplot(tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = sd), ylab = "SD", xlab = "Month") tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = range) tapply(X=Ozone[ss], INDEX=f.Month[ss], FUN = summary) tapply(X=Ozone, INDEX=f.Month, FUN = summary) # summary works with NA boxplot(Ozone ~ f.Month) boxplot(Ozone ~ f.Month, varwidth = T) # we have less data in June! boxplot(Temp ~ f.Month, col = "orange") # Notice the use of a function in a plot (aov.1 <- aov(Ozone ~ f.Month)) summary(aov.1) # ANOVA table # plot(aov.1, which = 1) # diagnostic on the anova TukeyHSD(aov.1) # multiple comparisons correction for "which of the couples is signif" par(mfrow = c(1,2)) par(mar = c(5, 9, 4, 2)) # c(bottom, left, top, right) plot(TukeyHSD(aov.1), las = 1) par(mar = c(5, 4, 4, 2) + 0.1) # the defaults boxplot(Ozone ~ f.Month, varwidth = T, las = 2) # we have less data in June! # a non parametric, one way anova (for many levels) kruskal.test(Ozone ~ f.Month) ############################################################################### # 2 numeric , 1 factors Ozone f.Temp Temp f.Temp.2 <- f.Temp[ss] plot(Ozone.2 ~ Temp.2) lines(lowess.1$y ~ lowess.1$x, type = "l", col = "green", lwd = 3) abline(v = 78) pairs(data.frame(Ozone.2,f.Temp.2,Temp.2)) # pairs get's a data.frame, and gives a "matrix plot" coplot(Ozone ~ Temp| f.Month, columns = 5) # scatter plot of Ozone ~ Temp, conditioned on f.Month (lm.2 <- lm(Ozone.2 ~ Temp.2 * f.Temp.2)) summary(lm.2) model.matrix(lm.2) par(mfrow = c(1,2)) plot(lm.1, which = 1, main = "lm.1 => Ozone.2 ~ Temp.2") plot(lm.2, which = 1, main = "lm.2 => Ozone.2 ~ Temp.2 * f.Temp.2") # a word about contrasts (f.Temp.3 <- C(f.Temp.2, contr=c("contr.sum", "contr.sum"))) (lm.3 <- lm(Ozone.2 ~ Temp.2 * f.Temp.3)) model.matrix(lm.3) summary(lm.3) f.Temp.col <- ifelse(f.Temp == "low", "blue", "orange") plot(Ozone.2 ~ Temp.2, col = f.Temp.col) lines(lowess.1$y ~ lowess.1$x, type = "l", col = "green", lwd = 3) abline(lm(Ozone.2 ~ Temp.2, subset = (f.Temp.2 == "low")), col = "blue", lwd = 2) abline(lm(Ozone.2 ~ Temp.2, subset = (f.Temp.2 == "high")), col = "orange", lwd = 2) # predicting with lm (lm.2 <- lm(Ozone.2 ~ Temp.2 * f.Temp.2)) summary(lm.2) predict(lm.2) # will return us our regression prediction. par(mfrow = c(1,2)) plot(Ozone.2 ~ Temp.2, col = f.Temp.col) points(predict(lm.2)~ Temp.2, col = "green", pch = 19) # will return us our regression prediction. # what if we want to predict "new" data? # let's fill in our na's our.new.data <- data.frame( Temp, f.Temp) [is.na(Ozone),] predict(lm.2, newdata = our.new.data) # won't work - we must have the EXACT same header our.new.data <- data.frame(Temp.2 = Temp, f.Temp.2 = f.Temp) [is.na(Ozone),] (predict.na.Ozone <- predict(lm.2, newdata = our.new.data)) points(predict.na.Ozone ~ Temp[is.na(Ozone)], col = "red", pch = 19) ############################################################################### # 1 numeric , 2 factors Ozone f.Temp f.Month table(f.Temp, f.Month) plot(t(table(f.Temp, f.Month)), col = c("blue", "orange")) boxplot(Ozone ~ f.Month) points(Ozone ~ f.Month) f.Temp.col <- ifelse(f.Temp == "low", "blue", "orange") points(Ozone ~ f.Month, col = f.Temp.col, pch = 19) # too many points on top of each other. boxplot(Ozone ~ f.Month, xlab = "Month", ylab = "Ozone") rep(1,4) jitter(rep(1,4)) points(Ozone ~ jitter(as.numeric(f.Month)), col = f.Temp.col, pch = 19) legend("topright", legend = c("<78", ">78"), fill = c("blue", "orange")) ############################################################################### # many numerics head(airquality) boxplot(airquality) pairs(airquality) pairs(data.frame(log(Ozone),airquality)) cor(airquality) (na.in.airquality <- apply(airquality, 2, is.na)) (sum.na.in.any.row <- apply(na.in.airquality,1, sum)) (na.in.any.row <- sum.na.in.any.row > 0) ss.all <- !na.in.any.row cor(airquality[ss.all,]) # filled.contour(cor(airquality[ss.all,])) panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * r) } pairs(airquality[ss.all,], lower.panel=panel.smooth, upper.panel=panel.cor) pairs(data.frame(log(Ozone),airquality)[ss.all,], lower.panel=panel.smooth, upper.panel=panel.cor) head(airquality) lm.4 <- lm(log(Ozone) ~ Solar.R + Wind + Temp + f.Temp + f.Month, subset = ss.all) summary(lm.4) lm.5 <- lm(log(Ozone) ~ (Solar.R + Wind + Temp + f.Temp )^2, subset = ss.all) summary(lm.5) step.lm.5 <-step(lm.5) summary(step.lm.5) # update and model selection here, if time permits... # saving the outputs ! pdf("images.pdf") # also png and jpeg sink("output.txt") # Notcie it's courier new roman !! ## all our code here sink() dev.off() detach(airquality) # remove the variables (column vectors) in the data.frame from the working environment