#By Stefan Zeugner, 2007-02-14 #R commands used in TA sesssion 2 on R # augmented by some topics in plotting, time series and programming #____ Contents: ______________________ #+ REPEAT SOME KNOWLEDGE # o matrices, lists, random numbers # o reading data, data.frame # o working with matrices # o basic statistical models # SOME OTHER USEFUL COMMANDS # PACKAGES # PLOTTING # o high-level plotting commands # o low-level plotting commands # o combine plots into one # o managing devices # TIME SERIES # PROGRAMMING # o if statement and for loop # o functions and examples # o vectorization and usability #______________________________________ ############## START: REPEAT SOME KNOWLEDGE ################# setwd("C:/Dokumente und Einstellungen/zeugner/Eigene Dateien/IHS/ta_ecomtI") #set this to the path of your data #important: use "/" instead of "\" !! #____ matrices, lists, random numbers ______ #there are four random number commands "d"=density, "p"=prob, #"q"=quantile, and "r"=random number - all combined with name of dist'n #so pnorm gives probability of normal distribution (mean 0, stdev=1) pnorm(1.96,mean=0,sd=1) runif(1) # one random nb from uniform dist'n tryvec<-rnorm(10) #10 random nbs from normal dist'n #access 3rd element: tryvec[3] #converts tryvec into matrix of 5 rows, 2 cols: tryvec <- matrix(tryvec,5,2) #a LIST ist a collection of different objects diffrnds=list(rnorm(5),"test,diag(2)) print(diffrnds) #access second element: diffrnds[[2]] #list can also have names me=list(forename="Stefan",room=314) #access room number in different fashions, these two are most common: me[[2]] me$room #moreover there exist general "attributes" for objects: girls=list("Marianna","Shima","Theresa") attr(girls,"room")=323 #read and access attribute by attr print(attr(girls,"room")) #matrix() does nothing else than setting an attribute called "dim" newvec=runif(100) attr(newvec,"dim")=c(10,10) print(newvec) #____ reading data, data.frame _____________ usdata=read.table("u_cpi_usa_old.csv",sep=",") #read from file: here, csv #remark: first row only has two elements: therefore R thinks the first data columns actually contains the 'row names' class(usdata) # usdata is data.frame object mode(usdata) # a data.frame is a special list summary(usdata) # the generic summary function displays nice output for many objects... names(usdata) # the 'list' usdata has several names usdata$usa_cpi #accesible via dollar sign #we calculate inflation for further analysis, first entry is NA = "not available (NA is a genuine R command, like TRUE) infl=c(NA,diff(log(usdata$usa_cpi))) usdata=cbind(usdata,infl) #add infl to the usdata data.frame names(usdata)[3]="usa_infl" #set the name in the data.frame differently #Remark: get data from clipboard (e.g. copy in Excel) by the command trydata1=read.table("clipboard") #remark: scan() reads in data as a long vector (not data.frame) trydata2 <- scan("somenumbers.txt") rm(list=ls(pat="trydata")) #remove trydata* #____ working with matrices _____________ #transform data.frame to matrix: dtmatrix=data.matrix(usdata) class(dtmatrix) #drop first row (the one with NA): dtmatrix=dtmatrix[-1,] #get number of rows (cf. "ncol()"): N=nrow(dtmatrix) #make regressor matrix with constant and infl: X=cbind(rep(1,N),dtmatrix[,3]) y=dtmatrix[,1] #unemp #calculate efficiently: betahat=chol2inv(chol(crossprod(X))) %*% crossprod(X,y) #chol2inv(chol(A)) is an effiencient inverse of the pos.def. matrix A (much faster than "solve()" ) #crossprod(X) makes X'X ; crossprod(X,y) makes X'y ; %*% is usual matrix multiplication #show output on console: print(betahat) #____ basic statistical models _______ mod=lm(usa_unemp ~usa_infl,data=usdata) #OLS regression #note that R recognizes "usa_unemp" And "usa_infl" although #they are not variables as shown by: ls() #list all varibales in workspace #this effect comes from including parameter "data" in "ls()" mod$coefficients #get an element of the "lm" class names(mod) #other elements available summary(mod) #display output summary(mod)$coef #get an element of summary usdatanew=read.table("u_cpi_usa_new.csv",sep=",") #get longer data set modnew =update(mod,data=usdatanew) #make new regression model by 'iupdating' the old one: #here the old formula "usa_unemp ~usa_infl" is executed with new data fore=predict(modnew) #use the generic method 'predict' (equivalent to the attribute modnew$fitted.values) methods(class="lm") #see other generic methods for class "lm" by this command ############## END: REPEAT SOME KNOWLEDGE ################# ############## START: PACKAGES ################# #There is the useful "package" car, a collection of additional R commands #try this command library(car) # if the command gives an error, goto the console menu, #choose "package installer", selct "Austria" then scroll to "car" and click OK #Then try library(car) again durbin.watson(mod) #durbin Watson test, one of the few tests in car durbin.watson(modnew) #compare the DW test of the new model: Its far to low -> We have got a problem... #test the linear hypothesis that the inflation coefficient = 20 #for this, use "linear.hypothesis" which tests the restriction # Rb=r R=cbind(0,1) r=20 linear.hypothesis(modnew,R,r) # note that you may like the package "xlsReadWrite" ############## END: PACKAGES ################# ############## START: SOME OTHER USEFUL COMMANDS ################# attach(usdatanew) #write variable names into search path #other regressors #generate one: nothing=rnorm(nrow(usdatanew)) mod2=lm(usa_unemp~usa_infl+nothing) #now with other regressor (since we did attach(), we don't have to use parameter "data" anymore) #remark: "mod2=lm(usa_unemp ~ 1 + usa_infl + nothing)" would be equivalent summary(mod2) mod3=lm(usa_unemp~0+usa_infl+nothing) #no constant included summary(mod3) ############## END: SOME OTHER USEFUL COMMANDS ################# ############## START: PLOTTING ################# #_____ high-level plotting commands ________ #...create a new plot plot(usa_cpi) #time series plot plot(usa_cpi,usa_unemp) #scatter plot plot(usa_unemp,type="l",main="USA Unemp rate",xlab="time") #compare the follwoing: ?plot # (respectively: help(plot) ) #self-explaining: boxplot(usa_unemp) qqnorm(usa_infl) barplot(usa_infl) hist(usa_unemp) plot(density(usa_unemp)) #sometimes nicer than hist() #using options: # type="l"-line, "p"- points or "b"- both (and many more) # xlab=X-axis label # lty=line style (dotted, dashed, etc.) # col=color (4=blue, 1=black) plot(usa_cpi,type="l",xlab="SDFA",lty=1,col=4,lwd=2) #do nice scatter plot: plot(usdata$usa_infl,usdata$usa_unemp,xlab="inflation",ylab="unemployment") #_____ low-level plotting commands ________ #... add info to plot abline(mod$coef[1],mod$coef[2]) #add a regression line title(main="unep vs infl") #add title (also an option in plot) dev.off() #whats this? the plot has disappeareed ?!? #use high-level and low-level together #A line plot of fitted vs observed values for unemp: plot(usdata$usa_unemp,type="l",lty=1,col=1,main="USA Phillips Model",xlab="time",ylab="unemp") lines(mod$fitted.values,type="l",lty=2,col=4) #add a line with the fitted values from the bad model "mod" #now we want to add a legend. For this we need to determine its position # Therefore use interactive commad "locator()": Execute it and then # click on the plot where you want to have it. legpos=locator(1) legend(legpos$x,legpos$y,c("observed","predicted"),lty=c(1,2),col=c(1,4)) #______ combine plots into one _______________ #layout() makes a matrix of plots #e.g. the following trymat=matrix(1:4,2,2) layout(trymat) #now the next four plots are put into the order as 1, 2, 3, 4 in the matrix, resp.ly plot(usa_unemp) plot(usa_cpi,type="l") plot(usa_infl,type="l") plot(usa_infl,usa_unemp) #more sophistacited layouts are possible, too: layout(cbind(c(1,2),3)) plot(usa_unemp,type="l",main="unemp") plot(usa_infl,type="l",main="infl") plot(usa_infl,usa_unemp,main="scatter") #if you do a new plot it starts again with layout, so use dev.off() plot(usa_infl,type="l",main="infl") #_______ managing devices _____________ #What is this dev.off()? #R puts its graphics commands to a device: that may be your graphics window, some graphic file, postscript, etc. #at the motment there is only the cplot window open as shown by dev.list() dev.list() dev.cur() #this ios also the current device, so all graphics commands go to it #now open a file as a graphics device: png("testplot.png") #creates testplot.png in your working directory plot(usa_infl,type="l",col=4,width=2) #do some commands - see that the plot in your graphics window doesnt change title(main="testplot") dev.cur() #the current device is currently "PNG" dev.off() #now turn the current device off - only then the .PNG file is completed! #Note: to change between devices use "dev.set()" plot(usa_infl,type="b") #now output goes again to the graphics window ############## END: PLOTTING ################# ############## START: TIME SERIES ################# # do some time series analysis on our data library(tseries) # the standard time series package usa_resid=modnew$residuals #get residuals from modnew plot(usa_resid,type="l") acf(usa_resid) # plots the autocorrelations # the ts class: usts=ts(usdatanew,start=c(1960,1),frequency=4) #converts a data set # to a 'Timeseries' ts class beginning in 1960:1, monthly frequency class(usts) #check: it's "ts" #take subcolumns infl and unemp: infl=usts[,3] unemp=usts[,1] class(unemp) #apply some nice functions from tseries package adf.test(unemp) #augmented Dickey Fuller test for stationarity: unemp is integrated! acf(unemp) #auto-corr plot # since both unemp and infl are I(1), have to take differences (at least) modd=lm(diff(unemp) ~diff(infl)) AIC(modd) #get the akaike IC usa_resid=modd$residuals #put resids into new vector plot(usa_resid,type="l") #looks better, right? #put acf and pacf into one chart layout(c(1,2)) acf(usa_resid) pacf(usa_resid) #'close' the chart dev.off() #estimate an arima model based on acf/pacf analysis: #order=c(AR,I,MA) means: # AR: number of AR terms (here AR(1)) # I: level of integration (here I(0) - no integration) # MA: number MA terms (here: zero) #xreg: exogenous regressors moda=arima(diff(unemp),order=c(1,0,0),xreg=diff(infl)) mode(moda) #analyse output: looks better acf(moda$residuals[-1]) pacf(moda$residuals[-1]) tsdiag(moda) # nice function for ARMA geeks print(moda) ############## END: TIME SERIES ################# ############## START: PROGRAMMING ################# #_______ if statement and for loop ________ #***if statement***: # syntax: if ( EVALUATION ) { TO BE EVALUATED IF ECVALUATION=TRUE } [else {TO BE EVALUATED IF ECVALUATION=FALSE } ] # remind binary operators >,>=,==,!= and=& or=| if (2==2) {print("hello world!")} x=3 if (x==2 | x==3) { print("x is either 2 or 3") } else { print("x is either 2 or 3") } #***for loop***: #syntax for (x in SEQUENCE) {EXECUTE FOR EACH ELEMENT OF SEQUENCE x} for (i in 1:length(usdata)) { print(names(usdata)[i]) } #remark: "length()" is number of elements in an object #***EXAMPLE***: mylist=list(a=runif(3,min=1,max=9),b=runif(5),c=rchisq(20,100)) #EXAMPLE: for each element (vector) in mylist, determine whether all entries are larger # than one. If s, display the mean of the log of this vector, else the mean of 1+this vector #use the vector mylist[[2]]>1 mylist[[2]]>1 #is a vector of TRUE And FALSE whether each element is >1 all(mylist[[i]]>1) #determines whether all elements are TRUE for (i in 1:length(mylist)) { if ( all( mylist[[i]]>1 ) ) { print(mean(log(mylist[[i]]))) } else { print(mean(log(1+mylist[[i]]))) } } #______ FUNCTIONS ___________ #we have all used function such as mean now - you may write your own functions as well... #syntax: FUNCTIONNAME <- function(arg1 [,MOREARGUMENTS] ) {function statements } #SIMPLE EXAMPLE: a=34 plus1 <- function(x) { #take input argument a=x+1 #add one return(a) #retrun the variable a and finish } b=plus1(a) #note that the value of a in the "global environment" did not # change, although we said a=x+1 in the function print(a) #MORE SOPHIST. EXAMPLE -- c #we cannot do mean(mylist) mean(mylist) #gives an error #but we can write a function that returns the mean for all list elements meaner <- function(input) { if (!is.list(input)) { #check if input argument is a list, stop function if not stop("only works with lists!") } output=numeric(0) #initialise the vector output containing zero elements for (i in 1:length(input)) { output[i]=mean(input[[i]]) } return(output) #retuns the element output } meaner(mylist) #EVEN MORE SOPHIST. EXAMPLE meaner2 <- function(input,display=FALSE) { #there is an optional argument "display" that is # set to false if the user doesnt set it if (!is.list(input)) { #check if input argument is a list, stop function if not stop("only works with lists!") } output=numeric(0) #initialise the vector output containing zero elements for (i in 1:length(input)) { output[i]=mean(input[[i]]) if (display) { print(output[i]) } } return(output) #retuns the element output } #try the following test=meaner2(mylist) test=meaner2(mylist,TRUE) # USEFUL EXAMPLE: displaying mean, StDev, Median, Min, Max for a data.frame descdata <- function(ddata) { if (!is.data.frame(ddata)) { stop("only works with data frames!") } aout=numeric(0) for (j in 1:length(ddata)) { aout<-cbind(aout,c(mean(ddata[,j]),var(ddata[,j])^0.5,median(ddata[,j]),min(ddata[,j]), max(ddata[,j]))) } rownames(aout)=c("Mean","Stdev","Median","Min","Max") colnames(aout)=names(ddata) return(aout) } descdata(usdatanew) #_________ VECTORIZATION äAND USABILITY ________________ #Although convenient, using loops is very slow - therefore # try to avoid them by using other commands: # e.g. instead of meaner the extremely useful "sapply()" command #that applies a function to each element of a list: sapply(mylist,"mean") # does the same as the example "LISTMEAN" above, but is far faster sapply(mylist,"plus1") # general tip: your function should output a list - this increases usability #_______ EXAMPLE: a hand-made OLS function ________________ OLS <- function (y,X) { if ( length(y) != nrow(X) ) { stop("y and X dimensions do not match") } # you should add some other consistency checks here #calcuate some things XtXinv=chol2inv(chol(crossprod(X))) bhat=XtXinv%*% crossprod(X,y) sigmahat2=crossprod(y-X %*% bhat)/(nrow(X)-ncol(X)) stnde=sqrt( diag(XtXinv) * sigmahat2 ) #prepare a list as output out=list( coefs=bhat, stderrs=stnde, se=sqrt(sigmahat2), display= function() { #you may also put a function as list element outtable=cbind(bhat,stnde,bhat/stnde) colnames(outtable)=c("Coef.","Std.Err","t-Stat") rownames(outtable)=colnames(X) print(outtable) } ) return(out) #return list and quit } # now apply the function: X=cbind(rep(1,N),dtmatrix[,3]) colnames(X)=c("intercept","infl") y=dtmatrix[,1] myobj=OLS(y,X) mybobj$coefs myobj$display() ## By now you should have an understanding of basic R functionalities,. ## I would recommend you to apply them directly to your projects, and refer to www.rseek.org for references