#By Stefan Zeugner, 2007-02-14
#R commands used in TA sesssion 2 on R
# augmented by some topics in plotting, time series and programming

#____ Contents: ______________________

#+ REPEAT SOME KNOWLEDGE 
#    o matrices, lists, random numbers
#    o reading data, data.frame
#    o working with matrices 
#    o basic statistical models
# SOME OTHER USEFUL COMMANDS
# PACKAGES
# PLOTTING
#    o high-level plotting commands
#    o low-level plotting commands
#    o combine plots into one 
#    o managing devices
# TIME SERIES
# PROGRAMMING
#    o if statement and for loop 
#    o functions and examples 
#    o vectorization and usability
#______________________________________



############## START: REPEAT SOME KNOWLEDGE #################

setwd("C:/Dokumente und Einstellungen/zeugner/Eigene Dateien/IHS/ta_ecomtI") #set this to the path of your data
#important: use "/" instead of "\" !!


#____ matrices, lists, random numbers ______

#there are four random number commands "d"=density, "p"=prob, 
#"q"=quantile, and "r"=random number - all combined with name of dist'n

#so pnorm gives probability of normal distribution (mean 0, stdev=1)
 pnorm(1.96,mean=0,sd=1)
 runif(1) # one random nb from uniform dist'n
 tryvec<-rnorm(10) #10 random nbs from normal dist'n

#access 3rd element:
 tryvec[3] 

#converts tryvec into matrix of 5 rows, 2 cols:
 tryvec <- matrix(tryvec,5,2) 


#a LIST ist a collection of different objects
 diffrnds=list(rnorm(5),"test,diag(2))
 print(diffrnds)
#access second element:
 diffrnds[[2]]

#list can also have names
 me=list(forename="Stefan",room=314)
#access room number in different fashions, these two are most common:
 me[[2]]
 me$room

#moreover there exist general "attributes" for objects:
 girls=list("Marianna","Shima","Theresa")
 attr(girls,"room")=323

#read and access attribute by attr
 print(attr(girls,"room"))

#matrix() does nothing else than setting an attribute called "dim"
 newvec=runif(100)
 attr(newvec,"dim")=c(10,10)
 print(newvec)



#____ reading data, data.frame _____________

  usdata=read.table("u_cpi_usa_old.csv",sep=",") #read from file: here, csv
#remark: first row only has two elements: therefore R thinks the first data columns actually contains the 'row names'

 class(usdata) # usdata is data.frame object
 mode(usdata) # a data.frame is a special list

 summary(usdata) # the generic summary function displays nice output for many objects...
 names(usdata)  # the 'list' usdata has several names
 usdata$usa_cpi #accesible via dollar sign


#we calculate inflation for further analysis, first entry is NA = "not available (NA is a genuine R command, like TRUE)
 infl=c(NA,diff(log(usdata$usa_cpi))) 
 usdata=cbind(usdata,infl) #add infl to the usdata data.frame
 names(usdata)[3]="usa_infl" #set the name in the data.frame differently

#Remark: get data from clipboard (e.g. copy in Excel) by the command
 trydata1=read.table("clipboard")
#remark: scan() reads in data as a long vector (not data.frame)
 trydata2 <- scan("somenumbers.txt") 

 rm(list=ls(pat="trydata")) #remove trydata*


#____ working with matrices _____________ 

#transform data.frame to matrix:
 dtmatrix=data.matrix(usdata) 
 class(dtmatrix)

#drop first row (the one with NA):
 dtmatrix=dtmatrix[-1,] 

#get number of rows (cf. "ncol()"):
 N=nrow(dtmatrix) 

#make regressor matrix with constant and infl:
 X=cbind(rep(1,N),dtmatrix[,3]) 
 y=dtmatrix[,1] #unemp

#calculate efficiently:
  betahat=chol2inv(chol(crossprod(X))) %*% crossprod(X,y)
 #chol2inv(chol(A)) is an effiencient inverse of the pos.def. matrix A (much faster than "solve()" )
 #crossprod(X) makes X'X ; crossprod(X,y) makes X'y ; %*% is usual matrix multiplication

#show output on console:
 print(betahat) 


#____ basic statistical models _______

mod=lm(usa_unemp ~usa_infl,data=usdata) #OLS regression 
  #note that R recognizes "usa_unemp" And "usa_infl" although 
  #they are not variables as shown by:
 ls() #list all varibales in workspace
 #this effect comes from including parameter "data" in "ls()"

mod$coefficients #get an element of the "lm" class
names(mod) #other elements available

summary(mod) #display output
summary(mod)$coef #get an element of summary



usdatanew=read.table("u_cpi_usa_new.csv",sep=",") #get longer data set

modnew =update(mod,data=usdatanew) #make new regression model by 'iupdating' the old one:
 #here the old formula "usa_unemp ~usa_infl" is executed with new data

fore=predict(modnew) #use the generic method 'predict' (equivalent to the attribute modnew$fitted.values)

methods(class="lm") #see other generic methods for class "lm" by this command


############## END: REPEAT SOME KNOWLEDGE #################

############## START: PACKAGES #################

#There is the useful "package" car, a collection of additional R commands
#try this command
 library(car)
 # if the command gives an error, goto the console menu, 
 #choose "package installer", selct "Austria" then scroll to "car" and click OK
 #Then try library(car) again

 durbin.watson(mod) #durbin Watson test, one of the few tests in car
 durbin.watson(modnew) #compare the DW test of the new model: Its far to low -> We have got a problem...

#test the linear hypothesis that the inflation coefficient = 20
#for this, use "linear.hypothesis" which tests the restriction
# Rb=r
 R=cbind(0,1)
 r=20
 linear.hypothesis(modnew,R,r)


# note that you may like the package "xlsReadWrite"

############## END: PACKAGES #################



############## START: SOME OTHER USEFUL COMMANDS #################

 attach(usdatanew) #write variable names into search path

 #other regressors
  #generate one:
  nothing=rnorm(nrow(usdatanew))
  mod2=lm(usa_unemp~usa_infl+nothing) #now with other regressor (since we did attach(), we don't have to use parameter "data" anymore)
   #remark: "mod2=lm(usa_unemp ~ 1 + usa_infl + nothing)" would be equivalent
  summary(mod2)

  mod3=lm(usa_unemp~0+usa_infl+nothing) #no constant included
  summary(mod3)


############## END: SOME OTHER USEFUL COMMANDS #################



############## START: PLOTTING #################

#_____ high-level plotting commands ________
#...create a new plot

 plot(usa_cpi) #time series plot
 plot(usa_cpi,usa_unemp) #scatter plot
 plot(usa_unemp,type="l",main="USA Unemp rate",xlab="time")
  #compare the follwoing:
  ?plot # (respectively: help(plot) )

 #self-explaining:
  boxplot(usa_unemp)
  qqnorm(usa_infl)
  barplot(usa_infl)
  hist(usa_unemp)
  plot(density(usa_unemp)) #sometimes nicer than hist()



#using options: 
#    type="l"-line, "p"- points  or "b"- both (and many more)
#    xlab=X-axis label
#    lty=line style (dotted, dashed, etc.)
#    col=color (4=blue, 1=black)
 plot(usa_cpi,type="l",xlab="SDFA",lty=1,col=4,lwd=2)

#do nice scatter plot:
 plot(usdata$usa_infl,usdata$usa_unemp,xlab="inflation",ylab="unemployment")




#_____ low-level plotting commands ________
#... add info to plot

 abline(mod$coef[1],mod$coef[2]) #add a regression line
 title(main="unep vs infl") #add title (also an option in plot)

 dev.off() #whats this? the plot has disappeareed ?!?

 #use high-level and low-level together
 #A line plot of fitted vs observed values for unemp:
  plot(usdata$usa_unemp,type="l",lty=1,col=1,main="USA Phillips Model",xlab="time",ylab="unemp")
  lines(mod$fitted.values,type="l",lty=2,col=4) #add a line with the fitted values from the bad model "mod"

#now we want to add a legend. For this we need to determine its position
# Therefore use interactive commad "locator()": Execute it and then 
# click on the plot where you want to have it.
 legpos=locator(1)
 legend(legpos$x,legpos$y,c("observed","predicted"),lty=c(1,2),col=c(1,4))


#______ combine plots into one _______________

#layout() makes a matrix of plots
 #e.g. the following
 trymat=matrix(1:4,2,2)
 layout(trymat)
 #now the next four plots are put into the order as 1, 2, 3, 4 in the matrix, resp.ly
  plot(usa_unemp)
  plot(usa_cpi,type="l")
  plot(usa_infl,type="l")
  plot(usa_infl,usa_unemp)

#more sophistacited layouts are possible, too:
  layout(cbind(c(1,2),3))
  plot(usa_unemp,type="l",main="unemp")
  plot(usa_infl,type="l",main="infl")
  plot(usa_infl,usa_unemp,main="scatter")

#if you do a new plot it starts again with layout, so use dev.off()
 plot(usa_infl,type="l",main="infl")


#_______ managing devices _____________

#What is this dev.off()?
 #R puts its graphics commands to a device: that may be your graphics window, some graphic file, postscript, etc.
 #at the motment there is only the cplot window open as shown by dev.list()
 dev.list()
 dev.cur() #this ios also the current device, so all graphics commands go to it

#now open a file as a graphics device:
 png("testplot.png") #creates testplot.png in your working directory
 plot(usa_infl,type="l",col=4,width=2) #do some commands - see that the plot in your graphics window doesnt change
 title(main="testplot")
 dev.cur() #the current device is currently "PNG"
 dev.off() #now turn the current device off - only then the .PNG  file is completed!

#Note: to change between devices use "dev.set()"
 plot(usa_infl,type="b") #now output goes again to the graphics window

############## END: PLOTTING #################






############## START: TIME SERIES #################

# do some time series analysis on our data
 library(tseries) # the standard time series package
 usa_resid=modnew$residuals #get residuals from modnew
 plot(usa_resid,type="l")
 acf(usa_resid) # plots the autocorrelations

# the ts class:
 usts=ts(usdatanew,start=c(1960,1),frequency=4) #converts a data set 
 # to a 'Timeseries' ts class beginning in 1960:1, monthly frequency
 class(usts) #check: it's "ts"

#take subcolumns infl and unemp:
 infl=usts[,3] 
 unemp=usts[,1]
 class(unemp)

#apply some nice functions from tseries package
 adf.test(unemp) #augmented Dickey Fuller test for stationarity: unemp is integrated!
 acf(unemp) #auto-corr plot

# since both unemp and infl are I(1), have to take differences (at least)
 modd=lm(diff(unemp) ~diff(infl)) 
 AIC(modd) #get the akaike IC

 usa_resid=modd$residuals #put resids into new vector
 plot(usa_resid,type="l") #looks better, right?

#put acf and pacf into one chart
 layout(c(1,2)) 
 acf(usa_resid)
 pacf(usa_resid)
#'close' the chart
 dev.off()

#estimate an arima model based on acf/pacf analysis: 
 #order=c(AR,I,MA) means:
 #    AR: number of AR terms (here AR(1))
 #    I: level of integration (here I(0) - no integration)
 #    MA: number MA terms (here: zero)
 #xreg: exogenous regressors
 moda=arima(diff(unemp),order=c(1,0,0),xreg=diff(infl))
 mode(moda)

#analyse output: looks better
 acf(moda$residuals[-1])
 pacf(moda$residuals[-1])
 tsdiag(moda) # nice function for ARMA geeks

 print(moda)
    
############## END: TIME SERIES #################




############## START: PROGRAMMING #################

#_______ if statement and for loop ________

#***if statement***:
# syntax: if ( EVALUATION ) { TO BE EVALUATED IF ECVALUATION=TRUE } [else {TO BE EVALUATED IF ECVALUATION=FALSE } ]
# remind binary operators >,>=,==,!=   and=& or=|
  if (2==2) {print("hello world!")}

 x=3
 if (x==2 | x==3) { 
     print("x is either 2 or 3") 
 } else { 
     print("x is either 2 or 3") 
 }

#***for loop***:
#syntax for (x in SEQUENCE) {EXECUTE FOR EACH ELEMENT OF SEQUENCE x}

 for (i in 1:length(usdata)) {
     print(names(usdata)[i])
 }
 #remark: "length()" is number of elements in an object



#***EXAMPLE***:

  mylist=list(a=runif(3,min=1,max=9),b=runif(5),c=rchisq(20,100))

#EXAMPLE: for each element (vector) in mylist, determine whether all entries are larger
# than one. If s, display the mean of the log of this vector, else the mean of 1+this vector

#use the vector mylist[[2]]>1
mylist[[2]]>1 #is a vector of TRUE And FALSE whether each element is >1
all(mylist[[i]]>1) #determines whether all elements are TRUE


for (i in 1:length(mylist)) {

    if ( all( mylist[[i]]>1 ) ) {

        print(mean(log(mylist[[i]])))

    } else {

        print(mean(log(1+mylist[[i]])))

    }
}


#______ FUNCTIONS ___________

#we have all used function such as mean now - you may write your own functions as well...
#syntax: FUNCTIONNAME <- function(arg1 [,MOREARGUMENTS] ) {function statements }

#SIMPLE EXAMPLE:
 a=34

 plus1 <- function(x) { #take input argument
     a=x+1 #add one
     return(a) #retrun the variable a and finish
 }

 b=plus1(a)

 #note that the value of a in the "global environment" did not 
 # change, although we said a=x+1 in the function
 print(a)


#MORE SOPHIST. EXAMPLE -- c
 #we cannot do mean(mylist)
 mean(mylist) #gives an error

 #but we can write a function that returns the mean for all list elements 

 meaner <- function(input) { 

    if (!is.list(input)) { #check if input argument is a list, stop function if not
        stop("only works with lists!")
    }

    output=numeric(0) #initialise the vector output containing zero elements
    for (i in 1:length(input)) {
        output[i]=mean(input[[i]])
    }
    return(output) #retuns the element output
 }

 meaner(mylist)


#EVEN MORE SOPHIST. EXAMPLE

 meaner2 <- function(input,display=FALSE) { 
    #there is an optional argument "display" that is 
    # set to false if the user doesnt set it

    if (!is.list(input)) { #check if input argument is a list, stop function if not
        stop("only works with lists!")
    }

    output=numeric(0) #initialise the vector output containing zero elements
    for (i in 1:length(input)) {
        output[i]=mean(input[[i]])
        if (display) {
            print(output[i])
        }
    }
    return(output) #retuns the element output
 }

 #try the following
 test=meaner2(mylist)
 test=meaner2(mylist,TRUE)



# USEFUL EXAMPLE: displaying mean, StDev, Median, Min, Max for a data.frame
 descdata <- function(ddata) {
    if (!is.data.frame(ddata)) {
        stop("only works with data frames!")
    }

    aout=numeric(0)
    for (j in 1:length(ddata)) {
        aout<-cbind(aout,c(mean(ddata[,j]),var(ddata[,j])^0.5,median(ddata[,j]),min(ddata[,j]), max(ddata[,j])))
    }
    
    rownames(aout)=c("Mean","Stdev","Median","Min","Max")
    colnames(aout)=names(ddata)
    return(aout)

 } 

 descdata(usdatanew)


#_________ VECTORIZATION äAND USABILITY ________________

#Although convenient, using loops is very slow - therefore
# try to avoid them by using other commands:
# e.g. instead of meaner the extremely useful "sapply()" command 
#that applies a function to each element of a list:

 sapply(mylist,"mean") # does the same as the example "LISTMEAN" above, but is far faster

 sapply(mylist,"plus1")

 # general tip: your function should output a list - this increases usability



#_______ EXAMPLE: a hand-made OLS function ________________

 OLS <- function (y,X) {
    if ( length(y) != nrow(X) ) {
        stop("y and X dimensions do not match")
    }
    # you should add some other consistency checks here

    #calcuate some things
    XtXinv=chol2inv(chol(crossprod(X))) 
    bhat=XtXinv%*% crossprod(X,y)
    sigmahat2=crossprod(y-X %*% bhat)/(nrow(X)-ncol(X))
    stnde=sqrt( diag(XtXinv) * sigmahat2 )

    #prepare a list as output
    out=list(
        coefs=bhat,
        stderrs=stnde,
        se=sqrt(sigmahat2),
        display= function() { #you may also put a function as list element
                outtable=cbind(bhat,stnde,bhat/stnde)
                colnames(outtable)=c("Coef.","Std.Err","t-Stat")
                rownames(outtable)=colnames(X)
                print(outtable)
            }
        )
    return(out) #return list and quit
 }

# now apply the function:
 X=cbind(rep(1,N),dtmatrix[,3])
 colnames(X)=c("intercept","infl")
 y=dtmatrix[,1]

 myobj=OLS(y,X)

 mybobj$coefs
 myobj$display()

## By now you should have an understanding of basic R functionalities,.
## I would recommend you to apply them directly to your projects, and refer to www.rseek.org for references