Ruimeg.files.wordpress.com



SUMMARY OF R COMMANDSGENERALgetwd() # shows the actual directorysetwd(choose.dir()) # sets the directorysetwd("C:/Users/rui.pinto/Documents/Rdocs/experiments")setwd("../") # goes down one directory in the pathsetwd("./data") #adds one directory to the pathinstall.packages("calibrate") # to download new packages from the internetpackage ? calibrate # to get info on the packagelibrary(help = calibrate)class(x) # shows the Class of objects: character, numeric, integer, complex, logical.sapply(X[1,],class) #shows the class of every variablestr(x) # shows info on the objectas.numeric(x): # to exchange the class of the object. Inf, NA and NaN exist:is.na() # used to logical test if objects are NA. is.nan() the same for NaN. NOTE: NA includes NaN but inverse not true.table(c(0,1,2,3,NA,3,3,2,2,3),useNA="ifany") #shows how many times each type of value appearsTo remove missing values: miss_vals <- is.na(x), x[!miss_vals].To remove missing values in more than one vector (or matrix): good_vals <- complete.cases(x, y), x[good_vals].na.rm= FALSE # does not include NA values in the calculation of a function e.g. mean(x=mydata , na.rm=FALSE).na.omit #is used in some functions to not use NAsreport[is.na(report)] <- 0 # to use zeros instead of NAIf one needs an integer, we can specify it with "L", e.g. 1 is numeric, 1L is integerhelp(zemanel) # lists help for the function or datasetlibrary () # lists a short description of the libraries availablelibrary (help="libraryname") # shows functions in that librarylibrary(calibrate) #search() # lists all the packages installed at the moment. Functions are used according to a certain order of their package.Order of packages : 1 - workspace (.Global environment), 2 - loaded libraries, 3 - packages according to "search()""environment" is a collection of pairs "symbol,value" (e.g. x as symbol and 4 as its value)."closure" (or function closure) is the pair "function, environment")ls() # lists all objects createddata () # lists all available datasets included in Rdata(trees) # shows the data "trees"rm(x,y) #removes (delete) x and yrm(list=ls()) # deletes all the variables in the workspacegetwd() # gets the work directorylist.files() # lists all the files in the work directory. It is the same as dir()b=list.files("C:/My Documents") # gets all the files in this directory.history(max.show=100, pattern="dimnames") #shows the last 100 commands that include the word "dimnames"AND, OR, AND/OR - &&, options(warn=-1) # to stop showing warningsstop("need to input two values") #error messagesregexpr(string1,string2) # comparison of stringsageGroups<-cut2(X$age,g=5) # turns a numeric variable into a factor variableVECTORSvector(mode = "logical", length = 10) # creates a vector with 10 values of FALSE (in case of "numeric", it's 0's.z<-c(5,9,1,0) # creates vectornames(z) <- c("foo", "bar", "norf","ze") # gives a name to each of the objects of z# create sequences of numbersx<-1:10seq(1,9,by=2) # Result [1] 1 3 5 7 9seq(8,20,length=6) # Result [1] 8.0 10.4 12.8 15.2 17.6 20.0rep(0,100) # creates 100 zerosrep(1:3,2) # Result [1] 1 2 3 1 2 3rep(1:3,rep(6,3)) # Result [1] 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3INDEXING[ # always returns object of the same class as the original (can be used to more than one element)[[ # used to extract elements of a list or data frame (single element, class is not necessarily the same)$ # used to extract elements of a list of data frame by name (similar to [[ )z[1,1] # first value of matrixz[c(2,3),2] # indexing for two values in column 2z[,2] # column 2z[1:2,] # lines 1 and 2x[1:6] # first six values of vector xx[c(2,4,9)] # selects only the values with index 2, 4 and 9x[-(1:6)] # excludes some values. It is the same as x[7:12]y=subset(x,x<7) #select those cases that meet the logical condition (x<7)MATRICES (every element is the same class)head(x) # displays the first 6 lines of the matrixz<-cbind(x,y) # creates a matrix of column vectors x y. Can be applied to matrices (rbind also works, for lines)dim(x)<- c(2,5) # creates a matrix 2 x 5 from a vector with 10 valuesz<-matrix(c(5,7,9,6,3,4),nrow=3) # creates matrix with 3 rows (along columns)z<-matrix(c(5,7,9,6,3,4),nrow=3,byrow=T) # creates matrix with 3 rows (along rows)dim(z) # to see the dimension of a matrixnrow(x) and ncol(x) # to see nr of rows or columns of matrixy%*%x # matrix multiplicationt(x) # transposed matrixnew=old[,-5] # deletes column 5attach(trees) # the labels of the variables become variables (then use e.g. mean(height))detach(trees) # "un"-attaches the labelstrees$height # works as attachapply(trees,2,mean) # calculates the mean of columns of matrix "trees" (for lines use 1)m <- matrix(1:4, nrow = 2, ncol = 2), dimnames(m) <- list(c("a", "b"), c("c", "d")) # names the elements of mdimnames(m) <- list(NULL,c("e", "f")) # deletes name of lines and puts a name in columns. Note: "names" and "rownames" do not work on matrices, only in data frames.rownames(x, do.NULL = TRUE, prefix = "row") # NOTE: prefix="col" can be used for the columns.rownames(x) <- value # value is a vector with the names of the rows. Used together with line below.order(X$var) #gives the indexes of the var by order of magnitudeY <- X[order(X$var1),] or we can use Y <- X[order(X$var1, X$var2),] to order a matrix according to variables y=x[1, 2, drop = FALSE] # using the drop=FALSE, y is a matrix, not a vectorLISTS (vectors that can contain elements of different classes)x<-list(1,"a", TRUE) # creates a list with these 3 elementsx <- list(a = 1, b = 2, c = 3) # to create names in listsze<-x[[1]] # creates vector ze with the first element of the list. b<-x[1] # creates list b. And in this case it is the same as b=x["a"]b<-x[c(1,3)] # to extract multiple elements of a listx$a # to access the value of a (which is an element of x). The same as x[["a"]]. But the $ cannot use computed names.x[[c(1,3)]] # is a vector with the 3rd value of the 1st element of list x. The same as x[[1]][[3]]FACTORS (used to represent categorical data. May be ordered or unordered)x <- factor(c("yes", "yes", "no", "yes", "no"))unclass(x) # returns the numerical values of each levelx <- factor(c("yes", "yes", "no", "yes", "no"), levels = c("yes", "no")) # defines the order of the valuesfactorsx<-gl(2,5) #makes a vector of 2 levels with 5 points eachinteraction(factor1,factor2); split(table(X) # counts the number of times each level appears in XDATA FRAMESData frames are a special type of list, where every element of the list has the same length (matrix). Elements are columns and length of each element is nr rows. Can store different classes of objects in each position.Looking at data: dim(X), names(X), nrow(X), ncol(X), quantile(X$var1), summary(X), head(X), class(X),sapply(X[1,],class)unique(X$var1), length(unique(X$var1)), table(X$var1) (for qual vars), table(X$var1,X$var2) (relation between vars, for qual vars), any(X$var>40), all(X$var>40); X[X$var1 > 0 & X$var2 > 0,c("Lat","Lon")] (we can also use "or" |); colSums(X) (has a problem with NA's); colMeans(X, na.rm=TRUE); rowMeans(X, na.rm=TRUE);mergedXY=merge(X,Y,all=TRUE) #other possibilities are x, y, by.x, by.y (eg. to merge according to one of the variables)x <- data.frame(foo = 1:4, bar = c(T, T, F, F))(the next 2 functions can also be applied in vectors)ze<-x[[1]] # creates vector ze from data frame x, with the first element of the data frame (column 1). Same as ze<-x$a b<-x[1] # creates list b from data frame x, with the 1st element of the data frame (column 1). And in this case it is the same as b=x["a"]row.names # to use with data framesrow.names(x)<-c("ze","manel","pedro", "filipe") # gives these names to the linesdata.matrix() # transforms a data frame into a matrix.nrow(x) and ncol(x) can be used with data framesdataX_frame$time<-dataX$time # attribution of one column of one data frame to another data framedataX_frame[,2]<-dataX[,2] # the same as abovedataX_frame[,1]<-dataX_matrix[,2] # attribution of one column of matrix to a data framedataX_frame[,2]<-dataX_matrix$time # ERROR can't pass atomic vector to frame (as above) using $melt(misShaped,id.vars="people",variable.name="treatment",value.name="value") #reshapes matrixBASIC PLOTSpar (col="red")# definitions to plots: col= "red" (color), pch=3 (symbol type), lty (line type), lwd =3 (line width), las=2 (orientation of the axis labels), bg (background color), mar =c(3.4, 4.2, 3.1, 5) (margins size), oma=c (0,0,1,0)(outer margin size), mfrow = c(2,3) (nr plots per row and column, filled row wise), mfcol (nr plots per row and column, filled column wise), cex=0.5 (point size)par(no.readonly = TRUE) # shows the actual values of all the par settingspar("col") # shows the actual colorpar(mfrow=c(1,2)) # creates a window of graphics of 1 x 2. Can also use mfcolexample(points) #opens a demo that shows different types of plotsdev.new() # opens a new window for figuredev.set(1) # changes the window to nr 1, so we can produce a plot theredev.off(2) # closes graphical window nr 2dev.list() # to know the number of plots opengraphics.off() # closes all graphspar(mfrow=c(1,1)) # to return graphics window to standard sizex=c(2,3) y=c(90,80) lines(x,y) # draws a line from point (2,90) to point (3,80)abline(50,2) # draws a line with origin in y=50 and slope = 2abline(v=3) # vertical line at x = 3abline(h=60) # horizontal line at y = 60fit <- lm(y ~ x); abline(fit) #linear regression on points x,y lines (x,y) # creates lines text(x,y,labels) #adds text labels to pointstitle # adds annotations to x, y axis labels, titles, subtitles, outer marginplot(x,y,xlab="travel time", ylab="whatever unit", cex.lab=2,cex.axis=1.5) #puts x and y labels and defines sizemtext #adds arbitrary text to the margins (inner or outer)legend("topleft",legend="Data",pch=20)axis #adds axis ticks/labelsexport formats: pdf, postscript, xfig (to edit a plot by hand in Unix), png (nice bitmap format), jpeg (photos), bitmap (bnp), bmp (windows bitmap format)# to define ranges in a plot, histogram, etcrange11<-range(as.numeric(outcome[, 11]), na.rm=TRUE)plot(jitter(x)) # to be able to see points that are on top of each otherTYPES OF PLOTSboxplot(Height,col="blue") # boxplot with median, 25% and 75%, plus bars that are 1.5 times the limits of the boxesboxplot(X$var1 ~ as.factor(X$var2),col="blue","orange", names=c("yes","no"), varwidth=TRUE))barplot(table(X$var1),col="blue")hist(X, main="Heart Attack",xlab="30-day Death Rate",xlim= range11,breaks=100)dens<-density(X$var1), plot(dens,lwd=3,col="blue") # its like a smoothed histogram, but with percentages. Allows to show more than one distribution.dens <- density(pData$AGEP), densMales <- density(pData$AGEP[which(pData$SEX==1)]), plot(dens,lwd=3,col="blue"),lines(densMales,lwd=3,col="orange")plot(Height,Volume, xlab="sample nr",ylab="temperature") # scatter plot (its identical to pairs) with axis labels# parameters for scatterplots x,y,type,xlab,ylab,xlim,ylim,cex,col,bgpairs(trees) # scatter plot of all the variables in matrix "trees"points(3,4) # draws a dot in coordinates (3,4)qqplot(x,y), abline(c(0,1)) #plots of percentiles of x against percentiles of yto plot groups points in different colors:plot(x,y,type="n") #creates axis but without the pointspoints(x[g=="Male"], y[g==["Male"], col="green") # first plot one of the groupspoints(x[g=="Female"], y[g==["Female"], col="blue", pch=19)plot(X$var1,Xvar2,col=X$var3) #it can also be done like thisimage(1:10, 161:236, as.matrix(X[1:10, 161:236])) # heatmap transposed (maybe transpose the matrix first)pdf export commands:pdf(file = "testRplot.pdf") # open the pdfx <- rnorm(100) hist(x)# plot whatever we wantdev.off()# it has to be called to close the pdf devicedev.copy2pdf(file="graph1.pdf") #this will copy exactly the graph that we seeLATTICE FUNCTIONS PLOTSlibrary(lattice)library(nlme)library(lattice)library(nlme)xyplot(distance ~ age | Subject, data = Orthodont) # this will plot all individuals in different plotsxyplot(distance ~ age | Subject, data = Orthodont, type = "b") # same, but with a linexyplot, bwplot,histogram,stripplot, dotplot, splot, levelplot, contourplotCONTROL STRUCTURESif, else; for; while; repeat; break; next; returnFORfor(i in 1:ncol(dat)){asdklfjaldfjk}for (i in 1:length(id){jadfjakjsdflkjalsdj}if (X==TRUE){ALKDJFADJKF}else# NOTE: when using "else" or "else if", the preceding } must be on the same line as "else"{alkdjflajkdfkl}# one can also use else if (with space between them)WHILEwhile (count<10) {asdfasdf}REPEAT/BREAKrepeat {asdfalsdkfjalsdjkif (asdf<0) {break}}NEXTfor (i in 1:100){if (i<=20{next # skips the first 20 iterations}}RETURNFUNCTIONSsource("functions_meg.R") # will load all the functions in this script to the workspacefix (sd) # opens the editor to write a function called sd (although in general one opens it in FILE/OPEN)help(functionX) # shows the functionX helpinvisible(x) # when x is the output, it does not print it (for some functions)structure of a function:sd2<- function (x) {2*x}formals(sd) # lists all the input arguments of the function (formal arguments). In this case "x"... # indicates a variable number of arguments that are usually passed to other functions, eg:myplot<- function (x,y,type="l", ...) {plot(x,y,type = type, ...) } #this passes all the other arguments of "plot" to "myplot"#in case "..." is the first argument of a function, it means the function does not know how many arguments are going to be input into it (e.g. functions "paste" and "args")#arguments that come after the "..." must be named explicitly (no positional or partial matching).echo=T # use it in the source function, to see the functions in the console as they are processed#if one object is defined in the body of a function and has no value, the function will look for its value in memory;#but functions can be defined inside another function. In that case, the function will look for its value in the function.example:make.power<-function(n) {pow <- function(x) {x^n}pow}Results: cube<make.power(3)cube(3) result is 27square(3) result is 9SPECIAL FUNCTIONSlapply(mtcars,mean) # calculates the mean of columns of "mtcars" (always returns a "list")y1<-lapply(mat1.df,function(x,y) sum(x)+y,y = 5) # example of using built-in functionsapply(split(mtcars$mpg, mtcars$cyl), mean, na.rm=TRUE) # similar to lapply. But if simplify=T, it simplifies (to a vector or matrix)apply(trees,2,mean) # calculates the mean of columns of matrix "trees" (for lines use 1)apply(X,1,quantile,probs=c(0.25,0.75)) # calculates the desired percentiles of each rowsplit(mtcars$mpg, mtcars$cyl) #splits variable mpg into list with groups defined in variable cyl. Used with lapply, sapplytapply(iris$Sepal.Length,levels_x,mean) # used in vectors, to calculate something (mean) based on coder "levels_x"rowSums, rowMeans, colSums, colMeans # used only on matrices (only interesting in large ones)mapply(rep,1:4, 4:1) # it is like apply, but multivariate. newdata <- subset(mydata, age >=20 | age <10newdata<-subset(mydata, sex=="m" | age>25)newdata<- mydata [which(mydata$gender=='F' & mydata$age >65)newdata<-subset(mydata, sex=="m" & age>25)order(vector) # gives the indexes of vector by (alphabetical, numerical) orderordered_vec<-matA[order (matA [,1], matA [,2]),] # orders matrix matA according to its column 1 and then column 2sort(vector) #sorts the vector or matrixREADSome file types: tab-delimited text; comma-separated text; Excel file; JSON File; HTML/XML file; Databaseb<-scan("ap1.txt") # reads a text file with only numbers to a vector bX<-read.csv(file.choose()) # opens the window to choose the fileread.table # arguments: file, header, sep, colClasses, nrows, comment.char, skip, stringsAsFactorsd<-read.table("ap1.txt", sep=",",header = FALSE) # reads a text file with a table (only numbers) into a data frame dd<read.table("ap.txt",header=TRUE,row.names=1) # read tab or space delimited with labels not including pos. (1,1)if row.names=3 it means that the row names are in column 3read.table("ap2.txt",header=TRUE) # read a tab or space delimited file (including position (1,1)NOTE: one can use also read.delimited (tab delimited) or read.csv (comma delimited) files.A <- matrix(scan("matrix.dat", n = 200*2000), 200, 2000, byrow = TRUE) # read big file (only numbers)#read files with labels in first rowread.table(filename,header=TRUE,sep=',') #read csv filesread.xls("doe.xls", sep="",stringsAsFactors=FALSE) #NOTE: needs library(xlsReadWrite) and only reads .xls (not .xlsx)read.xlsx(); read.xlsx2() # needs the xlsx packagereadLines # to read lines of a text filesource("ap_data.R") # to read in R code files (inverse of dump). Doesn't need to attribute it to new variable.k<-dget("ap_data.R") # for reading in R code files (inverse of dput). Needs to be attributed to new variable.load("workspace_meg.RData") # for reading in saved workspaces. In binaryload("ap.02") # loads the variables that are inside the file "ap.02". In binary (saved with "save")unserialize, for reading single R objects in binary form## read files directlycon <- file("./data/cameras.csv","r")cameraData <- read.csv(con)close(con)##download files from the internet (may use "library(XML)")con <- url("", "r") , x <- readLines(con,10) # to read 10 lines of an internet pagefileUrl <- ""download.file(fileUrl,destfile="./data/cameras.csv") list.files("./data"), dateDownloaded <- date()# to read and parse an internet pagehtml3 <- htmlTreeParse("", useInternalNodes=xpathSApply(html3, "//title", xmlValue)xpathSApply(html3, "//td[@id='col-citedby']", xmlValue) # this one will get "col-citedby" of the table ("td") WRITEwrite.tablewriteLinesdump("ap1",file="ap_data.R") # creates file with metadata ready to load using "source". Creates variable ap1dump(c("a","k"),file="multi_dump.R") # to create files with more than one variabledput(ap1,file="ap_data.R") # creates file with metadata ready to load using "dget". Doesn't create variable ap1save.image(file="workspace_meg.RData") # saves the workspace. In binarysavehistory("filename.Rhistory") # saves the history of commands. To reload, do loadhistory("filename.Rhistory")save(a,file="ap.02") # saves the object "a" in a file "ap.02". In binaryserializefilename_X<-paste("Pelagia_ctr_",depth_type[[1]],"_",season_type[[1]],".txt",sep="")write.table(final_data[[1]], file = filename_X,append = FALSE, sep = "\t")write.xls(response,paste(workpath,currentDOE,"/response.xls",sep="")) write.table(temp,file=paste(workpath,currentDOE,"/Run",i,".txt",sep=""),sep="\t",dec=",")save(X,Y,file="./data/cameras.rda") # allows one to save multiple datatables (also can use list instead of X,Y)DEBUGGING# message (function continues, maybe o problem), warning (something wrong, but not fatal) , error (fatal error), condition (indicates that something unexpected occurred; programmers can create their own conditions)traceback() # prints out the function call stack after an error occurs (does nothing if there's no error)debug # more important one. flags a function for debug (then it goes one line at a time). Use ENTER to continue.browser # suspends the execution of a function wherever it is called and puts the function in debug modetrace# allows to insert debugging code into a function in specific places (in general "browser" command)recover # allows to modify the error behavior so you can browse the function call stack. Use "options(error=recover)".print/cat # usefulSTATISTICSmean(x)var(x)summary(x)svd(x) # to do pcaCOMPLETE PCA example# How to open a txt (tab delimited) with a matrix (nothing on position 1,1) with labels on objects and variables and with commas as decimal point, and then run PCAC:\Users\rui.pinto\Documents\WorkMEg\data\Carragheenan ParisdataX<-read.table("carragR.txt",header=TRUE,sep="\t",dec=",",row.names=1)X<-data.matrix(dataX[6:1795])Xclass<-dataX[1:5]plot(1:1790,X[1,], type="l")plot(1:1790,t(X), type="l")num_comp<-6 # if one wants to find six componentspca_res<-svd(X, num_comp, num_comp) #Has to be applied on a matrix or dataframeeigenval<-pca_res$d[1: num_comp]diag_eig<-diag(eigenval)scoX<-pca_res$u%*%diag_eigloadX<-pca_res$vplot(scoX[,1],scoX[,2],type="p") # plots with dots. For lines do "type="l" (L)text(scoX[,1],scoX[,2],Xclass[[2]], pos=2,offset=0.5,cex=0.5) # plots labels 0.5 (offset) on the left (pos), size 0.5 (cex)PACKAGEShttr - to work with http connectionsforeign - to get data into R from SAS, SPSS, Octave, etclibrary(maps) DATA ANALYSISDATA MUNGING## CLEANINGnames(X)tolower(names(X)) # to change everything to lowercase# to split names in the dots:splitNames = strsplit(names(X),"\\.") firstElement <- function(x){x[1]}sapply(splitNames,firstElement) # to split namesnames(X)=sapply(splitNames,firstElement)# to replace symbols or lettersnames(X)=sub("_","",names(X),) #use gsub to substitute all of them# TO CREATE RANGES IN QUANTITATIVE VARIABLES (KINDA HISTOGRAM)timeRanges <- cut(reviews$time_left,seq(0,3600,by=600))table(timeRanges,useNA="ifany")# (ALTERNATIVELY, we can define the nr of ranges, using cut2)library(Hmisc)timeRanges<- cut2(reviews$time_left,g=6)table(timeRanges,useNA="ifany")# create new variableX$newvar<-varX# merge lines of Y that have the same value as X (in the columns defined)#NOTE: we can use by, by.x, by.y, allXY<-merge(X,Y,by.x="sporder",by.y="DDRS")# SORT AND ORDERXvector_sorted<-sort(X)Xmatrix_sorted<-Xmatrix(order(Xmatrix$ind_nr))# RESHAPE (create more lines, using only the defined variables)melt(X, id.vars="people",variable.name="treatment",value.name="value")INITIAL PLOTSplot(jitter(pData$sporder))# boxplotsboxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue")boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE)# density plots (line histograms)dens <- density(pData$AGEP), densMales <- density(pData$AGEP[which(pData$SEX==1)])density(na.omit(X$v1)) #does not uses the NAsplot(dens,lwd=3,col="blue"), lines(densMales,lwd=3,col="orange") #for multiple distributions# histogramshist(pData$AGEP,col="blue",breaks=100,main="Age")# scatterplots (with colors according to a column)plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$PUMA,cex=0.5)# scatterplots (with size according to a column)size_dots<-pData$JWMNP/max(pData$JWMNP)plot(pData$JWMNP,Pdata$wagp,pch=19,cex=size_dots*10)# to plot the points of a variable that are NA in another variabley[x < 0] <- NAboxplot(x ~ is.na(y))#NOTE: some plots may need library(Hmisc)HIERARCHICAL CLUSTERINGset.seed(1234); par(mar=c(0,0,0,0))x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)plot(x,y,col="blue",pch=19,cex=2)text(x+0.05,y+0.05,labels=as.character(1:12))dataFrame <- data.frame(x=x,y=y)distxy <- dist(dataFrame)hClustering <- hclust(distxy) # col=color_vector will add colourplot(hClustering)(heatmap)dataMatrix <- as.matrix(dataFrame)[sample(1:12),]heatmap(dataMatrix)K-MEANSdataFrame <- data.frame(x,y)kmeansObj <- kmeans(dataFrame,centers=3)names(kmeansObj)(heatmap)set.seed(1234)dataMatrix <- as.matrix(dataFrame)[sample(1:12),]kmeansObj2 <- kmeans(dataMatrix,centers=3) #nstart=100 will produce 100 models with random start points and average thempar(mfrow=c(1,2),mar=rep(0.2,4))image(t(dataMatrix)[,nrow(dataMatrix):1],yaxt="n")image(t(dataMatrix)[,order(kmeansObj$cluster)],yaxt="n")(to plot the variable centers, do the command below to the different cluster centers)plot(kClust$center[1,1:10],pch=19,ylab="Cluster Center",xlab="")PCAsvd1 <- svd(scale(dataMatrixOrdered)) #UV-scaledpar(mfrow=c(1,3))image(t(dataMatrixOrdered)[,nrow(dataMatrixOrdered):1])plot(svd1$u[,1],40:1,,xlab="Row",ylab="First left singular vector",pch=19)plot(svd1$v[,1],xlab="Column",ylab="First right singular vector",pch=19)(eigenvalues)par(mfrow=c(1,2))plot(svd1$d,xlab="Column",ylab="Singluar value",pch=19)plot(svd1$d^2/sum(svd1$d^2),xlab="Column",ylab="Percent of variance explained",pch=19)(for faster pca calculation)fast.svd(scale(bigMatrix),tol=0)# tol=0 is the tolerance in variance that it calculates more pc'sNOTE: in the case of missing values, we can impute (calculates close values to the ones missing, using knn)library(impute)dataMatrix2 <- impute.knn(dataMatrix2)$data(to regenerate the data)svd1 <- svd(scale(faceData))# %*% is matrix multiplication# Here svd1$d[1] is a constantapprox1 <- svd1$u[,1] %*% t(svd1$v[,1]) * svd1$d[1]# In these examples we need to make the diagonal matrix out of dapprox5 <- svd1$u[,1:5] %*% diag(svd1$d[1:5])%*% t(svd1$v[,1:5])approx10 <- svd1$u[,1:10] %*% diag(svd1$d[1:10])%*% t(svd1$v[,1:10])REGRESSIONplot(galton$parent,galton$child,pch=19,col="blue")lm1 <- lm(galton$child ~ galton$parent) # y~xlines(galton$parent,lm1$fitted,col="red",lwd=3)summary(lm1)(residuals)plot(galton$parent,lm1$residuals,col="blue",pch=19)abline(c(0,0),col="red",lwd=3)NOTE: plot(lm1) # gives some plots to evaluate the model (residuals, outliers)MULTIPLE REGRESSIONlmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex*hunger$Year)plot(hunger$Year,hunger$Numeric,pch=19)points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1))abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3)abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] +lmBoth$coeff[4]),col="black",lwd=3)REGRESSION WITH FACTOR VARIABLES lm1 <- lm(movies$score ~ as.factor(movies$rating))summary(lm1)REGRESSION WITH BINARY OUTCOME (logistic regression)logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore,family="binomial")summary(logRegRavens)(confidence intervals)exp(logRegRavens$coeff)(ANOVA)anova(logRegRavens,test="Chisq")CONFIDENCE INTERVALSconfint(sampleLm4,level=0.95)REGRESSION FOR COUNT OUTCOMESplot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")lm1 <- lm(gaData$visits ~ gaData$julian)abline(lm1,col="red",lwd=3)(CONFIDENCE INTERVALS)library(sandwich)confint.agnostic <- function (object, parm, level = 0.95, ...){cf <- coef(object); pnames <- names(cf)if (missing(parm))parm <- pnameselse if (is.numeric(parm))parm <- pnames[parm]a <- (1 - level)/2; a <- c(a, 1 - a)pct <- stats:::format.perc(a, 3)fac <- qnorm(a)ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,pct))ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]ci[] <- cf[parm] + ses %o% facci}confint(glm1)############################################glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),family="poisson",data=gaData)plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date",ylab="Fitted Rates",pch=19)lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3)REGRESSION FOR RATESglm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),family="poisson",data=gaData)plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts")points(julian(gaData$date),glm1$fitted,col="red",pch=19)ROBUST LINEAR REGRESSIONlm1 <- lm(y ~ x); rlm1 <- rlm(y ~ x)ANOVAanova(lm1)#another way of doing anova:lm1 <- aov(movies$score ~ as.factor(movies$rating))TukeyHSD(lm1)#honestly significant difference testANOVA WITH MULTIPLE FACTORSaovObject4 <- aov(movies$score ~ movies$genre + movies$rating + movies$box.office)Model selectionlm1 <- lm(score ~ .,data=movies)aicFormula <- step(lm1)library(leaps);regSub <- regsubsets(score ~ .,data=movies)plot(regSub)library(BMA)bicglm1 <- bic.glm(score ~.,data=movies,glm.family="gaussian")print(bicglm1)DATA #google analytics ................
................

In order to avoid copyright disputes, this page is only a partial summary.

Google Online Preview   Download