#R-pieSales - Fordham



#R-pieSales Regression of pie sales on price and advertising expenses.

#all lines starting with # are ignored by R. They are comments for u to

#to understand what the R code is doing. Good programmers include lots of comments

# fordham.edu/economics/vinod/ch14ppln.ppt has a powerpoint

# Note that R ignores all material on a line beyond # symbol

rm(list=ls())#good way to start a session

# it cleans out stuff from the memory of R (used by someone else?)

paste("Today is", date())

y=c(350, 460, 350, 430, 350, 380, 430, 470, 450, 490, 340, 300, 440, 450, 300)# . c means catalogue or list concatenate

#if an input needs more than one line, just end earlier lines with a comma

#comma tells R that more is coming

#y is apple pie sales. This is the simplest way of reading

#data into R. More advanced ways are in the appendix to this document.

# you should get back the prompt > from R suggesting that u have

#fully entered the data. If you get + sign, R wants more stuff

#then something is wrong and get out by pressing the escape key

x=c(5.5, 7.5, 8.0, 8.0, 6.8, 7.5, 4.5, 6.4, 7.0, 5.0, 7.2, 7.9, 5.9, 5.0, 7.0)

#price charged for the apple pies

z=c(3.3, 3.3, 3.0, 4.5, 3.0, 4.0, 3.0, 3.7, 3.5, 4.0, 3.5, 3.2, 4.0, 3.5, 2.7)

#advertising expenses

#Check data Entry

y; plot(y) #this lets u know if R understood your typing correctly

#semicolon is needed if more than one command is on the same line

hist(y)#draws histogram of y data

boxplot(y)# draws box plot. See if these plots make sense

x;plot(x)#note that R is case sensitive and wrong upper case will fail

hist(x)#draws histogram of y data

boxplot(x)# draws box plot

z;plot(z)

hist(z)#draws histogram of y data

boxplot(z)# draws box plot

#Advanced Check data Entry Omit at the elementary level

#sophisticated box plots for fun

library(gtools)#need to download this package and bring into memory

#library is the name of R command to bring in

#the functions from a package into the current memory

#any R package is installed inside your hard drive

#when u use the "install packages" menu option in R GUI

g=quantcut(x) #create a factor variable from four quarters of data

boxplot(split(x,g), col="lavender", notch=TRUE)#4 notched box plots

# Understanding the data with tests on null hypothesis mean mu=0

#that is we are testing if the population mean of one variable at a time is zero.

t.test(x)

sort(x) #prints sorted data to the screen

t.test(y)

sort(y)

t.test(z)

sort(z)

#Learn how to download a package from the Internet into R

# click on packages menu and choose Set CRAN mirror ...

#choose a location near you say PA2 Pennsylvania from New York

#Click on on install packages ... A long alphabetic list will appear

#HIGHLIGHT the name fBasics and click on OK at bottom

#another R package called car is needed for the Durbin Watson test

library(fBasics) #this brings it into current memory of R

#printing all basic stats together by one line command

basicStats(cbind(y,x,z)) #also works

# y x z

#nobs 15.000000 15.000000 15.000000

#NAs 0.000000 0.000000 0.000000

#Minimum 300.000000 4.500000 2.700000

#Maximum 490.000000 8.000000 4.500000

#1. Quartile 350.000000 5.700000 3.100000

#3. Quartile 450.000000 7.500000 3.850000

#Mean 399.333333 6.613333 3.480000

#Median 430.000000 7.000000 3.500000

#Sum 5990.000000 99.200000 52.200000

#SE Mean 16.401703 0.302508 0.126190

#LCL Mean 364.155178 5.964518 3.209350

#UCL Mean 434.511488 7.262149 3.750650

#Variance 4035.238095 1.372667 0.238857

#Stdev 63.523524 1.171609 0.488730

#Skewness -0.215868 -0.431478 0.373353

#Kurtosis -1.572078 -1.343088 -0.847871

#nobs= number of observations,

#NAs=not available or missing data, usually We want zero missing data!

#1. Quartile means first quartile, 3. Quartile is third quartile

#SE Mean is standard error of mean for testing mu=0,

#LCL Mean is lower 95% confidence limit for mean, UCL Mean=upper limit

# Skewness, Kurtosis are Pearson's measures based on moment orders 3 and 4.

#******

#BEGIN OUTLIER DETECTION FUNCTION

#following function (about 30-lines of code) automatically computes outliers

get.outliers = function(x) {

#function to compute the number of outliers automatically

#author H. D. Vinod, Fordham university, New York, 24 March, 2006

su=summary(x)

if (ncol(as.matrix(x))>1) {print("Error: input to get.outliers function has 2 or more columns")

return(0)}

iqr=su[5]-su[2]

dn=su[2]-1.5*iqr

up=su[5]+1.5*iqr

LO=x[xup]

nUP=length(UP)

print(c(" Q1-1.5*(inter quartile range)=",

as.vector(dn),"number of outliers below it are=",as.vector(nLO)),quote=F)

if (nLO>0){

print(c("Actual values below the lower limit are:", LO),quote=F)}

print(c(" Q3+1.5*(inter quartile range)=",

as.vector(up)," number of outliers above it are=",as.vector(nUP)),quote=F)

if (nUP>0){

print(c("Actual values above the upper limit are:", UP),quote=F)}

list(below=LO,nLO=nLO,above=UP,nUP=nUP,low.lim=dn,up.lim=up)}

#xx=get.outliers(x)

# END FUNCTION here copy+paste all the way to this line

#Now assuming x, y and z are already in memory, use the outlier function as:

xx=get.outliers(x)

#if outliers are present on either side, x data may need another look!

#If the data errors are present, fix it or delete that observation.

#If no error, then this is controversial in Statistics.

#Some authors suggest omitting outliers others say NO.

xx=get.outliers(y)

xx=get.outliers(z)

#Tests for Null that Correlation Coefficient is ZERO! One pair at a time

#there are 3 possible pairs here so make 3 separate tests.

cor.test(x,y)

#capture.output(cor.test(x,y), file="c:/stat2/PieSaleOutput.txt", append=T)

cor.test(z,y)

#capture.output(cor.test(z,y), file="c:/stat2/PieSaleOutput.txt", append=T)

cor.test(x,z)

#capture.output(cor.test(x,z), file="c:/stat2/PieSaleOutput.txt", append=T)

#creating a matrix of correlation coefficients

vmtx=cbind(y,x,z)#binds columns of data into matrix

cor(vmtx)

#this test reveals possible linear relation for each pair of variables only.

#The may fail to detect nonlinear relations! Plotting helps here.

#Now REGRESSION ANALYSIS needed if 3 or more variables are involved

reg1=lm(y~x+z)# R function called lm=linear model is for multiple regression

#reg1 is name of the object which contains my first regression results

#I could have chosen any name I wish. It is used several times below

#

summary(reg1) #this is needed to get detailed output of regression

#regression stats (multiple R square, Adjusted R square, Standard

#Error, degrees of freedom) are given here.

#Excel reports multiple R which need not be reported.

#multiple R=square root of multiple R square, and need not be reported

#plot(reg1)

#gives sophisticated plots of residuals

#you may need to hit "enter" a few times to look at these

#DO NOT attach these to your term paper. You do not know what

#they mean. They are just for your info and later use.

#you can copy and paste the screen output to the Appendix of your paper.

#the sophisticated ones among you can use the following type command.

#capture.output(summary(reg1), file="c:/stat2/PieSaleOutput.txt", append=T)

confint(reg1) #prints 95% confidence intervals for coefficients

#capture.output(confint(reg1), file="c:/stat2/PieSaleOutput.txt", append=T)

confint(reg1, level=0.99) #prints 99% confidence intervals

#confint does not need the package called car, it is generic to stats

r1=resid(reg1)#creates an object called r1 containing residuals

sum(r1^2) #prints sum of squared residuals

anova(reg1) #prints the analysis of variance table and F values for each

#regressor separately. For the F test needed in the paper use the

# LAST line of summary(reg1) output by R

# F-statistic: 6.539 on 2 and 12 DF, p-value: 0.01201

ano=anova(reg1) #ano object has anova results for further manipulation

regSS=ano$S[1]+ano$S[2] #extracts regression sum of squares $S of sum of sq

regSS

residSS=ano$S[3] #extract residual sum of squares

residSS

totalSS=regSS+residSS #INDIRECT estimate of total sum of squares

totalSS

totSS=var(y)*(length(y)-1)

totSS #direct estimate of total sum of squares should match exactly

#Finding CRITICAL VALUES from t table and F table using qt and qf

alp=0.05 #the alpha value

alpby2=alp/2 #defines alpha by 2 for typical t test

dft=length(y)-3 #defines degrees of freedom for t

#since we regress y on x and z we lose 3 degrees of freedom

qt(alpby2,df=dft)#quantile of t distribution gives critical values

To find the critical value of F test, use this command:

qf(alp, df1=2, df2=dft, lower.tail=F)#quantile of F distribution

#Finding p-values from t table and F table using pt and pf

#pvalues are already given by R

#for example t value in summary table is -2.306 for the pie-sales example

pt(-2.306,df=dft, lower.tail=T)*2

# we multiply tail area by 2 to get the p value= 0.03976325

#doubling is needed because t test is two-sided

#for the overall F test, the p value is obtained from one-sided case

pf(6.539,df1=2,df2=12, lower.tail=F)#we want 1-sided upper tail test

#here we DO NOT DOUBLE the answer 0.01200411, F test is one-sided here

#PLOTTING Headings: main means main heading, xlab is Label for x axis

plot(x,y, main="Fig. 1: Demand Curve ", xlab="Price", ylab="Sales")

#abline(lm(y~x)) #this draws a straight line of regression in the scatter-plot

#lines(lowess(x,y))# this suggests the best fitting free-hand nonlinear line

#if the free-hand curve is too different from the straight line,

# we may need a nonlinear model, you may want to say this in discussion

#of future work.

library(car) #install and bring package into current memory of R

scatterplot(x,y, main="Fig. 1: Demand Curve ", xlab="Price", ylab="Sales")

#this will also give box plots for the two variables plus lowess lines

#plot(z,y, main="Fig.2:Z against Y", xlab="Z", ylab="Y")

#abline(lm(y~z)) #this draws a line of regression in the scatter-plot

#lines(lowess(z,y))

# the free hand locally weighted scatter-plot smoother plot suggests

#the presence of nonlinearity, remedied by adding square terms etc.

#perhaps reg2=lm(y~x+z+I(z^2)) might be better. Note I(z^2) as additional

#regressor, +z^2 will not work as square term, need the I(...)

scatterplot(z,y, main="Fig.2: Z against Y", xlab="Z", ylab="Y")

r1=resid(reg1)

plot(x,r1, main="Fig. 3: Residuals against X", xlab="X", ylab="Residuals")

# ideally this plot should show no particular pattern

#otherwise a basic assumption of regression is not satisfied by our model

plot(z,r1, main="Fig. 4: Residuals against Z", xlab="Z", ylab="Residuals")

# ideally this plot should show no particular pattern

#a package called car prints the results of the durbin.watson (DW) test

library(car)

durbin.watson(reg1)

# needed if the residual plots show serial correlation pattern (autocorrelation)

r1=resid(reg1);n=length(r1)

cor(r1[1:(n-1)],r1[2:n])#correlation between residuals(t)

#at time t and residuals(t-1) at time t-1 (time lag order 1)

#type help(durbin.watson) for details

fity=fitted(reg1)

plot(y,fity, main="Fig. 5: Fitted against Actual y", xlab="Actual Y", ylab="Fitted Y")

# Good fit =this figure should be close to a virtual 45 degree line

#the actual angle will depend on the scales along axes

#it will be 45 degree only if origin+ scale of axes match exactly

lines(c(min(y),max(y)), c(min(y),max(y)))#draws the 45 degree line

miny=min(y)

minfy=min(fity)

minboth=min(miny,minfy)

maxy=max(y)

maxfy=max(fity)

maxboth=max(maxy,maxfy)

plot(y,fity, main="Fig. 5: Fitted against Actual y", xlab="Actual Y", ylab="Fitted Y", xlim=c(minboth,maxboth), ylim=c(minboth,maxboth) )

lines(c(minboth,maxboth), c(minboth,maxboth))#draws the 45 degree line

# We can get details about the fit graphically as follows

library(car)

scatterplot(y,fity,main="Fig. 5: Fitted against Actual y", xlab="Actual Y", ylab="Fitted Y")# gives useful info about quartiles of y and fity

lines(c(min(y),max(y)), c(min(y),max(y)))#virtual 45 degree line

#dashed line is different from virtual 45 degree solid line?

#Note that this is not as useful as plotting residuals

#against x and z, since there we can see possible nonlinearities

#and those figures suggest possible improvements

#Regression with additional regressor for squared x

reg2=lm(y~x+z+I(x^2))

anova(reg2) # see the contribution of adding the sq term

summary(reg2)#compare adjusted R-square to decide if adding sq

#term helps or hurts the model. Sometimes nonlinear terms are

#needed even if they reduce the adjusted r-square

#If two or more variables are highly correlated with each other,

#then regression standard errors might be inflated and one can

#get nonsense coefficients. This is called collinearity problem

#To diagnose the presence of (multi)collinearity

#use the "vif" function of "car".

#type ?vif to learn details. If vif computations exceed 5,

#then one has collinearity.

APPENDIX METHODS OF READING DATA INTO R

If you do not know all the following terms, your best bet is to learn these basic computing concepts from someone or by going on the Internet. Learning these things could take a couple of hours:

1) file: All information inside a computer is remembered as a file. Each file must have a name.

2) file extension. The file name consists of two parts (name)(extension). The extension tells the operating system the kind of file it is. Microsoft word documents have the extension doc, Excel files have the extension xls . Most files seen on the internet have the extension htm or html which stand for hypertext markup language. Many read-only files have extension pdf. These are universally readable on any computer and follow a format owned by Adobe corporation. There are many more formats.

3) Directory. Since computer files with the same name can be confusing to the operating system, computers place the files into different directories (file folders) and sub directories, sub-sub directories.

4) c drive. This is the place where most Windows computers keep all information. It is called a hard disk and resides inside the PC. The drive is denoted and addressed by "c:" with a colon.

5) text files. The files created by MS word or Excel are not human readable. They are in a binary format, which only computers can understand. Simple text files are the only ones I know which are human readable. Open source Latex for document preparation and R software often use the text format for ease of human reading. The text files take up minimal space on the computer and are robust (not sensitive to changes in operating systems, software vendor whims, etc).

6) The file menu for MS Word or Excel allow you to save the information in various formats. There is an option called "save as …" I often keep text versions of some files

for reasons mentioned above.

7) csv extension is for "comma separated version". Excel files can be "save as …" csv. this is way down in the list of options under save as . . . .

8) Some naïve computer users do not learn about the directories and let the compute choose the directory where to save the information. If more than one user uses a computer, this location becomes cumbersome to access.

9) Creating a directory on the c drive

This is an important skill worth learning. Here is what you do:

Double Left click on "My computer"

Double Left click on "Local Disk: (c:)"

Right Click on what you see and choose "NEW" for new folder and give it a name of your choice (pr1) for project 1. Choose short names without spaces.

10) Always seeing file names and extensions. Once you get into c drive, click on Tools menu and choose "Folder Options" and "View" Tab of it.

Also insert a check mark on "Display the full path in the address bar"

Also insert a check mark on "Display the full path in the title bar"

Further down, there is one place where it has a check mark for "Hide extension for Known File Types". This means the extensions doc, xls, pdf etc are hidden from view. This is a bad idea, if you want to read or access the file from a computer program. I recommend that you remove the check mark

Before leaving, click on "Apply to All Folders".

11) How to save a file. Once you create information, you want to save it as a file somewhere. The main point of saving is to get back to it at will. If you save all files for project 1 in the directory pr1, you will have access to it by clicking on my computer, c drive and then clicking on folder named pr1. Viewing the folders you should always opt for details and click on the date header so that latest file is at the top.

If you are unable or unwilling to learn all the above concepts, you should simply type all data numbers separated by commas. For example as:

x=c(2,5,7,99)

y=c(23,1,4,5)

METHOD A for reading data Create and Read a text file. You have to know how the directories in a computer work. c: means c drive etc. etc.

I am attempting to teach this to you, but it may be difficult for some of you.

In the following example, create a text file with 3 columns Open a blank MS word document. Type in the first row x y z as column names (so we will say header=True)

The actual data is lined up under the column names.

x y z

1 2 3

4 5 7

8 9 10

7 11 11

6 5 10

Save this MS Word document as a file using "saved as" a text file "test-r-data.txt" in some directory. In my case I use the directory "data".It is a good idea to pick the txt extension for text files. Use windows explorer (My computer--> local disk--> data--> etc. In the tools menu click on folder options. . .Then get view and be sure that there is no check mark on "Hide extensions for known file types". Now make sure the file is there under the right name. Only then use following R command

myxyz ................
................

In order to avoid copyright disputes, this page is only a partial summary.

Google Online Preview   Download