# TODO: Add comment # # Author: mike-bowles ############################################################################### rm(list=ls()) setwd("/home/mike-bowles/Documents/StatisticsPapers/ESL/DataSets/Prostate") pdata <- read.table(file="data", header = TRUE, row.names=1) #put predictors into X X <- pdata[,1:8] #put response into Y Y <- pdata[,9] #pull out indices which authors used as training set Itrain <- which(pdata[,10]) #demean X and bring to unit variance for(i in 1:8){ m <- sum(X[,i]) m <- m/length(X[,i]) X[,i] <- X[,i]-m v <- var(X[,i]) X[,i] <- X[,i]/sqrt(v) } #put X into matrix form X <- as.matrix(X) #check covariances/correlation cov(X) #linear model on full set (train + test) linMod <- lm(Y ~ X) linMod #restrict model to training set YY <- Y[Itrain] XX <- X[Itrain,1:8] linMod <- lm(YY ~ XX) linMod ###################### ####################### #best subsets ############## ########## require(leaps) leapOut <- leaps(X,Y,nbest = 40) #have a look at leapOut str(leapOut) #have a look at leapOut$which w <- leapOut$which w #Compute cross-validation error for these subsets. xvalErr <- matrix(0.0, 8,40) num2char <- c("1","2","3","4","5","6","7","8") #start a loop for different subset sizes colIndex <- 1:8 #loop on different subset sizes for(isize in 2:8){ iSubsets <- which(rownames(w)[]==num2char[isize]) #start a loop to run though best subsets for(isets in 1:length(iSubsets)){ iCol <- which(w[iSubsets[isets],]) Xtemp <- as.matrix(X[,iCol]) nxval <- 10 Index <- 1:length(X[,1]) #crossvalidation for(ixval in 1:nxval){ #hold out 1/10th of data for out of sample (oos) error check Iout <- which(Index%%nxval==(ixval-1)) XtempTemp <- Xtemp[-Iout,1:ncol(Xtemp)] Xnew <- Xtemp[Iout,] Ytemp <- Y[-Iout] Ynew <- Y[Iout] #fit a linear model linMod <- lm(Ytemp ~ XtempTemp) v <- as.array(linMod$coefficients) yHat <- rep(0.0,length(Xnew[,1])) #use coefficients from lm to generate predictions on oos set #could also use "predict" function for(i in 1:length(Xnew[,1])){ yHat[i] <- v[1] for(j in 1:isize){ yHat[i] <- yHat[i] + Xnew[i,j]*v[j+1] } } #calculate residuals (fit error) dY <- yHat - Ynew #calculate sum squared error se <- (1/length(Xnew[,1]))*sum(dY*dY) #accumulate sum sq error for all 10 xval passes xvalErr[isize,isets] <- xvalErr[isize,isets] + se/nxval } } } #run separate for subset size = 1 isize <- 1 iSubsets <- which(rownames(w)[]==num2char[isize]) #start a loop to run though best subsets for(isets in 1:length(iSubsets)){ iCol <- which(w[iSubsets[isets],]) Xtemp <- as.matrix(X[,iCol]) nxval <- 10 Index <- 1:length(X[,1]) for(ixval in 1:nxval){ Iout <- which(Index%%nxval==(ixval-1)) XtempTemp <- Xtemp[-Iout,1:ncol(Xtemp)] Xnew <- Xtemp[Iout,] Ytemp <- Y[-Iout] Ynew <- Y[Iout] linMod <- lm(Ytemp ~ XtempTemp) v <- as.array(linMod$coefficients) yHat <- rep(0.0,length(Xnew)) for(i in 1:length(Xnew)){ yHat[i] <- v[1] for(j in 1:isize){ yHat[i] <- yHat[i] + Xnew[i]*v[j+1] } } dY <- yHat - Ynew se <- (1/length(Xnew))*sum(dY*dY) xvalErr[isize,isets] <- xvalErr[isize,isets] + se/nxval } } #let's have a look at the xvalErr xvalErr #collect all the non-zero elements of xvalErr and plot according to subset size output <- matrix(0.0,1,2) temp <- output output[1,1] <- 8 output[1,2] <- sqrt(xvalErr[8,1]) icount <- 1 for(i in 1:7){ for(j in 1:40){ if(xvalErr[i,j]>0.0){ temp[1,1] <- i temp[1,2] <- sqrt(xvalErr[i,j]) output <- rbind(output,temp) } } } plot(output) #Forward Step-wise - using cross-validation for variable selection. #Pick the first variable Index <- 1:nrow(X) colIndex <- 1:8 seBest <- 1000000.0 seArray <- rep(0.0,7) Xtemp <- X[,1] nxval <- 10 for(iTry in 1:8){ Xtemp <- X[,iTry] se <- 0.0 for(ixval in 1:nxval){ Iout <- which(Index%%nxval==(ixval-1)) XtempTemp <- Xtemp[-Iout] Xnew <- Xtemp[Iout] Ytemp <- Y[-Iout] Ynew <- Y[Iout] linMod <- lm(Ytemp ~ XtempTemp) v <- as.array(linMod$coefficients) yHat <- rep(0.0,length(Xnew)) for(i in 1:length(Xnew)){ yHat[i] <- v[1] + Xnew[i]*v[2] } dY <- yHat -Ynew seTemp <- (1/length(Xnew))*sum(dY*dY) se <- se + seTemp/nxval } #print(se) if(se