Introduction

What is big data? According to Wikipedia, big data is a term used to refer to data sets that are too large or complex for traditional data-processing application software to adequately deal with. big data means that either the sample size (\(n\)) is extremely large, the number of explanatory variables (\(p\)) is extremely large, or both sample size and the number of variables is extremely large.

Suppose we have a response variable \(Y\) and \(p=1000\) predictor variables. It is highly unlikely that all the predictor variables are relevant explaining the variation of \(Y\). Furthermore, it is highly unlikely that all the predictor variables are useful for maker future predictions \(\hat{Y}\). Today, we take a look at helpful methods for simultaneously parameter estimation and variable selection for the classic linear model.

Part 1: Simulate and Meditate

Chunk 1: Simulate Data

set.seed(216)
X=matrix(rnorm(100000),500,200)
beta=c(rep(5,5),rep(-2,5),rep(0,190))
set.seed(480)
epsilon=rnorm(500,0,10)
y=X%*%beta+epsilon

SIM.DATA=data.frame(y=y,X=X)

Chunk 2: Fit Linear Model

lm.model=lm(y~.,data=SIM.DATA)
glance(lm.model)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.759         0.597  10.3      4.70 1.67e-33   200 -1745. 3894. 4745.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

param.est=lm.model$coefficients
param.conf=confint(lm.model)
param.lm=data.frame(cbind(param.est,param.conf))[-1,] #Remove Intercept
names(param.lm)=c("Estimate","Lower","Upper")
param.lm = param.lm %>%
              mutate(Significant=factor(ifelse(0>Lower & 0<Upper,"No","Yes")))

ggplot(param.lm[1:5,]) +
  geom_pointrange(aes(x=1:5,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
  theme_minimal()+
  scale_color_manual(drop=F,values=c("lightskyblue2","gray"))+
  xlab("X1:X5")

ggplot(param.lm[6:10,]) +
  geom_pointrange(aes(x=6:10,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
  theme_minimal()+
  scale_color_manual(values=c("lightskyblue2","gray"))+
  xlab("X6:X10")

ggplot(param.lm[11:200,]) +
  geom_pointrange(aes(x=11:200,y=Estimate,ymin=Lower,ymax=Upper,color=Significant))+
  theme_minimal()+
  scale_color_manual(values=c("lightskyblue2","gray"))+
  xlab("X11:X200")

Chunk 3: Linear Model for Each Potential Predictor

COEF=rep(NA,200)
P.VAL=rep(NA,200)
for(j in 1:200){
  individual.mod=lm(y~.,data=SIM.DATA[,c(1,j+1)])
  COEF[j]=coef(individual.mod)[2]
  P.VAL[j]=summary(individual.mod)$coefficients[2,4]
}

KEEP=P.VAL<0.01
ACTUAL=c(rep("NonZero",10),rep("Zero",190))

tibble(COEF,P.VAL,KEEP) %>% 
  ggplot() +
  geom_point(aes(x=COEF,y=P.VAL,color=KEEP),size=2) +
  geom_hline(yintercept=0.01,linetype="dashed")+
  scale_color_manual(values=c("lightskyblue2","gray"))+
  theme_minimal()

tibble(ACTUAL=ACTUAL,KEEP=factor(KEEP,levels = c(TRUE, FALSE), labels = c('P-val<0.01', 'P-val>0.01'))) %>% 
  table() %>% prop.table()

##          KEEP
## ACTUAL    P-val<0.01 P-val>0.01
##   NonZero       0.04       0.01
##   Zero          0.01       0.94

Chunk 4: Modifying the Cutoff For P-Values

# Please try different Cutoff values
# Try to find out the smallest Cutoff value that can keep all important variables
Cutoff = 0.05

COEF=rep(NA,200)
P.VAL=rep(NA,200)
for(j in 1:200){
  individual.mod=lm(y~.,data=SIM.DATA[,c(1,j+1)])
  COEF[j]=coef(individual.mod)[2]
  P.VAL[j]=summary(individual.mod)$coefficients[2,4]
}

KEEP=P.VAL<Cutoff
ACTUAL=c(rep("NonZero",10),rep("Zero",190))

tibble(ACTUAL=ACTUAL,KEEP=factor(KEEP,levels = c(TRUE, FALSE), labels = c('P-val<0.01', 'P-val>0.01'))) %>% 
  table() %>% prop.table()

lm.model=lm(y~.,data=SIM.DATA[,c(1,which(KEEP)+1)])
param.est=lm.model$coefficients
param.conf=confint(lm.model,level=1-Cutoff)
param.lm=data.frame(cbind(param.est,param.conf))[-1,] #Remove Intercept
names(param.lm)=c("Estimate","Lower","Upper")
param.lm = param.lm %>%
  mutate(Significant=factor(ifelse(0>Lower & 0<Upper,"No","Yes")))

ggplot(param.lm[1:5,]) +
  geom_pointrange(aes(x=1:5,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
  theme_minimal()+
  scale_color_manual(drop=F,values=c("lightskyblue2","gray"))+
  xlab("X1:X5")

ggplot(param.lm[6:10,]) +
  geom_pointrange(aes(x=6:10,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
  theme_minimal()+
  scale_color_manual(drop=F,values=c("lightskyblue2","gray"))+
  xlab("X6:X10")

ggplot(param.lm[11:(dim(param.lm)[1]),]) +
  geom_pointrange(aes(x=11:(dim(param.lm)[1]),y=Estimate,ymin=Lower,ymax=Upper,color=Significant))+
  theme_minimal()+
  scale_color_manual(values=c("lightskyblue2","gray"))+
  xlab("X11:X200")

Part 2: Shrinkage Estimation and More Meditation

Chunk 1: Penalized Estimation Path for Ridge

ridge.mod=glmnet(x=as.matrix(SIM.DATA[,-1]),
                 y=as.vector(SIM.DATA[,1]),
                 alpha=0)
plot(ridge.mod,xvar="lambda")

Chunk 2: Penalized Estimation Path for Lasso

lasso.mod=glmnet(x=as.matrix(SIM.DATA[,-1]),
                 y=as.vector(SIM.DATA[,1]),
                 alpha=1)
plot(lasso.mod,xvar="lambda")

Chunk 3: Penalized Estimation Path for Elastic Net

enet.mod=glmnet(x=as.matrix(SIM.DATA[,-1]),
                 y=as.vector(SIM.DATA[,1]),
                 alpha=1/2)
plot(enet.mod,xvar="lambda")

Chunk 4: Using 10-Fold CV for Selection of \(\alpha\) and \(\lambda\)

set.seed(216)
in.train=sample(1:500,floor(0.66*500))
SIM.TRAIN=SIM.DATA[in.train,]
SIM.TEST=SIM.DATA[-in.train,]

#Default: 10 Fold Cross Validation
RESULT=NULL
for (i in 0:10) {
    cv.out = cv.glmnet(x=as.matrix(SIM.TRAIN[,-1]),
                       y=as.vector(SIM.TRAIN[,1]),
                       type.measure="mse", 
                       alpha=i/10)
    alpha=i/10
    best.lambda=cv.out$lambda.1se
    y.test=predict(cv.out,s=best.lambda,newx=as.matrix(SIM.TEST[,-1]))
    out.mse=mean((SIM.TEST$y-y.test)^2)
    RESULT=rbind(RESULT,c(alpha,best.lambda,out.mse))
}
colnames(RESULT)=c("alpha","lambda","MSE")
print(RESULT)

##       alpha    lambda      MSE
##  [1,]   0.0 18.320277 165.3322
##  [2,]   0.1  8.308023 137.2618
##  [3,]   0.2  6.026759 135.8090
##  [4,]   0.3  3.335681 125.3393
##  [5,]   0.4  2.745679 124.3696
##  [6,]   0.5  2.196543 122.9306
##  [7,]   0.6  2.204787 126.2065
##  [8,]   0.7  1.721931 123.0226
##  [9,]   0.8  1.653590 124.6896
## [10,]   0.9  1.339280 122.1634
## [11,]   1.0  1.451851 126.0858

Chunk 5: Visualizing the Top 4 Models

RESULT2=as.data.frame(RESULT) %>% filter(rank(MSE)<=4)
head(RESULT2)

##   alpha   lambda      MSE
## 1   0.4 2.745679 124.3696
## 2   0.5 2.196543 122.9306
## 3   0.7 1.721931 123.0226
## 4   0.9 1.339280 122.1634

RESULT3=NULL
for(k in 1:4){
  fit=glmnet(x=as.matrix(SIM.DATA[,-1]),y=as.matrix(SIM.DATA[,1]),alpha=RESULT2$alpha[k],nlambda=1,lambda = RESULT2$lambda[k])
  RESULT3=rbind(RESULT3,cbind(k,1:201,as.numeric(coef(fit,s=RESULT2$lambda[k])),c(0,beta)))
}
colnames(RESULT3)=c("Model","Parameter","Estimate","Actual")
RESULT3=as.data.frame(RESULT3)

RESULT3 %>% 
  ggplot() +
  geom_point(aes(x=Parameter,y=Estimate,color=as.factor(Model)),size=2) +
  geom_line(aes(x=Parameter,y=Actual),linetype="dashed",size=1.25,alpha=0.4) +
  theme_minimal() +
  facet_grid(as.factor(Model)~.) +
  guides(color=FALSE)

Part 3: Less Meditation and More Application

Chunk 1: Data in `mpg`

DATA=mpg
DATA2=DATA[,c("year","displ","cyl","drv","cty","hwy","fl","class")]
head(DATA2)

## # A tibble: 6 × 8
##    year displ   cyl drv     cty   hwy fl    class  
##   <int> <dbl> <int> <chr> <int> <int> <chr> <chr>  
## 1  1999   1.8     4 f        18    29 p     compact
## 2  1999   1.8     4 f        21    29 p     compact
## 3  2008   2       4 f        20    31 p     compact
## 4  2008   2       4 f        21    30 p     compact
## 5  1999   2.8     6 f        16    26 p     compact
## 6  1999   2.8     6 f        18    26 p     compact

y=DATA2$hwy
X=model_matrix(DATA2,hwy~.*.)[,-1]
var.names=names(X)
dim(X)

## [1] 234 114

Chunk 2: Multiple Regularized Models

set.seed(216)
cvmod.0=cv.glmnet(y=y,x=as.matrix(X),alpha=0)
set.seed(216)
cvmod.25=cv.glmnet(y=y,x=as.matrix(X),alpha=0.25)
set.seed(216)
cvmod.5=cv.glmnet(y=y,x=as.matrix(X),alpha=0.5)
set.seed(216)
cvmod.75=cv.glmnet(y=y,x=as.matrix(X),alpha=0.75)
set.seed(216)
cvmod.1=cv.glmnet(y=y,x=as.matrix(X),alpha=1)

CV.0.ERROR=cvmod.0$cvm[which(cvmod.0$lambda==cvmod.0$lambda.1se)]
CV.25.ERROR=cvmod.25$cvm[which(cvmod.25$lambda==cvmod.25$lambda.1se)]
CV.5.ERROR=cvmod.5$cvm[which(cvmod.5$lambda==cvmod.5$lambda.1se)]
CV.75.ERROR=cvmod.75$cvm[which(cvmod.75$lambda==cvmod.75$lambda.1se)]
CV.1.ERROR=cvmod.1$cvm[which(cvmod.1$lambda==cvmod.1$lambda.1se)]

MOD.RESULT=tibble(alpha=c(0,0.25,0.5,0.75,1),
                  lambda=c(cvmod.0$lambda.1se,cvmod.25$lambda.1se,
                           cvmod.5$lambda.1se,cvmod.75$lambda.1se,
                           cvmod.1$lambda.1se),
                  CV.Error=c(CV.0.ERROR,CV.25.ERROR,CV.5.ERROR,
                             CV.75.ERROR,CV.1.ERROR))
print(MOD.RESULT)

## # A tibble: 5 × 3
##   alpha lambda CV.Error
##   <dbl>  <dbl>    <dbl>
## 1  0    1.31       1.61
## 2  0.25 0.416      1.47
## 3  0.5  0.208      1.49
## 4  0.75 0.139      1.51
## 5  1    0.0948     1.51

Chunk 3: Using the Best Model

best.alpha=MOD.RESULT$alpha[which.min(MOD.RESULT$CV.Error)]
best.lambda=MOD.RESULT$lambda[which.min(MOD.RESULT$CV.Error)]

best.mod=glmnet(y=y,x=as.matrix(X),nlambda=1,lambda=best.lambda,alpha=best.alpha)
best.coef=as.tibble(as.matrix(coef(best.mod)))
best.coef2=best.coef %>% 
              mutate(Parameter=c("Int",var.names)) %>%
              rename(Estimate=s0) %>%
              select(Parameter,Estimate)
nonzero.best.coef=best.coef2 %>%
                    filter(Estimate!=0)
print(nonzero.best.coef,n=1e3)

## # A tibble: 39 × 2
##    Parameter              Estimate
##    <chr>                     <dbl>
##  1 Int                 -91.8      
##  2 year                  0.0512   
##  3 cyl                  -0.0917   
##  4 drvf                  0.0868   
##  5 cty                   0.417    
##  6 fle                  -0.530    
##  7 classmidsize          0.140    
##  8 classminivan         -0.224    
##  9 classpickup          -0.799    
## 10 classsuv             -0.653    
## 11 year:cyl             -0.0000326
## 12 year:drvf             0.0000273
## 13 year:cty              0.000208 
## 14 year:fle             -0.000249 
## 15 year:classmidsize     0.0000507
## 16 year:classminivan    -0.000135 
## 17 year:classpickup     -0.000404 
## 18 year:classsuv        -0.000315 
## 19 displ:drvr            0.0802   
## 20 displ:fle            -0.0531   
## 21 displ:classmidsize    0.0196   
## 22 displ:classpickup    -0.0551   
## 23 displ:classsuv       -0.125    
## 24 cyl:fle              -0.0723   
## 25 cyl:flr              -0.0486   
## 26 cyl:classcompact      0.0353   
## 27 cyl:classpickup      -0.0481   
## 28 cyl:classsuv         -0.0936   
## 29 drvf:cty              0.0576   
## 30 drvr:cty              0.0249   
## 31 drvf:fld              2.89     
## 32 drvr:flp              0.0581   
## 33 drvf:classminivan    -0.289    
## 34 cty:fle              -0.00719  
## 35 cty:classminivan     -0.0105   
## 36 cty:classpickup      -0.0615   
## 37 flp:classcompact      0.0932   
## 38 flr:classmidsize      0.0923   
## 39 flp:classsubcompact  -0.547

DATA2$hwy.hat=predict(best.mod,newx=as.matrix(X))

ggplot(DATA2) +
  geom_point(aes(x=hwy,y=hwy.hat),color="lightskyblue2") +
  geom_abline(a=0,b=1,linetype="dashed") +
  theme_minimal() +
  ylab("Predicted Highway MPG") +
  xlab("Actual Highway MPG")

ggplot(DATA2) +
  geom_histogram(aes(x=hwy-hwy.hat),fill="lightskyblue2") +
  theme_minimal() +
  xlab("Residuals") +
  ylab("Frequency")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Chunk 4: Data in `Participation`

Part=Participation
dim(Part)

## [1] 872   7

head(Part)

##   lfp  lnnlinc age educ nyc noc foreign
## 1  no 10.78750 3.0    8   1   1      no
## 2 yes 10.52425 4.5    8   0   1      no
## 3  no 10.96858 4.6    9   0   0      no
## 4  no 11.10500 3.1   11   2   0      no
## 5  no 11.10847 4.4   12   0   2      no
## 6 yes 11.02825 4.2   12   0   1      no

Part$lfp=ifelse(Part$lfp=="yes",1,0)
Part$foreign=ifelse(Part$foreign=="yes",1,0)

Chunk 5: Multiple Regularized Models

set.seed(216)
cvmod.0=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0,
                  family="binomial",type.measure="class")
set.seed(216)
cvmod.25=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0.25,
                   family="binomial",type.measure="class")
set.seed(216)
cvmod.5=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0.5,
                  family="binomial",type.measure="class")
set.seed(216)
cvmod.75=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0.75,
                   family="binomial",type.measure="class")
set.seed(216)
cvmod.1=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=1,
                  family="binomial",type.measure="class")

CV.0.ERROR=cvmod.0$cvm[which(cvmod.0$lambda==cvmod.0$lambda.1se)]
CV.25.ERROR=cvmod.25$cvm[which(cvmod.25$lambda==cvmod.25$lambda.1se)]
CV.5.ERROR=cvmod.5$cvm[which(cvmod.5$lambda==cvmod.5$lambda.1se)]
CV.75.ERROR=cvmod.75$cvm[which(cvmod.75$lambda==cvmod.75$lambda.1se)]
CV.1.ERROR=cvmod.1$cvm[which(cvmod.1$lambda==cvmod.1$lambda.1se)]

MOD.RESULT=tibble(alpha=c(0,0.25,0.5,0.75,1),
                  lambda=c(cvmod.0$lambda.1se,cvmod.25$lambda.1se,
                           cvmod.5$lambda.1se,cvmod.75$lambda.1se,
                           cvmod.1$lambda.1se),
                  CV.Error=c(CV.0.ERROR,CV.25.ERROR,CV.5.ERROR,
                             CV.75.ERROR,CV.1.ERROR))
print(MOD.RESULT)

## # A tibble: 5 × 3
##   alpha lambda CV.Error
##   <dbl>  <dbl>    <dbl>
## 1  0    0.329     0.346
## 2  0.25 0.0718    0.339
## 3  0.5  0.0521    0.342
## 4  0.75 0.0381    0.341
## 5  1    0.0314    0.341

Chunk 6: Using the Best Model

best.alpha=MOD.RESULT$alpha[which.min(MOD.RESULT$CV.Error)]
best.lambda=MOD.RESULT$lambda[which.min(MOD.RESULT$CV.Error)]

best.mod=glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),
                nlambda=1,lambda=best.lambda,alpha=best.alpha,
                family="binomial")
best.coef=as.matrix(coef(best.mod))
best.coef

##                       s0
## (Intercept)  5.365197629
## lnnlinc     -0.443171930
## age         -0.203613098
## educ         0.000000000
## nyc         -0.592670562
## noc          0.001456132
## foreign      0.801230320

Part$Predict=predict(best.mod,newx=as.matrix(Part[,-1]),type="class")
Part$lfp=ifelse(Part$lfp==1,"Yes","No")
Part$Predict=ifelse(Part$Predict=="1","Yes","No")

table(Part[,c("lfp","Predict")])

##      Predict
## lfp    No Yes
##   No  394  77
##   Yes 210 191

STOR 320 Tutorial on Shrinkage Methods

November 13, 2024

Introduction

Part 1: Simulate and Meditate

Chunk 1: Simulate Data

Chunk 2: Fit Linear Model

Chunk 3: Linear Model for Each Potential Predictor

Chunk 4: Modifying the Cutoff For P-Values

Part 2: Shrinkage Estimation and More Meditation

Chunk 1: Penalized Estimation Path for Ridge

Chunk 2: Penalized Estimation Path for Lasso

Chunk 3: Penalized Estimation Path for Elastic Net

Chunk 4: Using 10-Fold CV for Selection of \(\alpha\) and \(\lambda\)

Chunk 5: Visualizing the Top 4 Models

Part 3: Less Meditation and More Application

Chunk 1: Data in `mpg`

Chunk 2: Multiple Regularized Models

Chunk 3: Using the Best Model

Chunk 4: Data in `Participation`

Chunk 5: Multiple Regularized Models

Chunk 6: Using the Best Model

STOR 320 Tutorial on Shrinkage Methods

November 13, 2024

Introduction

Part 1: Simulate and Meditate

Chunk 1: Simulate Data

Chunk 2: Fit Linear Model

Chunk 3: Linear Model for Each Potential Predictor

Chunk 4: Modifying the Cutoff For P-Values

Part 2: Shrinkage Estimation and More Meditation

Chunk 1: Penalized Estimation Path for Ridge

Chunk 2: Penalized Estimation Path for Lasso

Chunk 3: Penalized Estimation Path for Elastic Net

Chunk 4: Using 10-Fold CV for Selection of \(\alpha\) and \(\lambda\)

Chunk 5: Visualizing the Top 4 Models

Part 3: Less Meditation and More Application

Chunk 1: Data in mpg

Chunk 2: Multiple Regularized Models

Chunk 3: Using the Best Model

Chunk 4: Data in Participation

Chunk 5: Multiple Regularized Models

Chunk 6: Using the Best Model

Chunk 1: Data in `mpg`

Chunk 4: Data in `Participation`