What is big data? According to Wikipedia, big data is a term used to refer to data sets that are too large or complex for traditional data-processing application software to adequately deal with. big data means that either the sample size (\(n\)) is extremely large, the number of explanatory variables (\(p\)) is extremely large, or both sample size and the number of variables is extremely large.
Suppose we have a response variable \(Y\) and \(p=1000\) predictor variables. It is highly unlikely that all the predictor variables are relevant explaining the variation of \(Y\). Furthermore, it is highly unlikely that all the predictor variables are useful for maker future predictions \(\hat{Y}\). Today, we take a look at helpful methods for simultaneously parameter estimation and variable selection for the classic linear model.
set.seed(216)
X=matrix(rnorm(100000),500,200)
beta=c(rep(5,5),rep(-2,5),rep(0,190))
set.seed(480)
epsilon=rnorm(500,0,10)
y=X%*%beta+epsilon
SIM.DATA=data.frame(y=y,X=X)
lm.model=lm(y~.,data=SIM.DATA)
glance(lm.model)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.759 0.597 10.3 4.70 1.67e-33 200 -1745. 3894. 4745.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
param.est=lm.model$coefficients
param.conf=confint(lm.model)
param.lm=data.frame(cbind(param.est,param.conf))[-1,] #Remove Intercept
names(param.lm)=c("Estimate","Lower","Upper")
param.lm = param.lm %>%
mutate(Significant=factor(ifelse(0>Lower & 0<Upper,"No","Yes")))
ggplot(param.lm[1:5,]) +
geom_pointrange(aes(x=1:5,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
theme_minimal()+
scale_color_manual(drop=F,values=c("lightskyblue2","gray"))+
xlab("X1:X5")
ggplot(param.lm[6:10,]) +
geom_pointrange(aes(x=6:10,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
theme_minimal()+
scale_color_manual(values=c("lightskyblue2","gray"))+
xlab("X6:X10")
ggplot(param.lm[11:200,]) +
geom_pointrange(aes(x=11:200,y=Estimate,ymin=Lower,ymax=Upper,color=Significant))+
theme_minimal()+
scale_color_manual(values=c("lightskyblue2","gray"))+
xlab("X11:X200")
COEF=rep(NA,200)
P.VAL=rep(NA,200)
for(j in 1:200){
individual.mod=lm(y~.,data=SIM.DATA[,c(1,j+1)])
COEF[j]=coef(individual.mod)[2]
P.VAL[j]=summary(individual.mod)$coefficients[2,4]
}
KEEP=P.VAL<0.01
ACTUAL=c(rep("NonZero",10),rep("Zero",190))
tibble(COEF,P.VAL,KEEP) %>%
ggplot() +
geom_point(aes(x=COEF,y=P.VAL,color=KEEP),size=2) +
geom_hline(yintercept=0.01,linetype="dashed")+
scale_color_manual(values=c("lightskyblue2","gray"))+
theme_minimal()
tibble(ACTUAL=ACTUAL,KEEP=factor(KEEP,levels = c(TRUE, FALSE), labels = c('P-val<0.01', 'P-val>0.01'))) %>%
table() %>% prop.table()
## KEEP
## ACTUAL P-val<0.01 P-val>0.01
## NonZero 0.04 0.01
## Zero 0.01 0.94
# Please try different Cutoff values
# Try to find out the smallest Cutoff value that can keep all important variables
Cutoff = 0.1
COEF=rep(NA,200)
P.VAL=rep(NA,200)
for(j in 1:200){
individual.mod=lm(y~.,data=SIM.DATA[,c(1,j+1)])
COEF[j]=coef(individual.mod)[2]
P.VAL[j]=summary(individual.mod)$coefficients[2,4]
}
KEEP=P.VAL<Cutoff
ACTUAL=c(rep("NonZero",10),rep("Zero",190))
tibble(ACTUAL=ACTUAL,KEEP=factor(KEEP,levels = c(TRUE, FALSE), labels = c('P-val<0.01', 'P-val>0.01'))) %>%
table() %>% prop.table()
lm.model=lm(y~.,data=SIM.DATA[,c(1,which(KEEP)+1)])
param.est=lm.model$coefficients
param.conf=confint(lm.model,level=1-Cutoff)
param.lm=data.frame(cbind(param.est,param.conf))[-1,] #Remove Intercept
names(param.lm)=c("Estimate","Lower","Upper")
param.lm = param.lm %>%
mutate(Significant=factor(ifelse(0>Lower & 0<Upper,"No","Yes")))
ggplot(param.lm[1:5,]) +
geom_pointrange(aes(x=1:5,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
theme_minimal()+
scale_color_manual(drop=F,values=c("lightskyblue2","gray"))+
xlab("X1:X5")
ggplot(param.lm[6:10,]) +
geom_pointrange(aes(x=6:10,y=Estimate,ymin=Lower,ymax=Upper,color=Significant),size=2)+
theme_minimal()+
scale_color_manual(drop=F,values=c("lightskyblue2","gray"))+
xlab("X6:X10")
ggplot(param.lm[11:(dim(param.lm)[1]),]) +
geom_pointrange(aes(x=11:(dim(param.lm)[1]),y=Estimate,ymin=Lower,ymax=Upper,color=Significant))+
theme_minimal()+
scale_color_manual(values=c("lightskyblue2","gray"))+
xlab("X11:X200")
ridge.mod=glmnet(x=as.matrix(SIM.DATA[,-1]),
y=as.vector(SIM.DATA[,1]),
alpha=0)
plot(ridge.mod,xvar="lambda")
lasso.mod=glmnet(x=as.matrix(SIM.DATA[,-1]),
y=as.vector(SIM.DATA[,1]),
alpha=1)
plot(lasso.mod,xvar="lambda")
enet.mod=glmnet(x=as.matrix(SIM.DATA[,-1]),
y=as.vector(SIM.DATA[,1]),
alpha=1/2)
plot(enet.mod,xvar="lambda")
set.seed(216)
in.train=sample(1:500,floor(0.8*500))
SIM.TRAIN=SIM.DATA[in.train,]
SIM.TEST=SIM.DATA[-in.train,]
#Default: 10 Fold Cross Validation
RESULT=NULL
for (i in 0:10) {
cv.out = cv.glmnet(x=as.matrix(SIM.TRAIN[,-1]),
y=as.vector(SIM.TRAIN[,1]),
type.measure="mse",
alpha=i/10)
alpha=i/10
best.lambda=cv.out$lambda.1se
y.test=predict(cv.out,s=best.lambda,newx=as.matrix(SIM.TEST[,-1]))
out.mse=mean((SIM.TEST$y-y.test)^2)
RESULT=rbind(RESULT,c(alpha,best.lambda,out.mse))
}
colnames(RESULT)=c("alpha","lambda","MSE")
print(RESULT)
## alpha lambda MSE
## [1,] 0.0 12.979017 154.6540
## [2,] 0.1 6.459687 119.7430
## [3,] 0.2 4.685950 116.8040
## [4,] 0.3 3.428550 115.3693
## [5,] 0.4 2.342975 112.8951
## [6,] 0.5 2.057130 113.4255
## [7,] 0.6 2.064850 115.5236
## [8,] 0.7 1.769871 115.0673
## [9,] 0.8 1.411061 113.7625
## [10,] 0.9 1.510780 115.5342
## [11,] 1.0 1.359702 115.3059
RESULT2=as.data.frame(RESULT) %>% filter(rank(MSE)<=4)
head(RESULT2)
## alpha lambda MSE
## 1 0.4 2.342975 112.8951
## 2 0.5 2.057130 113.4255
## 3 0.7 1.769871 115.0673
## 4 0.8 1.411061 113.7625
RESULT3=NULL
for(k in 1:4){
fit=glmnet(x=as.matrix(SIM.DATA[,-1]),y=as.matrix(SIM.DATA[,1]),alpha=RESULT2$alpha[k],nlambda=1,lambda = RESULT2$lambda[k])
RESULT3=rbind(RESULT3,cbind(k,1:201,as.numeric(coef(fit,s=RESULT2$lambda[k])),c(0,beta)))
}
colnames(RESULT3)=c("Model","Parameter","Estimate","Actual")
RESULT3=as.data.frame(RESULT3)
RESULT3 %>%
ggplot() +
geom_point(aes(x=Parameter,y=Estimate,color=as.factor(Model)),size=2) +
geom_line(aes(x=Parameter,y=Actual),linetype="dashed",size=1.25,alpha=0.4) +
theme_minimal() +
facet_grid(as.factor(Model)~.) +
guides(color=FALSE)
mpgDATA=mpg
DATA2=DATA[,c("year","displ","cyl","drv","cty","hwy","fl","class")]
head(DATA2)
## # A tibble: 6 × 8
## year displ cyl drv cty hwy fl class
## <int> <dbl> <int> <chr> <int> <int> <chr> <chr>
## 1 1999 1.8 4 f 18 29 p compact
## 2 1999 1.8 4 f 21 29 p compact
## 3 2008 2 4 f 20 31 p compact
## 4 2008 2 4 f 21 30 p compact
## 5 1999 2.8 6 f 16 26 p compact
## 6 1999 2.8 6 f 18 26 p compact
y=DATA2$hwy
X=model_matrix(DATA2,hwy~.*.)[,-1]
var.names=names(X)
dim(X)
## [1] 234 114
set.seed(216)
cvmod.0=cv.glmnet(y=y,x=as.matrix(X),alpha=0)
set.seed(216)
cvmod.25=cv.glmnet(y=y,x=as.matrix(X),alpha=0.25)
set.seed(216)
cvmod.5=cv.glmnet(y=y,x=as.matrix(X),alpha=0.5)
set.seed(216)
cvmod.75=cv.glmnet(y=y,x=as.matrix(X),alpha=0.75)
set.seed(216)
cvmod.1=cv.glmnet(y=y,x=as.matrix(X),alpha=1)
CV.0.ERROR=cvmod.0$cvm[which(cvmod.0$lambda==cvmod.0$lambda.1se)]
CV.25.ERROR=cvmod.25$cvm[which(cvmod.25$lambda==cvmod.25$lambda.1se)]
CV.5.ERROR=cvmod.5$cvm[which(cvmod.5$lambda==cvmod.5$lambda.1se)]
CV.75.ERROR=cvmod.75$cvm[which(cvmod.75$lambda==cvmod.75$lambda.1se)]
CV.1.ERROR=cvmod.1$cvm[which(cvmod.1$lambda==cvmod.1$lambda.1se)]
MOD.RESULT=tibble(alpha=c(0,0.25,0.5,0.75,1),
lambda=c(cvmod.0$lambda.1se,cvmod.25$lambda.1se,
cvmod.5$lambda.1se,cvmod.75$lambda.1se,
cvmod.1$lambda.1se),
CV.Error=c(CV.0.ERROR,CV.25.ERROR,CV.5.ERROR,
CV.75.ERROR,CV.1.ERROR))
print(MOD.RESULT)
## # A tibble: 5 × 3
## alpha lambda CV.Error
## <dbl> <dbl> <dbl>
## 1 0 1.31 1.61
## 2 0.25 0.416 1.47
## 3 0.5 0.208 1.49
## 4 0.75 0.139 1.51
## 5 1 0.0948 1.51
best.alpha=MOD.RESULT$alpha[which.min(MOD.RESULT$CV.Error)]
best.lambda=MOD.RESULT$lambda[which.min(MOD.RESULT$CV.Error)]
best.mod=glmnet(y=y,x=as.matrix(X),nlambda=1,lambda=best.lambda,alpha=best.alpha)
best.coef=as.tibble(as.matrix(coef(best.mod)))
best.coef2=best.coef %>%
mutate(Parameter=c("Int",var.names)) %>%
rename(Estimate=s0) %>%
select(Parameter,Estimate)
nonzero.best.coef=best.coef2 %>%
filter(Estimate!=0)
print(nonzero.best.coef,n=1e3)
## # A tibble: 39 × 2
## Parameter Estimate
## <chr> <dbl>
## 1 Int -91.8
## 2 year 0.0512
## 3 cyl -0.0917
## 4 drvf 0.0868
## 5 cty 0.417
## 6 fle -0.530
## 7 classmidsize 0.140
## 8 classminivan -0.224
## 9 classpickup -0.799
## 10 classsuv -0.653
## 11 year:cyl -0.0000326
## 12 year:drvf 0.0000273
## 13 year:cty 0.000208
## 14 year:fle -0.000249
## 15 year:classmidsize 0.0000507
## 16 year:classminivan -0.000135
## 17 year:classpickup -0.000404
## 18 year:classsuv -0.000315
## 19 displ:drvr 0.0802
## 20 displ:fle -0.0531
## 21 displ:classmidsize 0.0196
## 22 displ:classpickup -0.0551
## 23 displ:classsuv -0.125
## 24 cyl:fle -0.0723
## 25 cyl:flr -0.0486
## 26 cyl:classcompact 0.0353
## 27 cyl:classpickup -0.0481
## 28 cyl:classsuv -0.0936
## 29 drvf:cty 0.0576
## 30 drvr:cty 0.0249
## 31 drvf:fld 2.89
## 32 drvr:flp 0.0581
## 33 drvf:classminivan -0.289
## 34 cty:fle -0.00719
## 35 cty:classminivan -0.0105
## 36 cty:classpickup -0.0615
## 37 flp:classcompact 0.0932
## 38 flr:classmidsize 0.0923
## 39 flp:classsubcompact -0.547
DATA2$hwy.hat=predict(best.mod,newx=as.matrix(X))
ggplot(DATA2) +
geom_point(aes(x=hwy,y=hwy.hat),color="lightskyblue2") +
geom_abline(a=0,b=1,linetype="dashed") +
theme_minimal() +
ylab("Predicted Highway MPG") +
xlab("Actual Highway MPG")
ggplot(DATA2) +
geom_histogram(aes(x=hwy-hwy.hat),fill="lightskyblue2") +
theme_minimal() +
xlab("Residuals") +
ylab("Frequency")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
ParticipationPart=Participation
dim(Part)
## [1] 872 7
head(Part)
## lfp lnnlinc age educ nyc noc foreign
## 1 no 10.78750 3.0 8 1 1 no
## 2 yes 10.52425 4.5 8 0 1 no
## 3 no 10.96858 4.6 9 0 0 no
## 4 no 11.10500 3.1 11 2 0 no
## 5 no 11.10847 4.4 12 0 2 no
## 6 yes 11.02825 4.2 12 0 1 no
Part$lfp=ifelse(Part$lfp=="yes",1,0)
Part$foreign=ifelse(Part$foreign=="yes",1,0)
set.seed(216)
cvmod.0=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0,
family="binomial",type.measure="class")
set.seed(216)
cvmod.25=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0.25,
family="binomial",type.measure="class")
set.seed(216)
cvmod.5=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0.5,
family="binomial",type.measure="class")
set.seed(216)
cvmod.75=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=0.75,
family="binomial",type.measure="class")
set.seed(216)
cvmod.1=cv.glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),alpha=1,
family="binomial",type.measure="class")
CV.0.ERROR=cvmod.0$cvm[which(cvmod.0$lambda==cvmod.0$lambda.1se)]
CV.25.ERROR=cvmod.25$cvm[which(cvmod.25$lambda==cvmod.25$lambda.1se)]
CV.5.ERROR=cvmod.5$cvm[which(cvmod.5$lambda==cvmod.5$lambda.1se)]
CV.75.ERROR=cvmod.75$cvm[which(cvmod.75$lambda==cvmod.75$lambda.1se)]
CV.1.ERROR=cvmod.1$cvm[which(cvmod.1$lambda==cvmod.1$lambda.1se)]
MOD.RESULT=tibble(alpha=c(0,0.25,0.5,0.75,1),
lambda=c(cvmod.0$lambda.1se,cvmod.25$lambda.1se,
cvmod.5$lambda.1se,cvmod.75$lambda.1se,
cvmod.1$lambda.1se),
CV.Error=c(CV.0.ERROR,CV.25.ERROR,CV.5.ERROR,
CV.75.ERROR,CV.1.ERROR))
print(MOD.RESULT)
## # A tibble: 5 × 3
## alpha lambda CV.Error
## <dbl> <dbl> <dbl>
## 1 0 0.329 0.346
## 2 0.25 0.0718 0.339
## 3 0.5 0.0521 0.342
## 4 0.75 0.0381 0.341
## 5 1 0.0314 0.341
best.alpha=MOD.RESULT$alpha[which.min(MOD.RESULT$CV.Error)]
best.lambda=MOD.RESULT$lambda[which.min(MOD.RESULT$CV.Error)]
best.mod=glmnet(y=as.factor(Part$lfp),x=as.matrix(Part[,-1]),
nlambda=1,lambda=best.lambda,alpha=best.alpha,
family="binomial")
best.coef=as.matrix(coef(best.mod))
best.coef
## s0
## (Intercept) 5.365197629
## lnnlinc -0.443171930
## age -0.203613098
## educ 0.000000000
## nyc -0.592670562
## noc 0.001456132
## foreign 0.801230320
Part$Predict=predict(best.mod,newx=as.matrix(Part[,-1]),type="class")
Part$lfp=ifelse(Part$lfp==1,"Yes","No")
Part$Predict=ifelse(Part$Predict=="1","Yes","No")
table(Part[,c("lfp","Predict")])
## Predict
## lfp No Yes
## No 394 77
## Yes 210 191