Today, we will work with daily water temperature and air temperature
data observed for 31 rivers in Spain. The goal of this tutorial is to
identify the best model for predicting the maximum water temperature
given the maximum air temperature. In the preview below, W
represents the daily maximum water temperature and A
represents the daily maximum air temperature. The data contains almost a
full year of data for each of the 31 different rivers.
## # A tibble: 6 × 8
## JULIAN_DAY YEAR L W A TIME MONTH DAY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2003 103 14.2 21.2 1 1 1
## 2 2 2003 103 14.4 16.8 2 1 2
## 3 3 2003 103 14.4 15.4 3 1 3
## 4 4 2003 103 10.9 10.8 4 1 4
## 5 5 2003 103 10.8 11.7 5 1 5
## 6 6 2003 103 10.7 12.4 6 1 6
\[y=d+\frac{a}{1+e^{(b-cx)}}\]
set.seed(216)
EXAMPLE=tibble(
x=rnorm(10000,mean=0,sd=5),
y=7+12/(1+exp(4-1*x))
)
ggplot(data=EXAMPLE) +
geom_point(aes(x=x,y=y)) +
theme_minimal()
Please write a function to calculate mean squared error for the nonlinear logistic model.
logistic.model=function(COEF,DATA){
pred=COEF[1]+COEF[2]/(1+exp(COEF[3]-COEF[4]*DATA$A))
}
MSE.logistic=function(COEF,DATA){
error=DATA$W-logistic.model(DATA=DATA,COEF=COEF)
sq.error=error^2
mse=mean(sq.error,na.rm=T)
return(mse)
}
logistic.mod=optim(
par=c(min(TRAIN$W,na.rm=T), #d
max(TRAIN$W,na.rm=T)-min(TRAIN$W,na.rm=T), #a
median(TRAIN$A,na.rm=T), #b
1), #c #Smart Starting Values
fn=MSE.logistic, #Function to Minimize
DATA=TRAIN #Required Argument
)
print(logistic.mod)
## $par
## [1] 12.282190 36.386633 5.026564 0.136390
##
## $value
## [1] 13.50391
##
## $counts
## function gradient
## 501 NA
##
## $convergence
## [1] 1
##
## $message
## NULL
TRAIN6=TRAIN5 %>% mutate(logpred=logistic.model(COEF=logistic.mod$par,DATA=TRAIN5),
logres=W-logpred)
TEST6=TEST5 %>% mutate(logpred=logistic.model(COEF=logistic.mod$par,DATA=TEST5),
logres=W-logpred)
TEST6 %>%
select(L,A,W,linpred,poly2pred,poly3pred,poly4pred,logpred)%>%
gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
ggplot() +
geom_point(aes(x=A,y=W),color="gray") +
theme_minimal() +
geom_line(aes(x=A,y=Pred,color=Model),size=2)
TEST6 %>%
select(L,A,W,linpred,poly2pred,poly3pred,poly4pred,logpred)%>%
gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
ggplot() +
geom_point(aes(x=W,y=Pred,color=Model)) +
geom_abline(a=0,b=1,color="black",size=2) +
xlab("Maximum Water Temperature") +
ylab("Predicted Water Temperature") +
theme_minimal()
TEST6 %>%
select(L,A,W,TIME,linres,poly2res,poly3res,poly4res,logres)%>%
gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
ggplot() +
geom_line(aes(x=TIME,y=Res),color="lightskyblue2") +
geom_hline(yintercept=0,color="black",linetype=2,size=1.5) +
xlab("Time") +
ylab("Residual") +
theme_dark() +
facet_grid(Model~.)
TEST6 %>%
select(L,A,W,TIME,linpred,logpred) %>%
gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
ggplot() +
geom_point(aes(x=A,y=W),alpha=0.2) +
geom_line(aes(x=A,y=Pred,color=Model),size=2) +
theme_minimal()+facet_grid(L~.)
TEST6 %>%
select(L,A,W,TIME,linres,logres) %>%
gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
ggplot() +
geom_point(aes(x=A,y=Res,color=Model)) +
geom_hline(yintercept=0) +
theme_minimal()+facet_grid(L~.)
TEST6 %>%
select(L,A,W,TIME,linres,logres) %>%
gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
ggplot() +
geom_point(aes(x=TIME,y=Res,color=Model)) +
geom_hline(yintercept=0) +
theme_minimal()+facet_grid(L~.)
bias.func=function(res){
bias=mean(res,na.rm=T)
return(bias)
}
mae.func=function(res){
mae=mean(abs(res),na.rm=T)
return(mae)
}
rmse.func=function(res){
mse=mean(res^2,na.rm=T)
rmse=sqrt(mse)
return(rmse)
}
Please use apply function to calculate Bias, MAE and
rMSE for the models.
ex.res.mat=TEST6 %>% select(linres,poly2res,poly3res,poly4res,logres)
apply(ex.res.mat,2,bias.func)
## linres poly2res poly3res poly4res logres
## 0.9534126 0.9742415 0.9903951 0.9920042 0.2613184
apply(ex.res.mat,2,mae.func)
## linres poly2res poly3res poly4res logres
## 2.750323 2.732399 2.706833 2.715366 3.135313
apply(ex.res.mat,2,rmse.func)
## linres poly2res poly3res poly4res logres
## 3.351594 3.344867 3.328889 3.338710 3.711664
SUMM1=TEST6 %>%
select(L,A,W,TIME,linres,poly2res,poly3res,poly4res,logres)%>%
rename(Linear=linres,`Poly(2)`=poly2res,`Poly(3)`=poly3res,`Poly(4)`=poly4res,Logistic=logres)%>%
gather(Linear:Logistic,key="Model",value="Residual",factor_key=T)
SUMM2= SUMM1 %>%
group_by(Model) %>%
summarize(MB=bias.func(Residual),
MAE=mae.func(Residual),
RMSE=rmse.func(Residual),
.groups='drop')
print(SUMM2)
## # A tibble: 5 × 4
## Model MB MAE RMSE
## <fct> <dbl> <dbl> <dbl>
## 1 Linear 0.953 2.75 3.35
## 2 Poly(2) 0.974 2.73 3.34
## 3 Poly(3) 0.990 2.71 3.33
## 4 Poly(4) 0.992 2.72 3.34
## 5 Logistic 0.261 3.14 3.71