Introduction

Today, we will work with daily water temperature and air temperature data observed for 31 rivers in Spain. The goal of this tutorial is to identify the best model for predicting the maximum water temperature given the maximum air temperature. In the preview below, W represents the daily maximum water temperature and A represents the daily maximum air temperature. The data contains almost a full year of data for each of the 31 different rivers.

## # A tibble: 6 × 8
##   JULIAN_DAY  YEAR     L     W     A  TIME MONTH   DAY
##        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1          1  2003   103  14.2  21.2     1     1     1
## 2          2  2003   103  14.4  16.8     2     1     2
## 3          3  2003   103  14.4  15.4     3     1     3
## 4          4  2003   103  10.9  10.8     4     1     4
## 5          5  2003   103  10.8  11.7     5     1     5
## 6          6  2003   103  10.7  12.4     6     1     6

Part 4: Nonlinear Logistic Model

\[y=d+\frac{a}{1+e^{(b-cx)}}\]

Chunk 1: Logistic Model Investigation

Chunk 2: Essential Functions for Nonlinear Logistic Model

logistic.model=function(COEF,DATA){
  pred=COEF[1]+COEF[2]/(1+exp(COEF[3]-COEF[4]*DATA$A))
}
MSE.logistic=function(COEF,DATA){
  error=DATA$W-logistic.model(DATA=DATA,COEF=COEF)
  sq.error=error^2
  mse=mean(sq.error,na.rm=T)
  return(mse)
}

Chunk 3: Estimate Parameters for Nonlinear Logistic Model

logistic.mod=optim(
  par=c(min(TRAIN$W,na.rm=T),
        max(TRAIN$W,na.rm=T)-min(TRAIN$W,na.rm=T),
        median(TRAIN$A,na.rm=T),
        1),           #Smart Starting Values
  fn=MSE.logistic,    #Function to Minimize
  DATA=TRAIN          #Required Argument
)
print(logistic.mod)
## $par
## [1] 12.282190 36.386633  5.026564  0.136390
## 
## $value
## [1] 13.50391
## 
## $counts
## function gradient 
##      501       NA 
## 
## $convergence
## [1] 1
## 
## $message
## NULL

Chunk 4: Obtain Predictions and Residuals

TRAIN6=TRAIN5 %>% mutate(logpred=logistic.model(COEF=logistic.mod$par,DATA=TRAIN5),
                         logres=W-logpred)
TEST6=TEST5 %>% mutate(logpred=logistic.model(COEF=logistic.mod$par,DATA=TEST5),
                         logres=W-logpred)

Part 5: Evaluation by Visualization

Chunk 1: Plotting Models

TEST6 %>%
  select(L,A,W,linpred,poly2pred,poly3pred,poly4pred,logpred)%>%
  gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
    ggplot() + 
      geom_point(aes(x=A,y=W),color="gray") + 
      theme_minimal() +
      geom_line(aes(x=A,y=Pred,color=Model),size=2)

Chunk 2: Predicted Versus Actual Max Water Temperature

TEST6 %>%
  select(L,A,W,linpred,poly2pred,poly3pred,poly4pred,logpred)%>%
  gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
    ggplot() + 
    geom_point(aes(x=W,y=Pred,color=Model)) + 
    geom_abline(a=0,b=1,color="black",size=2) +
    xlab("Maximum Water Temperature") +
    ylab("Predicted Water Temperature") +
    theme_minimal()

Chunk 3: Plotting Residuals Versus Time

TEST6 %>%
  select(L,A,W,TIME,linres,poly2res,poly3res,poly4res,logres)%>%
  gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
    ggplot() + 
      geom_line(aes(x=TIME,y=Res),color="lightskyblue2") + 
      geom_hline(yintercept=0,color="black",linetype=2,size=1.5) +
      xlab("Time") +
      ylab("Residual") +
      theme_dark() +
      facet_grid(Model~.)

Chunk 4: Evaluating The Location-Specific Error

TEST6 %>%
  select(L,A,W,TIME,linpred,logpred) %>%
  gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
    ggplot() +
      geom_point(aes(x=A,y=W),alpha=0.2) +
      geom_line(aes(x=A,y=Pred,color=Model),size=2) +
      theme_minimal()+facet_grid(L~.)

TEST6 %>%
  select(L,A,W,TIME,linres,logres) %>%
  gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
  ggplot() +
  geom_point(aes(x=A,y=Res,color=Model)) +
  geom_hline(yintercept=0) +
  theme_minimal()+facet_grid(L~.)

TEST6 %>%
  select(L,A,W,TIME,linres,logres) %>%
  gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
  ggplot() +
  geom_point(aes(x=TIME,y=Res,color=Model)) +
  geom_hline(yintercept=0) +
  theme_minimal()+facet_grid(L~.)

Part 6: Evaluation by Numerical Summary

Chunk 1: Functions Required For Evaluating Prediction

bias.func=function(res){
  bias=mean(res,na.rm=T)
  return(bias)
}

mae.func=function(res){
  mae=mean(abs(res),na.rm=T)
  return(mae)
}

rmse.func=function(res){
  mse=mean(res^2,na.rm=T)
  rmse=sqrt(mse)
  return(rmse)
}

Chunk 2: Checking Functions

ex.res=TEST6$linres
c(bias.func(ex.res),mae.func(ex.res),rmse.func(ex.res))
## [1] 0.9534126 2.7503233 3.3515944
ex.res.mat=TEST6 %>% select(linres,poly2res,poly3res,poly4res,logres)
apply(ex.res.mat,2,bias.func)
##    linres  poly2res  poly3res  poly4res    logres 
## 0.9534126 0.9742415 0.9903951 0.9920042 0.2613184
apply(ex.res.mat,2,mae.func)
##   linres poly2res poly3res poly4res   logres 
## 2.750323 2.732399 2.706833 2.715366 3.135313
apply(ex.res.mat,2,rmse.func)
##   linres poly2res poly3res poly4res   logres 
## 3.351594 3.344867 3.328889 3.338710 3.711664

Chunk 3: Get Table Quickly

SUMM1=TEST6 %>%
  select(L,A,W,TIME,linres,poly2res,poly3res,poly4res,logres)%>%
  rename(Linear=linres,`Poly(2)`=poly2res,`Poly(3)`=poly3res,`Poly(4)`=poly4res,Logistic=logres)%>%
  gather(Linear:Logistic,key="Model",value="Residual",factor_key=T)
SUMM2= SUMM1 %>% 
  group_by(Model) %>%
  summarize(MB=bias.func(Residual),
            MAE=mae.func(Residual),
            RMSE=rmse.func(Residual))
print(SUMM2)
## # A tibble: 5 × 4
##   Model       MB   MAE  RMSE
##   <fct>    <dbl> <dbl> <dbl>
## 1 Linear   0.953  2.75  3.35
## 2 Poly(2)  0.974  2.73  3.34
## 3 Poly(3)  0.990  2.71  3.33
## 4 Poly(4)  0.992  2.72  3.34
## 5 Logistic 0.261  3.14  3.71