We will continue our work with daily water temperature and air
temperature data observed for 31 rivers in Spain. In the preview below,
W
represents the daily maximum water temperature and
A
represents the daily maximum air temperature. The data
contains almost a full year of data for each of the 31 different
rivers.
## # A tibble: 6 × 8
## JULIAN_DAY YEAR L W A TIME MONTH DAY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2003 103 14.2 21.2 1 1 1
## 2 2 2003 103 14.4 16.8 2 1 2
## 3 3 2003 103 14.4 15.4 3 1 3
## 4 4 2003 103 10.9 10.8 4 1 4
## 5 5 2003 103 10.8 11.7 5 1 5
## 6 6 2003 103 10.7 12.4 6 1 6
Using the data, we seek to identify the best model for predicting the maximum water temperature given the maximum air temperature. Previously, we randomly selected 3 rivers to act as a test set. All models were evaluated based on the randomly selected test set. In this tutorial, we explore approaches that ensure that all data is used for both model training and model testing.
In this tutorial, we apply helpful functions in the
purrr
, modelr
, and broom
packages. See the following links for helpful articles on performing
cross-validation within the tidyverse: Link
1, Link
2, and Link
3.
NEST.DATA = DATA %>% group_by(L) %>% nest()
head(NEST.DATA)
## # A tibble: 6 × 2
## # Groups: L [6]
## L data
## <dbl> <list>
## 1 103 <tibble [365 × 7]>
## 2 105 <tibble [366 × 7]>
## 3 107 <tibble [365 × 7]>
## 4 111 <tibble [365 × 7]>
## 5 112 <tibble [366 × 7]>
## 6 118 <tibble [366 × 7]>
filter()
with unnest()
To Split DataNEST.DATA %>% filter(L==103) %>% unnest() %>% glimpse()
## Rows: 365
## Columns: 8
## Groups: L [1]
## $ L <dbl> 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103,…
## $ JULIAN_DAY <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ YEAR <dbl> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003,…
## $ W <dbl> 14.2, 14.4, 14.4, 10.9, 10.8, 10.7, 10.3, 10.1, 9.8, 10.5, …
## $ A <dbl> 21.2, 16.8, 15.4, 10.8, 11.7, 12.4, 13.2, 11.8, 5.1, 3.5, 5…
## $ TIME <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ MONTH <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ DAY <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
NEST.DATA %>% filter(L!=103) %>% unnest() %>% glimpse()
## Rows: 10,972
## Columns: 8
## Groups: L [30]
## $ L <dbl> 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,…
## $ JULIAN_DAY <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ YEAR <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008,…
## $ W <dbl> 7.9, 7.8, 8.7, 9.0, 9.4, 10.4, 9.9, 9.8, 10.3, NA, 9.5, 9.5…
## $ A <dbl> 14.0, 15.0, 12.4, 14.6, 18.0, 18.2, 12.0, 14.2, 16.3, NA, 1…
## $ TIME <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ MONTH <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ DAY <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
DATA2=DATA
DATA2$linpred=NA
TEST = NEST.DATA %>% filter(L==103) %>% unnest()
TRAIN = NEST.DATA %>% filter(L!=103) %>% unnest()
linmod=lm(W~A,data=TRAIN)
linmodpred=predict(linmod,newdata=TEST)
DATA2$linpred[which(DATA2$L==103)]=linmodpred
tail(DATA2)
## # A tibble: 6 × 9
## JULIAN_DAY YEAR L W A TIME MONTH DAY linpred
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 361 2008 920 6.3 4 361 12 26 NA
## 2 362 2008 920 6.5 8 362 12 27 NA
## 3 363 2008 920 7 8.5 363 12 28 NA
## 4 364 2008 920 7.4 6 364 12 29 NA
## 5 365 2008 920 7.8 11 365 12 30 NA
## 6 366 2008 920 8.7 11 366 12 31 NA
DATA2=DATA
DATA2$linpred=NA
for(k in unique(DATA2$L)){
TEST = NEST.DATA %>% filter(L==k) %>% unnest()
TRAIN = NEST.DATA %>% filter(L!=k) %>% unnest()
linmod=lm(W~A,data=TRAIN)
linmodpred=predict(linmod,newdata=TEST)
DATA2$linpred[which(DATA2$L==k)]=linmodpred
}
RMSE.func=function(actual,predict){
mse=mean((actual-predict)^2,na.rm=T)
rmse=sqrt(mse)
return(rmse)
}
#RMSE.func(actual=DATA2$W,predict=DATA2$linpred)
polymodel=lm(W~poly(A,4)+poly(JULIAN_DAY,3),data=na.omit(DATA))
tidy(polymodel)
## # A tibble: 8 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 16.2 0.0273 595. 0
## 2 poly(A, 4)1 328. 4.36 75.3 0
## 3 poly(A, 4)2 49.0 2.80 17.5 1.62e-67
## 4 poly(A, 4)3 2.85 2.78 1.02 3.06e- 1
## 5 poly(A, 4)4 -3.62 2.72 -1.33 1.84e- 1
## 6 poly(JULIAN_DAY, 3)1 46.0 2.78 16.6 8.85e-61
## 7 poly(JULIAN_DAY, 3)2 -226. 4.31 -52.5 0
## 8 poly(JULIAN_DAY, 3)3 -59.3 2.89 -20.5 8.66e-92
glance(polymodel)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.797 0.797 2.71 5525. 0 7 -23804. 47626. 47691.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
DATA3=na.omit(DATA) %>% crossv_kfold(10)
DATA3
## # A tibble: 10 × 3
## train test .id
## <named list> <named list> <chr>
## 1 <resample [8,871 x 8]> <resample [986 x 8]> 01
## 2 <resample [8,871 x 8]> <resample [986 x 8]> 02
## 3 <resample [8,871 x 8]> <resample [986 x 8]> 03
## 4 <resample [8,871 x 8]> <resample [986 x 8]> 04
## 5 <resample [8,871 x 8]> <resample [986 x 8]> 05
## 6 <resample [8,871 x 8]> <resample [986 x 8]> 06
## 7 <resample [8,871 x 8]> <resample [986 x 8]> 07
## 8 <resample [8,872 x 8]> <resample [985 x 8]> 08
## 9 <resample [8,872 x 8]> <resample [985 x 8]> 09
## 10 <resample [8,872 x 8]> <resample [985 x 8]> 10
train.model.func=function(data,i,j){
mod=lm(W~poly(A,i)+poly(JULIAN_DAY,j),data=data)
return(mod)
}
i=4
j=3
DATA4=DATA3 %>%
mutate(tr.model=map(train,train.model.func,i=i,j=j))
DATA4
## # A tibble: 10 × 4
## train test .id tr.model
## <named list> <named list> <chr> <named list>
## 1 <resample [8,871 x 8]> <resample [986 x 8]> 01 <lm>
## 2 <resample [8,871 x 8]> <resample [986 x 8]> 02 <lm>
## 3 <resample [8,871 x 8]> <resample [986 x 8]> 03 <lm>
## 4 <resample [8,871 x 8]> <resample [986 x 8]> 04 <lm>
## 5 <resample [8,871 x 8]> <resample [986 x 8]> 05 <lm>
## 6 <resample [8,871 x 8]> <resample [986 x 8]> 06 <lm>
## 7 <resample [8,871 x 8]> <resample [986 x 8]> 07 <lm>
## 8 <resample [8,872 x 8]> <resample [985 x 8]> 08 <lm>
## 9 <resample [8,872 x 8]> <resample [985 x 8]> 09 <lm>
## 10 <resample [8,872 x 8]> <resample [985 x 8]> 10 <lm>
DATA4.PREDICT = DATA4 %>%
mutate(predict=map2(test,tr.model,~augment(.y,newdata=.x))) %>%
select(predict) %>%
unnest(cols = c(predict))
head(DATA4.PREDICT)
## # A tibble: 6 × 10
## JULIAN_DAY YEAR L W A TIME MONTH DAY .fitted .resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 2003 103 14.4 16.8 2 1 2 10.5 3.92
## 2 9 2003 103 9.8 5.1 9 1 9 7.24 2.56
## 3 27 2003 103 10.1 17.2 27 1 27 11.6 -1.49
## 4 33 2003 103 12.4 15 33 2 2 11.1 1.33
## 5 59 2003 103 11.9 19 59 2 28 13.6 -1.73
## 6 77 2003 103 12.7 18.4 77 3 18 14.2 -1.49
RMSE.func(actual=DATA4.PREDICT$W,predict=DATA4.PREDICT$.fitted)
## [1] 2.710072
max_i = 10
max_j = 10
rmse_results = matrix(NA,max_i,max_j)
for(i in 1:max_i){
for(j in 1:max_j){
DATA4=DATA3 %>%
mutate(tr.model=map(train,train.model.func,i=i,j=j))
DATA4.PREDICT = DATA4 %>%
mutate(predict=map2(test,tr.model,~augment(.y,newdata=.x))) %>%
select(predict) %>%
unnest(cols = c(predict))
rmse_results[i,j] = RMSE.func(actual=DATA4.PREDICT$W,predict=DATA4.PREDICT$.fitted)
}
}
which(rmse_results==min(rmse_results), arr.ind = T)
## row col
## [1,] 3 10
set.seed(123)
x=rnorm(100)
y=x-2*x^2+rnorm (100)
sim=tibble(predictor=x,response=y)
In this data set, what is n and what is p? Write out the model used to generate the data in equation form.
Here we have that n=100 and p=2, the model used is \[ Y = X - 2X^2 + \epsilon \]
ggplot(data=sim) +
geom_point(aes(x=predictor,y=response))
The data obviously suggests a curved (non-linear) relationship.
\(Y = \beta_0 + \beta_1X + \epsilon\)
\(Y = \beta_0 + \beta_1X + \beta_2X^2 + \epsilon\)
\(Y = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \epsilon\)
\(Y = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4X^4 + \epsilon\)
max_i = 4
rmse_ex = rep(NA,max_i)
train.model.func2 = function(data,i){
mod=lm(response~poly(predictor,i),data=data)
return(mod)
}
for(i in 1:max_i){
sim_final = sim %>%
crossv_kfold(k=5) %>%
mutate(tr.model=map(train,train.model.func2,i=i)) %>%
mutate(predict=map2(test,tr.model,~augment(.y,newdata=.x))) %>%
select(predict) %>%
unnest(cols = c(predict))
rmse_ex[i] = RMSE.func(actual=sim_final$response,predict=sim_final$.fitted)
}
which.min(rmse_ex)
## [1] 2
We may see that the 5-Fold estimate for the test RMSE is minimum for
the second model, this is not surprising since we know the true relation
between x
and y
is quadratic.