Introduction

In this tutorial, we will try to understand some fundamental control structures used in statistical programming. In the beginning, we will separately analyze different control structures.

Part 1: Loops

Chunk 1: Correlation Matrix for Variables in Cigar from Ecdat

head(Cigar)
##   state year price  pop  pop16  cpi      ndi sales pimin
## 1     1   63  28.6 3383 2236.5 30.6 1558.305  93.9  26.1
## 2     1   64  29.8 3431 2276.7 31.0 1684.073  95.4  27.5
## 3     1   65  29.8 3486 2327.5 31.5 1809.842  98.5  28.9
## 4     1   66  31.5 3524 2369.7 32.4 1915.160  96.4  29.5
## 5     1   67  31.6 3533 2393.7 33.4 2023.546  95.5  29.6
## 6     1   68  35.6 3522 2405.2 34.8 2202.486  88.4  32.0
print(round(cor(Cigar),3))
##        state   year  price    pop  pop16    cpi    ndi  sales  pimin
## state  1.000  0.000 -0.010 -0.094 -0.096  0.000 -0.036 -0.104  0.000
## year   0.000  1.000  0.918  0.082  0.118  0.979  0.936 -0.182  0.920
## price -0.010  0.918  1.000  0.108  0.137  0.949  0.944 -0.311  0.987
## pop   -0.094  0.082  0.108  1.000  0.998  0.082  0.159 -0.112  0.077
## pop16 -0.096  0.118  0.137  0.998  1.000  0.115  0.194 -0.112  0.106
## cpi    0.000  0.979  0.949  0.082  0.115  1.000  0.953 -0.210  0.953
## ndi   -0.036  0.936  0.944  0.159  0.194  0.953  1.000 -0.184  0.938
## sales -0.104 -0.182 -0.311 -0.112 -0.112 -0.210 -0.184  1.000 -0.277
## pimin  0.000  0.920  0.987  0.077  0.106  0.953  0.938 -0.277  1.000

Chunk 2: Using a Double Loop to Create a Correlation Matrix

CORR.CIGAR1=matrix(NA,9,9) #Empty Matrix Initialized
rownames(CORR.CIGAR1)=names(Cigar)
colnames(CORR.CIGAR1)=names(Cigar)

seq_along(names(Cigar))
## [1] 1 2 3 4 5 6 7 8 9
for(j in seq_along(names(Cigar))){
  for(k in seq_along(names(Cigar))){
    CORR.CIGAR1[j,k]=cor(Cigar[,j],Cigar[,k])
  }
}
print(round(CORR.CIGAR1,3))
##        state   year  price    pop  pop16    cpi    ndi  sales  pimin
## state  1.000  0.000 -0.010 -0.094 -0.096  0.000 -0.036 -0.104  0.000
## year   0.000  1.000  0.918  0.082  0.118  0.979  0.936 -0.182  0.920
## price -0.010  0.918  1.000  0.108  0.137  0.949  0.944 -0.311  0.987
## pop   -0.094  0.082  0.108  1.000  0.998  0.082  0.159 -0.112  0.077
## pop16 -0.096  0.118  0.137  0.998  1.000  0.115  0.194 -0.112  0.106
## cpi    0.000  0.979  0.949  0.082  0.115  1.000  0.953 -0.210  0.953
## ndi   -0.036  0.936  0.944  0.159  0.194  0.953  1.000 -0.184  0.938
## sales -0.104 -0.182 -0.311 -0.112 -0.112 -0.210 -0.184  1.000 -0.277
## pimin  0.000  0.920  0.987  0.077  0.106  0.953  0.938 -0.277  1.000
Cigar_num = Cigar[,-1]
CORR.CIGAR2=matrix(NA,8,8)
rownames(CORR.CIGAR2)=names(Cigar_num)
colnames(CORR.CIGAR2)=names(Cigar_num)
# Please Replace BLANK with appropriate values to complete the task.
for(j in seq_along(names(Cigar_num))){
  for(k in seq_along(names(Cigar_num))){
    CORR.CIGAR2[j,k]=cor(Cigar_num[,j],Cigar_num[,k])
  }
}
print(round(CORR.CIGAR2,3))
##         year  price    pop  pop16    cpi    ndi  sales  pimin
## year   1.000  0.918  0.082  0.118  0.979  0.936 -0.182  0.920
## price  0.918  1.000  0.108  0.137  0.949  0.944 -0.311  0.987
## pop    0.082  0.108  1.000  0.998  0.082  0.159 -0.112  0.077
## pop16  0.118  0.137  0.998  1.000  0.115  0.194 -0.112  0.106
## cpi    0.979  0.949  0.082  0.115  1.000  0.953 -0.210  0.953
## ndi    0.936  0.944  0.159  0.194  0.953  1.000 -0.184  0.938
## sales -0.182 -0.311 -0.112 -0.112 -0.210 -0.184  1.000 -0.277
## pimin  0.920  0.987  0.077  0.106  0.953  0.938 -0.277  1.000

Chunk 3: Correlation Matrix for Variables in HI from Ecdat

HealthInsurance=HI
head(HealthInsurance)
##   whrswk hhi whi hhi2  education  race hispanic experience kidslt6 kids618
## 1      0  no  no   no 13-15years white       no       13.0       2       1
## 2     50  no yes   no 13-15years white       no       24.0       0       1
## 3     40 yes  no  yes    12years white       no       43.0       0       0
## 4     40  no yes  yes 13-15years white       no       17.0       0       1
## 5      0 yes  no  yes  9-11years white       no       44.5       0       0
## 6     40 yes yes  yes    12years white       no       32.0       0       0
##    husby       region   wght
## 1 11.960 northcentral 214986
## 2  1.200 northcentral 210119
## 3 31.275 northcentral 219955
## 4  9.000 northcentral 210317
## 5  0.000 northcentral 219955
## 6 15.690 northcentral 208148
#?HI
#print(round(cor(HealthInsurance),3)) #Try this Code

Chunk 4: Using a Double Loop to Compute 5-Number Summary for Numeric Variables

var.names = names(HealthInsurance)

FiveSum.HI = matrix(NA,length(var.names),5)
rownames(FiveSum.HI) = var.names
colnames(FiveSum.HI) = c("Min","Q1","Q2","Q3","Max")

for(VAR in seq_along(var.names)){
  if(is.numeric(HealthInsurance[,VAR])){
    MIN=min(HealthInsurance[,VAR])
    Q1=quantile(HealthInsurance[,VAR],0.25)
    Q2=median(HealthInsurance[,VAR],0.5)
    Q3=quantile(HealthInsurance[,VAR],0.75)
    MAX=max(HealthInsurance[,VAR])
    FiveSum.HI[VAR,]=c(MIN,Q1,Q2,Q3,MAX)
  } else {
    cat("Variable",var.names[VAR],"is not numeric\n")
  }
}
## Variable hhi is not numeric
## Variable whi is not numeric
## Variable hhi2 is not numeric
## Variable education is not numeric
## Variable race is not numeric
## Variable hispanic is not numeric
## Variable region is not numeric
FiveSum.HI = na.omit(FiveSum.HI)
print(FiveSum.HI)
##             Min       Q1       Q2       Q3         Max
## whrswk        0     0.00     35.0     40.0      90.000
## experience   -1    14.00     21.0     31.0      51.000
## kidslt6       0     0.00      0.0      1.0       5.000
## kids618       0     0.00      0.0      1.0       8.000
## husby         0     7.50     25.0     40.0     183.719
## wght       5182 99136.25 154814.5 204013.2 1136869.000
## attr(,"na.action")
##       hhi       whi      hhi2 education      race  hispanic    region 
##         2         3         4         5         6         7        12 
## attr(,"class")
## [1] "omit"
FiveSum.HI2 = NULL
Numeric.names = NULL
for(VAR in seq_along(var.names)){
  if(is.numeric(HealthInsurance[,VAR])){
    MIN=min(HealthInsurance[,VAR])
    Q1=quantile(HealthInsurance[,VAR],0.25)
    Q2=median(HealthInsurance[,VAR],0.5)
    Q3=quantile(HealthInsurance[,VAR],0.75)
    MAX=max(HealthInsurance[,VAR])
    FiveSum.HI2=rbind(FiveSum.HI2,c(MIN,Q1,Q2,Q3,MAX))
    Numeric.names=c(Numeric.names,var.names[VAR])
    print(Numeric.names)
  } 
}
## [1] "whrswk"
## [1] "whrswk"     "experience"
## [1] "whrswk"     "experience" "kidslt6"   
## [1] "whrswk"     "experience" "kidslt6"    "kids618"   
## [1] "whrswk"     "experience" "kidslt6"    "kids618"    "husby"     
## [1] "whrswk"     "experience" "kidslt6"    "kids618"    "husby"     
## [6] "wght"
rownames(FiveSum.HI2) = Numeric.names
colnames(FiveSum.HI2) = c("Min","Q1","Q2","Q3","Max")
print(FiveSum.HI2)
##             Min       Q1       Q2       Q3         Max
## whrswk        0     0.00     35.0     40.0      90.000
## experience   -1    14.00     21.0     31.0      51.000
## kidslt6       0     0.00      0.0      1.0       5.000
## kids618       0     0.00      0.0      1.0       8.000
## husby         0     7.50     25.0     40.0     183.719
## wght       5182 99136.25 154814.5 204013.2 1136869.000

Excercise 1

The following code creates a new data frame HI_Num that only contains numeric columns in HealthInsurance.

HI_Num = HealthInsurance %>%
  select(c(1,8,9,10,11,13))

Using a Double Loop to Create a Correlation Matrix for numeric variables in HealthInsurance.

Data = HI_Num
d = dim(Data)[2]
CORR = matrix(NA,d,d) #Empty Matrix Initialized
rownames(CORR)=names(Data)
colnames(CORR)=names(Data)

for(j in seq_along(names(Data))){
  for(k in seq_along(names(Data))){
    CORR[j,k]=cor(Data[,j],Data[,k])
  }
}

print(round(CORR,3))
##            whrswk experience kidslt6 kids618  husby   wght
## whrswk      1.000     -0.216  -0.139  -0.033  0.020  0.011
## experience -0.216      1.000  -0.450  -0.193 -0.131  0.009
## kidslt6    -0.139     -0.450   1.000  -0.011  0.023 -0.030
## kids618    -0.033     -0.193  -0.011   1.000  0.086 -0.036
## husby       0.020     -0.131   0.023   0.086  1.000  0.047
## wght        0.011      0.009  -0.030  -0.036  0.047  1.000

Excercise 2

Write a loop that loops over the columns of data Wages and reports the mean of the column if it is numeric and the total number of unique values if it’s a character vector. (Hint: unique(), length())

for(i in 1:ncol(Wages)){
  if (is.numeric(Wages[,i])){
    cat("Mean of ",names(Wages)[i], " is ",mean(Wages[,i],na.rm=T),"\n")
  } else {
    cat("Number of unique values in ",names(Wages)[i], " is ",length(unique(Wages[,i])),"\n")
  }
}
## Mean of  exp  is  19.85378 
## Mean of  wks  is  46.81152 
## Number of unique values in  bluecol  is  2 
## Mean of  ind  is  0.3954382 
## Number of unique values in  south  is  2 
## Number of unique values in  smsa  is  2 
## Number of unique values in  married  is  2 
## Number of unique values in  sex  is  2 
## Number of unique values in  union  is  2 
## Mean of  ed  is  12.84538 
## Number of unique values in  black  is  2 
## Mean of  lwage  is  6.676346

Part 2: Simple Random Sampling

Chunk 1: Sampling from Known Distributions

# Scores of 100 students from a population distribution N(82,2)
x1=rnorm(100,mean=82,sd=2)
ggplot()+geom_histogram(aes(x1))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

# Number of times 5 appears when rolling a fair die 10 times
x2=rbinom(100,size=10,prob=1/6)
ggplot()+geom_bar(aes(x2))

Chunk 2: Experiment for Flipping Coins

BLANK = 0.6
prop=rep(NA,1000)
for(k in 1:1000){
  set.seed(k)
  x=sample(c("H","T"),size=k,replace=T,prob=c(BLANK,1-BLANK)) #Replace BLANK with Head probability
  prop[k]=mean(x=="H")
}
ggplot() + 
  geom_line(aes(x=1:1000,y=prop),alpha=0.5) + 
  geom_hline(yintercept=BLANK,linetype="dashed",color="red",size=2) +
  theme_minimal()