Introduction

In this tutorial, we will try to understand some fundamental control structures used in statistical programming. In the beginning, we will separately analyze different control structures.

Part 1: Loops

Chunk 1: Correlation Matrix for Variables in Cigar from Ecdat

head(Cigar)
print(round(cor(Cigar),3))

Chunk 2: Using a Double Loop to Create a Correlation Matrix

CORR.CIGAR1=matrix(NA,9,9) #Empty Matrix Initialized
rownames(CORR.CIGAR1)=names(Cigar)
colnames(CORR.CIGAR1)=names(Cigar)

seq_along(names(Cigar))

for(j in seq_along(names(Cigar))){
  for(k in seq_along(names(Cigar))){
    CORR.CIGAR1[j,k]=cor(Cigar[,j],Cigar[,k])
  }
}
print(round(CORR.CIGAR1,3))
Cigar_num = Cigar[,-1]
CORR.CIGAR2=matrix(NA,8,8)
rownames(CORR.CIGAR2)=names(Cigar_num)
colnames(CORR.CIGAR2)=names(Cigar_num)
for(j in 1:8){
  for(k in 1:8){
    CORR.CIGAR2[j,k]=cor(Cigar_num[,j],Cigar_num[,k])
  }
}
print(round(CORR.CIGAR2,3))

Chunk 3: Correlation Matrix for Variables in HI from Ecdat

HealthInsurance=HI
head(HealthInsurance)
#?HI
#print(round(cor(HealthInsurance),3)) #Try this Code

Chunk 4: Using a Double Loop to Compute 5-Number Summary for Numeric Variables

var.names = names(HealthInsurance)

FiveSum.HI = matrix(NA,length(var.names),6)
colnames(FiveSum.HI) = c("Variable","Min","Q1","Q2","Q3","Max")

for(VAR in seq_along(var.names)){
  if(is.numeric(HealthInsurance[,VAR])){
    MIN=min(HealthInsurance[,VAR])
    Q1=quantile(HealthInsurance[,VAR],0.25)
    Q2=median(HealthInsurance[,VAR],0.5)
    Q3=quantile(HealthInsurance[,VAR],0.75)
    MAX=max(HealthInsurance[,VAR])
    FiveSum.HI[VAR,]=c(names(HealthInsurance)[VAR],MIN,Q1,Q2,Q3,MAX)
  } else {
    cat("Variable",var.names[VAR],"is not numeric\n")
    FiveSum.HI[VAR,]=c(names(HealthInsurance)[VAR],rep(NA,5))
  }
}
print(as.tibble(na.omit(FiveSum.HI)))
FiveSum.HI2 = NULL
Numeric.names = NULL
for(VAR in seq_along(var.names)){
  if(is.numeric(HealthInsurance[,VAR])){
    MIN=min(HealthInsurance[,VAR])
    Q1=quantile(HealthInsurance[,VAR],0.25)
    Q2=median(HealthInsurance[,VAR],0.5)
    Q3=quantile(HealthInsurance[,VAR],0.75)
    MAX=max(HealthInsurance[,VAR])
    FiveSum.HI2=rbind(FiveSum.HI2,c(MIN,Q1,Q2,Q3,MAX))
    Numeric.names=c(Numeric.names,var.names[VAR])
  } 
}
FiveSum.HI3=as.tibble(cbind(Numeric.names,as.tibble(FiveSum.HI2)))
names(FiveSum.HI3) = c("Variable","Min","Q1","Q2","Q3","Max")
print(FiveSum.HI3)

Excercise 1

The following code creates a new data frame HI_Num that only contains numeric columns in HealthInsurance.

HI_Num = HealthInsurance %>%
  select(c(1,8,9,10,11,13))

Using a Double Loop to Create a Correlation Matrix for numeric variables in HealthInsurance.

Data = HI_Num
d = dim(Data)[2]
CORR = matrix(NA,d,d) #Empty Matrix Initialized
rownames(CORR)=names(Data)
colnames(CORR)=names(Data)

for(j in 1:d){
  for(k in 1:d){
    CORR[j,k]=cor(Data[,j],Data[,k])
  }
}
print(round(CORR,3))

Excercise 2

Write a loop that loops over the columns of data Wages and reports the mean of the column if it is numeric and the total number of unique characters if it’s a character vector.

head(Wages,3)
##   exp wks bluecol ind south smsa married  sex union ed black   lwage
## 1   3  32      no   0   yes   no     yes male    no  9    no 5.56068
## 2   4  43      no   0   yes   no     yes male    no  9    no 5.72031
## 3   5  40      no   0   yes   no     yes male    no  9    no 5.99645
for(i in 1:ncol(Wages)){
  if (is.numeric(Wages[,i])){
    cat("Mean of ",names(Wages)[i], " is ",mean(Wages[,i],na.rm=T),"\n")
  } else {
    cat("Number of unique values in ",names(Wages)[i], " is ",length(unique(Wages[,i])),"\n")
  }
}
## Mean of  exp  is  19.85378 
## Mean of  wks  is  46.81152 
## Number of unique values in  bluecol  is  2 
## Mean of  ind  is  0.3954382 
## Number of unique values in  south  is  2 
## Number of unique values in  smsa  is  2 
## Number of unique values in  married  is  2 
## Number of unique values in  sex  is  2 
## Number of unique values in  union  is  2 
## Mean of  ed  is  12.84538 
## Number of unique values in  black  is  2 
## Mean of  lwage  is  6.676346

Excercise 3

Use ifelse function to answer excercise 2.

summary = ifelse(is.numeric(Wages),mean(Wages),length(unique(Wages)))
summary
## [1] 12

Excercise 4

Writing a function to calculate five summary statistics and generating five summary statistics for numeric variables in HealthInsurance.

#

Part 2: Simple Random Sampling

Chunk 1: Sampling from Known Distributions

# Scores of 100 students from a population distribution N(82,2)
x1=rnorm(100,mean=82,sd=2)
ggplot()+geom_histogram(aes(x1))

# Number of times 5 appears when rolling a fair die 10 times
x2=rbinom(100,size=10,prob=1/6)
ggplot()+geom_bar(aes(x2))

Chunk 2: Experiment for Flipping Coins

prop=rep(NA,1000)
for(k in 1:1000){
  set.seed(k)
  x=sample(c("H","T"),size=k,replace=T,prob=c(0.5,0.5))
  prop[k]=mean(x=="H")
}
ggplot() + 
  geom_line(aes(x=1:1000,y=prop),alpha=0.5) + 
  geom_hline(yintercept=0.5,linetype="dashed",color="red",size=2) +
  theme_minimal()