In this tutorial, we will try to understand some fundamental control structures used in statistical programming. In the beginning, we will separately analyze different control structures.
Cigar from
Ecdathead(Cigar)
## state year price pop pop16 cpi ndi sales pimin
## 1 1 63 28.6 3383 2236.5 30.6 1558.305 93.9 26.1
## 2 1 64 29.8 3431 2276.7 31.0 1684.073 95.4 27.5
## 3 1 65 29.8 3486 2327.5 31.5 1809.842 98.5 28.9
## 4 1 66 31.5 3524 2369.7 32.4 1915.160 96.4 29.5
## 5 1 67 31.6 3533 2393.7 33.4 2023.546 95.5 29.6
## 6 1 68 35.6 3522 2405.2 34.8 2202.486 88.4 32.0
print(round(cor(Cigar),3))
## state year price pop pop16 cpi ndi sales pimin
## state 1.000 0.000 -0.010 -0.094 -0.096 0.000 -0.036 -0.104 0.000
## year 0.000 1.000 0.918 0.082 0.118 0.979 0.936 -0.182 0.920
## price -0.010 0.918 1.000 0.108 0.137 0.949 0.944 -0.311 0.987
## pop -0.094 0.082 0.108 1.000 0.998 0.082 0.159 -0.112 0.077
## pop16 -0.096 0.118 0.137 0.998 1.000 0.115 0.194 -0.112 0.106
## cpi 0.000 0.979 0.949 0.082 0.115 1.000 0.953 -0.210 0.953
## ndi -0.036 0.936 0.944 0.159 0.194 0.953 1.000 -0.184 0.938
## sales -0.104 -0.182 -0.311 -0.112 -0.112 -0.210 -0.184 1.000 -0.277
## pimin 0.000 0.920 0.987 0.077 0.106 0.953 0.938 -0.277 1.000
CORR.CIGAR1=matrix(NA,9,9) #Empty Matrix Initialized
rownames(CORR.CIGAR1)=names(Cigar)
colnames(CORR.CIGAR1)=names(Cigar)
seq_along(names(Cigar))
## [1] 1 2 3 4 5 6 7 8 9
for(j in seq_along(names(Cigar))){
for(k in seq_along(names(Cigar))){
CORR.CIGAR1[j,k]=cor(Cigar[,j],Cigar[,k])
}
}
print(round(CORR.CIGAR1,3))
## state year price pop pop16 cpi ndi sales pimin
## state 1.000 0.000 -0.010 -0.094 -0.096 0.000 -0.036 -0.104 0.000
## year 0.000 1.000 0.918 0.082 0.118 0.979 0.936 -0.182 0.920
## price -0.010 0.918 1.000 0.108 0.137 0.949 0.944 -0.311 0.987
## pop -0.094 0.082 0.108 1.000 0.998 0.082 0.159 -0.112 0.077
## pop16 -0.096 0.118 0.137 0.998 1.000 0.115 0.194 -0.112 0.106
## cpi 0.000 0.979 0.949 0.082 0.115 1.000 0.953 -0.210 0.953
## ndi -0.036 0.936 0.944 0.159 0.194 0.953 1.000 -0.184 0.938
## sales -0.104 -0.182 -0.311 -0.112 -0.112 -0.210 -0.184 1.000 -0.277
## pimin 0.000 0.920 0.987 0.077 0.106 0.953 0.938 -0.277 1.000
Cigar_num = Cigar[,-1]
CORR.CIGAR2=matrix(NA,8,8)
rownames(CORR.CIGAR2)=names(Cigar_num)
colnames(CORR.CIGAR2)=names(Cigar_num)
# Please Replace BLANK with appropriate values to complete the task.
for(j in seq_along(names(Cigar_num))){
for(k in seq_along(names(Cigar_num))){
CORR.CIGAR2[j,k]=cor(Cigar_num[,j],Cigar_num[,k])
}
}
print(round(CORR.CIGAR2,3))
## year price pop pop16 cpi ndi sales pimin
## year 1.000 0.918 0.082 0.118 0.979 0.936 -0.182 0.920
## price 0.918 1.000 0.108 0.137 0.949 0.944 -0.311 0.987
## pop 0.082 0.108 1.000 0.998 0.082 0.159 -0.112 0.077
## pop16 0.118 0.137 0.998 1.000 0.115 0.194 -0.112 0.106
## cpi 0.979 0.949 0.082 0.115 1.000 0.953 -0.210 0.953
## ndi 0.936 0.944 0.159 0.194 0.953 1.000 -0.184 0.938
## sales -0.182 -0.311 -0.112 -0.112 -0.210 -0.184 1.000 -0.277
## pimin 0.920 0.987 0.077 0.106 0.953 0.938 -0.277 1.000
HI from
EcdatHealthInsurance=HI
head(HealthInsurance)
## whrswk hhi whi hhi2 education race hispanic experience kidslt6 kids618
## 1 0 no no no 13-15years white no 13.0 2 1
## 2 50 no yes no 13-15years white no 24.0 0 1
## 3 40 yes no yes 12years white no 43.0 0 0
## 4 40 no yes yes 13-15years white no 17.0 0 1
## 5 0 yes no yes 9-11years white no 44.5 0 0
## 6 40 yes yes yes 12years white no 32.0 0 0
## husby region wght
## 1 11.960 northcentral 214986
## 2 1.200 northcentral 210119
## 3 31.275 northcentral 219955
## 4 9.000 northcentral 210317
## 5 0.000 northcentral 219955
## 6 15.690 northcentral 208148
#?HI
#print(round(cor(HealthInsurance),3)) #Try this Code
var.names = names(HealthInsurance)
FiveSum.HI = matrix(NA,length(var.names),5)
rownames(FiveSum.HI) = var.names
colnames(FiveSum.HI) = c("Min","Q1","Q2","Q3","Max")
for(VAR in seq_along(var.names)){
if(is.numeric(HealthInsurance[,VAR])){
MIN=min(HealthInsurance[,VAR])
Q1=quantile(HealthInsurance[,VAR],0.25)
Q2=median(HealthInsurance[,VAR],0.5)
Q3=quantile(HealthInsurance[,VAR],0.75)
MAX=max(HealthInsurance[,VAR])
FiveSum.HI[VAR,]=c(MIN,Q1,Q2,Q3,MAX)
} else {
cat("Variable",var.names[VAR],"is not numeric\n")
}
}
## Variable hhi is not numeric
## Variable whi is not numeric
## Variable hhi2 is not numeric
## Variable education is not numeric
## Variable race is not numeric
## Variable hispanic is not numeric
## Variable region is not numeric
FiveSum.HI = na.omit(FiveSum.HI)
print(FiveSum.HI)
## Min Q1 Q2 Q3 Max
## whrswk 0 0.00 35.0 40.0 90.000
## experience -1 14.00 21.0 31.0 51.000
## kidslt6 0 0.00 0.0 1.0 5.000
## kids618 0 0.00 0.0 1.0 8.000
## husby 0 7.50 25.0 40.0 183.719
## wght 5182 99136.25 154814.5 204013.2 1136869.000
## attr(,"na.action")
## hhi whi hhi2 education race hispanic region
## 2 3 4 5 6 7 12
## attr(,"class")
## [1] "omit"
FiveSum.HI2 = NULL
Numeric.names = NULL
for(VAR in seq_along(var.names)){
if(is.numeric(HealthInsurance[,VAR])){
MIN=min(HealthInsurance[,VAR])
Q1=quantile(HealthInsurance[,VAR],0.25)
Q2=median(HealthInsurance[,VAR],0.5)
Q3=quantile(HealthInsurance[,VAR],0.75)
MAX=max(HealthInsurance[,VAR])
FiveSum.HI2=rbind(FiveSum.HI2,c(MIN,Q1,Q2,Q3,MAX))
Numeric.names=c(Numeric.names,var.names[VAR])
print(Numeric.names)
}
}
## [1] "whrswk"
## [1] "whrswk" "experience"
## [1] "whrswk" "experience" "kidslt6"
## [1] "whrswk" "experience" "kidslt6" "kids618"
## [1] "whrswk" "experience" "kidslt6" "kids618" "husby"
## [1] "whrswk" "experience" "kidslt6" "kids618" "husby"
## [6] "wght"
rownames(FiveSum.HI2) = Numeric.names
colnames(FiveSum.HI2) = c("Min","Q1","Q2","Q3","Max")
print(FiveSum.HI2)
## Min Q1 Q2 Q3 Max
## whrswk 0 0.00 35.0 40.0 90.000
## experience -1 14.00 21.0 31.0 51.000
## kidslt6 0 0.00 0.0 1.0 5.000
## kids618 0 0.00 0.0 1.0 8.000
## husby 0 7.50 25.0 40.0 183.719
## wght 5182 99136.25 154814.5 204013.2 1136869.000
The following code creates a new data frame HI_Num that
only contains numeric columns in HealthInsurance.
HI_Num = HealthInsurance %>%
select(c(1,8,9,10,11,13))
Using a Double Loop to Create a Correlation Matrix for numeric
variables in HealthInsurance.
Data = HI_Num
d = dim(Data)[2]
CORR = matrix(NA,d,d) #Empty Matrix Initialized
rownames(CORR)=names(Data)
colnames(CORR)=names(Data)
for(j in seq_along(names(Data))){
for(k in seq_along(names(Data))){
CORR[j,k]=cor(Data[,j],Data[,k])
}
}
print(round(CORR,3))
## whrswk experience kidslt6 kids618 husby wght
## whrswk 1.000 -0.216 -0.139 -0.033 0.020 0.011
## experience -0.216 1.000 -0.450 -0.193 -0.131 0.009
## kidslt6 -0.139 -0.450 1.000 -0.011 0.023 -0.030
## kids618 -0.033 -0.193 -0.011 1.000 0.086 -0.036
## husby 0.020 -0.131 0.023 0.086 1.000 0.047
## wght 0.011 0.009 -0.030 -0.036 0.047 1.000
Write a loop that loops over the columns of data Wages
and reports the mean of the column if it is numeric and the total number
of unique values if it’s a character vector. (Hint:
unique(), length())
for(i in 1:ncol(Wages)){
if (is.numeric(Wages[,i])){
cat("Mean of ",names(Wages)[i], " is ",mean(Wages[,i],na.rm=T),"\n")
} else {
cat("Number of unique values in ",names(Wages)[i], " is ",length(unique(Wages[,i])),"\n")
}
}
## Mean of exp is 19.85378
## Mean of wks is 46.81152
## Number of unique values in bluecol is 2
## Mean of ind is 0.3954382
## Number of unique values in south is 2
## Number of unique values in smsa is 2
## Number of unique values in married is 2
## Number of unique values in sex is 2
## Number of unique values in union is 2
## Mean of ed is 12.84538
## Number of unique values in black is 2
## Mean of lwage is 6.676346
# Scores of 100 students from a population distribution N(82,2)
x1=rnorm(100,mean=82,sd=2)
ggplot()+geom_histogram(aes(x1))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
# Number of times 5 appears when rolling a fair die 10 times
x2=rbinom(100,size=10,prob=1/6)
ggplot()+geom_bar(aes(x2))
BLANK = 0.6
prop=rep(NA,1000)
for(k in 1:1000){
set.seed(k)
x=sample(c("H","T"),size=k,replace=T,prob=c(BLANK,1-BLANK)) #Replace BLANK with Head probability
prop[k]=mean(x=="H")
}
ggplot() +
geom_line(aes(x=1:1000,y=prop),alpha=0.5) +
geom_hline(yintercept=BLANK,linetype="dashed",color="red",size=2) +
theme_minimal()