In this tutorial, we will try to understand some fundamental control structures used in statistical programming. In the beginning, we will separately analyze different control structures.
Cigar
from
Ecdat
head(Cigar)
print(round(cor(Cigar),3))
CORR.CIGAR1=matrix(NA,9,9) #Empty Matrix Initialized
rownames(CORR.CIGAR1)=names(Cigar)
colnames(CORR.CIGAR1)=names(Cigar)
seq_along(names(Cigar))
for(j in seq_along(names(Cigar))){
for(k in seq_along(names(Cigar))){
CORR.CIGAR1[j,k]=cor(Cigar[,j],Cigar[,k])
}
}
print(round(CORR.CIGAR1,3))
Cigar_num = Cigar[,-1]
CORR.CIGAR2=matrix(NA,8,8)
rownames(CORR.CIGAR2)=names(Cigar_num)
colnames(CORR.CIGAR2)=names(Cigar_num)
for(j in 1:8){
for(k in 1:8){
CORR.CIGAR2[j,k]=cor(Cigar_num[,j],Cigar_num[,k])
}
}
print(round(CORR.CIGAR2,3))
HI
from
Ecdat
HealthInsurance=HI
head(HealthInsurance)
#?HI
#print(round(cor(HealthInsurance),3)) #Try this Code
var.names = names(HealthInsurance)
FiveSum.HI = matrix(NA,length(var.names),6)
colnames(FiveSum.HI) = c("Variable","Min","Q1","Q2","Q3","Max")
for(VAR in seq_along(var.names)){
if(is.numeric(HealthInsurance[,VAR])){
MIN=min(HealthInsurance[,VAR])
Q1=quantile(HealthInsurance[,VAR],0.25)
Q2=median(HealthInsurance[,VAR],0.5)
Q3=quantile(HealthInsurance[,VAR],0.75)
MAX=max(HealthInsurance[,VAR])
FiveSum.HI[VAR,]=c(names(HealthInsurance)[VAR],MIN,Q1,Q2,Q3,MAX)
} else {
cat("Variable",var.names[VAR],"is not numeric\n")
FiveSum.HI[VAR,]=c(names(HealthInsurance)[VAR],rep(NA,5))
}
}
print(as.tibble(na.omit(FiveSum.HI)))
FiveSum.HI2 = NULL
Numeric.names = NULL
for(VAR in seq_along(var.names)){
if(is.numeric(HealthInsurance[,VAR])){
MIN=min(HealthInsurance[,VAR])
Q1=quantile(HealthInsurance[,VAR],0.25)
Q2=median(HealthInsurance[,VAR],0.5)
Q3=quantile(HealthInsurance[,VAR],0.75)
MAX=max(HealthInsurance[,VAR])
FiveSum.HI2=rbind(FiveSum.HI2,c(MIN,Q1,Q2,Q3,MAX))
Numeric.names=c(Numeric.names,var.names[VAR])
}
}
FiveSum.HI3=as.tibble(cbind(Numeric.names,as.tibble(FiveSum.HI2)))
names(FiveSum.HI3) = c("Variable","Min","Q1","Q2","Q3","Max")
print(FiveSum.HI3)
The following code creates a new data frame HI_Num
that
only contains numeric columns in HealthInsurance
.
HI_Num = HealthInsurance %>%
select(c(1,8,9,10,11,13))
Using a Double Loop to Create a Correlation Matrix for numeric
variables in HealthInsurance
.
Data = HI_Num
d = dim(Data)[2]
CORR = matrix(NA,d,d) #Empty Matrix Initialized
rownames(CORR)=names(Data)
colnames(CORR)=names(Data)
for(j in 1:d){
for(k in 1:d){
CORR[j,k]=cor(Data[,j],Data[,k])
}
}
print(round(CORR,3))
Write a loop that loops over the columns of data Wages
and reports the mean of the column if it is numeric and the total number
of unique characters if it’s a character vector.
head(Wages,3)
## exp wks bluecol ind south smsa married sex union ed black lwage
## 1 3 32 no 0 yes no yes male no 9 no 5.56068
## 2 4 43 no 0 yes no yes male no 9 no 5.72031
## 3 5 40 no 0 yes no yes male no 9 no 5.99645
for(i in 1:ncol(Wages)){
if (is.numeric(Wages[,i])){
cat("Mean of ",names(Wages)[i], " is ",mean(Wages[,i],na.rm=T),"\n")
} else {
cat("Number of unique values in ",names(Wages)[i], " is ",length(unique(Wages[,i])),"\n")
}
}
## Mean of exp is 19.85378
## Mean of wks is 46.81152
## Number of unique values in bluecol is 2
## Mean of ind is 0.3954382
## Number of unique values in south is 2
## Number of unique values in smsa is 2
## Number of unique values in married is 2
## Number of unique values in sex is 2
## Number of unique values in union is 2
## Mean of ed is 12.84538
## Number of unique values in black is 2
## Mean of lwage is 6.676346
Use ifelse
function to answer excercise 2.
summary = ifelse(is.numeric(Wages),mean(Wages),length(unique(Wages)))
summary
## [1] 12
Writing a function to calculate five summary statistics and
generating five summary statistics for numeric variables in
HealthInsurance
.
#
# Scores of 100 students from a population distribution N(82,2)
x1=rnorm(100,mean=82,sd=2)
ggplot()+geom_histogram(aes(x1))
# Number of times 5 appears when rolling a fair die 10 times
x2=rbinom(100,size=10,prob=1/6)
ggplot()+geom_bar(aes(x2))
prop=rep(NA,1000)
for(k in 1:1000){
set.seed(k)
x=sample(c("H","T"),size=k,replace=T,prob=c(0.5,0.5))
prop[k]=mean(x=="H")
}
ggplot() +
geom_line(aes(x=1:1000,y=prop),alpha=0.5) +
geom_hline(yintercept=0.5,linetype="dashed",color="red",size=2) +
theme_minimal()