如何表示二进制 t 统计量?
How to represent the binary t-statistic?
题目是这样的:
读取文件diabetes.csv。有两个变量称为 BMI 和结果。变量结果只有两个值:0 和 1。对两个结果值的 BMI 标准差相同的假设进行非参数双样本检验
bmi <- diabetes$BMI
bmi
outcome <- diabetes$Outcome
outcome
n <- length(bmi)
# tstat
tstat <- ???
# Describe the population and draw synthetic samples
f1 <- function()
{
x <- c(bmi, outcome)
x <- sample(x)
m1 <- sd(x[1:n])
m2 <- sd(x[(n+1):length(x)])
return(m1 - m2)
}
# Create sampling distribution
sdist <- replicate(10000, f1())
plot(density(sdist))
# Gap
gap <- abs(mean(sdist) - tstat)
abline(v = mean(sdist) + c(-1,1) * gap, col = "dark orange")
s1 <- sdist[sdist <(mean(sdist - gap)) | sdist >(mean(sdist + gap))]
pvalue <- length(s1) / length(sdist)
pvalue
数据位于某个名为 "diabetes" 的数据集中。我的问题是如何表示 "t-statistic" 因为结果是二进制的?
使用此代码:
# Sort the table diabetes on accending order of Outcome to separate the BMI
# values with outcome = 0 and BMI values with outcome = 1
diabetes = diabetes[order(diabetes$Outcome),]
View(diabetes)
# Find the number of values with outcome = 0
n = length(which(diabetes$Outcome == 0))
# Find total number of rows
l = length(diabetes$BMI)
# Find BMI values to create the sample later on
g = diabetes$BMI
# Create function to take the values of BMI and shuffle it every time and
# to find the difference between the standard deviations
f1 = function()
{
x = sample(g)
z = abs(sd(x[1:n]) - sd(x[(n+1):l]))
return(z)
}
# Replicate the function several times
dist = replicate(100000,f1())
# Plot density of distribution
plot(density(dist))
polygon(density(dist),col="green")
diabetes0 = diabetes[diabetes$Outcome == 0,]
diabetes1 = diabetes[diabetes$Outcome == 1,]
View(diabetes0)
View(diabetes1)
# Find the difference between standard deviation of BMI when outcome = 0 and
# when outcome = 1
tstat = abs(sd(diabetes0$BMI) - sd(diabetes1$BMI))
tstat
abline(v=tstat)
rside = dist[dist>tstat]
pvalue = length(rside)/length(dist)
pvalue
题目是这样的:
读取文件diabetes.csv。有两个变量称为 BMI 和结果。变量结果只有两个值:0 和 1。对两个结果值的 BMI 标准差相同的假设进行非参数双样本检验
bmi <- diabetes$BMI
bmi
outcome <- diabetes$Outcome
outcome
n <- length(bmi)
# tstat
tstat <- ???
# Describe the population and draw synthetic samples
f1 <- function()
{
x <- c(bmi, outcome)
x <- sample(x)
m1 <- sd(x[1:n])
m2 <- sd(x[(n+1):length(x)])
return(m1 - m2)
}
# Create sampling distribution
sdist <- replicate(10000, f1())
plot(density(sdist))
# Gap
gap <- abs(mean(sdist) - tstat)
abline(v = mean(sdist) + c(-1,1) * gap, col = "dark orange")
s1 <- sdist[sdist <(mean(sdist - gap)) | sdist >(mean(sdist + gap))]
pvalue <- length(s1) / length(sdist)
pvalue
数据位于某个名为 "diabetes" 的数据集中。我的问题是如何表示 "t-statistic" 因为结果是二进制的?
使用此代码:
# Sort the table diabetes on accending order of Outcome to separate the BMI
# values with outcome = 0 and BMI values with outcome = 1
diabetes = diabetes[order(diabetes$Outcome),]
View(diabetes)
# Find the number of values with outcome = 0
n = length(which(diabetes$Outcome == 0))
# Find total number of rows
l = length(diabetes$BMI)
# Find BMI values to create the sample later on
g = diabetes$BMI
# Create function to take the values of BMI and shuffle it every time and
# to find the difference between the standard deviations
f1 = function()
{
x = sample(g)
z = abs(sd(x[1:n]) - sd(x[(n+1):l]))
return(z)
}
# Replicate the function several times
dist = replicate(100000,f1())
# Plot density of distribution
plot(density(dist))
polygon(density(dist),col="green")
diabetes0 = diabetes[diabetes$Outcome == 0,]
diabetes1 = diabetes[diabetes$Outcome == 1,]
View(diabetes0)
View(diabetes1)
# Find the difference between standard deviation of BMI when outcome = 0 and
# when outcome = 1
tstat = abs(sd(diabetes0$BMI) - sd(diabetes1$BMI))
tstat
abline(v=tstat)
rside = dist[dist>tstat]
pvalue = length(rside)/length(dist)
pvalue