根据上一列中的条件查看数据框中的下一列
looking at the next column in a data frame based on a condition in previous column
假设我有一个数据框如下,其中有 classes(Low,Medium,High) 和一些反对它的值。所以,在第一行,0.7和0.25是High的几率,0.99是Medium的几率,0.11是Low的几率。
col1 col2 col3 col4 col5 col6 col7 col8
1 High 0.7 Medium 0.99 High 0.25 Low 0.11
2 Low 0.9 Low 0.19 Low 0.29 Low 0.49
3 High 0.6 High 0.16 Low 0.46 Medium 0.63
4 High 0.8 Low 0.71 Low 0.28 High 0.20
我想找出一行中每个 class 的计数,以及每个 class 的平均值。
我能够通过使用具有低、中和高条件的 rowSums 找到出现次数最多的 class。 但是要查看下一列的值,我只能使用 for 循环,如下所示,
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
有没有更有效的方法来做到这一点?
下面是我的版本,它使用 rowSums 来查找计数,并使用迭代 for 循环来查看下一个值。
col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)
df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)
df$finalclass = NULL
df$finalchance = NULL
for(i in 1:nrow(df))
{
countlow = 0;
counthigh = 0;
countmedium = 0;
chancelow = 0;
chancemedium = 0;
chancehigh = 0;
countlow = rowSums(df[i,1:8]=="Low")
countmedium = rowSums(df[i,1:8]=="Medium")
counthigh = rowSums(df[i,1:8]=="High")
highestcount = max(countlow,countmedium,counthigh)
#tie case
if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
{
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
if(chancelow == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Low"
df[i,"finalchance"] = chancelow/highestcount
}
if(chancemedium == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Medium"
df[i,"finalchance"] = chancemedium/highestcount
}
if(chancehigh == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "High"
df[i,"finalchance"] = chancehigh/highestcount
}
}
#no-tie case
else
{
if(highestcount == countlow)
{
df[i,"finalclass"] = "Low"
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
}
df[i,"finalchance"] = chancelow/highestcount
}
if(highestcount == countmedium)
{
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
df[i,"finalchance"] = chancemedium/highestcount
}
if(highestcount == counthigh)
{
df[i,"finalclass"] = "High"
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
df[i,"finalchance"] = chancehigh/highestcount
}
}
}
假设列以 "key/value" 对出现,将数据集 ("df") 子集化为值 ('df1') 和键 ('df2') 数据集。
df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]
要获取每一行中每个 class("High"、"Low"、"Medium")的 "count",我们可以使用 apply
和 MARGIN=1
。通过将一行中的 class 个元素转换为 "factor" 并指定级别,我们甚至可以获得该行缺失级别的计数。
t(apply(df2, 1, function(x) table(factor(x,
levels=c('High', 'Low', 'Medium')))))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
或者可以使用 qdapTools
.
中的一个方便的函数 (mtabulate
) 来完成
library(qdapTools)
mtabulate(as.data.frame(t(df2)))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
为了按行查找不同 class 的 "mean" 值,我们可以遍历数据集 ("df1") 的行 (sapply
) 和使用聚合函数 (tapply
)。
sapply(seq_len(nrow(df1)), function(i)
tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High Low Medium
# 0.475 0.110 0.990
#[[2]]
# Low
#0.4675
#[[3]]
# High Low Medium
# 0.38 0.46 0.63
#[[4]]
# High Low
#0.500 0.495
或者我们可以用ave
将"df1"对应的元素按分组的平均值填入
ave(as.matrix(df1), as.matrix(df2), row(df2))
# col2 col4 col6 col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000
数据
df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High",
"Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low",
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
.Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
"col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))
假设我有一个数据框如下,其中有 classes(Low,Medium,High) 和一些反对它的值。所以,在第一行,0.7和0.25是High的几率,0.99是Medium的几率,0.11是Low的几率。
col1 col2 col3 col4 col5 col6 col7 col8
1 High 0.7 Medium 0.99 High 0.25 Low 0.11
2 Low 0.9 Low 0.19 Low 0.29 Low 0.49
3 High 0.6 High 0.16 Low 0.46 Medium 0.63
4 High 0.8 Low 0.71 Low 0.28 High 0.20
我想找出一行中每个 class 的计数,以及每个 class 的平均值。
我能够通过使用具有低、中和高条件的 rowSums 找到出现次数最多的 class。 但是要查看下一列的值,我只能使用 for 循环,如下所示,
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
有没有更有效的方法来做到这一点?
下面是我的版本,它使用 rowSums 来查找计数,并使用迭代 for 循环来查看下一个值。
col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)
df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)
df$finalclass = NULL
df$finalchance = NULL
for(i in 1:nrow(df))
{
countlow = 0;
counthigh = 0;
countmedium = 0;
chancelow = 0;
chancemedium = 0;
chancehigh = 0;
countlow = rowSums(df[i,1:8]=="Low")
countmedium = rowSums(df[i,1:8]=="Medium")
counthigh = rowSums(df[i,1:8]=="High")
highestcount = max(countlow,countmedium,counthigh)
#tie case
if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
{
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
if(chancelow == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Low"
df[i,"finalchance"] = chancelow/highestcount
}
if(chancemedium == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Medium"
df[i,"finalchance"] = chancemedium/highestcount
}
if(chancehigh == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "High"
df[i,"finalchance"] = chancehigh/highestcount
}
}
#no-tie case
else
{
if(highestcount == countlow)
{
df[i,"finalclass"] = "Low"
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
}
df[i,"finalchance"] = chancelow/highestcount
}
if(highestcount == countmedium)
{
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
df[i,"finalchance"] = chancemedium/highestcount
}
if(highestcount == counthigh)
{
df[i,"finalclass"] = "High"
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
df[i,"finalchance"] = chancehigh/highestcount
}
}
}
假设列以 "key/value" 对出现,将数据集 ("df") 子集化为值 ('df1') 和键 ('df2') 数据集。
df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]
要获取每一行中每个 class("High"、"Low"、"Medium")的 "count",我们可以使用 apply
和 MARGIN=1
。通过将一行中的 class 个元素转换为 "factor" 并指定级别,我们甚至可以获得该行缺失级别的计数。
t(apply(df2, 1, function(x) table(factor(x,
levels=c('High', 'Low', 'Medium')))))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
或者可以使用 qdapTools
.
mtabulate
) 来完成
library(qdapTools)
mtabulate(as.data.frame(t(df2)))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
为了按行查找不同 class 的 "mean" 值,我们可以遍历数据集 ("df1") 的行 (sapply
) 和使用聚合函数 (tapply
)。
sapply(seq_len(nrow(df1)), function(i)
tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High Low Medium
# 0.475 0.110 0.990
#[[2]]
# Low
#0.4675
#[[3]]
# High Low Medium
# 0.38 0.46 0.63
#[[4]]
# High Low
#0.500 0.495
或者我们可以用ave
将"df1"对应的元素按分组的平均值填入
ave(as.matrix(df1), as.matrix(df2), row(df2))
# col2 col4 col6 col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000
数据
df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High",
"Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low",
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
.Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
"col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))