根据上一列中的条件查看数据框中的下一列

Question

假设我有一个数据框如下，其中有 classes(Low,Medium,High) 和一些反对它的值。所以，在第一行，0.7和0.25是High的几率，0.99是Medium的几率，0.11是Low的几率。

    col1 col2   col3 col4 col5 col6   col7 col8 
1   High  0.7 Medium 0.99 High 0.25    Low 0.11 
2    Low  0.9    Low 0.19  Low 0.29    Low 0.49 
3   High  0.6   High 0.16  Low 0.46 Medium 0.63 
4   High  0.8    Low 0.71  Low 0.28   High 0.20

我想找出一行中每个 class 的计数，以及每个 class 的平均值。

我能够通过使用具有低、中和高条件的 rowSums 找到出现次数最多的 class。 但是要查看下一列的值，我只能使用 for 循环，如下所示，

for(j in 1:7)
{
    if(df[i,j] == "Medium")
    {
        chancemedium = chancemedium + df[i,j+1]
    }
}

有没有更有效的方法来做到这一点？

下面是我的版本，它使用 rowSums 来查找计数，并使用迭代 for 循环来查看下一个值。

col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)

df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)

df$finalclass = NULL
df$finalchance = NULL

for(i in 1:nrow(df))
{
    countlow = 0;
    counthigh = 0;
    countmedium = 0;

    chancelow = 0;
    chancemedium = 0;
    chancehigh = 0;

    countlow = rowSums(df[i,1:8]=="Low")
    countmedium = rowSums(df[i,1:8]=="Medium")
    counthigh = rowSums(df[i,1:8]=="High")

    highestcount = max(countlow,countmedium,counthigh)

    #tie case
    if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
    {
        for(j in 1:7)
        {
            if(df[i,j] == "Low")
            {
                chancelow = chancelow + df[i,j+1]
            }

            if(df[i,j] == "Medium")
            {
                chancemedium = chancemedium + df[i,j+1]
            }

            if(df[i,j] == "High")
            {
                chancehigh = chancehigh + df[i,j+1]
            }
        }

        if(chancelow == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "Low"
            df[i,"finalchance"] = chancelow/highestcount
        }

        if(chancemedium == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "Medium"
            df[i,"finalchance"] = chancemedium/highestcount
        }

        if(chancehigh == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "High"
            df[i,"finalchance"] = chancehigh/highestcount
        }
    }

    #no-tie case
    else
    {
        if(highestcount == countlow)
        {
            df[i,"finalclass"] = "Low"
            for(j in 1:7)
            {
                if(df[i,j] == "Low")
                {
                    chancelow = chancelow + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancelow/highestcount
        }

        if(highestcount == countmedium)
        {
            df[i,"finalclass"] = "Medium"
            for(j in 1:7)
            {
                if(df[i,j] == "Medium")
                {
                    chancemedium = chancemedium + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancemedium/highestcount
        }

        if(highestcount == counthigh)
        {
            df[i,"finalclass"] = "High"
            df[i,"finalclass"] = "Medium"
            for(j in 1:7)
            {
                if(df[i,j] == "High")
                {
                    chancehigh = chancehigh + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancehigh/highestcount
        }
    }
}

Answer 1

假设列以 "key/value" 对出现，将数据集 ("df") 子集化为值 ('df1') 和键 ('df2') 数据集。

df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]

要获取每一行中每个 class（"High"、"Low"、"Medium"）的 "count"，我们可以使用 apply 和 MARGIN=1。通过将一行中的 class 个元素转换为 "factor" 并指定级别，我们甚至可以获得该行缺失级别的计数。

 t(apply(df2, 1, function(x) table(factor(x,
                levels=c('High', 'Low', 'Medium')))))
 #  High Low Medium
 #1    2   1      1
 #2    0   4      0
 #3    2   1      1
 #4    2   2      0

或者可以使用 qdapTools.

中的一个方便的函数 (mtabulate) 来完成

 library(qdapTools) 
 mtabulate(as.data.frame(t(df2)))
 #  High Low Medium
 #1    2   1      1
 #2    0   4      0
 #3    2   1      1
 #4    2   2      0

为了按行查找不同 class 的 "mean" 值，我们可以遍历数据集 ("df1") 的行 (sapply) 和使用聚合函数 (tapply)。

sapply(seq_len(nrow(df1)), function(i) 
       tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High    Low Medium 
# 0.475  0.110  0.990 

#[[2]]
# Low 
#0.4675 

#[[3]]
# High    Low Medium 
#  0.38   0.46   0.63 

#[[4]]
# High   Low 
#0.500 0.495

或者我们可以用ave将"df1"对应的元素按分组的平均值填入

ave(as.matrix(df1), as.matrix(df2), row(df2))
#   col2   col4   col6   col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000

数据

df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High", 
 "Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low", 
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
 .Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
 "col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))

根据上一列中的条件查看数据框中的下一列

looking at the next column in a data frame based on a condition in previous column

r

rstudio

数据