如果值存在于具有匹配 ID 的其他矩阵的行中的任何位置,则用“1”填充单元格——R

If value is present anywhere in row of other matrix with matching ID, populate cell with "1" -- R

我正在尝试创建一个矩阵以最终 运行 评分者间的可靠性。我正在尝试根据字符串是否存在于第二个或第三个矩阵中具有匹配 ID 的行中,用 TRUE 和 FALSE(或 1/0)填充矩阵。我在底部包含了它应该是什么样子。

下面是我的可重现示例,包括两个现有矩阵和我迄今为止为生成我想要的最终矩阵所做的尝试。我能够做到这一点,我可以确认我正在选择与我想要匹配的矩阵相匹配的正确列(请参阅列名称中包含“m1”的所有列中带有“m1”的输出) .我还没有想出如何进入下一阶段正确匹配 m1.mat 和最终矩阵 reliability.ex 之间的 id 列。在 excel 中,这类似于 VLOOKUP,但是当我在 R 中搜索 VLOOKUP 等价物时,我只得到 join/merge 函数,我 认为 会为我需要的东西工作,但也许我错了。我尝试在 excel 中完成所有这些,但最终卡住了,如果可能的话,宁愿在 R 中使用它。

require(stringr)

set.seed(327)
ids <- sample(1:1000, 5)

m.cols <- c("id", "IP1", "IP2", "IP3", "IP4", "IP5")
m1.mat <- matrix(data=NA, nrow=5, ncol=6)
colnames(m1.mat) <- m.cols
m1.mat[1,] <- c(ids[1], "abc", "ghi", NA, NA, NA)
m1.mat[2,] <- c(ids[2], "def", NA, NA, NA, NA)
m1.mat[3,] <- c(ids[3], "mno", "jkl", NA, NA, NA)
m1.mat[4,] <- c(ids[4], "ghi", "abc", NA, NA, NA)
m1.mat[5,] <- c(ids[5], "abc", "def", "ghi", "jkl", "mno")

m2.mat <- matrix(data=NA, nrow=5, ncol=6)
colnames(m2.mat) <- m.cols
m2.mat[1,] <- c(ids[1], "def", "ghi", NA, NA, NA)
m2.mat[2,] <- c(ids[2], "def", "mno", NA, NA, NA)
m2.mat[3,] <- c(ids[3], "mno", "jkl", "abc", NA, NA)
m2.mat[4,] <- c(ids[4], "ghi", "abc", NA, NA, NA)
m2.mat[5,] <- c(ids[5], "abc", "def", "ghi", "jkl", "mno")


reliability.ex <- matrix(data=NA, nrow=5, ncol=11)
ex.cols <- c("id", "abc_m1", "abc_m2", "def_m1", "def_m2", "ghi_m1", "ghi_m2", "jkl_m1", "jkl_m2", "mno_m1", "mno_m2")
colnames(reliability.ex) <- ex.cols
reliability.ex[,1] <- ids
ip.indx <- grepl('m1', colnames(reliability.ex))


for (i in 1:nrow(reliability.ex)) {
  for(j in 1:ncol(reliability.ex)) {
    if (grepl("m1", colnames(reliability.ex)[j])==TRUE) {
            reliability.ex[i,j] <- "m1"
    }
  }
}

下面是基于上述代码的矩阵:

> m1.mat
     id    IP1   IP2   IP3   IP4   IP5  
[1,] "345" "abc" "ghi" NA    NA    NA   
[2,] "615" "def" NA    NA    NA    NA   
[3,] "478" "mno" "jkl" NA    NA    NA   
[4,] "792" "ghi" "abc" NA    NA    NA   
[5,] "881" "abc" "def" "ghi" "jkl" "no"

> m2.mat
     id    IP1   IP2   IP3   IP4   IP5  
[1,] "345" "def" "ghi" NA    NA    NA   
[2,] "615" "def" "mno" NA    NA    NA   
[3,] "478" "mno" "jkl" "abc" NA    NA   
[4,] "792" "ghi" "abc" NA    NA    NA   
[5,] "881" "abc" "def" "ghi" "jkl" "mno" 

> reliability.ex
         id    abc_m1 abc_m2 def_m1 def_m2 ghi_m1 ghi_m2 jkl_m1 jkl_m2 mno_m1 mno_m2
    [1,] "345" "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA    
    [2,] "615" "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA    
    [3,] "478" "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA    
    [4,] "792" "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA    
    [5,] "881" "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA     "m1"   NA  

这就是我希望能够生成的内容,而不是当前命名的内容 reliability.ex:

> reliability.desired
     id    abc_m1 abc_m2 def_m1 def_m2 ghi_m1 ghi_m2 jkl_m1 jkl_m2 mno_m1 mno_m2
[1,] "345" "1"    "0"    "0"    "1"    "1"    "1"    "0"    "0"    "0"    "0"   
[2,] "615" "0"    "0"    "1"    "1"    "0"    "0"    "0"    "0"    "0"    "1"   
[3,] "478" "0"    "1"    "0"    "0"    "0"    "0"    "1"    "1"    "1"    "1"   
[4,] "792" "1"    "1"    "0"    "0"    "1"    "1"    "0"    "0"    "0"    "0"   
[5,] "881" "1"    "1"    "1"    "1"    "1"    "1"    "1"    "1"    "1"    "1"   

感谢任何帮助!我还在研究 R.

library(dplyr)
library(tidyr)
make_rmat <- function(mat) {
    mat %>% 
        data.frame() %>% 
        pivot_longer(!id) %>% 
        pivot_wider(!name, names_from=value, values_fn = list) %>% 
        select(!`NA`) %>% 
        unnest(!id) %>% 
        mutate(across(!id, ~ifelse(is.na(.x), 0, 1)))
}
reliability.desired <- merge(make_rmat(m1.mat), make_rmat(m2.mat), by="id",suffixes=c("_m1","_m2"))

您可以使用 outer 方法(outer 计算矩阵的外积,并且可以(滥用)用于将函数置换应用于两个列表的元素,与mapply() 只能连续这样做)。

首先sort将所有unique值转化为向量u。然后定义一个 Vectorized 函数 FUN,它检查 uany 是否在特定的矩阵单元格中。将 FUN 放在 outerlapply 两个矩阵之上。 aperm将得到的数组适当转置,计算rowSums,即u的具体值是否在整个矩阵中。最后只是做一些 colnames 的事情和 cbind 的事情。

u <- sort(unique(c(m1.mat[,-1], m2.mat[,-1])))
FUN <- Vectorize(\(m, u) any(u %in% m))
rr <- lapply(list(m1=m1.mat, m2=m2.mat), \(m) 
       `colnames<-`(rowSums(
         aperm(outer(m[,-1], u, FUN), c(1, 3, 2)), 
         dim=2), u))
rr <- do.call(cbind, 
              Map(\(x, y) 
                   `colnames<-`(x, paste0(colnames(x), '_', y)), rr, names(rr)))
rr <- cbind(id=m1.mat[,1], rr[, order(colnames(rr))])
rr
#   id    abc_m1 abc_m2 def_m1 def_m2 ghi_m1 ghi_m2 jkl_m1 jkl_m2 mno_m1 mno_m2
# [1,] "345" "1"    "0"    "0"    "1"    "1"    "1"    "0"    "0"    "0"    "0"   
# [2,] "615" "0"    "0"    "1"    "1"    "0"    "0"    "0"    "0"    "0"    "1"   
# [3,] "478" "0"    "1"    "0"    "0"    "0"    "0"    "1"    "1"    "1"    "1"   
# [4,] "792" "1"    "1"    "0"    "0"    "1"    "1"    "0"    "0"    "0"    "0"   
# [5,] "881" "1"    "1"    "1"    "1"    "1"    "1"    "1"    "1"    "1"    "1"  

注意:我使用了新的 R4.1.* \(x) 符号来定义函数。如果没有,请使用 function(x) 代替。

首先,最好将每个案例的 ids 放在行名中,以免将 ID 与实际编码混淆。然后,您可以在不使用 stringr 或 dplyr 的情况下循环它:


set.seed(327)
ids <- sample(1:1000, 5)

m.cols <- c("IP1", "IP2", "IP3", "IP4", "IP5")
m1.mat <- matrix(data=NA, nrow=5, ncol=5)
colnames(m1.mat) <- m.cols
rownames(m1.mat) <- ids ## Store id's in the rownames
m1.mat[1,] <- c("abc", "ghi", NA, NA, NA)
m1.mat[2,] <- c("def", NA, NA, NA, NA)
m1.mat[3,] <- c("mno", "jkl", NA, NA, NA)
m1.mat[4,] <- c("ghi", "abc", NA, NA, NA)
m1.mat[5,] <- c("abc", "def", "ghi", "jkl", "mno")

m2.mat <- matrix(data=NA, nrow=5, ncol=5)
colnames(m2.mat) <- m.cols
rownames(m2.mat) <- ids ## Store id's in the rownames
m2.mat[1,] <- c("def", "ghi", NA, NA, NA)
m2.mat[2,] <- c("def", "mno", NA, NA, NA)
m2.mat[3,] <- c("mno", "jkl", "abc", NA, NA)
m2.mat[4,] <- c("ghi", "abc", NA, NA, NA)
m2.mat[5,] <- c("abc", "def", "ghi", "jkl", "mno")

possible = unique(c(m1.mat,m2.mat))   ## Get all possible values
possible = possible[!is.na(possible)] ## NA is not a valid field

reliability.ex <- matrix(data=0, nrow=length(ids), ncol=length(possible)*2)
rownames(reliability.ex) = ids ## Store id's in the rownames
colnames(reliability.ex) = 1:(length(possible)*2)

c0 = 1
for(c in possible){
  colnames(reliability.ex)[c0]=paste(c,"m1",sep="_")
  colnames(reliability.ex)[c0+1]=paste(c,"m2",sep="_")
  for(i in as.character(ids)){ ## Use the rownames to match codings
    if(c %in% m1.mat[i,]){reliability.ex[i,c0]=1}
    if(c %in% m2.mat[i,]){reliability.ex[i,c0+1]=1}
  }
  c0=c0+2
}

但是,在计算评分者间的可靠性时,最好不要单独存储 m1 和 m2 的值,而是已经存储协议。在上面 for 循环的最里面,代码测试了这种情况下两个矩阵中是否存在元素。您可以很容易地计算出此时此代码的一致性 (agreement = c %in% m1.mat[i,] == c %in% m2.mat[i,])。这可能值得为后续步骤做(和存储)。 否则,您将不得不再次遍历该矩阵以计算协议。

我的意思是:

agree <- matrix(data=FALSE, nrow=length(ids), ncol=length(possible))
rownames(agree) = ids
colnames(agree) = possible

for(c in possible){
  for(i in as.character(ids)){
    if(c %in% m1.mat[i,]==c %in% m2.mat[i,]){agree[i,c]=TRUE}
  }
}

这将产生输出:

> agree  ## Cross table
      abc   def   mno  ghi  jkl
345 FALSE FALSE  TRUE TRUE TRUE
615  TRUE  TRUE FALSE TRUE TRUE
478 FALSE  TRUE  TRUE TRUE TRUE
792  TRUE  TRUE  TRUE TRUE TRUE
881  TRUE  TRUE  TRUE TRUE TRUE
> sum(agree)/length(agree)  ## Holsti Agreement
[1] 0.84