循环 R 中的 2 个数据集,以匹配一个数据集中所有行的值与另一个数据集的仅一列

loop over 2 datasets in R to match the value of all rows from one dateset with only one column of another dateset

我正在尝试在 R 中编写一个循环,以对名为 datasetAdatasetB 的两个数据集执行一些迭代。

datasetA 有 600 个条目,datasetB 有 200,000 个条目。 对于 datasetA 中的每个条目,我要执行以下操作:

如果两个数据集中V2的值相等, 然后计算 ppm:

(datasetA$V3 - datasetB$V3) / datasetA$V3 * 1000000

如果 ppm < |10|,则将 ppm 值粘贴到 datasetBV4 列,将 datasetA$V1 的相关名称粘贴到 V1 的列datasetB.

假设这是 datasetA,有 600 个条目:

datasetA<- read.table(text='Alex    1   50.00042
John    1   60.000423
Janine    3   88.000123
Aline    3   117
Mark    2    79.9999')

DatasetA

这是一个包含 200000 个条目的 datasetB 示例:

datasetB<- read.table(text='NA    1   50.0001    NA
NA    1   50.00032    NA
NA    2   70    NA
NA    2   80    NA
NA    3   88.0004    NA
NA    3   100    NA
NA    3   101    NA
NA    2    102    NA')

DatasetB

最后的 table 应该是这样的:

datasetC <- read.table(text='Alex    1   50.0001    6.459945
Alex    1   50.00032    2.059983
NA    2   70    NA
Mark    2   80    -1.25
Janine    3   88.0004    -3.14772
NA    3   100    NA
NA    3   101    NA
NA    2    102    NA')

The final table should look like this

试试这个:我是 R 菜鸟,但如果这对你有用,请告诉我。

library(data.table)

datasetA<- read.table(text='Alex    1   50.00042
John    1   60.000423
                      Janine    3   88.000123
                      Aline    3   117
                      Mark    2    79.9999')
datasetB<- read.table(text='NA    1   50.0001    NA
NA    1   50.00032    NA
NA    2   70    NA
NA    2   80    NA
NA    3   88.0004    NA
NA    3   100    NA
NA    3   101    NA
NA    2    102    NA')

# I renamed columns for my own reference, V1,V2,.. were a bit confusing
names(datasetA) <- c("Name", "ID", "ValueA")
names(datasetB) <- c("V1", "ID", "ValueB", "V4")


# Create a key for each row in datasetB
datasetB$key <- seq(nrow(datasetB))

# Left join A to B on column ID, but first set them as data table
datasetB <- as.data.table(datasetB)
datasetA <- as.data.table(datasetA)
# Using base join but you can also use data table left join see below
datasetC <- merge(x = datasetB, y = datasetA, by = c("ID"), all.x = TRUE)

# Create PPM column
datasetC[, c("ppm") := 1000000*(ValueA - ValueB)/ValueA, ]
# Filter on PPM and keep columns we need
datasetC <- datasetC[abs(ppm) < 10, list(key,Name,ppm)]

# Left join to datasetB on key
setkey(datasetC, key)
setkey(datasetB, key)


datasetB <- datasetC[datasetB]

# Keep columns we need and rename to V1,... as requested
datasetB <- datasetB[, list(V1 = Name, V2 = ID, V3 = ValueB, V4 = ppm)]

以下答案似乎符合问题的要求,但我未能获得 2 个计算值,最后一列 V4

AV2 <- sort(unique(datasetA$V2))
res <- lapply(AV2, function(v2){
  inx_a <- datasetA[['V2']] == v2
  inx_b <- datasetB[['V2']] == v2
  mrg <- merge(datasetA[inx_a, ], datasetB[inx_b, ], by = 'V2')
  ppm <- ((mrg$V3.x - mrg$V3.y)/mrg$V3.x)*1000000
  cbind(mrg[abs(ppm) < 10, c(2, 1, 5)], ppm = ppm[abs(ppm) < 10])
})
res <- do.call(rbind, res)
names(res) <- paste0('V', 1:4)
row.names(res) <- NULL
final <- merge(res, datasetB, by = c('V2', 'V3'), all.y = TRUE)[c(3, 1, 2, 4)]
names(final) <- paste0('V', 1:4)

final
#      V1 V2        V3        V4
#1   Alex  1  50.00010  6.399946
#2   Alex  1  50.00032  1.999983
#3   <NA>  2  70.00000        NA
#4   Mark  2  80.00000 -1.250002
#5   <NA>  2 102.00000        NA
#6 Janine  3  88.00040 -3.147723
#7   <NA>  3 100.00000        NA
#8   <NA>  3 101.00000        NA
data<-datasetB
for(i in 1:5){
  for(j in 1:8){
    if (datasetA$V2[i]==datasetB$V2[j] & abs((datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6)<10){
      data[j,1]=datasetA[i,1]
      data[j,4]=(datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6
  }}}
data

如果我没理解错的话,问题是要求 具有复杂条件的联接 。这可以使用 data.table:

来实现
library(data.table)
setDT(datasetA)[setDT(datasetB), on = "V2", {
  ppm <-(x.V3- i.V3) / i.V3 * 1E6
  list(V1 = ifelse(abs(ppm) < 10, x.V1, NA_character_), 
       V2,
       V3 = i.V3,
       V4 = ifelse(abs(ppm) < 10, ppm, NA_real_))
}, mult = "first"]
       V1 V2        V3        V4
1:   Alex  1  50.00010  6.399987
2:   Alex  1  50.00032  1.999987
3:   <NA>  2  70.00000        NA
4:   Mark  2  80.00000 -1.250000
5: Janine  3  88.00040 -3.147713
6:   <NA>  3 100.00000        NA
7:   <NA>  3 101.00000        NA
8:   <NA>  2 102.00000        NA

这是一种替代方法,它通过 update join 更新 datasetB:

library(data.table)
tmp <- setDT(datasetA)[setDT(datasetB), on = "V2"][
  , V4 := (V3- i.V3) / i.V3 * 1E6][abs(V4) < 10][, i.V1 := NULL]
datasetB[, `:=`(V1 = as.character(V1), V4 = as.double(V4))]
datasetB[tmp, on = .(V2, V3 = i.V3), `:=`(V1 = i.V1, V4 = i.V4)][]
       V1 V2        V3        V4
1:   Alex  1  50.00010  6.399987
2:   Alex  1  50.00032  1.999987
3:   <NA>  2  70.00000        NA
4:   Mark  2  80.00000 -1.250000
5: Janine  3  88.00040 -3.147713
6:   <NA>  3 100.00000        NA
7:   <NA>  3 101.00000        NA
8:   <NA>  2 102.00000        NA