循环 R 中的 2 个数据集,以匹配一个数据集中所有行的值与另一个数据集的仅一列
loop over 2 datasets in R to match the value of all rows from one dateset with only one column of another dateset
我正在尝试在 R 中编写一个循环,以对名为 datasetA
和 datasetB
的两个数据集执行一些迭代。
datasetA
有 600 个条目,datasetB
有 200,000 个条目。
对于 datasetA
中的每个条目,我要执行以下操作:
如果两个数据集中V2
的值相等,
然后计算 ppm:
(datasetA$V3 - datasetB$V3) / datasetA$V3 * 1000000
如果 ppm < |10|,则将 ppm 值粘贴到 datasetB
的 V4
列,将 datasetA$V1
的相关名称粘贴到 V1
的列datasetB
.
假设这是 datasetA
,有 600 个条目:
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
DatasetA
这是一个包含 200000 个条目的 datasetB
示例:
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
DatasetB
最后的 table 应该是这样的:
datasetC <- read.table(text='Alex 1 50.0001 6.459945
Alex 1 50.00032 2.059983
NA 2 70 NA
Mark 2 80 -1.25
Janine 3 88.0004 -3.14772
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
The final table should look like this
试试这个:我是 R 菜鸟,但如果这对你有用,请告诉我。
library(data.table)
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
# I renamed columns for my own reference, V1,V2,.. were a bit confusing
names(datasetA) <- c("Name", "ID", "ValueA")
names(datasetB) <- c("V1", "ID", "ValueB", "V4")
# Create a key for each row in datasetB
datasetB$key <- seq(nrow(datasetB))
# Left join A to B on column ID, but first set them as data table
datasetB <- as.data.table(datasetB)
datasetA <- as.data.table(datasetA)
# Using base join but you can also use data table left join see below
datasetC <- merge(x = datasetB, y = datasetA, by = c("ID"), all.x = TRUE)
# Create PPM column
datasetC[, c("ppm") := 1000000*(ValueA - ValueB)/ValueA, ]
# Filter on PPM and keep columns we need
datasetC <- datasetC[abs(ppm) < 10, list(key,Name,ppm)]
# Left join to datasetB on key
setkey(datasetC, key)
setkey(datasetB, key)
datasetB <- datasetC[datasetB]
# Keep columns we need and rename to V1,... as requested
datasetB <- datasetB[, list(V1 = Name, V2 = ID, V3 = ValueB, V4 = ppm)]
以下答案似乎符合问题的要求,但我未能获得 2 个计算值,最后一列 V4
。
AV2 <- sort(unique(datasetA$V2))
res <- lapply(AV2, function(v2){
inx_a <- datasetA[['V2']] == v2
inx_b <- datasetB[['V2']] == v2
mrg <- merge(datasetA[inx_a, ], datasetB[inx_b, ], by = 'V2')
ppm <- ((mrg$V3.x - mrg$V3.y)/mrg$V3.x)*1000000
cbind(mrg[abs(ppm) < 10, c(2, 1, 5)], ppm = ppm[abs(ppm) < 10])
})
res <- do.call(rbind, res)
names(res) <- paste0('V', 1:4)
row.names(res) <- NULL
final <- merge(res, datasetB, by = c('V2', 'V3'), all.y = TRUE)[c(3, 1, 2, 4)]
names(final) <- paste0('V', 1:4)
final
# V1 V2 V3 V4
#1 Alex 1 50.00010 6.399946
#2 Alex 1 50.00032 1.999983
#3 <NA> 2 70.00000 NA
#4 Mark 2 80.00000 -1.250002
#5 <NA> 2 102.00000 NA
#6 Janine 3 88.00040 -3.147723
#7 <NA> 3 100.00000 NA
#8 <NA> 3 101.00000 NA
data<-datasetB
for(i in 1:5){
for(j in 1:8){
if (datasetA$V2[i]==datasetB$V2[j] & abs((datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6)<10){
data[j,1]=datasetA[i,1]
data[j,4]=(datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6
}}}
data
如果我没理解错的话,问题是要求 具有复杂条件的联接 。这可以使用 data.table
:
来实现
library(data.table)
setDT(datasetA)[setDT(datasetB), on = "V2", {
ppm <-(x.V3- i.V3) / i.V3 * 1E6
list(V1 = ifelse(abs(ppm) < 10, x.V1, NA_character_),
V2,
V3 = i.V3,
V4 = ifelse(abs(ppm) < 10, ppm, NA_real_))
}, mult = "first"]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA
这是一种替代方法,它通过 update join 更新 datasetB
:
library(data.table)
tmp <- setDT(datasetA)[setDT(datasetB), on = "V2"][
, V4 := (V3- i.V3) / i.V3 * 1E6][abs(V4) < 10][, i.V1 := NULL]
datasetB[, `:=`(V1 = as.character(V1), V4 = as.double(V4))]
datasetB[tmp, on = .(V2, V3 = i.V3), `:=`(V1 = i.V1, V4 = i.V4)][]
V1 V2 V3 V4
1: Alex 1 50.00010 6.399987
2: Alex 1 50.00032 1.999987
3: <NA> 2 70.00000 NA
4: Mark 2 80.00000 -1.250000
5: Janine 3 88.00040 -3.147713
6: <NA> 3 100.00000 NA
7: <NA> 3 101.00000 NA
8: <NA> 2 102.00000 NA
我正在尝试在 R 中编写一个循环,以对名为 datasetA
和 datasetB
的两个数据集执行一些迭代。
datasetA
有 600 个条目,datasetB
有 200,000 个条目。
对于 datasetA
中的每个条目,我要执行以下操作:
如果两个数据集中V2
的值相等,
然后计算 ppm:
(datasetA$V3 - datasetB$V3) / datasetA$V3 * 1000000
如果 ppm < |10|,则将 ppm 值粘贴到 datasetB
的 V4
列,将 datasetA$V1
的相关名称粘贴到 V1
的列datasetB
.
假设这是 datasetA
,有 600 个条目:
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
DatasetA
这是一个包含 200000 个条目的 datasetB
示例:
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
DatasetB
最后的 table 应该是这样的:
datasetC <- read.table(text='Alex 1 50.0001 6.459945
Alex 1 50.00032 2.059983
NA 2 70 NA
Mark 2 80 -1.25
Janine 3 88.0004 -3.14772
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
The final table should look like this
试试这个:我是 R 菜鸟,但如果这对你有用,请告诉我。
library(data.table)
datasetA<- read.table(text='Alex 1 50.00042
John 1 60.000423
Janine 3 88.000123
Aline 3 117
Mark 2 79.9999')
datasetB<- read.table(text='NA 1 50.0001 NA
NA 1 50.00032 NA
NA 2 70 NA
NA 2 80 NA
NA 3 88.0004 NA
NA 3 100 NA
NA 3 101 NA
NA 2 102 NA')
# I renamed columns for my own reference, V1,V2,.. were a bit confusing
names(datasetA) <- c("Name", "ID", "ValueA")
names(datasetB) <- c("V1", "ID", "ValueB", "V4")
# Create a key for each row in datasetB
datasetB$key <- seq(nrow(datasetB))
# Left join A to B on column ID, but first set them as data table
datasetB <- as.data.table(datasetB)
datasetA <- as.data.table(datasetA)
# Using base join but you can also use data table left join see below
datasetC <- merge(x = datasetB, y = datasetA, by = c("ID"), all.x = TRUE)
# Create PPM column
datasetC[, c("ppm") := 1000000*(ValueA - ValueB)/ValueA, ]
# Filter on PPM and keep columns we need
datasetC <- datasetC[abs(ppm) < 10, list(key,Name,ppm)]
# Left join to datasetB on key
setkey(datasetC, key)
setkey(datasetB, key)
datasetB <- datasetC[datasetB]
# Keep columns we need and rename to V1,... as requested
datasetB <- datasetB[, list(V1 = Name, V2 = ID, V3 = ValueB, V4 = ppm)]
以下答案似乎符合问题的要求,但我未能获得 2 个计算值,最后一列 V4
。
AV2 <- sort(unique(datasetA$V2))
res <- lapply(AV2, function(v2){
inx_a <- datasetA[['V2']] == v2
inx_b <- datasetB[['V2']] == v2
mrg <- merge(datasetA[inx_a, ], datasetB[inx_b, ], by = 'V2')
ppm <- ((mrg$V3.x - mrg$V3.y)/mrg$V3.x)*1000000
cbind(mrg[abs(ppm) < 10, c(2, 1, 5)], ppm = ppm[abs(ppm) < 10])
})
res <- do.call(rbind, res)
names(res) <- paste0('V', 1:4)
row.names(res) <- NULL
final <- merge(res, datasetB, by = c('V2', 'V3'), all.y = TRUE)[c(3, 1, 2, 4)]
names(final) <- paste0('V', 1:4)
final
# V1 V2 V3 V4
#1 Alex 1 50.00010 6.399946
#2 Alex 1 50.00032 1.999983
#3 <NA> 2 70.00000 NA
#4 Mark 2 80.00000 -1.250002
#5 <NA> 2 102.00000 NA
#6 Janine 3 88.00040 -3.147723
#7 <NA> 3 100.00000 NA
#8 <NA> 3 101.00000 NA
data<-datasetB
for(i in 1:5){
for(j in 1:8){
if (datasetA$V2[i]==datasetB$V2[j] & abs((datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6)<10){
data[j,1]=datasetA[i,1]
data[j,4]=(datasetA$V3[i]-datasetB$V3[j])/datasetA$V3[i]*10**6
}}}
data
如果我没理解错的话,问题是要求 具有复杂条件的联接 。这可以使用 data.table
:
library(data.table)
setDT(datasetA)[setDT(datasetB), on = "V2", {
ppm <-(x.V3- i.V3) / i.V3 * 1E6
list(V1 = ifelse(abs(ppm) < 10, x.V1, NA_character_),
V2,
V3 = i.V3,
V4 = ifelse(abs(ppm) < 10, ppm, NA_real_))
}, mult = "first"]
V1 V2 V3 V4 1: Alex 1 50.00010 6.399987 2: Alex 1 50.00032 1.999987 3: <NA> 2 70.00000 NA 4: Mark 2 80.00000 -1.250000 5: Janine 3 88.00040 -3.147713 6: <NA> 3 100.00000 NA 7: <NA> 3 101.00000 NA 8: <NA> 2 102.00000 NA
这是一种替代方法,它通过 update join 更新 datasetB
:
library(data.table)
tmp <- setDT(datasetA)[setDT(datasetB), on = "V2"][
, V4 := (V3- i.V3) / i.V3 * 1E6][abs(V4) < 10][, i.V1 := NULL]
datasetB[, `:=`(V1 = as.character(V1), V4 = as.double(V4))]
datasetB[tmp, on = .(V2, V3 = i.V3), `:=`(V1 = i.V1, V4 = i.V4)][]
V1 V2 V3 V4 1: Alex 1 50.00010 6.399987 2: Alex 1 50.00032 1.999987 3: <NA> 2 70.00000 NA 4: Mark 2 80.00000 -1.250000 5: Janine 3 88.00040 -3.147713 6: <NA> 3 100.00000 NA 7: <NA> 3 101.00000 NA 8: <NA> 2 102.00000 NA