如何根据 R 中的中值随机 select 和绑定数据列?
How to randomly select and bind data columns based on their median values in R?
我有两个宽格式数据框。每列都是各种维基百科文章的页面点击时间序列。
set.seed(123)
library(tidyr)
time = as.Date('2009-01-01') + 0:9
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,10,replace = T),
B = sample(90:100,10,replace = T),
C = sample(1:10,10,replace = T),
D = sample(1:10,10,replace = T)
)
我想将第一个数据集 (wiki_1
) 中的一列与第二个数据集 (wiki_2
) 中的 n 列合并。但是这个选择应该基于 wiki_2
中列的中值与 wiki_1
中列的中值的接近程度,例如按数量级。
在此示例中,对于 n = 2,Y 应与 C 和 D 匹配,因为它们的中值非常接近。
median(wiki_1$Y) # 7
median(wiki_2$C) # 6
median(wiki_2$D) # 4.5
我不确定如何实施中值差异标准来获得所需的结果。
此外,如果能够从 wiki_2
中满足标准的列中随机抽样会很有用,因为我的真实数据集有更多的列。
这是我目前正在使用的:
df <- zoo(cbind(subset(wiki_1,select="Y"),
subset(wiki_2,select=c("C","D"))),time)
我想这就是您想要的。我在 wiki_2
中添加了一列,以便允许 2 个以上的匹配项显示匹配列的随机选择。
set.seed(123)
library(tidyr)
time = as.Date('2009-01-01') + 0:9
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,10,replace = T),
B = sample(90:100,10,replace = T),
C = sample(1:10,10,replace = T),
D = sample(1:10,10,replace = T),
E = sample(1:20,10,replace = T)
)
selectColsByMedian <- function(df1, df2, ref_v, n_v, cutoff_v) {
#' Select Columns By Median
#' @description Select any number of columns from a test data.frame whose median value is
#' close to the median value of a specified column from a reference data.frame. "Close to"
#' is determined as the absolute value of the difference in medians being less thant he specified cutoff.
#' Outputs a new data.frame containing the reference data.frame's test column and all matching columns
#' from the test data.frame
#' @param df1 reference data.frame
#' @param df2 test data.frame
#' @param ref_v column from reference data.frame to test against
#' @param n_v number of columns from df2 to select
#' @param cutoff_v value to use to determine if test columns' medians are close enough
#' @return data.frame with 1 column from df1 and matching columns from df2
## Get median of ref
med_v <- median(df1[,ref_v], na.rm = T)
## Get other medians
otherMed_v <- apply(wiki_2, 2, function(x) median(x, na.rm = T))
## Get differences
medDiff_v <- sapply(otherMed_v, function(x) abs(med_v - x))
## Get whoever is within range (and order them)
inRange_v <- sort(medDiff_v[medDiff_v < cutoff_v])
inRangeCols_v <- names(inRange_v)
## Select random sample, if needed
if (length(inRangeCols_v) > n_v){
whichRandom_v <- sample(1:length(inRangeCols_v), size = n_v, replace = F)
} else {
whichRandom_v <- 1:length(inRangeCols_v)
}
finalCols_v <- inRangeCols_v[whichRandom_v]
## Final output
out_df <- cbind(df1[,ref_v], df2[,finalCols_v])
colnames(out_df) <- c(ref_v, finalCols_v)
## Return
return(out_df)
} # selectColsByMedian
### 3 matching columns, select 2
match3pick2_df <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 12)
match3pick2_df2 <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 12)
### 2 matching columns, select 2
match2pick2_df <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 10)
这是我的解决方案,我在 wiki_2
中添加了更多列以允许进行子集化(但它在 ncols(wiki_1) == ncols(wiki_2)
.
时有效
set.seed(123)
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,100,replace = T),
B = sample(90:100,100,replace = T),
C = sample(1:10,100,replace = T),
D = sample(1:10,100,replace = T)
)
combineMedianComp <- function(data1, data2, col, n){
if(nrow(data1) > nrow(data2)) stop("Rows in 'data2' need to be greater or equal to rows in 'data1'")
medRef <- median(data1[[col]], na.rm = T, ) # median of desired column
medComp <- sapply(data2, function(x){abs(medRef - median(x, na.rm = T))}) # vector with medians for each columns in data2 ('wiki_2')
cols <- names(sort(medComp)[seq_len(n)]) # sort this vector in ascending order, select top n
d2 <- data2[, c(cols)] # select columns in data2 that have medians closest to 'medRef'
d2 <- d2[sample(seq_len(nrow(d2)), size = nrow(data1), replace = F), ] # subset column as to match those in data1
# merge data
res <- do.call(cbind, list(data1[col], d2))
return(res)
}
combineMedianComp(data1 = wiki_1, data2 = wiki_2, col = "Y", n = 2)
你可以这样做:
time = as.Date('2009-01-01') + 0:9
close_median <- function(df1, df2, to_match = NULL){
# get median
m <- median(df1[[to_match]])
# get difference of median from other data
mat_cols <- apply(df2, 2, function(x) abs(m - median(x)))
# get top 2 matched column
cols <- sort(names(sort(v)[1:2]))
return(cbind(df1[to_match], df2[cols], row.names=time))
}
close_median(wiki_1, wiki_2, 'Y')
Y C D
2009-01-01 8 9 10
2009-01-02 7 8 1
2009-01-03 1 7 7
2009-01-04 10 3 10
2009-01-05 2 1 1
2009-01-06 3 10 3
2009-01-07 6 2 3
2009-01-08 5 8 10
2009-01-09 3 8 5
2009-01-10 10 8 3
我有两个宽格式数据框。每列都是各种维基百科文章的页面点击时间序列。
set.seed(123)
library(tidyr)
time = as.Date('2009-01-01') + 0:9
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,10,replace = T),
B = sample(90:100,10,replace = T),
C = sample(1:10,10,replace = T),
D = sample(1:10,10,replace = T)
)
我想将第一个数据集 (wiki_1
) 中的一列与第二个数据集 (wiki_2
) 中的 n 列合并。但是这个选择应该基于 wiki_2
中列的中值与 wiki_1
中列的中值的接近程度,例如按数量级。
在此示例中,对于 n = 2,Y 应与 C 和 D 匹配,因为它们的中值非常接近。
median(wiki_1$Y) # 7
median(wiki_2$C) # 6
median(wiki_2$D) # 4.5
我不确定如何实施中值差异标准来获得所需的结果。
此外,如果能够从 wiki_2
中满足标准的列中随机抽样会很有用,因为我的真实数据集有更多的列。
这是我目前正在使用的:
df <- zoo(cbind(subset(wiki_1,select="Y"),
subset(wiki_2,select=c("C","D"))),time)
我想这就是您想要的。我在 wiki_2
中添加了一列,以便允许 2 个以上的匹配项显示匹配列的随机选择。
set.seed(123)
library(tidyr)
time = as.Date('2009-01-01') + 0:9
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,10,replace = T),
B = sample(90:100,10,replace = T),
C = sample(1:10,10,replace = T),
D = sample(1:10,10,replace = T),
E = sample(1:20,10,replace = T)
)
selectColsByMedian <- function(df1, df2, ref_v, n_v, cutoff_v) {
#' Select Columns By Median
#' @description Select any number of columns from a test data.frame whose median value is
#' close to the median value of a specified column from a reference data.frame. "Close to"
#' is determined as the absolute value of the difference in medians being less thant he specified cutoff.
#' Outputs a new data.frame containing the reference data.frame's test column and all matching columns
#' from the test data.frame
#' @param df1 reference data.frame
#' @param df2 test data.frame
#' @param ref_v column from reference data.frame to test against
#' @param n_v number of columns from df2 to select
#' @param cutoff_v value to use to determine if test columns' medians are close enough
#' @return data.frame with 1 column from df1 and matching columns from df2
## Get median of ref
med_v <- median(df1[,ref_v], na.rm = T)
## Get other medians
otherMed_v <- apply(wiki_2, 2, function(x) median(x, na.rm = T))
## Get differences
medDiff_v <- sapply(otherMed_v, function(x) abs(med_v - x))
## Get whoever is within range (and order them)
inRange_v <- sort(medDiff_v[medDiff_v < cutoff_v])
inRangeCols_v <- names(inRange_v)
## Select random sample, if needed
if (length(inRangeCols_v) > n_v){
whichRandom_v <- sample(1:length(inRangeCols_v), size = n_v, replace = F)
} else {
whichRandom_v <- 1:length(inRangeCols_v)
}
finalCols_v <- inRangeCols_v[whichRandom_v]
## Final output
out_df <- cbind(df1[,ref_v], df2[,finalCols_v])
colnames(out_df) <- c(ref_v, finalCols_v)
## Return
return(out_df)
} # selectColsByMedian
### 3 matching columns, select 2
match3pick2_df <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 12)
match3pick2_df2 <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 12)
### 2 matching columns, select 2
match2pick2_df <- selectColsByMedian(df1 = wiki_1, df2 = wiki_2, ref_v = "Y", n_v = 2, cutoff_v = 10)
这是我的解决方案,我在 wiki_2
中添加了更多列以允许进行子集化(但它在 ncols(wiki_1) == ncols(wiki_2)
.
set.seed(123)
wiki_1 <- data.frame(
W = sample(1:1000,10,replace = T),
X = sample(1:100,10,replace = T),
Y = sample(1:10,10,replace = T),
Z = sample(1:10,10, replace = T)
)
wiki_2 <- data.frame(
A = sample(500:1000,100,replace = T),
B = sample(90:100,100,replace = T),
C = sample(1:10,100,replace = T),
D = sample(1:10,100,replace = T)
)
combineMedianComp <- function(data1, data2, col, n){
if(nrow(data1) > nrow(data2)) stop("Rows in 'data2' need to be greater or equal to rows in 'data1'")
medRef <- median(data1[[col]], na.rm = T, ) # median of desired column
medComp <- sapply(data2, function(x){abs(medRef - median(x, na.rm = T))}) # vector with medians for each columns in data2 ('wiki_2')
cols <- names(sort(medComp)[seq_len(n)]) # sort this vector in ascending order, select top n
d2 <- data2[, c(cols)] # select columns in data2 that have medians closest to 'medRef'
d2 <- d2[sample(seq_len(nrow(d2)), size = nrow(data1), replace = F), ] # subset column as to match those in data1
# merge data
res <- do.call(cbind, list(data1[col], d2))
return(res)
}
combineMedianComp(data1 = wiki_1, data2 = wiki_2, col = "Y", n = 2)
你可以这样做:
time = as.Date('2009-01-01') + 0:9
close_median <- function(df1, df2, to_match = NULL){
# get median
m <- median(df1[[to_match]])
# get difference of median from other data
mat_cols <- apply(df2, 2, function(x) abs(m - median(x)))
# get top 2 matched column
cols <- sort(names(sort(v)[1:2]))
return(cbind(df1[to_match], df2[cols], row.names=time))
}
close_median(wiki_1, wiki_2, 'Y')
Y C D
2009-01-01 8 9 10
2009-01-02 7 8 1
2009-01-03 1 7 7
2009-01-04 10 3 10
2009-01-05 2 1 1
2009-01-06 3 10 3
2009-01-07 6 2 3
2009-01-08 5 8 10
2009-01-09 3 8 5
2009-01-10 10 8 3