基于 R 中指定列值的采样

sampling based on specified column values in R

我有这样一个数据,其中 Average 是 X、Y 和 Z 的平均值。

head(df)
ID  X   Y   Z   Average
A   2   2   5   3
A   4   3   2   3
A   4   3   2   3
B   5   3   1   3
B   3   4   2   3
B   1   5   3   3
C   5   3   1   3
C   2   3   4   3
C   5   3   1   3
D   2   3   4   3
D   3   2   4   3
D   3   2   4   3
E   5   3   1   3
E   4   3   2   3
E   3   4   2   3

要重现这一点,我们可以使用

df <- data.frame(ID = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D", "E", "E", "E"),
                     X = c(2L, 4L, 4L, 5L, 3L,1L, 5L, 2L, 5L, 2L, 3L, 3L, 5L, 4L, 3L),
                     Y = c(2L, 3L, 3L, 3L,4L, 5L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 4L), 
                     Z = c(5L, 2L, 2L,1L, 2L, 3L, 1L, 4L, 1L, 4L, 4L, 4L, 1L, 2L, 2L), 
                     Average = c(3L,3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L))

据此,我想为每个 ID 提取一个观察值,这样我们就不会得到相同(尽可能多)的 X、Y 和 Z 组合值。我试过

library(dplyr)
df %>% sample_n(size = nrow(.), replace = FALSE) %>% distinct(ID, .keep_all = T)

但是,在更大的数据集上,我发现 X、Y、Z 的组合重复太多。在可能的范围内,我需要输出具有相等或接近相等的案例表示(即组合X, Y, Y) 像这样:

   ID   X   Y   Z   Average
    A   2   2   5   3
    B   5   3   1   3
    C   2   3   4   3
    D   3   2   4   3
    E   4   3   2   3

这看起来很可疑,但试试这个:

library(dplyr)
df %>% add_count(X, Y, Z) %>%
    group_by(ID) %>%
    top_n(-1, n) %>%
    arrange(runif(n)) %>%
    select(-n) %>%
    slice(1)
# # A tibble: 5 x 5
# # Groups:   ID [5]
#       ID     X     Y     Z Average
#   <fctr> <int> <int> <int>   <int>
# 1      A     2     2     5       3
# 2      B     1     5     3       3
# 3      C     2     3     4       3
# 4      D     3     2     4       3
# 5      E     3     4     2       3

这会从每个 ID 中选择最不常见的 XYZ 组合,如果平局则随机选择。非常常见的 XYZ 组合可能会完全丢失。

这是一种方法。内联解释。请注意,可以根据行之间所需的 "similarity"/"duplication" 概念来调整实现。

# get the sample data from the original post
dat <- data.frame(
  ID = c("A","A","A","B", "B", "B", "C", "C", "C", "D", "D", "D", "E", "E", "E"),
  X = c(2L, 4L, 4L, 5L, 3L,1L, 5L, 2L, 5L, 2L, 3L, 3L, 5L, 4L, 3L),
  Y = c(2L, 3L, 3L, 3L,4L, 5L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 4L),
  Z = c(5L, 2L, 2L,1L, 2L, 3L, 1L, 4L, 1L, 4L, 4L, 4L, 1L, 2L, 2L),
  Average = c(3L,3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L))

定义一个函数,在给定 id 的情况下对数据帧的一行进行采样(假设存在 $ID 列):

# function to get a randomly sampled row from `df` with `df$ID==id`
id_sample <- function(df, id){
  df <- df[df$ID==id, ]
  return(df[sample(1:nrow(df), size=1), ])
}

定义一个循环遍历每个 id 的函数,如果样本 "too similar" 到数据框中已有的行,则拒绝该样本:

make_sample_df <- function(dat, threshold){

  # initialize empty data frame to fill with row samples
  out <- dat[NULL, ]

  # get the unique id's to loop over
  ids <- unique(dat$ID)

  for (id in ids){

    # grab a sample
    id_row <- id_sample(dat, id)

    # see how many of its elements have column-duplicates (zero for first id)
    n_dupes <- sum(apply(out, 1, function(row){
      sum(row[1]==id_row$X, row[2]==id_row$Y, row[3]==id_row$Z)}))

    # keep getting samps if the number of duplicates is higher than threshold
    while (n_dupes > threshold){

      id_row <- id_sample(dat, id)

      n_dupes <- sum(apply(out, 1, function(row){
        sum(row[1]==id_row$X, row[2]==id_row$Y, row[3]==id_row$Z)}))
    }

    # once we have a suitable row for `id`, add it to the output df
    out <- rbind(out, id_row)
  }

  return(out)
}

现在将函数应用于 OP 的数据并检查:

# rows at most 1 of whose values appear in another row (at same column)     
set.seed(6933)
make_sample_df(dat, threshold=1)

## ID X Y Z Average
## A  4 3 2       3
## B  1 5 3       3
## C  2 3 4       3
## D  3 2 4       3
## E  5 3 1       3

根据您的需要,您可以尝试不同的 threshold 值,但请注意,如果您对 threshold 过于严格,while 循环可能会使 运行 永远,所以你可能想在那里放一些逃生舱口。

您还可以根据不同的变体调整此策略,例如您关心的是在行内而不是列内重复值。

希望对你有帮助~~

一个可能的解决方案是 excluding 已为上一列采样的值。

library(dplyr)
df %>% group_by(ID) %>%
  summarise(XX = sample(unique(X),1),
            YY = sample(unique(Y[Y!=XX]),1),
            ZZ = sample(unique(Z[Z!=XX & Z!=YY]),1),
            Average = mean(Average))
# # A tibble: 5 x 5
#    ID        XX    YY    ZZ Average
#   <fctr> <int> <int> <int>   <dbl>
# 1 A          4     2     1    3.00
# 2 B          5     4     2    3.00
# 3 C          5     3     1    3.00
# 4 D          2     3     2    3.00
# 5 E          5     4     2    3.00

为了让逻辑更清晰,我更改了列名。

我只想补充左边的答案。我修改了代码以允许样本大小,而不依赖于名为 ID 或 X、Y、Z 或其中任何一个的列

id_sample <- function(df, id, field, sampleSize) {
  df = df %>%
   filter(!!as.symbol(field) == id)
  return(df[sample(1:nrow(df), size = sampleSize, replace = TRUE),])
}

  make_sample_df <- function(dat, sampleSize, field) {
   # initialize empty data frame to fill with row samples
  out <- dat[NULL,]
  # get the unique id's to loop over
  ids <- unique(dat[[field]])
   for (id in ids) {
    # grab a sample
    id_row <- id_sample(dat, id, field,sampleSize)
   out <- rbind(out, id_row)
     }
  return(out)
  }

然后我如何使用它

   sample_df = make_sample_df(df, 20, "ColumnToSampleOn")

其中 df 是我的原始数据框,20 是我想要在“ColumnToSampleOn”中找到的每个唯一值的行数