在 R 中的大数据上重新排列数据框的多行

Rearranging multiple rows of a dataframe on big data in R

我是 R 的新手。我有一个数据框 test,如下所示:

PMID    # id
LID
STAT
MH
RN
OT
PST     # cue
LID
STAT
MH
PMID    # id
OT
PST     # cue
LID
DEP
RN
PMID    # id
PST     # cue

我希望它看起来像这样:

PMID    # id
LID
STAT
MH
RN
OT
PST     # cue
PMID    # id
LID
STAT
MH
OT
PST     # cue
PMID    # id
LID
DEP
RN
PST     # cue

基本上,我希望 PMID 之后的条目针对特定的 PMID,第一个 PMID 就是这种情况。然而,在第一个 PMID 之后,PMID 随机位于其条目之间。但是,每个 PMID 都以 PST 结尾,因此我希望将第一个之后的后续 PMID 移动到前一个 PST 位置之后的位置。我有两个数据帧,其中包含每个 PMID 和 PST 的索引位置。例如,对于 PMID,df a_new 包含

1 
11
17

对于太平洋标准时间,df b 包含

7
13
18

这是我尝试过的方法,但是因为我有超过 2400 万行,它在几个小时后没有完成 运行,当我停止它时,我的数据框没有改变:

for (i in 1:nrow(test))
{    
  if (i %in% a_new$X1) # if it's a PMID
  {
    entry <- match(i, a_new$X1) # find entry index of PMID
    if (entry != 1) # as long as not first row from a_new (that's corrected)
    {
      r <- b[i, 1] # row of PST
      test <- rbind(test[1:r, ], test[entry, 1], test[-(1:r), ])
      test <- test[-c(i+1), ] # remove duplicate PMID
    }
  }
}

如您所见,rbind 在这种情况下效率极低。请指教

这里是使用which的索引方法。

# get positions of PST, the final value
endSpot <- which(temp == "PST")
# increment to get the desired positions of the PMID
# (dropping final value as we don't need to change it)
startSpot <- head(endSpot + 1, -1)
# get the current positions of the PMID, except the first one
PMIDSpot <- tail(which(temp == "PMID"), -1)

现在,使用这些索引交换行

temp[c(startSpot, PMIDSpot), ] <- temp[c(PMIDSpot, startSpot), ]

这个returns(我添加了一个名为count的行位置变量来保持跟踪)。

temp
     V1 count
1  PMID     1
2   LID     2
3  STAT     3
4    MH     4
5    RN     5
6    OT     6
7   PST     7
8  PMID    11
9  STAT     9
10   MH    10
11  LID     8
12   OT    12
13  PST    13
14 PMID    17
15  DEP    15
16   RN    16
17  LID    14
18  PST    18

数据

temp <-
structure(list(V1 = c("PMID", "LID", "STAT", "MH", "RN", "OT", 
"PST", "LID", "STAT", "MH", "PMID", "OT", "PST", "LID", "DEP", 
"RN", "PMID", "PST"), count = 1:18), .Names = c("V1", "count"
), row.names = c(NA, -18L), class = "data.frame")

这里是使用 data.table 的答案。

library(data.table)

dat <- fread("Origcol
             PMID
             LID
             STAT
             MH
             RN
             OT
             PST     
             LID
             STAT
             MH
             PMID    
             OT
             PST     
             LID
             DEP
             RN
             PMID   
             PST")

dat[, old_order := 1:.N]
pst_index <- c(0, which(dat$Origcol == "PST"))
dat[, grp := unlist(lapply(1:(length(pst_index)-1), 
                           function(x) rep(x, 
                                           times = (pst_index[x+1] - pst_index[x]))))]
dat[, Origcol := factor(Origcol, levels = c("PMID", "LID", "STAT", 
                                            "MH", "RN", "OT", 
                                            "DEP", "PST"))]
dat[order(grp, Origcol)]

结果:

    Origcol old_order grp
 1:    PMID         1   1
 2:     LID         2   1
 3:    STAT         3   1
 4:      MH         4   1
 5:      RN         5   1
 6:      OT         6   1
 7:     PST         7   1
 8:    PMID        11   2
 9:     LID         8   2
10:    STAT         9   2
11:      MH        10   2
12:      OT        12   2
13:     PST        13   2
14:    PMID        17   3
15:     LID        14   3
16:      RN        16   3
17:     DEP        15   3
18:     PST        18   3

这样做的好处是 data.table 通过引用进行大量操作,一旦按比例放大,应该会很快。你说你有 1400 万行,让我们试试看。生成一些该大小的合成数据:

dat_big <- data.table(Origcol = c("PMID", "LID", "STAT", "MH", "RN", "OT", "PST"))
dat_big_add <- rbindlist(lapply(1:10000, 
                                function(x) data.table(Origcol = c(sample(c("PMID", "LID", "STAT", 
                                                                            "MH", "RN", "OT")), 
                                                                   "PST"))))
dat_big <- rbindlist(list(dat_big, 
                          dat_big_add, dat_big_add, dat_big_add, dat_big_add, dat_big_add, 
                          dat_big_add, dat_big_add, dat_big_add, dat_big_add, dat_big_add, 
                          dat_big_add, dat_big_add, dat_big_add, dat_big_add, dat_big_add, 
                          dat_big_add, dat_big_add, dat_big_add, dat_big_add, dat_big_add))

dat <- rbindlist(list(dat_big, dat_big, dat_big, dat_big, dat_big,
                      dat_big, dat_big, dat_big, dat_big, dat_big))

我们现在有:

          Origcol
       1:    PMID
       2:     LID
       3:    STAT
       4:      MH
       5:      RN
      ---        
14000066:    STAT
14000067:      MH
14000068:      OT
14000069:    PMID
14000070:     PST

应用与上面相同的代码:

dat[, old_order := 1:.N]
pst_index <- c(0, which(dat$Origcol == "PST"))
dat[, grp := unlist(lapply(1:(length(pst_index)-1), 
                           function(x) rep(x, 
                                           times = (pst_index[x+1] - pst_index[x]))))]
dat[, Origcol := factor(Origcol, levels = c("PMID", "LID", "STAT", 
                                            "MH", "RN", "OT", 
                                            "DEP", "PST"))]
dat[order(grp, Origcol)]

我们现在得到:

          Origcol old_order     grp
       1:    PMID         1       1
       2:     LID         2       1
       3:    STAT         3       1
       4:      MH         4       1
       5:      RN         5       1
      ---                          
14000066:    STAT  14000066 2000010
14000067:      MH  14000067 2000010
14000068:      RN  14000064 2000010
14000069:      OT  14000068 2000010
14000070:     PST  14000070 2000010

需要多长时间?

library(microbenchmark)
microbenchmark(
  "data.table" = {
    dat[, old_order := 1:.N]
    pst_index <- c(0, which(dat$Origcol == "PST"))
    dat[, grp := unlist(lapply(1:(length(pst_index)-1), 
                               function(x) rep(x, 
                                               times = (pst_index[x+1] - pst_index[x]))))]
    dat[, Origcol := factor(Origcol, levels = c("PMID", "LID", "STAT", 
                                                "MH", "RN", "OT", 
                                                "DEP", "PST"))]
    dat[order(grp, Origcol)]
  }, 
  times = 10)

需要:

Unit: seconds
       expr      min       lq     mean  median       uq      max neval
 data.table 5.755276 5.813267 6.059665 5.87151 6.034506 7.310169    10

1400 万行不到 10 秒。生成测试数据需要更长的时间。