如何拆分数据框的列并重塑它?

How to split the columns of a data frame and reshape it?

我正在尝试重组数据,以便展开包含多个值的列以匹配我在下面列出的所需输出?我用 tidyr::spread()tidyr::gather() 做了几次尝试都无济于事。有什么想法吗?

dat <- data.frame("name" = c("a", "b", "c"), 
                  "count" = c("2003=22; 2004=32", 
                              "2003=34; 2005=45", 
                              "2005=32; 2006=67"))


name            count
a               2003=22; 2004=32
b               2003=34; 2005=45
c               2005=32; 2006=67

期望的输出:

name    2003    2004    2005    2006    
a        22     32      NA      NA
b        34     NA      45      NA      
c        NA     NA      32      67

这是一个使用 base R 和 reshape2 的解决方案:

ting1 <- data.frame(get1 = gsub("(.*);(.*)", "\1", dat$count))
ting1 <- cbind(name = dat$name, ting1)

ting2 <- data.frame(get1 = gsub("(.*);(.*)", "\2", dat$count))
ting2 <- cbind(name = dat$name, ting2)

df <- rbind(ting1, ting2)

df$years <- trimws(gsub("(.*)=(.*)", "\1", df$get1))
df$values <- gsub("(.*)=(.*)", "\2", df$get1)

library(reshape2)

outdf <- dcast(df, name ~ years, value.var = "values")
outdf
# name 2003 2004 2005 2006
# 1    a   22   32 <NA> <NA>
# 2    b   34 <NA>   45 <NA>
# 3    c <NA> <NA>   32   67

可能有更聪明、更简洁的方法,但这个方法有效:

library(tidyr)
dat %>% 
  separate(count, sep = "; ", into = c("c1", "c2")) %>% 
  gather(Var, Val, -name) %>% 
  separate(Val, sep = "=", into = c("year", "value")) %>% 
  select(-Var) %>% 
  spread(year, value)

  name 2003 2004 2005 2006
1    a   22   32 <NA> <NA>
2    b   34 <NA>   45 <NA>
3    c <NA> <NA>   32   67

请注意,这会产生 "wide" 数据; "long" 传播前的数据可能更容易处理。

这里有一个方法使用 extract + bind_rows + spread -

dat %>%
  extract(count, c("year1", "value1", "year2", "value2"),
          regex = "([:digit:]+)=([:digit:]+);.([:digit:]+)=([:digit:]+)") %>% 
  {bind_rows(
    select(., name, year = year1, value = value1),
    select(., name, year = year2, value = value2)
  )} %>% 
  spread(year, value)

  name 2003 2004 2005 2006
1    a   22   32 <NA> <NA>
2    b   34 <NA>   45 <NA>
3    c <NA> <NA>   32   67

这是使用 extract + spread + spread 的另一种方法。这可能看起来不那么冗长,但我觉得上面的方法更可靠,因为这里的第一个 spread 在某些特定情况下可能会失败。

dat %>%
  extract(count, c("year1", "value1", "year2", "value2"),
          regex = "([:digit:]+)=([:digit:]+);.([:digit:]+)=([:digit:]+)") %>% 
  spread(year1, value1) %>%
  spread(year2, value2)

  name 2003 2004 2005 2006
1    a   22   32 <NA> <NA>
2    b   34 <NA>   45 <NA>
3    c <NA> <NA> <NA>   67

如果您想要基于 strsplit()reshape() 的没有包的基本 R 方法。

dat[] <- lapply(dat, as.character)  # transform columns to characters

# split the columns
DF <- data.frame(cbind(rep(dat$name, each=2), 
                       matrix(unlist(
                         lapply(strsplit(dat$count, "; "), strsplit, "=")), 
                         6, byrow=TRUE)))

# reshape into wide format
DF <- reshape(DF, timevar="X2", idvar="X1", direction="wide")

# coerce year values into numeric form
DF[, -1] <- lapply(DF[, -1], function(x) as.numeric(as.character(x)))

# desired column names (optional)
names(DF) <- c(names(dat)[1], sub("X[^09+].", "", names(DF)[-1]))

> DF
  name 2003 2004 2005 2006
1    a   22   32   NA   NA
3    b   34   NA   45   NA
5    c   NA   NA   32   67

您可以使用 tidyr 中的 separate_rowsseparate

library(tidyr)
dat %>% 
  separate_rows(count, sep = "; ") %>%
  separate(count, sep = "=", into = c("key","val")) %>% 
  spread(key, val)
#   name 2003 2004 2005 2006
# 1    a   22   32 <NA> <NA>
# 2    b   34 <NA>   45 <NA>
# 3    c <NA> <NA>   32   67

另一种解决方案是将这些字符串解析为小标题并取消嵌套结果:

library(tidyverse)
dat %>%
  mutate(count = map(count, ~  gsub("; ",", Y",.) %>%
                       paste0("tibble(Y",.,")") %>%
                       {eval(parse(text= .))})) %>%
  unnest %>%
  rename_at(-1,~str_sub(.,2))
#   name 2003 2004 2005 2006
# 1    a   22   32   NA   NA
# 2    b   34   NA   45   NA
# 3    c   NA   NA   32   67