按 id 不熔化堆叠列

Stacking columns without melting by id

我使用 rjson 导入了一个 json 文件并将其转换为 data.frame,但所有数据都是横向分布的,列名包含关键信息。

stations <- fromJSON(file = "station_information.json")
test <- as.data.frame(stations[3])

这看起来像:

> dim(test)
[1]    2 5985

> test[1:27]
  data.stations.station_id data.stations.name data.stations.short_name
1                       72   W 52 St & 11 Ave                  6926.01
2                       72   W 52 St & 11 Ave                  6926.01
  data.stations.lat data.stations.lon data.stations.region_id
1          40.76727         -73.99393                      71
2          40.76727         -73.99393                      71
  data.stations.rental_methods data.stations.capacity
1                          KEY                     39
2                   CREDITCARD                     39
  data.stations.eightd_has_key_dispenser data.stations.station_id.1
1                                  FALSE                         79
2                                  FALSE                         79
      data.stations.name.1 data.stations.short_name.1 data.stations.lat.1
1 Franklin St & W Broadway                    5430.08            40.71912
2 Franklin St & W Broadway                    5430.08            40.71912
  data.stations.lon.1 data.stations.region_id.1 data.stations.rental_methods.1
1           -74.00667                        71                            KEY
2           -74.00667                        71                     CREDITCARD
  data.stations.capacity.1 data.stations.eightd_has_key_dispenser.1
1                       33                                    FALSE
2                       33                                    FALSE
  data.stations.station_id.2   data.stations.name.2 data.stations.short_name.2
1                         82 St James Pl & Pearl St                    5167.06
2                         82 St James Pl & Pearl St                    5167.06
  data.stations.lat.2 data.stations.lon.2 data.stations.region_id.2
1            40.71117           -74.00017                        71
2            40.71117           -74.00017                        71
  data.stations.rental_methods.2 data.stations.capacity.2
1                            KEY                       27
2                     CREDITCARD                       27
  data.stations.eightd_has_key_dispenser.2
1                                    FALSE
2                                    FALSE

如您所见,这无法通过简单的转置 t()melt() 解决方案来解决。我想知道我在导入或转换为 data.frame 时做错了什么,这给我留下了一个数据框,该数据框拉伸了应该是行附加到列名的索引。

我已经尝试了这两种方法,但我得到的是相同的拉伸数据:

plyr::ldply(stations, data.frame)

do.call(rbind, lapply(stations, data.frame, stringsAsFactors=FALSE))

最后我希望我的输出看起来像每 9 列 "cut" 并堆叠到前 9 列 - 这样我就剩下 655 行和 9 列 如有任何建议,我们将不胜感激。

注意:我直接从这个link中获取JSON(它不是一个大文件)

这是前 27 列的可重现示例,应将其重塑为 9 x 3 数据框:

> dput(df)
structure(list(data.stations.station_id = structure(c(1L, 1L), class = "factor", .Label = "72"), 
    data.stations.name = structure(c(1L, 1L), class = "factor", .Label = "W 52 St & 11 Ave"), 
    data.stations.short_name = structure(c(1L, 1L), class = "factor", .Label = "6926.01"), 
    data.stations.lat = c(40.76727216, 40.76727216), data.stations.lon = c(-73.99392888, 
    -73.99392888), data.stations.region_id = c(71, 71), data.stations.rental_methods = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity = c(39, 39), data.stations.eightd_has_key_dispenser = c(FALSE, 
    FALSE), data.stations.station_id.1 = structure(c(1L, 1L), class = "factor", .Label = "79"), 
    data.stations.name.1 = structure(c(1L, 1L), class = "factor", .Label = "Franklin St & W Broadway"), 
    data.stations.short_name.1 = structure(c(1L, 1L), class = "factor", .Label = "5430.08"), 
    data.stations.lat.1 = c(40.71911552, 40.71911552), data.stations.lon.1 = c(-74.00666661, 
    -74.00666661), data.stations.region_id.1 = c(71, 71), data.stations.rental_methods.1 = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity.1 = c(33, 33), data.stations.eightd_has_key_dispenser.1 = c(FALSE, 
    FALSE), data.stations.station_id.2 = structure(c(1L, 1L), class = "factor", .Label = "82"), 
    data.stations.name.2 = structure(c(1L, 1L), class = "factor", .Label = "St James Pl & Pearl St"), 
    data.stations.short_name.2 = structure(c(1L, 1L), class = "factor", .Label = "5167.06"), 
    data.stations.lat.2 = c(40.71117416, 40.71117416), data.stations.lon.2 = c(-74.00016545, 
    -74.00016545), data.stations.region_id.2 = c(71, 71), data.stations.rental_methods.2 = structure(c(2L, 
    1L), .Label = c("CREDITCARD", "KEY"), class = "factor"), 
    data.stations.capacity.2 = c(27, 27), data.stations.eightd_has_key_dispenser.2 = c(FALSE, 
    FALSE)), .Names = c("data.stations.station_id", "data.stations.name", 
"data.stations.short_name", "data.stations.lat", "data.stations.lon", 
"data.stations.region_id", "data.stations.rental_methods", "data.stations.capacity", 
"data.stations.eightd_has_key_dispenser", "data.stations.station_id.1", 
"data.stations.name.1", "data.stations.short_name.1", "data.stations.lat.1", 
"data.stations.lon.1", "data.stations.region_id.1", "data.stations.rental_methods.1", 
"data.stations.capacity.1", "data.stations.eightd_has_key_dispenser.1", 
"data.stations.station_id.2", "data.stations.name.2", "data.stations.short_name.2", 
"data.stations.lat.2", "data.stations.lon.2", "data.stations.region_id.2", 
"data.stations.rental_methods.2", "data.stations.capacity.2", 
"data.stations.eightd_has_key_dispenser.2"), row.names = c(NA, 
-2L), class = "data.frame")

所以输出结构应该是这样的(显然值不是 NA)。每行代表原始数据框列名的附加索引号

> output
  data.stations.station_id data.stations.name data.stations.short_name
1                       NA                 NA                       NA
2                       NA                 NA                       NA
3                       NA                 NA                       NA
  data.stations.lat data.stations.lon data.stations.region_id
1                NA                NA                      NA
2                NA                NA                      NA
3                NA                NA                      NA
  data.stations.rental_methods data.stations.capacity
1                           NA                     NA
2                           NA                     NA
3                           NA                     NA
  data.stations.eightd_has_key_dispenser
1                                     NA
2                                     NA
3                                     NA

我会尝试:

library(data.table)
rbindlist(lapply(split(seq_along(df), c(0, (seq_along(df)%/%9)[-length(df)])), 
    function(x) df[, x]), use.names = FALSE)
##    data.stations.station_id       data.stations.name data.stations.short_name data.stations.lat
## 1:                       72         W 52 St & 11 Ave                  6926.01          40.76727
## 2:                       72         W 52 St & 11 Ave                  6926.01          40.76727
## 3:                       79 Franklin St & W Broadway                  5430.08          40.71912
## 4:                       79 Franklin St & W Broadway                  5430.08          40.71912
## 5:                       82   St James Pl & Pearl St                  5167.06          40.71117
## 6:                       82   St James Pl & Pearl St                  5167.06          40.71117
##    data.stations.lon data.stations.region_id data.stations.rental_methods
## 1:         -73.99393                      71                          KEY
## 2:         -73.99393                      71                   CREDITCARD
## 3:         -74.00667                      71                          KEY
## 4:         -74.00667                      71                   CREDITCARD
## 5:         -74.00017                      71                          KEY
## 6:         -74.00017                      71                   CREDITCARD
##    data.stations.capacity data.stations.eightd_has_key_dispenser
## 1:                     39                                  FALSE
## 2:                     39                                  FALSE
## 3:                     33                                  FALSE
## 4:                     33                                  FALSE
## 5:                     27                                  FALSE
## 6:                     27                                  FALSE

也就是说,创建一个 listdata.frame,每列有 9 列,然后创建 rbind 个。这样,在转换为 matrix.

时就不会出现数据强制问题

这导致 6 行 x 9 列 data.table。不确定要使用什么规则来删除行以最终只有 3 行....


但我认为您正在尝试解决一个不存在的问题。尝试像这样读取您的数据:

library(jsonlite)
x <- fromJSON("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
head(x[[3]]$stations)
##   station_id                          name short_name      lat       lon region_id
## 1         72              W 52 St & 11 Ave    6926.01 40.76727 -73.99393        71
## 2         79      Franklin St & W Broadway    5430.08 40.71912 -74.00667        71
## 3         82        St James Pl & Pearl St    5167.06 40.71117 -74.00017        71
## 4         83 Atlantic Ave & Fort Greene Pl    4354.07 40.68383 -73.97632        71
## 5        116               W 17 St & 8 Ave    6148.02 40.74178 -74.00150        71
## 6        119      Park Ave & St Edwards St    4700.06 40.69609 -73.97803        71
##    rental_methods capacity eightd_has_key_dispenser
## 1 KEY, CREDITCARD       39                    FALSE
## 2 KEY, CREDITCARD       33                    FALSE
## 3 KEY, CREDITCARD       27                    FALSE
## 4 KEY, CREDITCARD       62                    FALSE
## 5 KEY, CREDITCARD       39                    FALSE
## 6 KEY, CREDITCARD       19                    FALSE


dim(x[[3]]$stations)
# [1] 665   9

您可以使用矩阵,但要确保所有因子列都是字符,即

ind <- sapply(df, is.factor)
df[ind] <- lapply(df[ind], as.character)

final_df <- as.data.frame(matrix(unlist(df), ncol = 9, byrow = TRUE))

final_df[c(TRUE, FALSE),]
#  V1 V2                       V3                       V4      V5      V6          V7          V8           V9
#1 72 72         W 52 St & 11 Ave         W 52 St & 11 Ave 6926.01 6926.01 40.76727216 40.76727216 -73.99392888
#3 79 79 Franklin St & W Broadway Franklin St & W Broadway 5430.08 5430.08 40.71911552 40.71911552 -74.00666661
#5 82 82   St James Pl & Pearl St   St James Pl & Pearl St 5167.06 5167.06 40.71117416 40.71117416 -74.00016545

另一方面,正如@A5C1D2H2I1M1N2O1R2T1 指出的那样,您可能正在寻找这个:

as.data.frame(matrix(c(t(df)), ncol = 9, byrow = TRUE))
#  V1                       V2      V3       V4        V5 V6         V7 V8    V9
#1 72         W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71        KEY 39 FALSE
#2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71        KEY 33 FALSE
#3 82   St James Pl & Pearl St 5167.06 40.71117 -74.00017 71        KEY 27 FALSE
#4 72         W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 CREDITCARD 39 FALSE
#5 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 CREDITCARD 33 FALSE
#6 82   St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 CREDITCARD 27 FALSE