按 id 不熔化堆叠列
Stacking columns without melting by id
我使用 rjson
导入了一个 json 文件并将其转换为 data.frame,但所有数据都是横向分布的,列名包含关键信息。
stations <- fromJSON(file = "station_information.json")
test <- as.data.frame(stations[3])
这看起来像:
> dim(test)
[1] 2 5985
> test[1:27]
data.stations.station_id data.stations.name data.stations.short_name
1 72 W 52 St & 11 Ave 6926.01
2 72 W 52 St & 11 Ave 6926.01
data.stations.lat data.stations.lon data.stations.region_id
1 40.76727 -73.99393 71
2 40.76727 -73.99393 71
data.stations.rental_methods data.stations.capacity
1 KEY 39
2 CREDITCARD 39
data.stations.eightd_has_key_dispenser data.stations.station_id.1
1 FALSE 79
2 FALSE 79
data.stations.name.1 data.stations.short_name.1 data.stations.lat.1
1 Franklin St & W Broadway 5430.08 40.71912
2 Franklin St & W Broadway 5430.08 40.71912
data.stations.lon.1 data.stations.region_id.1 data.stations.rental_methods.1
1 -74.00667 71 KEY
2 -74.00667 71 CREDITCARD
data.stations.capacity.1 data.stations.eightd_has_key_dispenser.1
1 33 FALSE
2 33 FALSE
data.stations.station_id.2 data.stations.name.2 data.stations.short_name.2
1 82 St James Pl & Pearl St 5167.06
2 82 St James Pl & Pearl St 5167.06
data.stations.lat.2 data.stations.lon.2 data.stations.region_id.2
1 40.71117 -74.00017 71
2 40.71117 -74.00017 71
data.stations.rental_methods.2 data.stations.capacity.2
1 KEY 27
2 CREDITCARD 27
data.stations.eightd_has_key_dispenser.2
1 FALSE
2 FALSE
如您所见,这无法通过简单的转置 t()
或 melt()
解决方案来解决。我想知道我在导入或转换为 data.frame 时做错了什么,这给我留下了一个数据框,该数据框拉伸了应该是行附加到列名的索引。
我已经尝试了这两种方法,但我得到的是相同的拉伸数据:
plyr::ldply(stations, data.frame)
do.call(rbind, lapply(stations, data.frame, stringsAsFactors=FALSE))
最后我希望我的输出看起来像每 9 列 "cut" 并堆叠到前 9 列 - 这样我就剩下 655 行和 9 列 如有任何建议,我们将不胜感激。
注意:我直接从这个link中获取JSON(它不是一个大文件)
这是前 27 列的可重现示例,应将其重塑为 9 x 3 数据框:
> dput(df)
structure(list(data.stations.station_id = structure(c(1L, 1L), class = "factor", .Label = "72"),
data.stations.name = structure(c(1L, 1L), class = "factor", .Label = "W 52 St & 11 Ave"),
data.stations.short_name = structure(c(1L, 1L), class = "factor", .Label = "6926.01"),
data.stations.lat = c(40.76727216, 40.76727216), data.stations.lon = c(-73.99392888,
-73.99392888), data.stations.region_id = c(71, 71), data.stations.rental_methods = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity = c(39, 39), data.stations.eightd_has_key_dispenser = c(FALSE,
FALSE), data.stations.station_id.1 = structure(c(1L, 1L), class = "factor", .Label = "79"),
data.stations.name.1 = structure(c(1L, 1L), class = "factor", .Label = "Franklin St & W Broadway"),
data.stations.short_name.1 = structure(c(1L, 1L), class = "factor", .Label = "5430.08"),
data.stations.lat.1 = c(40.71911552, 40.71911552), data.stations.lon.1 = c(-74.00666661,
-74.00666661), data.stations.region_id.1 = c(71, 71), data.stations.rental_methods.1 = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity.1 = c(33, 33), data.stations.eightd_has_key_dispenser.1 = c(FALSE,
FALSE), data.stations.station_id.2 = structure(c(1L, 1L), class = "factor", .Label = "82"),
data.stations.name.2 = structure(c(1L, 1L), class = "factor", .Label = "St James Pl & Pearl St"),
data.stations.short_name.2 = structure(c(1L, 1L), class = "factor", .Label = "5167.06"),
data.stations.lat.2 = c(40.71117416, 40.71117416), data.stations.lon.2 = c(-74.00016545,
-74.00016545), data.stations.region_id.2 = c(71, 71), data.stations.rental_methods.2 = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity.2 = c(27, 27), data.stations.eightd_has_key_dispenser.2 = c(FALSE,
FALSE)), .Names = c("data.stations.station_id", "data.stations.name",
"data.stations.short_name", "data.stations.lat", "data.stations.lon",
"data.stations.region_id", "data.stations.rental_methods", "data.stations.capacity",
"data.stations.eightd_has_key_dispenser", "data.stations.station_id.1",
"data.stations.name.1", "data.stations.short_name.1", "data.stations.lat.1",
"data.stations.lon.1", "data.stations.region_id.1", "data.stations.rental_methods.1",
"data.stations.capacity.1", "data.stations.eightd_has_key_dispenser.1",
"data.stations.station_id.2", "data.stations.name.2", "data.stations.short_name.2",
"data.stations.lat.2", "data.stations.lon.2", "data.stations.region_id.2",
"data.stations.rental_methods.2", "data.stations.capacity.2",
"data.stations.eightd_has_key_dispenser.2"), row.names = c(NA,
-2L), class = "data.frame")
所以输出结构应该是这样的(显然值不是 NA)。每行代表原始数据框列名的附加索引号
> output
data.stations.station_id data.stations.name data.stations.short_name
1 NA NA NA
2 NA NA NA
3 NA NA NA
data.stations.lat data.stations.lon data.stations.region_id
1 NA NA NA
2 NA NA NA
3 NA NA NA
data.stations.rental_methods data.stations.capacity
1 NA NA
2 NA NA
3 NA NA
data.stations.eightd_has_key_dispenser
1 NA
2 NA
3 NA
我会尝试:
library(data.table)
rbindlist(lapply(split(seq_along(df), c(0, (seq_along(df)%/%9)[-length(df)])),
function(x) df[, x]), use.names = FALSE)
## data.stations.station_id data.stations.name data.stations.short_name data.stations.lat
## 1: 72 W 52 St & 11 Ave 6926.01 40.76727
## 2: 72 W 52 St & 11 Ave 6926.01 40.76727
## 3: 79 Franklin St & W Broadway 5430.08 40.71912
## 4: 79 Franklin St & W Broadway 5430.08 40.71912
## 5: 82 St James Pl & Pearl St 5167.06 40.71117
## 6: 82 St James Pl & Pearl St 5167.06 40.71117
## data.stations.lon data.stations.region_id data.stations.rental_methods
## 1: -73.99393 71 KEY
## 2: -73.99393 71 CREDITCARD
## 3: -74.00667 71 KEY
## 4: -74.00667 71 CREDITCARD
## 5: -74.00017 71 KEY
## 6: -74.00017 71 CREDITCARD
## data.stations.capacity data.stations.eightd_has_key_dispenser
## 1: 39 FALSE
## 2: 39 FALSE
## 3: 33 FALSE
## 4: 33 FALSE
## 5: 27 FALSE
## 6: 27 FALSE
也就是说,创建一个 list
的 data.frame
,每列有 9 列,然后创建 rbind
个。这样,在转换为 matrix
.
时就不会出现数据强制问题
这导致 6 行 x 9 列 data.table
。不确定要使用什么规则来删除行以最终只有 3 行....
但我认为您正在尝试解决一个不存在的问题。尝试像这样读取您的数据:
library(jsonlite)
x <- fromJSON("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
head(x[[3]]$stations)
## station_id name short_name lat lon region_id
## 1 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71
## 2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71
## 3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71
## 4 83 Atlantic Ave & Fort Greene Pl 4354.07 40.68383 -73.97632 71
## 5 116 W 17 St & 8 Ave 6148.02 40.74178 -74.00150 71
## 6 119 Park Ave & St Edwards St 4700.06 40.69609 -73.97803 71
## rental_methods capacity eightd_has_key_dispenser
## 1 KEY, CREDITCARD 39 FALSE
## 2 KEY, CREDITCARD 33 FALSE
## 3 KEY, CREDITCARD 27 FALSE
## 4 KEY, CREDITCARD 62 FALSE
## 5 KEY, CREDITCARD 39 FALSE
## 6 KEY, CREDITCARD 19 FALSE
dim(x[[3]]$stations)
# [1] 665 9
您可以使用矩阵,但要确保所有因子列都是字符,即
ind <- sapply(df, is.factor)
df[ind] <- lapply(df[ind], as.character)
final_df <- as.data.frame(matrix(unlist(df), ncol = 9, byrow = TRUE))
final_df[c(TRUE, FALSE),]
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 72 72 W 52 St & 11 Ave W 52 St & 11 Ave 6926.01 6926.01 40.76727216 40.76727216 -73.99392888
#3 79 79 Franklin St & W Broadway Franklin St & W Broadway 5430.08 5430.08 40.71911552 40.71911552 -74.00666661
#5 82 82 St James Pl & Pearl St St James Pl & Pearl St 5167.06 5167.06 40.71117416 40.71117416 -74.00016545
另一方面,正如@A5C1D2H2I1M1N2O1R2T1 指出的那样,您可能正在寻找这个:
as.data.frame(matrix(c(t(df)), ncol = 9, byrow = TRUE))
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 KEY 39 FALSE
#2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 KEY 33 FALSE
#3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 KEY 27 FALSE
#4 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 CREDITCARD 39 FALSE
#5 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 CREDITCARD 33 FALSE
#6 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 CREDITCARD 27 FALSE
我使用 rjson
导入了一个 json 文件并将其转换为 data.frame,但所有数据都是横向分布的,列名包含关键信息。
stations <- fromJSON(file = "station_information.json")
test <- as.data.frame(stations[3])
这看起来像:
> dim(test)
[1] 2 5985
> test[1:27]
data.stations.station_id data.stations.name data.stations.short_name
1 72 W 52 St & 11 Ave 6926.01
2 72 W 52 St & 11 Ave 6926.01
data.stations.lat data.stations.lon data.stations.region_id
1 40.76727 -73.99393 71
2 40.76727 -73.99393 71
data.stations.rental_methods data.stations.capacity
1 KEY 39
2 CREDITCARD 39
data.stations.eightd_has_key_dispenser data.stations.station_id.1
1 FALSE 79
2 FALSE 79
data.stations.name.1 data.stations.short_name.1 data.stations.lat.1
1 Franklin St & W Broadway 5430.08 40.71912
2 Franklin St & W Broadway 5430.08 40.71912
data.stations.lon.1 data.stations.region_id.1 data.stations.rental_methods.1
1 -74.00667 71 KEY
2 -74.00667 71 CREDITCARD
data.stations.capacity.1 data.stations.eightd_has_key_dispenser.1
1 33 FALSE
2 33 FALSE
data.stations.station_id.2 data.stations.name.2 data.stations.short_name.2
1 82 St James Pl & Pearl St 5167.06
2 82 St James Pl & Pearl St 5167.06
data.stations.lat.2 data.stations.lon.2 data.stations.region_id.2
1 40.71117 -74.00017 71
2 40.71117 -74.00017 71
data.stations.rental_methods.2 data.stations.capacity.2
1 KEY 27
2 CREDITCARD 27
data.stations.eightd_has_key_dispenser.2
1 FALSE
2 FALSE
如您所见,这无法通过简单的转置 t()
或 melt()
解决方案来解决。我想知道我在导入或转换为 data.frame 时做错了什么,这给我留下了一个数据框,该数据框拉伸了应该是行附加到列名的索引。
我已经尝试了这两种方法,但我得到的是相同的拉伸数据:
plyr::ldply(stations, data.frame)
do.call(rbind, lapply(stations, data.frame, stringsAsFactors=FALSE))
最后我希望我的输出看起来像每 9 列 "cut" 并堆叠到前 9 列 - 这样我就剩下 655 行和 9 列 如有任何建议,我们将不胜感激。
注意:我直接从这个link中获取JSON(它不是一个大文件)
这是前 27 列的可重现示例,应将其重塑为 9 x 3 数据框:
> dput(df)
structure(list(data.stations.station_id = structure(c(1L, 1L), class = "factor", .Label = "72"),
data.stations.name = structure(c(1L, 1L), class = "factor", .Label = "W 52 St & 11 Ave"),
data.stations.short_name = structure(c(1L, 1L), class = "factor", .Label = "6926.01"),
data.stations.lat = c(40.76727216, 40.76727216), data.stations.lon = c(-73.99392888,
-73.99392888), data.stations.region_id = c(71, 71), data.stations.rental_methods = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity = c(39, 39), data.stations.eightd_has_key_dispenser = c(FALSE,
FALSE), data.stations.station_id.1 = structure(c(1L, 1L), class = "factor", .Label = "79"),
data.stations.name.1 = structure(c(1L, 1L), class = "factor", .Label = "Franklin St & W Broadway"),
data.stations.short_name.1 = structure(c(1L, 1L), class = "factor", .Label = "5430.08"),
data.stations.lat.1 = c(40.71911552, 40.71911552), data.stations.lon.1 = c(-74.00666661,
-74.00666661), data.stations.region_id.1 = c(71, 71), data.stations.rental_methods.1 = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity.1 = c(33, 33), data.stations.eightd_has_key_dispenser.1 = c(FALSE,
FALSE), data.stations.station_id.2 = structure(c(1L, 1L), class = "factor", .Label = "82"),
data.stations.name.2 = structure(c(1L, 1L), class = "factor", .Label = "St James Pl & Pearl St"),
data.stations.short_name.2 = structure(c(1L, 1L), class = "factor", .Label = "5167.06"),
data.stations.lat.2 = c(40.71117416, 40.71117416), data.stations.lon.2 = c(-74.00016545,
-74.00016545), data.stations.region_id.2 = c(71, 71), data.stations.rental_methods.2 = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity.2 = c(27, 27), data.stations.eightd_has_key_dispenser.2 = c(FALSE,
FALSE)), .Names = c("data.stations.station_id", "data.stations.name",
"data.stations.short_name", "data.stations.lat", "data.stations.lon",
"data.stations.region_id", "data.stations.rental_methods", "data.stations.capacity",
"data.stations.eightd_has_key_dispenser", "data.stations.station_id.1",
"data.stations.name.1", "data.stations.short_name.1", "data.stations.lat.1",
"data.stations.lon.1", "data.stations.region_id.1", "data.stations.rental_methods.1",
"data.stations.capacity.1", "data.stations.eightd_has_key_dispenser.1",
"data.stations.station_id.2", "data.stations.name.2", "data.stations.short_name.2",
"data.stations.lat.2", "data.stations.lon.2", "data.stations.region_id.2",
"data.stations.rental_methods.2", "data.stations.capacity.2",
"data.stations.eightd_has_key_dispenser.2"), row.names = c(NA,
-2L), class = "data.frame")
所以输出结构应该是这样的(显然值不是 NA)。每行代表原始数据框列名的附加索引号
> output
data.stations.station_id data.stations.name data.stations.short_name
1 NA NA NA
2 NA NA NA
3 NA NA NA
data.stations.lat data.stations.lon data.stations.region_id
1 NA NA NA
2 NA NA NA
3 NA NA NA
data.stations.rental_methods data.stations.capacity
1 NA NA
2 NA NA
3 NA NA
data.stations.eightd_has_key_dispenser
1 NA
2 NA
3 NA
我会尝试:
library(data.table)
rbindlist(lapply(split(seq_along(df), c(0, (seq_along(df)%/%9)[-length(df)])),
function(x) df[, x]), use.names = FALSE)
## data.stations.station_id data.stations.name data.stations.short_name data.stations.lat
## 1: 72 W 52 St & 11 Ave 6926.01 40.76727
## 2: 72 W 52 St & 11 Ave 6926.01 40.76727
## 3: 79 Franklin St & W Broadway 5430.08 40.71912
## 4: 79 Franklin St & W Broadway 5430.08 40.71912
## 5: 82 St James Pl & Pearl St 5167.06 40.71117
## 6: 82 St James Pl & Pearl St 5167.06 40.71117
## data.stations.lon data.stations.region_id data.stations.rental_methods
## 1: -73.99393 71 KEY
## 2: -73.99393 71 CREDITCARD
## 3: -74.00667 71 KEY
## 4: -74.00667 71 CREDITCARD
## 5: -74.00017 71 KEY
## 6: -74.00017 71 CREDITCARD
## data.stations.capacity data.stations.eightd_has_key_dispenser
## 1: 39 FALSE
## 2: 39 FALSE
## 3: 33 FALSE
## 4: 33 FALSE
## 5: 27 FALSE
## 6: 27 FALSE
也就是说,创建一个 list
的 data.frame
,每列有 9 列,然后创建 rbind
个。这样,在转换为 matrix
.
这导致 6 行 x 9 列 data.table
。不确定要使用什么规则来删除行以最终只有 3 行....
但我认为您正在尝试解决一个不存在的问题。尝试像这样读取您的数据:
library(jsonlite)
x <- fromJSON("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
head(x[[3]]$stations)
## station_id name short_name lat lon region_id
## 1 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71
## 2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71
## 3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71
## 4 83 Atlantic Ave & Fort Greene Pl 4354.07 40.68383 -73.97632 71
## 5 116 W 17 St & 8 Ave 6148.02 40.74178 -74.00150 71
## 6 119 Park Ave & St Edwards St 4700.06 40.69609 -73.97803 71
## rental_methods capacity eightd_has_key_dispenser
## 1 KEY, CREDITCARD 39 FALSE
## 2 KEY, CREDITCARD 33 FALSE
## 3 KEY, CREDITCARD 27 FALSE
## 4 KEY, CREDITCARD 62 FALSE
## 5 KEY, CREDITCARD 39 FALSE
## 6 KEY, CREDITCARD 19 FALSE
dim(x[[3]]$stations)
# [1] 665 9
您可以使用矩阵,但要确保所有因子列都是字符,即
ind <- sapply(df, is.factor)
df[ind] <- lapply(df[ind], as.character)
final_df <- as.data.frame(matrix(unlist(df), ncol = 9, byrow = TRUE))
final_df[c(TRUE, FALSE),]
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 72 72 W 52 St & 11 Ave W 52 St & 11 Ave 6926.01 6926.01 40.76727216 40.76727216 -73.99392888
#3 79 79 Franklin St & W Broadway Franklin St & W Broadway 5430.08 5430.08 40.71911552 40.71911552 -74.00666661
#5 82 82 St James Pl & Pearl St St James Pl & Pearl St 5167.06 5167.06 40.71117416 40.71117416 -74.00016545
另一方面,正如@A5C1D2H2I1M1N2O1R2T1 指出的那样,您可能正在寻找这个:
as.data.frame(matrix(c(t(df)), ncol = 9, byrow = TRUE))
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 KEY 39 FALSE
#2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 KEY 33 FALSE
#3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 KEY 27 FALSE
#4 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 CREDITCARD 39 FALSE
#5 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 CREDITCARD 33 FALSE
#6 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 CREDITCARD 27 FALSE