Spread 和 dcast 未正确对齐
Spread and dcast not aligning correctly
我正在重新访问一些现在不起作用的代码,我无法弄清楚为什么当我使用 spread
或 dcast
时变量与日期不一致他们应该。所有软件包都是最新的。
请注意,当使用 spread
时,tmean
位于其自己的行中,而不是与 1996-apr 对齐。 dcast
做同样的事情。
这是一个例子:
dput:
library(reshape2)
library(tidyr)
dat <- structure(list(gridNumber = c(266783L, 266783L, 266783L, 266783L,
266783L, 266783L, 266783L, 266783L, 266783L), fips = c(9005L,
9005L, 9005L, 9005L, 9005L, 9005L, 9005L, 9005L, 9005L), cropArea = c(0,
0, 0, 0, 0, 0, 0, 0, 0), state = structure(c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), .Label = c("AK", "AL", "AR", "AS", "AZ",
"CA", "CO", "CT", "DC", "DE", "FL", "GA", "GU", "HI", "IA", "ID",
"IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO",
"MP", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY",
"OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX", "UM",
"UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"), class = "factor"),
county_name = c("Litchfield County", "Litchfield County",
"Litchfield County", "Litchfield County", "Litchfield County",
"Litchfield County", "Litchfield County", "Litchfield County",
"Litchfield County"), long = c(-73.4583333333292, -73.4583333333292,
-73.4583333333292, -73.458333332921, -73.4583333333292, -73.4583333333292,
-73.4583333333292, -73.458333332921, -73.4583333333292),
lat = c(42.0416666666681, 42.0416666666681, 42.0416666666681,
42.041666666606, 42.0416666666681, 42.0416666666681, 42.0416666666681,
42.041666666606, 42.0416666666681), year = c(1996L, 1996L,
1996L, 1996L, 1996L, 1996L, 1996L, 1996L, 1996L), element = c("tmin",
"tmax", "ppt", "tmean", "tmin", "tmax", "ppt", "tmean", "tmin"
), month = c("apr", "apr", "apr", "apr", "aug", "aug", "aug",
"aug", "dec"), value = c(32.8099999785423, 53.2939998626709,
6.64250644805908, 43.0519997596741, 56.552000617981, 76.55,
2.51968, 66.5509994506836, 26.1320000171661)), .Names = c("gridNumber",
"fips", "cropArea", "state", "county_name", "long", "lat", "year",
"element", "month", "value"), class = c("tbl_df", "data.frame"
), row.names = c(NA, -9L))
spread
和 dcast
:
> spread(dat, element, value)
Source: local data frame [5 x 13]
gridNumber fips cropArea state county_name long lat year month ppt tmax tmean tmin
(int) (int) (dbl) (fctr) (chr) (dbl) (dbl) (int) (chr) (dbl) (dbl) (dbl) (dbl)
1 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr 6.642506 53.294 NA 32.810
2 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug 2.519680 76.550 NA 56.552
3 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 dec NA NA NA 26.132
4 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr NA NA 43.052 NA
5 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug NA NA 66.551 NA
> dcast(dat, gridNumber + fips + cropArea + state + county_name + long + lat + year + month ~ element, value.var = "value")
gridNumber fips cropArea state county_name long lat year month ppt tmax tmean tmin
1 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr 6.642506 53.294 NA 32.810
2 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug 2.519680 76.550 NA 56.552
3 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 dec NA NA NA 26.132
4 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr NA NA 43.052 NA
5 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug NA NA 66.551 NA
您的问题与 lat
和 long
值
的精度有关
> unique(dat$long)
# [1] -73.45833 -73.45833
> unique(dat$lat)
# [1] 42.04167 42.04167
## Notice two 'unique' values that are 'printed' the same
如果您仔细查看 structure
数据,您会发现 lat
和 long
中的值略有不同
例如,舍入 lat/long 值会删除精度 'error'
dat$lat <- round(dat$lat, 4)
dat$long <- round(dat$long, 4)
spread(dat, element, value)
## now tmin is aligned
#gridNumber fips cropArea state county_name long lat year month ppt tmax tmean tmin
#(int) (int) (dbl) (fctr) (chr) (dbl) (dbl) (int) (chr) (dbl) (dbl) (dbl) (dbl)
#1 266783 9005 0 CT Litchfield County -73.4583 42.0417 1996 apr 6.642506 53.294 43.052 32.810
#2 266783 9005 0 CT Litchfield County -73.4583 42.0417 1996 aug 2.519680 76.550 66.551 56.552
#3 266783 9005 0 CT Litchfield County -73.4583 42.0417 1996 dec NA NA NA 26.132
我正在重新访问一些现在不起作用的代码,我无法弄清楚为什么当我使用 spread
或 dcast
时变量与日期不一致他们应该。所有软件包都是最新的。
请注意,当使用 spread
时,tmean
位于其自己的行中,而不是与 1996-apr 对齐。 dcast
做同样的事情。
这是一个例子:
dput:
library(reshape2)
library(tidyr)
dat <- structure(list(gridNumber = c(266783L, 266783L, 266783L, 266783L,
266783L, 266783L, 266783L, 266783L, 266783L), fips = c(9005L,
9005L, 9005L, 9005L, 9005L, 9005L, 9005L, 9005L, 9005L), cropArea = c(0,
0, 0, 0, 0, 0, 0, 0, 0), state = structure(c(8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), .Label = c("AK", "AL", "AR", "AS", "AZ",
"CA", "CO", "CT", "DC", "DE", "FL", "GA", "GU", "HI", "IA", "ID",
"IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO",
"MP", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY",
"OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX", "UM",
"UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"), class = "factor"),
county_name = c("Litchfield County", "Litchfield County",
"Litchfield County", "Litchfield County", "Litchfield County",
"Litchfield County", "Litchfield County", "Litchfield County",
"Litchfield County"), long = c(-73.4583333333292, -73.4583333333292,
-73.4583333333292, -73.458333332921, -73.4583333333292, -73.4583333333292,
-73.4583333333292, -73.458333332921, -73.4583333333292),
lat = c(42.0416666666681, 42.0416666666681, 42.0416666666681,
42.041666666606, 42.0416666666681, 42.0416666666681, 42.0416666666681,
42.041666666606, 42.0416666666681), year = c(1996L, 1996L,
1996L, 1996L, 1996L, 1996L, 1996L, 1996L, 1996L), element = c("tmin",
"tmax", "ppt", "tmean", "tmin", "tmax", "ppt", "tmean", "tmin"
), month = c("apr", "apr", "apr", "apr", "aug", "aug", "aug",
"aug", "dec"), value = c(32.8099999785423, 53.2939998626709,
6.64250644805908, 43.0519997596741, 56.552000617981, 76.55,
2.51968, 66.5509994506836, 26.1320000171661)), .Names = c("gridNumber",
"fips", "cropArea", "state", "county_name", "long", "lat", "year",
"element", "month", "value"), class = c("tbl_df", "data.frame"
), row.names = c(NA, -9L))
spread
和 dcast
:
> spread(dat, element, value)
Source: local data frame [5 x 13]
gridNumber fips cropArea state county_name long lat year month ppt tmax tmean tmin
(int) (int) (dbl) (fctr) (chr) (dbl) (dbl) (int) (chr) (dbl) (dbl) (dbl) (dbl)
1 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr 6.642506 53.294 NA 32.810
2 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug 2.519680 76.550 NA 56.552
3 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 dec NA NA NA 26.132
4 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr NA NA 43.052 NA
5 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug NA NA 66.551 NA
> dcast(dat, gridNumber + fips + cropArea + state + county_name + long + lat + year + month ~ element, value.var = "value")
gridNumber fips cropArea state county_name long lat year month ppt tmax tmean tmin
1 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr 6.642506 53.294 NA 32.810
2 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug 2.519680 76.550 NA 56.552
3 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 dec NA NA NA 26.132
4 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 apr NA NA 43.052 NA
5 266783 9005 0 CT Litchfield County -73.45833 42.04167 1996 aug NA NA 66.551 NA
您的问题与 lat
和 long
值
> unique(dat$long)
# [1] -73.45833 -73.45833
> unique(dat$lat)
# [1] 42.04167 42.04167
## Notice two 'unique' values that are 'printed' the same
如果您仔细查看 structure
数据,您会发现 lat
和 long
例如,舍入 lat/long 值会删除精度 'error'
dat$lat <- round(dat$lat, 4)
dat$long <- round(dat$long, 4)
spread(dat, element, value)
## now tmin is aligned
#gridNumber fips cropArea state county_name long lat year month ppt tmax tmean tmin
#(int) (int) (dbl) (fctr) (chr) (dbl) (dbl) (int) (chr) (dbl) (dbl) (dbl) (dbl)
#1 266783 9005 0 CT Litchfield County -73.4583 42.0417 1996 apr 6.642506 53.294 43.052 32.810
#2 266783 9005 0 CT Litchfield County -73.4583 42.0417 1996 aug 2.519680 76.550 66.551 56.552
#3 266783 9005 0 CT Litchfield County -73.4583 42.0417 1996 dec NA NA NA 26.132