计算相似字符串之间的差异
Calculating the differences between similar strings
我有这个数据框:
# A tibble: 8 x 7
Date_Time_GMT_3 LoggerID_SiteCode value month year day time
<dttm> <chr> <dbl> <chr> <dbl> <int> <Period>
1 2021-08-01 12:00:00 X20819740_X3WR_S_Statio~ 19.9 Aug 2021 1 12H 0M 0S
2 2021-08-01 12:00:00 X21092860_X3WR_U_Compare 19.3 Aug 2021 1 12H 0M 0S
3 2021-08-01 12:00:00 X20676906_X1WR_S_Statio~ 18.8 Aug 2021 1 1H 45M 0S
4 2021-08-01 12:00:00 X21092863_X1WR_U_Compare NA Aug 2021 1 1H 45M 0S
5 2021-08-01 12:00:00 X20817726_14WR_S_Statio~ 19.9 Aug 2021 1 2H 15M 0S
6 2021-08-01 12:00:00 X21092877_14WR_U_Compare 19.3 Aug 2021 1 2H 15M 0S
7 2021-08-01 12:00:00 X20819740_X3WR_S_Statio~ 17.2 Sept 2021 1 1H 45M 0S
8 2021-08-01 12:00:00 X21092860_X3WR_U_Compare 17.0 Sept 2021 1 1H 45M 0S
我的个人数据框有点大,上面的代码只显示了前几行。
我正在尝试计算包含相同 SiteCode
的 LoggerID_SiteCode
字符串的值差异,例如当它们都包含 X3WR
时,因此计算 [=] 之间的差异15=] 和 X21092860_X3WR_U_Compare
。但重要的是,当我计算这两个值之间的差异时,它们是同时出现的值 date/time。有人知道怎么做吗?
数据
df = structure(list(Date_Time_GMT_3 = structure
(list(sec = c(0, 0, 0, 0, 0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L), mday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), mon = c(7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L), year = c(121L, 121L, 121L, 121L, 121L, 121, 121, 121),
wday = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), yday = c(212L, 212L, 212L,
212L, 212L, 212L, 212L, 212L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("EST",
"EST", "EST", "EST", "EST", "EST", "EST", "EST"), gmtoff = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), tzone = "EST", class = c("POSIXlt",
"POSIXt")), LoggerID_SiteCode = c("X20819740_X3WR_S_Stationary",
"X21092860_X3WR_U_Compare", "X20676906_X1WR_S_Stationary", "X21092863_X1WR_U_Compare",
"X20817726_14WR_S_Stationary", "X21092877_14WR_U_Compare",
"X20819740_X3WR_S_Stationary", "X21092860_X3WR_U_Compare"), value = c(19.948,
19.321, 18.806, NA, 19.948, 19.300, 17.201, 17.012), month = c("Aug", "Aug", "Aug", "Aug",
"Aug", "Aug", "Sept", "Sept"), year = c(2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021),
day = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), time = new("Period", .Data = c(0,
0, 0, 0, 0, 0, 0, 0),
year = c(0, 0, 0, 0, 0, 0, 0, 0), month = c(0,
0, 0, 0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0, 0, 0), hour = c(12, 12,
1, 1, 2, 2, 1, 1), minute = c(0, 0, 45, 45, 15, 15, 45, 45))), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
您可以使用 group_by
和 substr
。如果你不想让NA出现,在diff
函数中添加na.rm = T
。
df %>%
group_by(month, year, day, time,
SiteCode = substr(LoggerID_SiteCode, 11, 14)) %>%
summarise(diff = diff(value))
# A tibble: 4 x 6
# Groups: month, year, day, time [4]
month year day time SiteCode diff
<chr> <dbl> <int> <Period> <chr> <dbl>
1 Aug 2021 1 1H 45M 0S X1WR NA
2 Aug 2021 1 2H 15M 0S 14WR -0.648
3 Aug 2021 1 12H 0M 0S X3WR -0.627
4 Sept 2021 1 1H 45M 0S X3WR -0.189
我有这个数据框:
# A tibble: 8 x 7
Date_Time_GMT_3 LoggerID_SiteCode value month year day time
<dttm> <chr> <dbl> <chr> <dbl> <int> <Period>
1 2021-08-01 12:00:00 X20819740_X3WR_S_Statio~ 19.9 Aug 2021 1 12H 0M 0S
2 2021-08-01 12:00:00 X21092860_X3WR_U_Compare 19.3 Aug 2021 1 12H 0M 0S
3 2021-08-01 12:00:00 X20676906_X1WR_S_Statio~ 18.8 Aug 2021 1 1H 45M 0S
4 2021-08-01 12:00:00 X21092863_X1WR_U_Compare NA Aug 2021 1 1H 45M 0S
5 2021-08-01 12:00:00 X20817726_14WR_S_Statio~ 19.9 Aug 2021 1 2H 15M 0S
6 2021-08-01 12:00:00 X21092877_14WR_U_Compare 19.3 Aug 2021 1 2H 15M 0S
7 2021-08-01 12:00:00 X20819740_X3WR_S_Statio~ 17.2 Sept 2021 1 1H 45M 0S
8 2021-08-01 12:00:00 X21092860_X3WR_U_Compare 17.0 Sept 2021 1 1H 45M 0S
我的个人数据框有点大,上面的代码只显示了前几行。
我正在尝试计算包含相同 SiteCode
的 LoggerID_SiteCode
字符串的值差异,例如当它们都包含 X3WR
时,因此计算 [=] 之间的差异15=] 和 X21092860_X3WR_U_Compare
。但重要的是,当我计算这两个值之间的差异时,它们是同时出现的值 date/time。有人知道怎么做吗?
数据
df = structure(list(Date_Time_GMT_3 = structure
(list(sec = c(0, 0, 0, 0, 0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L), mday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), mon = c(7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L), year = c(121L, 121L, 121L, 121L, 121L, 121, 121, 121),
wday = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), yday = c(212L, 212L, 212L,
212L, 212L, 212L, 212L, 212L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("EST",
"EST", "EST", "EST", "EST", "EST", "EST", "EST"), gmtoff = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), tzone = "EST", class = c("POSIXlt",
"POSIXt")), LoggerID_SiteCode = c("X20819740_X3WR_S_Stationary",
"X21092860_X3WR_U_Compare", "X20676906_X1WR_S_Stationary", "X21092863_X1WR_U_Compare",
"X20817726_14WR_S_Stationary", "X21092877_14WR_U_Compare",
"X20819740_X3WR_S_Stationary", "X21092860_X3WR_U_Compare"), value = c(19.948,
19.321, 18.806, NA, 19.948, 19.300, 17.201, 17.012), month = c("Aug", "Aug", "Aug", "Aug",
"Aug", "Aug", "Sept", "Sept"), year = c(2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021),
day = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), time = new("Period", .Data = c(0,
0, 0, 0, 0, 0, 0, 0),
year = c(0, 0, 0, 0, 0, 0, 0, 0), month = c(0,
0, 0, 0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0, 0, 0), hour = c(12, 12,
1, 1, 2, 2, 1, 1), minute = c(0, 0, 45, 45, 15, 15, 45, 45))), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
您可以使用 group_by
和 substr
。如果你不想让NA出现,在diff
函数中添加na.rm = T
。
df %>%
group_by(month, year, day, time,
SiteCode = substr(LoggerID_SiteCode, 11, 14)) %>%
summarise(diff = diff(value))
# A tibble: 4 x 6
# Groups: month, year, day, time [4]
month year day time SiteCode diff
<chr> <dbl> <int> <Period> <chr> <dbl>
1 Aug 2021 1 1H 45M 0S X1WR NA
2 Aug 2021 1 2H 15M 0S 14WR -0.648
3 Aug 2021 1 12H 0M 0S X3WR -0.627
4 Sept 2021 1 1H 45M 0S X3WR -0.189