计算相似字符串之间的差异

Calculating the differences between similar strings

我有这个数据框:

# A tibble: 8 x 7
  Date_Time_GMT_3     LoggerID_SiteCode        value month  year   day time     
  <dttm>              <chr>                    <dbl> <chr> <dbl> <int> <Period> 
1 2021-08-01 12:00:00 X20819740_X3WR_S_Statio~  19.9 Aug    2021     1 12H 0M 0S
2 2021-08-01 12:00:00 X21092860_X3WR_U_Compare  19.3 Aug    2021     1 12H 0M 0S
3 2021-08-01 12:00:00 X20676906_X1WR_S_Statio~  18.8 Aug    2021     1 1H 45M 0S
4 2021-08-01 12:00:00 X21092863_X1WR_U_Compare  NA   Aug    2021     1 1H 45M 0S
5 2021-08-01 12:00:00 X20817726_14WR_S_Statio~  19.9 Aug    2021     1 2H 15M 0S
6 2021-08-01 12:00:00 X21092877_14WR_U_Compare  19.3 Aug    2021     1 2H 15M 0S
7 2021-08-01 12:00:00 X20819740_X3WR_S_Statio~  17.2 Sept   2021     1 1H 45M 0S
8 2021-08-01 12:00:00 X21092860_X3WR_U_Compare  17.0 Sept   2021     1 1H 45M 0S

我的个人数据框有点大,上面的代码只显示了前几行。

我正在尝试计算包含相同 SiteCodeLoggerID_SiteCode 字符串的值差异,例如当它们都包含 X3WR 时,因此计算 [=] 之间的差异15=] 和 X21092860_X3WR_U_Compare。但重要的是,当我计算这两个值之间的差异时,它们是同时出现的值 date/time。有人知道怎么做吗?

数据

df = structure(list(Date_Time_GMT_3 = structure
                    (list(sec = c(0, 0, 0, 0, 0, 0, 0, 0), 
                     min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(12L, 12L, 
                      12L, 12L, 12L, 12L, 12L, 12L), mday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), mon = c(7L, 
                      7L, 7L, 7L, 7L, 7L, 7L, 7L), year = c(121L, 121L, 121L, 121L, 121L, 121, 121, 121), 
                      wday = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), yday = c(212L, 212L, 212L, 
                      212L, 212L, 212L, 212L, 212L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("EST", 
                      "EST", "EST", "EST", "EST", "EST", "EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, 
                      NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), tzone = "EST", class = c("POSIXlt", 
                      "POSIXt")), LoggerID_SiteCode = c("X20819740_X3WR_S_Stationary", 
                      "X21092860_X3WR_U_Compare", "X20676906_X1WR_S_Stationary", "X21092863_X1WR_U_Compare", 
                      "X20817726_14WR_S_Stationary", "X21092877_14WR_U_Compare", 
                      "X20819740_X3WR_S_Stationary", "X21092860_X3WR_U_Compare"), value = c(19.948, 
                      19.321, 18.806, NA, 19.948, 19.300, 17.201, 17.012), month = c("Aug", "Aug", "Aug", "Aug", 
                      "Aug", "Aug", "Sept", "Sept"), year = c(2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021), 
                      day = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), time = new("Period", .Data = c(0, 
                                                                                    0, 0, 0, 0, 0, 0, 0), 
                        year = c(0, 0, 0, 0, 0, 0, 0, 0), month = c(0, 
                        0, 0, 0, 0, 0, 0, 0), day = c(0, 0, 0, 0, 0, 0, 0, 0), hour = c(12, 12, 
                        1, 1, 2, 2, 1, 1), minute = c(0, 0, 45, 45, 15, 15, 45, 45))), row.names = c(NA, 
                        -8L), class = c("tbl_df", "tbl", "data.frame"))

您可以使用 group_bysubstr。如果你不想让NA出现,在diff函数中添加na.rm = T

df %>% 
  group_by(month, year, day, time,
           SiteCode = substr(LoggerID_SiteCode, 11, 14)) %>% 
  summarise(diff = diff(value))

# A tibble: 4 x 6
# Groups:   month, year, day, time [4]
  month  year   day time      SiteCode   diff
  <chr> <dbl> <int> <Period>  <chr>     <dbl>
1 Aug    2021     1 1H 45M 0S X1WR     NA    
2 Aug    2021     1 2H 15M 0S 14WR     -0.648
3 Aug    2021     1 12H 0M 0S X3WR     -0.627
4 Sept   2021     1 1H 45M 0S X3WR     -0.189