使用 cut 为一个时间变量创建 24 个类别

Use cut to create 24 categories for a time variable

我在这里导入数据,对其进行一些操作(这可能不会成为 issue/fix 所在的位置)


lab_var_num <- (0:24) 
times_var <-c(0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500)

all_files_ls <- read_csv("~/Desktop/bioinformatic_work/log_parse_files/sorted_by_habitat/all_trap/all_files_la_selva_log.csv")
#Eliminate bad data and capture in separate dataframe- "bad" data contained within all_files_ls_bad
all_files_ls <-subset(all_files_ls,all_files_ls$temp>10&all_files_ls$temp<50)

# convert our character data to date data- then change to POSIXct data type.
# all_dates <- strptime(all_files_ls$date,format="%m/%d/%Y")
# Data needs to be put into a cosnistant format of %m/%d/%Y before you can coerce it
# into POSIXct, or any other, data otherwise it will spit out errors.

all_files_ls$date <- strptime(all_files_ls$date,format="%m/%d/%Y")
all_files_ls$date <- as.POSIXct(all_files_ls$date)
# Create wet and dry season data sets.
all_files_ls_w <- subset(all_files_ls,date>="2015-05-01"&date<="2015-12-31"|date>="2016-05-01"&date<="2016-12-31")
all_files_ls_s <- subset(all_files_ls,date>="2015-01-01"&date<="2015-4-30"|date>="2016-01-01"&date<="2016-04-30")

# Subset into canopy and understory dataframes.

all_files_ls_s_c <- subset(all_files_ls_s,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_s_u <- subset(all_files_ls_s,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_w_c <- subset(all_files_ls_w,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_w_u <- subset(all_files_ls_w,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_s_c_summ <- all_files_ls_s_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_s_u_summ <- all_files_ls_s_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_c_summ <- all_files_ls_w_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_u_summ <- all_files_ls_w_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))


all_files_ls_s_c_summ$time <- cut(as.numeric(all_files_ls_s_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_s_u_summ$time <- cut(as.numeric(all_files_ls_s_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_c_summ$time <- cut(as.numeric(all_files_ls_w_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_u_summ$time <- cut(as.numeric(all_files_ls_w_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)

当我检查 cut 函数产生的数据时,我得到的类别比我想要的 24 个类别多得多。


  trap        serial_no                           file_name canopy_understory       date  time  temp humidity
1  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   600  20.1     <NA>
2  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   800  25.5     <NA>
3  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1000  29.0     <NA>
4  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1200  28.0     <NA>
5  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1400  28.5     <NA>
6  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1601  27.5     <NA>
7  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1803  25.5     <NA>
8  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2001  23.5     <NA>
9  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2200  22.5     <NA>
10 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-29   000  21.5     <NA>
11  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0159  23.6     <NA>
12  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0359  24.1     <NA>
13  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0559  24.1     <NA>
14  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0759  24.6     <NA>
15  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0959  24.6     <NA>
16  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1159  26.1     <NA>
17  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1359  26.6     <NA>
18  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1559  25.6     <NA>
19  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1759  24.1     <NA>
20 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1959  24.1     <NA>




我正在使用 group_by 获取每个时间点的摘要数据。然后尝试使用 cut 来制作它,以便将特定时间附近的每个数据点分配给该时间。因此,如果时间是 1801,则它与 1800 组合在一起。group_by 函数只是将具有相同 "time" 的每个数据点放在一起。我想要完成的是将每个附近的时间点组合在一起。

我不明白为什么我会得到 58 个类别,而我期望得到 24 个类别。

不是将 data.frame 的部分内容保存为单独的文件并对它们执行相同的操作,而是可以按多个变量分组。您可以使用 lubridate::month 从每个日期中提取月份作为数字(在 base R 中您可以使用 strptime(df$date, '%Y-%m-%d')$mon + 1),这样您就可以简单地使用 ifelse 创建一个新的分组变量而不是cut 带有重复的标签(这将导致 R >= 3.4.0 中的错误)。一旦设置了所有分组变量,汇总就很简单 DRY.


df %>% group_by(canopy_understory,    # Group by canopy/understory factor
                # Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
                season = ifelse(lubridate::month(date) < 5, 's', 'w'), 
                # Cut time by 0,100,200,...,2400, and group by the factor returned.
                hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>% 
    summarise(temp_mean = mean(temp),    # For each group, calc mean and sd of temp.
              temp_sd = sd(temp))

#> # A tibble: 20 x 5
#> # Groups: canopy_understory, season [?]
#>    canopy_understory season              hour temp_mean temp_sd
#>               <fctr>  <chr>            <fctr>     <dbl>   <dbl>
#>  1                 c      w           [0,100]      21.5      NA
#>  2                 c      w         (500,600]      20.1      NA
#>  3                 c      w         (700,800]      25.5      NA
#>  4                 c      w       (900,1e+03]      29.0      NA
#>  5                 c      w (1.1e+03,1.2e+03]      28.0      NA
#>  6                 c      w (1.3e+03,1.4e+03]      28.5      NA
#>  7                 c      w (1.6e+03,1.7e+03]      27.5      NA
#>  8                 c      w (1.8e+03,1.9e+03]      25.5      NA
#>  9                 c      w   (2e+03,2.1e+03]      23.5      NA
#> 10                 c      w (2.1e+03,2.2e+03]      22.5      NA
#> 11                 u      s         (100,200]      23.6      NA
#> 12                 u      s         (300,400]      24.1      NA
#> 13                 u      s         (500,600]      24.1      NA
#> 14                 u      s         (700,800]      24.6      NA
#> 15                 u      s       (900,1e+03]      24.6      NA
#> 16                 u      s (1.1e+03,1.2e+03]      26.1      NA
#> 17                 u      s (1.3e+03,1.4e+03]      26.6      NA
#> 18                 u      s (1.5e+03,1.6e+03]      25.6      NA
#> 19                 u      s (1.7e+03,1.8e+03]      24.1      NA
#> 20                 u      s   (1.9e+03,2e+03]      24.1      NA

样本数据的标准差是 NA,因为每组中只有一个观察值,但它应该适用于更大的数据。


df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c", 
    "LS_trap_10u"), class = "factor"), serial_no = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
    ), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
    ), class = "factor"), canopy_understory = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
    ), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L, 
    1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L, 
    1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29, 
    28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6, 
    24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap", 
    "serial_no", "file_name", "canopy_understory", "date", "time", 
    "temp", "humidity"), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
    "14", "15", "16", "17", "18", "19", "20"))