使用 cut 为一个时间变量创建 24 个类别

Question

我在这里导入数据，对其进行一些操作（这可能不会成为 issue/fix 所在的位置）

前两行设置我的剪辑参数。

lab_var_num <- (0:24) 
times_var <-c(0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,2200,2300,2400,2500)


all_files_ls <- read_csv("~/Desktop/bioinformatic_work/log_parse_files/sorted_by_habitat/all_trap/all_files_la_selva_log.csv")
#Eliminate bad data and capture in separate dataframe- "bad" data contained within all_files_ls_bad
all_files_ls_bad<-subset(all_files_ls,all_files_ls$temp<10|all_files_ls$temp>50)
all_files_ls <-subset(all_files_ls,all_files_ls$temp>10&all_files_ls$temp<50)

# convert our character data to date data- then change to POSIXct data type.
# all_dates <- strptime(all_files_ls$date,format="%m/%d/%Y")
# Data needs to be put into a cosnistant format of %m/%d/%Y before you can coerce it
# into POSIXct, or any other, data otherwise it will spit out errors.

all_files_ls$date <- strptime(all_files_ls$date,format="%m/%d/%Y")
all_files_ls$date <- as.POSIXct(all_files_ls$date)
# Create wet and dry season data sets.
all_files_ls_w <- subset(all_files_ls,date>="2015-05-01"&date<="2015-12-31"|date>="2016-05-01"&date<="2016-12-31")
all_files_ls_s <- subset(all_files_ls,date>="2015-01-01"&date<="2015-4-30"|date>="2016-01-01"&date<="2016-04-30")


# Subset into canopy and understory dataframes.

all_files_ls_s_c <- subset(all_files_ls_s,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_s_u <- subset(all_files_ls_s,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_w_c <- subset(all_files_ls_w,canopy_understory=="c"|canopy_understory=="C")
all_files_ls_w_u <- subset(all_files_ls_w,canopy_understory=="u"|canopy_understory=="U")

all_files_ls_s_c_summ <- all_files_ls_s_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_s_u_summ <- all_files_ls_s_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_c_summ <- all_files_ls_w_c %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))
all_files_ls_w_u_summ <- all_files_ls_w_u %>% group_by(time)%>% summarise(standard_deviation = sd(temp,na.rm=TRUE),mean = mean(temp,na.rm=TRUE))

这是我的剪切函数：

all_files_ls_s_c_summ$time <- cut(as.numeric(all_files_ls_s_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_s_u_summ$time <- cut(as.numeric(all_files_ls_s_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_c_summ$time <- cut(as.numeric(all_files_ls_w_c_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)
all_files_ls_w_u_summ$time <- cut(as.numeric(all_files_ls_w_u_summ$time),breaks=c(times_var),labels = lab_var_num,include.lowest = TRUE)

当我检查 cut 函数产生的数据时，我得到的类别比我想要的 24 个类别多得多。

这是一些示例数据：

  trap        serial_no                           file_name canopy_understory       date  time  temp humidity
1  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   600  20.1     <NA>
2  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28   800  25.5     <NA>
3  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1000  29.0     <NA>
4  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1200  28.0     <NA>
5  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1400  28.5     <NA>
6  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1601  27.5     <NA>
7  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  1803  25.5     <NA>
8  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2001  23.5     <NA>
9  LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-28  2200  22.5     <NA>
10 LS_trap_10c 7C000000395C1641 trap10c_7C000000395C1641_150809.csv                 c 2015-05-29   000  21.5     <NA>
11  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0159  23.6     <NA>
12  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0359  24.1     <NA>
13  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0559  24.1     <NA>
14  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0759  24.6     <NA>
15  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  0959  24.6     <NA>
16  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1159  26.1     <NA>
17  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1359  26.6     <NA>
18  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1559  25.6     <NA>
19  LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1759  24.1     <NA>
20 LS_trap_10u 9F00000039641541 trap10u_9F00000039641541_160110.csv                 u 2016-01-01  1959  24.1     <NA>

此示例数据可能有问题，因为我无法提供足够的数据集快照（太大），其高可变性可能是问题所在。

这里是剪切产生的数据帧之一：

"","time","standard_deviation","mean"
"1","0",0.864956566100052,23.5574468085106
"2","0",1.14440510857225,22.81103515625
"3","0",0.984904980117555,22.2286831812256
"4","0",1.08678357585325,22.3990654205607
"5","1",1.05145037946718,22.0769704433498
"6","1",1.12960402993109,22.3836754643206
"7","2",1.03725039998279,21.7559322033898
"8","2",1.1068790873174,21.9357894736842
"9","3",1.12097157902533,21.6717980295567
"10","3",1.19621923944834,22.00751953125
"11","4",1.07458677721861,21.4380704041721
"12","4",1.13677253853809,21.6116959064328
"13","5",1.17900504899409,21.4315270935961
"14","5",1.28653071505367,21.79990234375
"15","6",1.20354620166699,21.9286831812256
"16","6",1.31676108631382,22.2322429906542
"17","7",1.86260704732764,23.7655596555966
"18","7",1.77861521566506,24.20419921875
"19","8",2.46883855937697,25.7301298701299
"20","8",2.46920498327612,26.1562427071179
"21","9",2.68395795782085,27.1479115479115
"22","0",0.949097628789142,23.3553191489362
"23","9",2.79945910162021,27.6413533834586
"24","10",2.79930128034239,27.7833981841764
"25","10",2.90435941493285,28.4350606394708
"26","11",2.79704441144441,28.2748466257669
"27","11",2.84178392019108,28.8
"28","12",2.88487423989003,28.5626131953428
"29","12",3.09977843678832,29.2737596471885
"30","13",2.78609514613334,28.6300613496933
"31","13",2.9274394403559,29.0124410933082
"32","14",2.46471466241151,28.0413748378729
"33","14",2.64014509330527,28.5502750275027
"34","15",2.24926437332819,27.1096296296296
"35","15",2.3886068967475,27.4907634307257
"36","16",1.9467999768684,26.0171875
"37","16",1.96854340222531,26.4749174917492
"38","17",1.43673026552318,24.7727385377943
"39","17",1.49178257598373,25.1431279620853
"40","18",1.23662593572858,24.0101694915254
"41","18",1.36276616154878,24.3736434108527
"42","19",1.07197213445298,23.5255266418835
"43","1",0.99431780638411,23.0787234042553
"44","19",1.13453791853054,23.854174573055
"45","20",1.01855291267246,23.1731421121252
"46","20",1.10799364301127,23.4543743078627
"47","21",0.998989468534969,22.9889714993804
"48","21",1.0452391633029,23.2751423149905
"49","22",0.993841145023006,22.6971316818774
"50","22",1.08423014353774,22.9405524861878
"51","23",1.01856406998964,22.517843866171
"52","2",1.03074836073784,22.8872340425532
"53","3",1.10188636506543,22.7382978723404
"54","4",1.11782711780932,22.5787234042553
"55","5",1.06571756649915,22.6106382978723
"56","6",1.16909794681656,23.8127659574468
"57","7",1.28653814110936,26.2702127659574
"58","8",1.39470055539637,28.0787234042553

我正在使用 group_by 获取每个时间点的摘要数据。然后尝试使用 cut 来制作它，以便将特定时间附近的每个数据点分配给该时间。因此，如果时间是 1801，则它与 1800 组合在一起。group_by 函数只是将具有相同 "time" 的每个数据点放在一起。我想要完成的是将每个附近的时间点组合在一起。

我不明白为什么我会得到 58 个类别，而我期望得到 24 个类别。

Answer 1

不是将 data.frame 的部分内容保存为单独的文件并对它们执行相同的操作，而是可以按多个变量分组。您可以使用 lubridate::month 从每个日期中提取月份作为数字（在 base R 中您可以使用 strptime(df$date, '%Y-%m-%d')$mon + 1），这样您就可以简单地使用 ifelse 创建一个新的分组变量而不是cut 带有重复的标签（这将导致 R >= 3.4.0 中的错误）。一旦设置了所有分组变量，汇总就很简单 DRY.

library(dplyr)

df %>% group_by(canopy_understory,    # Group by canopy/understory factor
                # Extract numeric month from date. If less than 5, make `season` "s" else "w", and group by it.
                season = ifelse(lubridate::month(date) < 5, 's', 'w'), 
                # Cut time by 0,100,200,...,2400, and group by the factor returned.
                hour = cut(time, seq(0, 2400, 100), include.lowest = TRUE)) %>% 
    summarise(temp_mean = mean(temp),    # For each group, calc mean and sd of temp.
              temp_sd = sd(temp))

#> # A tibble: 20 x 5
#> # Groups: canopy_understory, season [?]
#>    canopy_understory season              hour temp_mean temp_sd
#>               <fctr>  <chr>            <fctr>     <dbl>   <dbl>
#>  1                 c      w           [0,100]      21.5      NA
#>  2                 c      w         (500,600]      20.1      NA
#>  3                 c      w         (700,800]      25.5      NA
#>  4                 c      w       (900,1e+03]      29.0      NA
#>  5                 c      w (1.1e+03,1.2e+03]      28.0      NA
#>  6                 c      w (1.3e+03,1.4e+03]      28.5      NA
#>  7                 c      w (1.6e+03,1.7e+03]      27.5      NA
#>  8                 c      w (1.8e+03,1.9e+03]      25.5      NA
#>  9                 c      w   (2e+03,2.1e+03]      23.5      NA
#> 10                 c      w (2.1e+03,2.2e+03]      22.5      NA
#> 11                 u      s         (100,200]      23.6      NA
#> 12                 u      s         (300,400]      24.1      NA
#> 13                 u      s         (500,600]      24.1      NA
#> 14                 u      s         (700,800]      24.6      NA
#> 15                 u      s       (900,1e+03]      24.6      NA
#> 16                 u      s (1.1e+03,1.2e+03]      26.1      NA
#> 17                 u      s (1.3e+03,1.4e+03]      26.6      NA
#> 18                 u      s (1.5e+03,1.6e+03]      25.6      NA
#> 19                 u      s (1.7e+03,1.8e+03]      24.1      NA
#> 20                 u      s   (1.9e+03,2e+03]      24.1      NA

样本数据的标准差是 NA，因为每组中只有一个观察值，但它应该适用于更大的数据。

数据

df <- structure(list(trap = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LS_trap_10c", 
    "LS_trap_10u"), class = "factor"), serial_no = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L), .Label = c("7C000000395C1641", "9F00000039641541"
    ), class = "factor"), file_name = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("trap10c_7C000000395C1641_150809.csv", "trap10u_9F00000039641541_160110.csv"
    ), class = "factor"), canopy_understory = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L), .Label = c("c", "u"), class = "factor"), date = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L), .Label = c("2015-05-28", "2015-05-29", "2016-01-01"
    ), class = "factor"), time = c(600L, 800L, 1000L, 1200L, 1400L, 
    1601L, 1803L, 2001L, 2200L, 0L, 159L, 359L, 559L, 759L, 959L, 
    1159L, 1359L, 1559L, 1759L, 1959L), temp = c(20.1, 25.5, 29, 
    28, 28.5, 27.5, 25.5, 23.5, 22.5, 21.5, 23.6, 24.1, 24.1, 24.6, 
    24.6, 26.1, 26.6, 25.6, 24.1, 24.1), humidity = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L), .Label = "<NA>", class = "factor")), .Names = c("trap", 
    "serial_no", "file_name", "canopy_understory", "date", "time", 
    "temp", "humidity"), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
    "14", "15", "16", "17", "18", "19", "20"))

使用 cut 为一个时间变量创建 24 个类别

Use cut to create 24 categories for a time variable

cut

r

dataframe

categorical-data