如何重新格式化数据框以计算 R 中的基本值(平均值、SD、SE 等)?

How to reformat dataframe in order to calculate basic values (mean, SD, SE, etc.) in R?

我有一个名为 'my_data' 的数据框,看起来像这样:

Injury         BL.Time.Delay Acute.Time.Delay Chronic.Time.Delay Acute.Area.Def Chronic.Area.Def BL.BBB Acute.BBB Chronic.BBB
1  Moderate          0.35             1.10               0.60           1.84            0.150     21        11          18
2    Severe          0.42             1.47               0.86           3.04            0.420     21         3          14
3  Moderate          0.45             1.02               0.65           1.80            0.150     21        11          18
4  Moderate          0.42             0.97               0.70           1.76            0.000     21         8          17
5    Severe          0.40             1.55               0.80           3.12            0.370     21         4          11
6  Moderate          0.37             0.96               0.65           1.65            0.240     21         9          14
7    Severe          0.32             1.64               0.75           3.34            0.400     21         2          12
8    Severe          0.40             1.44               0.90           3.24            0.298     21         1           9
9  Moderate          0.47             1.01               0.82           1.77            0.140     21         7          17
10 Moderate          0.41             0.90               0.67           1.51            0.190     21         9          15
11 Moderate          0.38             0.86               0.57           1.78            0.100     21        10          17
12   Severe          0.39             1.59               0.70           3.27            0.360     21         2          12

我想将伤害因素(Mod率和严重)格式化为列,每列都有其相关的测量值(时间延迟、面积不足等)。还有三个时间点(BL、急性、慢性),我也想将其纳入我的最终数据框。

我 运行 此代码用于基本统计数据,但未按伤害组分组(Mod评级与严重)

updated_df <- sapply(my_data[, c(2:9)], function(my_data) c("Mean"= mean(my_data,na.rm=TRUE),
                          "Stand dev" = sd(my_data),
                          "Stand Err" = sd(my_data)/sqrt(length(my_data)),
                         "Median" = median(my_data),
                         "CoeffofVariation" = sd(my_data)/mean(my_data,na.rm=TRUE),
                         "Minimum" = min(my_data),
                         "Maximun" = max(my_data),
                         "Upper Quantile" = quantile(my_data,.75),
                         "LowerQuartile" = quantile(my_data,.25),
                         "n" = length(my_data)
)
)

如何 运行 上面显示的统计数据并按伤害类型(Mod vs Sev)分组并考虑时间点?

在当前设置下,一个选项是 split 通过 'Injury Type',用 lapply 循环 list 并应用代码

lst1 <- split(my_data, my_data$Injury, drop = TRUE)
lst2 <- lapply(lst1, function(dat) 
     sapply(dat[, 2:9], function(my_data1)  c("Mean"= mean(my_data1,na.rm=TRUE),
                      "Stand dev" = sd(my_data1),
                      "Stand Err" = sd(my_data1)/sqrt(length(my_data1)),
                     "Median" = median(my_data1),
                     "CoeffofVariation" = sd(my_data1)/mean(my_data1,na.rm=TRUE),
                     "Minimum" = min(my_data1),
                     "Maximun" = max(my_data1),
                     "Upper Quantile" = quantile(my_data1,.75),
                     "LowerQuartile" = quantile(my_data1,.25),
                     "n" = length(my_data1))))

-输出

lst2
#$Moderate
#                       BL.Time.Delay Acute.Time.Delay Chronic.Time.Delay Acute.Area.Def Chronic.Area.Def BL.BBB
#    Mean                  0.40714286       0.97428571         0.66571429     1.73000000       0.13857143     21
#    Stand dev             0.04347961       0.07955232         0.08059304     0.11313708       0.07515064      0
#    Stand Err             0.01643375       0.03006795         0.03046131     0.04276180       0.02840427      0
#    Median                0.41000000       0.97000000         0.65000000     1.77000000       0.15000000     21
#    CoeffofVariation      0.10679202       0.08165194         0.12106251     0.06539716       0.54232422      0
#    Minimum               0.35000000       0.86000000         0.57000000     1.51000000       0.00000000     21
#    Maximun               0.47000000       1.10000000         0.82000000     1.84000000       0.24000000     21
#    Upper Quantile.75%    0.43500000       1.01500000         0.68500000     1.79000000       0.17000000     21
#    LowerQuartile.25%     0.37500000       0.93000000         0.62500000     1.70500000       0.12000000     21
#    n                     7.00000000       7.00000000         7.00000000     7.00000000       7.00000000      7
#                        Acute.BBB Chronic.BBB
#    Mean                9.2857143  16.5714286
#    Stand dev           1.4960265   1.5118579
#    Stand Err           0.5654449   0.5714286
#    Median              9.0000000  17.0000000
#    CoeffofVariation    0.1611105   0.0912328
#    Minimum             7.0000000  14.0000000
#    Maximun            11.0000000  18.0000000
#    Upper Quantile.75% 10.5000000  17.5000000
#    LowerQuartile.25%   8.5000000  16.0000000
#    n                   7.0000000   7.0000000

#    $Severe
#                       BL.Time.Delay Acute.Time.Delay Chronic.Time.Delay Acute.Area.Def Chronic.Area.Def BL.BBB
#    Mean                  0.38600000       1.53800000         0.80200000     3.20200000       0.36960000     21
#    Stand dev             0.03847077       0.08288546         0.08074652     0.12049896       0.04659184      0
#    Stand Err             0.01720465       0.03706751         0.03611094     0.05388877       0.02083651      0
#    Median                0.40000000       1.55000000         0.80000000     3.24000000       0.37000000     21
#    CoeffofVariation      0.09966520       0.05389172         0.10068144     0.03763241       0.12606019      0
#    Minimum               0.32000000       1.44000000         0.70000000     3.04000000       0.29800000     21
#    Maximun               0.42000000       1.64000000         0.90000000     3.34000000       0.42000000     21
#    Upper Quantile.75%    0.40000000       1.59000000         0.86000000     3.27000000       0.40000000     21
#    LowerQuartile.25%     0.39000000       1.47000000         0.75000000     3.12000000       0.36000000     21
#    n                     5.00000000       5.00000000         5.00000000     5.00000000       5.00000000      5
#                       Acute.BBB Chronic.BBB
#    Mean               2.4000000  11.6000000
#    Stand dev          1.1401754   1.8165902
#    Stand Err          0.5099020   0.8124038
#    Median             2.0000000  12.0000000
#    CoeffofVariation   0.4750731   0.1566026
#    Minimum            1.0000000   9.0000000
#    Maximun            4.0000000  14.0000000
#    Upper Quantile.75% 3.0000000  12.0000000
#    LowerQuartile.25%  2.0000000  11.0000000
#    n                  5.0000000   5.0000000

我们可以通过 rbinding

转换为单个数据集
out <- do.call(rbind, unname(Map(function(x, y) 
 `row.names<-`(transform(data.frame(Injury = x, y, 
    stringsAsFactors = FALSE), rn = row.names(y)), NULL),  names(lst2), lst2)))

数据

my_data <- structure(list(Injury = c("Moderate", "Severe", "Moderate", "Moderate", 
"Severe", "Moderate", "Severe", "Severe", "Moderate", "Moderate", 
"Moderate", "Severe"), BL.Time.Delay = c(0.35, 0.42, 0.45, 0.42, 
0.4, 0.37, 0.32, 0.4, 0.47, 0.41, 0.38, 0.39), Acute.Time.Delay = c(1.1, 
1.47, 1.02, 0.97, 1.55, 0.96, 1.64, 1.44, 1.01, 0.9, 0.86, 1.59
), Chronic.Time.Delay = c(0.6, 0.86, 0.65, 0.7, 0.8, 0.65, 0.75, 
0.9, 0.82, 0.67, 0.57, 0.7), Acute.Area.Def = c(1.84, 3.04, 1.8, 
1.76, 3.12, 1.65, 3.34, 3.24, 1.77, 1.51, 1.78, 3.27), Chronic.Area.Def = c(0.15, 
0.42, 0.15, 0, 0.37, 0.24, 0.4, 0.298, 0.14, 0.19, 0.1, 0.36), 
    BL.BBB = c(21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 
    21L, 21L), Acute.BBB = c(11L, 3L, 11L, 8L, 4L, 9L, 2L, 1L, 
    7L, 9L, 10L, 2L), Chronic.BBB = c(18L, 14L, 18L, 17L, 11L, 
    14L, 12L, 9L, 17L, 15L, 17L, 12L)), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

这样的事情应该可行,pivot_widerpivot_longer 足够灵活,可以帮助您使用更多分组变量来做到这一点

df %>%
  mutate(id = seq_along(Injury)) %>%
  pivot_longer(BL.Time.Delay:Chronic.BBB, names_to = "var", values_to = "value") %>%
  pivot_wider(id_cols = c(id, var), names_from = Injury, values_from = value) %>%
  group_by(var) %>%
  summarise(Moderate = mean(Moderate, na.rm = T),
            Severe = mean(Severe, na.rm = T))

或者,这将为您提供更长格式的输出

df %>%
  mutate(id = seq_along(Injury)) %>%
  pivot_longer(BL.Time.Delay:Chronic.BBB, names_to = "var", values_to = "value") %>%
  # pivot_wider(id_cols = c(id, var), names_from = Injury, values_from = value) %>%
  group_by(var, Injury) %>%
  summarise(mean = mean(value, na.rm = T))