如何计算 R 中最后两个值之间的差值?
How can I calculate the difference between two last values in R?
我的数据大致如下所示,我想为每个参与者 (v001
) 计算一个变量,其差值为 最后两个可用测量值(来自lnslope1
到 lnslope9
)。每个主题至少有两个测量值。
我的问题是:
我如何在 R 中执行此操作?我已经阅读了 diff
函数,但我不确定它是否可以在这里使用。我是否必须以长格式重组数据才能进行此计算?这是数据:
structure(list(v001 = c(10002, 10004, 10005, 10006, 10007, 10011,
10012, 10018), lnslope1 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), lnslope2 = c(NA, NA,
0.313091787977149, 0.800960043896479, NA, NA, 0, 0.246092484299754
), lnslope3 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), lnslope4 = c(NA, 0.218445030532656,
NA, NA, NA, NA, 0.505548566665147, NA), lnslope5 = c(0.0507723253734231,
NA, -0.0361572285993463, NA, -0.133531392624523, -0.0824189464154196,
NA, -0.186877373329815), lnslope6 = c(0.606135803570316, NA,
NA, NA, -0.0408887702539783, 0.304548524450922, NA, 0.099090902644231
), lnslope7 = c(0.192160005794242, NA, NA, 1.37147927533475,
NA, 0.485507815781701, NA, 0.0307716586667537), lnslope8 = c(0.10951852580649,
NA, NA, 1.53234783071453, 0.145860850410924, 0.604821224703469,
NA, 0.0692660582117757), lnslope9 = c(0.374693449441411, NA,
NA, 0.996237878364571, NA, 0.852777326151829, NA, 0.0299842570512681
)), .Names = c("v001", "lnslope1", "lnslope2", "lnslope3", "lnslope4",
"lnslope5", "lnslope6", "lnslope7", "lnslope8", "lnslope9"), row.names = c(NA,
8L), class = "data.frame")
这是一种使用已定义函数并应用的迂回方式(测试是您的数据)。我喜欢这种方式,因为每个步骤都有明确的定义:
# Finds the difference between first and last non-zero element
find_difference <- function(row) {
# Remove NAs
row <- row[!is.na(row)]
# Find number of non-NA entries
len <- length(row)
# Check to see if there is more than 1 non-NA observation
if (len > 1) {
difference <- row[len] - row[len - 1]
return(difference)
# If not more than one non-NA observation return NA
} else {
return(NA)
}
}
# Use apply across each row (MARGIN = 1) with defined function
# Exclude the first column because it contains the ID
test$diff <- apply(test[, 2:ncol(test)], MARGIN = 1, FUN = find_difference)
结果:
v001 lnslope1 lnslope2 lnslope3 lnslope4 lnslope5 lnslope6 lnslope7 lnslope8 lnslope9 diff
1 10002 NA NA NA NA 0.05077233 0.60613580 0.19216001 0.10951853 0.37469345 0.2651749
2 10004 NA NA NA 0.2184450 NA NA NA NA NA NA
3 10005 NA 0.3130918 NA NA -0.03615723 NA NA NA NA -0.3492490
4 10006 NA 0.8009600 NA NA NA NA 1.37147928 1.53234783 0.99623788 -0.5361100
5 10007 NA NA NA NA -0.13353139 -0.04088877 NA 0.14586085 NA 0.1867496
6 10011 NA NA NA NA -0.08241895 0.30454852 0.48550782 0.60482122 0.85277733 0.2479561
7 10012 NA 0.0000000 NA 0.5055486 NA NA NA NA NA 0.5055486
8 10018 NA 0.2460925 NA NA -0.18687737 0.09909090 0.03077166 0.06926606 0.02998426 -0.0392818
考虑以下基本 R 方法:
by
(按 v001 组对数据帧进行切片),
tail(..., 2)
(到 return 每组最后两个)
sapply
和 diff()
遍历列以计算行差异
数据 (v001组出现5次的随机数据)
set.seed(101)
df <- data.frame(v001=rep(c(10002, 10004, 10005, 10006, 10007, 10011, 10012, 10018), 5))
df[paste0("lnslope", 1:9)] <- replicate(9, runif(40))
# BOTTOM 16 (last two per each v001)
tail(df, 16)
# v001 lnslope1 lnslope2 lnslope3 lnslope4 lnslope5 lnslope6 lnslope7 lnslope8 lnslope9
# 25 10002 0.92331888 0.84324054 0.71833452 0.87557727 0.359044717 0.4376301 0.44279478 0.225614349 0.76226362
# 26 10004 0.79571976 0.71550340 0.33939503 0.16735989 0.008593605 0.1863236 0.89727380 0.561463483 0.92876406
# 27 10005 0.07121255 0.01908119 0.08122143 0.46907169 0.235711577 0.9454490 0.05384848 0.768000195 0.83164735
# 28 10006 0.38940777 0.30508025 0.03723433 0.65222974 0.106234733 0.1453450 0.66423805 0.633419973 0.17826814
# 29 10007 0.40645122 0.88294798 0.77308879 0.03446071 0.611033974 0.7789505 0.82934498 0.343626148 0.64859692
# 30 10011 0.65935508 0.94134682 0.99508226 0.43545232 0.204697003 0.8133834 0.75196439 0.198905340 0.67138895
# 31 10012 0.42334715 0.23441801 0.14658643 0.14976436 0.214610423 0.2201670 0.03444567 0.002378824 0.88840185
# 32 10018 0.32098445 0.93666583 0.03984487 0.45890584 0.016491745 0.3165820 0.64558797 0.849219944 0.18576634
# 33 10002 0.19773073 0.56670198 0.56567790 0.61886157 0.328194365 0.2641242 0.96276149 0.678308439 0.49316356
# 34 10004 0.16317009 0.84279040 0.88858587 0.95629334 0.269595276 0.5218472 0.36051843 0.203191312 0.13070689
# 35 10005 0.52331108 0.82130118 0.87072166 0.10063420 0.913872405 0.2031901 0.97042921 0.983532294 0.48476033
# 36 10006 0.91347865 0.27997285 0.98174943 0.22771539 0.417871804 0.6531599 0.49181493 0.752674565 0.60324084
# 37 10007 0.20677272 0.04730114 0.87974690 0.55501376 0.690565649 0.2772224 0.93198232 0.393392928 0.89320133
# 38 10011 0.81428302 0.22494063 0.51040174 0.77102964 0.900012892 0.4086745 0.60127222 0.090300324 0.64120345
# 39 10012 0.02016720 0.67309265 0.33437535 0.47966908 0.207708929 0.8367247 0.35175022 0.598136016 0.16957180
# 40 10018 0.92480441 0.95893086 0.61333531 0.88103731 0.461033160 0.4350532 0.08774474 0.676346199 0.01014118
输出
diff_list <- by(df, df$v001, FUN=function(d) sapply(tail(d, 2)[-1], diff))
diff_matrix <- do.call(rbind, diff_list)
diff_matrix
# lnslope1 lnslope2 lnslope3 lnslope4 lnslope5 lnslope6 lnslope7 lnslope8 lnslope9
# 10002 -0.7255881 -0.27653857 -0.1526566 -0.2567157 -0.030850352 -0.1735058 0.5199667 0.45269409 -0.2691001
# 10004 -0.6325497 0.12728700 0.5491908 0.7889335 0.261001671 0.3355236 -0.5367554 -0.35827217 -0.7980572
# 10005 0.4520985 0.80221999 0.7895002 -0.3684375 0.678160829 -0.7422589 0.9165807 0.21553210 -0.3468870
# 10006 0.5240709 -0.02510740 0.9445151 -0.4245144 0.311637071 0.5078149 -0.1724231 0.11925459 0.4249727
# 10007 -0.1996785 -0.83564685 0.1066581 0.5205530 0.079531675 -0.5017281 0.1026373 0.04976678 0.2446044
# 10011 0.1549279 -0.71640619 -0.4846805 0.3355773 0.695315889 -0.4047089 -0.1506922 -0.10860502 -0.0301855
# 10012 -0.4031799 0.43867464 0.1877889 0.3299047 -0.006901494 0.6165577 0.3173046 0.59575719 -0.7188300
# 10018 0.6038200 0.02226503 0.5734904 0.4221315 0.444541415 0.1184712 -0.5578432 -0.17287374 -0.175625
我的数据大致如下所示,我想为每个参与者 (v001
) 计算一个变量,其差值为 最后两个可用测量值(来自lnslope1
到 lnslope9
)。每个主题至少有两个测量值。
我的问题是:
我如何在 R 中执行此操作?我已经阅读了 diff
函数,但我不确定它是否可以在这里使用。我是否必须以长格式重组数据才能进行此计算?这是数据:
structure(list(v001 = c(10002, 10004, 10005, 10006, 10007, 10011,
10012, 10018), lnslope1 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), lnslope2 = c(NA, NA,
0.313091787977149, 0.800960043896479, NA, NA, 0, 0.246092484299754
), lnslope3 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), lnslope4 = c(NA, 0.218445030532656,
NA, NA, NA, NA, 0.505548566665147, NA), lnslope5 = c(0.0507723253734231,
NA, -0.0361572285993463, NA, -0.133531392624523, -0.0824189464154196,
NA, -0.186877373329815), lnslope6 = c(0.606135803570316, NA,
NA, NA, -0.0408887702539783, 0.304548524450922, NA, 0.099090902644231
), lnslope7 = c(0.192160005794242, NA, NA, 1.37147927533475,
NA, 0.485507815781701, NA, 0.0307716586667537), lnslope8 = c(0.10951852580649,
NA, NA, 1.53234783071453, 0.145860850410924, 0.604821224703469,
NA, 0.0692660582117757), lnslope9 = c(0.374693449441411, NA,
NA, 0.996237878364571, NA, 0.852777326151829, NA, 0.0299842570512681
)), .Names = c("v001", "lnslope1", "lnslope2", "lnslope3", "lnslope4",
"lnslope5", "lnslope6", "lnslope7", "lnslope8", "lnslope9"), row.names = c(NA,
8L), class = "data.frame")
这是一种使用已定义函数并应用的迂回方式(测试是您的数据)。我喜欢这种方式,因为每个步骤都有明确的定义:
# Finds the difference between first and last non-zero element
find_difference <- function(row) {
# Remove NAs
row <- row[!is.na(row)]
# Find number of non-NA entries
len <- length(row)
# Check to see if there is more than 1 non-NA observation
if (len > 1) {
difference <- row[len] - row[len - 1]
return(difference)
# If not more than one non-NA observation return NA
} else {
return(NA)
}
}
# Use apply across each row (MARGIN = 1) with defined function
# Exclude the first column because it contains the ID
test$diff <- apply(test[, 2:ncol(test)], MARGIN = 1, FUN = find_difference)
结果:
v001 lnslope1 lnslope2 lnslope3 lnslope4 lnslope5 lnslope6 lnslope7 lnslope8 lnslope9 diff
1 10002 NA NA NA NA 0.05077233 0.60613580 0.19216001 0.10951853 0.37469345 0.2651749
2 10004 NA NA NA 0.2184450 NA NA NA NA NA NA
3 10005 NA 0.3130918 NA NA -0.03615723 NA NA NA NA -0.3492490
4 10006 NA 0.8009600 NA NA NA NA 1.37147928 1.53234783 0.99623788 -0.5361100
5 10007 NA NA NA NA -0.13353139 -0.04088877 NA 0.14586085 NA 0.1867496
6 10011 NA NA NA NA -0.08241895 0.30454852 0.48550782 0.60482122 0.85277733 0.2479561
7 10012 NA 0.0000000 NA 0.5055486 NA NA NA NA NA 0.5055486
8 10018 NA 0.2460925 NA NA -0.18687737 0.09909090 0.03077166 0.06926606 0.02998426 -0.0392818
考虑以下基本 R 方法:
by
(按 v001 组对数据帧进行切片),tail(..., 2)
(到 return 每组最后两个)sapply
和diff()
遍历列以计算行差异
数据 (v001组出现5次的随机数据)
set.seed(101)
df <- data.frame(v001=rep(c(10002, 10004, 10005, 10006, 10007, 10011, 10012, 10018), 5))
df[paste0("lnslope", 1:9)] <- replicate(9, runif(40))
# BOTTOM 16 (last two per each v001)
tail(df, 16)
# v001 lnslope1 lnslope2 lnslope3 lnslope4 lnslope5 lnslope6 lnslope7 lnslope8 lnslope9
# 25 10002 0.92331888 0.84324054 0.71833452 0.87557727 0.359044717 0.4376301 0.44279478 0.225614349 0.76226362
# 26 10004 0.79571976 0.71550340 0.33939503 0.16735989 0.008593605 0.1863236 0.89727380 0.561463483 0.92876406
# 27 10005 0.07121255 0.01908119 0.08122143 0.46907169 0.235711577 0.9454490 0.05384848 0.768000195 0.83164735
# 28 10006 0.38940777 0.30508025 0.03723433 0.65222974 0.106234733 0.1453450 0.66423805 0.633419973 0.17826814
# 29 10007 0.40645122 0.88294798 0.77308879 0.03446071 0.611033974 0.7789505 0.82934498 0.343626148 0.64859692
# 30 10011 0.65935508 0.94134682 0.99508226 0.43545232 0.204697003 0.8133834 0.75196439 0.198905340 0.67138895
# 31 10012 0.42334715 0.23441801 0.14658643 0.14976436 0.214610423 0.2201670 0.03444567 0.002378824 0.88840185
# 32 10018 0.32098445 0.93666583 0.03984487 0.45890584 0.016491745 0.3165820 0.64558797 0.849219944 0.18576634
# 33 10002 0.19773073 0.56670198 0.56567790 0.61886157 0.328194365 0.2641242 0.96276149 0.678308439 0.49316356
# 34 10004 0.16317009 0.84279040 0.88858587 0.95629334 0.269595276 0.5218472 0.36051843 0.203191312 0.13070689
# 35 10005 0.52331108 0.82130118 0.87072166 0.10063420 0.913872405 0.2031901 0.97042921 0.983532294 0.48476033
# 36 10006 0.91347865 0.27997285 0.98174943 0.22771539 0.417871804 0.6531599 0.49181493 0.752674565 0.60324084
# 37 10007 0.20677272 0.04730114 0.87974690 0.55501376 0.690565649 0.2772224 0.93198232 0.393392928 0.89320133
# 38 10011 0.81428302 0.22494063 0.51040174 0.77102964 0.900012892 0.4086745 0.60127222 0.090300324 0.64120345
# 39 10012 0.02016720 0.67309265 0.33437535 0.47966908 0.207708929 0.8367247 0.35175022 0.598136016 0.16957180
# 40 10018 0.92480441 0.95893086 0.61333531 0.88103731 0.461033160 0.4350532 0.08774474 0.676346199 0.01014118
输出
diff_list <- by(df, df$v001, FUN=function(d) sapply(tail(d, 2)[-1], diff))
diff_matrix <- do.call(rbind, diff_list)
diff_matrix
# lnslope1 lnslope2 lnslope3 lnslope4 lnslope5 lnslope6 lnslope7 lnslope8 lnslope9
# 10002 -0.7255881 -0.27653857 -0.1526566 -0.2567157 -0.030850352 -0.1735058 0.5199667 0.45269409 -0.2691001
# 10004 -0.6325497 0.12728700 0.5491908 0.7889335 0.261001671 0.3355236 -0.5367554 -0.35827217 -0.7980572
# 10005 0.4520985 0.80221999 0.7895002 -0.3684375 0.678160829 -0.7422589 0.9165807 0.21553210 -0.3468870
# 10006 0.5240709 -0.02510740 0.9445151 -0.4245144 0.311637071 0.5078149 -0.1724231 0.11925459 0.4249727
# 10007 -0.1996785 -0.83564685 0.1066581 0.5205530 0.079531675 -0.5017281 0.1026373 0.04976678 0.2446044
# 10011 0.1549279 -0.71640619 -0.4846805 0.3355773 0.695315889 -0.4047089 -0.1506922 -0.10860502 -0.0301855
# 10012 -0.4031799 0.43867464 0.1877889 0.3299047 -0.006901494 0.6165577 0.3173046 0.59575719 -0.7188300
# 10018 0.6038200 0.02226503 0.5734904 0.4221315 0.444541415 0.1184712 -0.5578432 -0.17287374 -0.175625