如何 运行 作用于矩阵的子集
How to run function on subsets of matrix
我有一个包含数据子集的 data.frame。这些子集由存储在名为 MASTStation 的列中的 ID 标识。这是我的数据中的一个示例。
> Dummy4
Key MASTStation Longitude Latitude z Pressure Temperature Salinity SigmaT SigmaTheta PAR ChlAFluor
1 1 CH-A01 -168.497 65.998 0 0 1.80255 32.53150 26.00931 26.00931 35.844 5.6362
2 2 CH-A01 -168.497 65.998 -1 1 1.80255 32.53150 26.00931 26.00932 35.844 5.6362
3 3 CH-A01 -168.497 65.998 -2 2 1.80255 32.53150 26.00942 26.00943 35.844 5.5869
4 4 CH-A01 -168.497 65.998 -3 3 1.80190 32.53240 26.00935 26.00936 21.383 5.8863
5 5 CH-A01 -168.497 65.998 -4 4 1.80220 32.53250 26.00929 26.00931 16.383 5.9729
6 782 CH-H02 -167.048 69.493 0 0 7.71730 31.10890 24.26078 24.26079 25.697 1.5285
7 783 CH-H02 -167.048 69.493 -1 1 7.71730 31.10890 24.26078 24.26081 25.697 1.5285
8 784 CH-H02 -167.048 69.493 -2 2 7.71730 31.10890 24.25936 24.25939 25.697 1.5127
9 785 CH-H02 -167.048 69.493 -3 3 7.72100 31.10885 24.26016 24.26021 22.246 1.4385
10 786 CH-H02 -167.048 69.493 -4 4 7.72125 31.11020 24.26036 24.26042 17.901 1.4320
我有一个函数要应用到这些子集中的每一个,所以我尝试像这样使用 by():
by(Dummy4,list(Dummy4[,2]),calculate.mld,Dummy4[,9],Dummy4[,5])
其中returns以下错误和traceback():
Error in FUN(data[x, , drop = FALSE], ...) :
sigma and z vectors must have equal length
11 stop("sigma and z vectors must have equal length")
10 FUN(data[x, , drop = FALSE], ...)
9 FUN(X[[1L]], ...)
8 lapply(X = split(X, group), FUN = FUN, ...)
7 tapply(seq_len(10L), list(structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor")),
function (x)
FUN(data[x, , drop = FALSE], ...), simplify = TRUE)
6 eval(expr, envir, enclos)
5 eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data)
4 structure(eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data), call = match.call(), class = "by")
3 by.data.frame(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[,
9], Dummy4[, 5])
2 by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[, 9], Dummy4[,
5])
1 as.matrix(by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[,
9], Dummy4[, 5]))
所以问题出在我的功能上。但是,当我根据子集将矩阵拆分为两个单独的矩阵,并且 运行 每个矩阵上的函数时,没有问题。这是我的功能:
> calculate.mld
function(sigma, z, deltaSigma = 0.125)
{
# check that the number of sigma and z values are equal
if (length(sigma) != length(z))
{
stop('sigma and z vectors must have equal length')
}
# remove the na's
keepers <- !(is.na(z) | is.na(sigma))
z <- z[keepers]
sigma <- sigma[keepers]
# return an NA and a warning if there aren't at least two
# numeric values
if (length(sigma) < 2)
{
warning('fewer than two valid sigma and depth-value combinations entered, NA returned')
NA
} else {
# I use negative depths to be consistent with the Scripps database
if (all(z >= 0))
{
z <- z * -1
pos <- TRUE
} else {
pos <- FALSE
}
# be sure the data are sorted by depths
ord <- order(z, decreasing = TRUE)
z <- z[ord]
sigma <- sigma[ord]
#z <- sort(z, decreasing = TRUE)
#sigma <- sigma[order(z, decreasing = TRUE)]
# Manuscript uses a z of 10 m as the initial sigmaRef, but we will
# take the closest value in case the 10-m measurement is missing.
minDepth <- which(abs(z + 10) == min(abs(z + 10)))
minDepth <- ifelse(length(minDepth) > 1, minDepth[2], minDepth)
sigmaRef <- sigma[minDepth]
sigma <- sigma[minDepth:length(sigma)]
z <- z[minDepth:length(z)]
diffs <- abs(sigma - sigmaRef)
# if sigma never changes by at least deltaSigma, return the lowest depth
# Otherwise proceed
if (max(diffs, na.rm = TRUE) >= deltaSigma)
{
# the uniform region, if present, occurs where the change between
# any two points is <= deltaSigma * 1/10, and the change in sigma over
# the profile has not yet exceeded deltaSigma
uniformRegion <- (abs(sigma[2:length(sigma)] -
sigma[1:(length(sigma) - 1)]) <=
(deltaSigma / 10)) &
(diffs[2:length(diffs)] < deltaSigma)
if (any(uniformRegion))
{
sigmaRefPos <- max(which(uniformRegion))
# change sigmaRef to the base of the uniform region
sigmaRef <- sigma[sigmaRefPos]
# calculate change from the new reference
reachedDeltaSigma <- abs(sigma[(sigmaRefPos + 1):length(sigma)] - sigmaRef) >= deltaSigma
# if any deeper measurements of sigma reach or exceed deltaSigma,
# linearly interpolate between the nearest points to find
# mixed-layer depth
if (any(reachedDeltaSigma))
{
pair <- min(which(reachedDeltaSigma)) + sigmaRefPos - 1
linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)])
mld <- as.vector(linmod$coefficients[1] +
linmod$coefficients[2] *
(sigmaRef + deltaSigma))
} else {
# otherwise, mixed-layer depth is the deepest point
mld <- min(z)
}
} else {
# if there is no uniform region, just linearly interpolate mld
pair <- min(which(diffs >= deltaSigma)) - 1
linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)])
mld <- as.vector(linmod$coefficients[1] +
linmod$coefficients[2] *
(sigmaRef + deltaSigma))
}
} else {
mld <- min(z)
}
if (pos) mld <- abs(mld)
mld
}
}
我不明白为什么在 运行使用 by() 函数时两个变量列的长度不再相等。我确定这是一个足够简单的解决方案,但我没有看到它。有什么想法吗?
我是 R 的初学者,所以请多多包涵。
这就是您可以在此处使用 by
的方式。您需要一个接受 data.frame 作为参数的函数:
by(Dummy4, list(Dummy4$MASTStation), function(df) calculate.mld(df$SigmaT, df$z))
#: CH-A01
#[1] -4
#-----------------------------------------------------------------------------------------------------------
#: CH-H02
#[1] -4
我更喜欢使用包 data.table:
library(data.table)
setDT(Dummy4)
Dummy4[, .(MLD = calculate.mld(SigmaT, z)), by = MASTStation]
# MASTStation MLD
#1: CH-A01 -4
#2: CH-H02 -4
可重现数据(使用 dput
创建):
Dummy4 <- structure(list(Key = c(1L, 2L, 3L, 4L, 5L, 782L, 783L, 784L,
785L, 786L), MASTStation = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor"),
Longitude = c(-168.497, -168.497, -168.497, -168.497, -168.497,
-167.048, -167.048, -167.048, -167.048, -167.048), Latitude = c(65.998,
65.998, 65.998, 65.998, 65.998, 69.493, 69.493, 69.493, 69.493,
69.493), z = c(0L, -1L, -2L, -3L, -4L, 0L, -1L, -2L, -3L,
-4L), Pressure = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L
), Temperature = c(1.80255, 1.80255, 1.80255, 1.8019, 1.8022,
7.7173, 7.7173, 7.7173, 7.721, 7.72125), Salinity = c(32.5315,
32.5315, 32.5315, 32.5324, 32.5325, 31.1089, 31.1089, 31.1089,
31.10885, 31.1102), SigmaT = c(26.00931, 26.00931, 26.00942,
26.00935, 26.00929, 24.26078, 24.26078, 24.25936, 24.26016,
24.26036), SigmaTheta = c(26.00931, 26.00932, 26.00943, 26.00936,
26.00931, 24.26079, 24.26081, 24.25939, 24.26021, 24.26042
), PAR = c(35.844, 35.844, 35.844, 21.383, 16.383, 25.697,
25.697, 25.697, 22.246, 17.901), ChlAFluor = c(5.6362, 5.6362,
5.5869, 5.8863, 5.9729, 1.5285, 1.5285, 1.5127, 1.4385, 1.432
)), .Names = c("Key", "MASTStation", "Longitude", "Latitude",
"z", "Pressure", "Temperature", "Salinity", "SigmaT", "SigmaTheta",
"PAR", "ChlAFluor"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
我有一个包含数据子集的 data.frame。这些子集由存储在名为 MASTStation 的列中的 ID 标识。这是我的数据中的一个示例。
> Dummy4
Key MASTStation Longitude Latitude z Pressure Temperature Salinity SigmaT SigmaTheta PAR ChlAFluor
1 1 CH-A01 -168.497 65.998 0 0 1.80255 32.53150 26.00931 26.00931 35.844 5.6362
2 2 CH-A01 -168.497 65.998 -1 1 1.80255 32.53150 26.00931 26.00932 35.844 5.6362
3 3 CH-A01 -168.497 65.998 -2 2 1.80255 32.53150 26.00942 26.00943 35.844 5.5869
4 4 CH-A01 -168.497 65.998 -3 3 1.80190 32.53240 26.00935 26.00936 21.383 5.8863
5 5 CH-A01 -168.497 65.998 -4 4 1.80220 32.53250 26.00929 26.00931 16.383 5.9729
6 782 CH-H02 -167.048 69.493 0 0 7.71730 31.10890 24.26078 24.26079 25.697 1.5285
7 783 CH-H02 -167.048 69.493 -1 1 7.71730 31.10890 24.26078 24.26081 25.697 1.5285
8 784 CH-H02 -167.048 69.493 -2 2 7.71730 31.10890 24.25936 24.25939 25.697 1.5127
9 785 CH-H02 -167.048 69.493 -3 3 7.72100 31.10885 24.26016 24.26021 22.246 1.4385
10 786 CH-H02 -167.048 69.493 -4 4 7.72125 31.11020 24.26036 24.26042 17.901 1.4320
我有一个函数要应用到这些子集中的每一个,所以我尝试像这样使用 by():
by(Dummy4,list(Dummy4[,2]),calculate.mld,Dummy4[,9],Dummy4[,5])
其中returns以下错误和traceback():
Error in FUN(data[x, , drop = FALSE], ...) :
sigma and z vectors must have equal length
11 stop("sigma and z vectors must have equal length")
10 FUN(data[x, , drop = FALSE], ...)
9 FUN(X[[1L]], ...)
8 lapply(X = split(X, group), FUN = FUN, ...)
7 tapply(seq_len(10L), list(structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor")),
function (x)
FUN(data[x, , drop = FALSE], ...), simplify = TRUE)
6 eval(expr, envir, enclos)
5 eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data)
4 structure(eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data), call = match.call(), class = "by")
3 by.data.frame(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[,
9], Dummy4[, 5])
2 by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[, 9], Dummy4[,
5])
1 as.matrix(by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[,
9], Dummy4[, 5]))
所以问题出在我的功能上。但是,当我根据子集将矩阵拆分为两个单独的矩阵,并且 运行 每个矩阵上的函数时,没有问题。这是我的功能:
> calculate.mld
function(sigma, z, deltaSigma = 0.125)
{
# check that the number of sigma and z values are equal
if (length(sigma) != length(z))
{
stop('sigma and z vectors must have equal length')
}
# remove the na's
keepers <- !(is.na(z) | is.na(sigma))
z <- z[keepers]
sigma <- sigma[keepers]
# return an NA and a warning if there aren't at least two
# numeric values
if (length(sigma) < 2)
{
warning('fewer than two valid sigma and depth-value combinations entered, NA returned')
NA
} else {
# I use negative depths to be consistent with the Scripps database
if (all(z >= 0))
{
z <- z * -1
pos <- TRUE
} else {
pos <- FALSE
}
# be sure the data are sorted by depths
ord <- order(z, decreasing = TRUE)
z <- z[ord]
sigma <- sigma[ord]
#z <- sort(z, decreasing = TRUE)
#sigma <- sigma[order(z, decreasing = TRUE)]
# Manuscript uses a z of 10 m as the initial sigmaRef, but we will
# take the closest value in case the 10-m measurement is missing.
minDepth <- which(abs(z + 10) == min(abs(z + 10)))
minDepth <- ifelse(length(minDepth) > 1, minDepth[2], minDepth)
sigmaRef <- sigma[minDepth]
sigma <- sigma[minDepth:length(sigma)]
z <- z[minDepth:length(z)]
diffs <- abs(sigma - sigmaRef)
# if sigma never changes by at least deltaSigma, return the lowest depth
# Otherwise proceed
if (max(diffs, na.rm = TRUE) >= deltaSigma)
{
# the uniform region, if present, occurs where the change between
# any two points is <= deltaSigma * 1/10, and the change in sigma over
# the profile has not yet exceeded deltaSigma
uniformRegion <- (abs(sigma[2:length(sigma)] -
sigma[1:(length(sigma) - 1)]) <=
(deltaSigma / 10)) &
(diffs[2:length(diffs)] < deltaSigma)
if (any(uniformRegion))
{
sigmaRefPos <- max(which(uniformRegion))
# change sigmaRef to the base of the uniform region
sigmaRef <- sigma[sigmaRefPos]
# calculate change from the new reference
reachedDeltaSigma <- abs(sigma[(sigmaRefPos + 1):length(sigma)] - sigmaRef) >= deltaSigma
# if any deeper measurements of sigma reach or exceed deltaSigma,
# linearly interpolate between the nearest points to find
# mixed-layer depth
if (any(reachedDeltaSigma))
{
pair <- min(which(reachedDeltaSigma)) + sigmaRefPos - 1
linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)])
mld <- as.vector(linmod$coefficients[1] +
linmod$coefficients[2] *
(sigmaRef + deltaSigma))
} else {
# otherwise, mixed-layer depth is the deepest point
mld <- min(z)
}
} else {
# if there is no uniform region, just linearly interpolate mld
pair <- min(which(diffs >= deltaSigma)) - 1
linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)])
mld <- as.vector(linmod$coefficients[1] +
linmod$coefficients[2] *
(sigmaRef + deltaSigma))
}
} else {
mld <- min(z)
}
if (pos) mld <- abs(mld)
mld
}
}
我不明白为什么在 运行使用 by() 函数时两个变量列的长度不再相等。我确定这是一个足够简单的解决方案,但我没有看到它。有什么想法吗?
我是 R 的初学者,所以请多多包涵。
这就是您可以在此处使用 by
的方式。您需要一个接受 data.frame 作为参数的函数:
by(Dummy4, list(Dummy4$MASTStation), function(df) calculate.mld(df$SigmaT, df$z))
#: CH-A01
#[1] -4
#-----------------------------------------------------------------------------------------------------------
#: CH-H02
#[1] -4
我更喜欢使用包 data.table:
library(data.table)
setDT(Dummy4)
Dummy4[, .(MLD = calculate.mld(SigmaT, z)), by = MASTStation]
# MASTStation MLD
#1: CH-A01 -4
#2: CH-H02 -4
可重现数据(使用 dput
创建):
Dummy4 <- structure(list(Key = c(1L, 2L, 3L, 4L, 5L, 782L, 783L, 784L,
785L, 786L), MASTStation = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor"),
Longitude = c(-168.497, -168.497, -168.497, -168.497, -168.497,
-167.048, -167.048, -167.048, -167.048, -167.048), Latitude = c(65.998,
65.998, 65.998, 65.998, 65.998, 69.493, 69.493, 69.493, 69.493,
69.493), z = c(0L, -1L, -2L, -3L, -4L, 0L, -1L, -2L, -3L,
-4L), Pressure = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L
), Temperature = c(1.80255, 1.80255, 1.80255, 1.8019, 1.8022,
7.7173, 7.7173, 7.7173, 7.721, 7.72125), Salinity = c(32.5315,
32.5315, 32.5315, 32.5324, 32.5325, 31.1089, 31.1089, 31.1089,
31.10885, 31.1102), SigmaT = c(26.00931, 26.00931, 26.00942,
26.00935, 26.00929, 24.26078, 24.26078, 24.25936, 24.26016,
24.26036), SigmaTheta = c(26.00931, 26.00932, 26.00943, 26.00936,
26.00931, 24.26079, 24.26081, 24.25939, 24.26021, 24.26042
), PAR = c(35.844, 35.844, 35.844, 21.383, 16.383, 25.697,
25.697, 25.697, 22.246, 17.901), ChlAFluor = c(5.6362, 5.6362,
5.5869, 5.8863, 5.9729, 1.5285, 1.5285, 1.5127, 1.4385, 1.432
)), .Names = c("Key", "MASTStation", "Longitude", "Latitude",
"z", "Pressure", "Temperature", "Salinity", "SigmaT", "SigmaTheta",
"PAR", "ChlAFluor"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))