独立性测试中的 Rfast 分段错误
Rfast segmentation fault on independence test
我在使用 R 中 Fast 函数的 G2 测试函数时遇到问题,因为它输出了分段错误,即使在我看来输入参数是正确的。
更具体地说,我能够 运行 manual page
中的示例代码
nvalues <- 3
nvars <- 10
nsamples <- 5000
data <- matrix( sample( 0:(nvalues - 1), nvars * nsamples, replace = TRUE ), nsamples, nvars )
dc <- rep(nvalues, nvars)
res<-g2Test( data, 1, 2, 3, c(3, 3, 3) )
但我无法 运行 使用我的数据。函数 g2Test 将一个数字矩阵作为输入,三个整数代表条件列(在示例中,我们正在研究第一个对第二个条件对第三个条件的依赖性)和一个向量,其中包含唯一的数每列值。
我的代码遵循从 ALARM csv file
读取数据的相同原则
library(readr)
library(Rfast)
# open the file
path <- "datasets/alarm.csv"
dataset <- read.csv(path)
# search for the indexes of the column I'm interested in and the amount of unique values per column
c1 <- "PVS"
c2 <- "ACO2"
s <- c("VALV", "VLNG", "VTUB", "VMCH")
n <- colnames(dataset)
col_c1 <- match(c1, n)
col_c2 <- match(c2, n)
cols_c3 <- c()
uni <- c(length(unique(dataset[c1])[[1]])[[1]],length(unique(dataset[c2])[[1]])[[1]])
if (!s[1]=="()"){
for(v in s){
idx <- match(v, n)
cols_c3 <- append(cols_c3,idx)
uni <- append(uni,length(unique(dataset[v])[[1]])[[1]])
}
}
# transforming the str DataFrame into a integer matrix
for (nn in n){
dataset[nn] <- unclass(as.factor(dataset[nn][[1]]))
}
ds <- as.matrix(dataset)
colnames(ds) <- NULL
# running the G2 test
res <- g2Test(ds, col_c1, col_c2, cols_c3, uni)
但它会导致分段错误
*** caught segfault ***
address 0x1f103f96a, cause 'memory not mapped'
Traceback:
1: g2Test(ds, col_c1, col_c2, cols_c3, uni)
Possible actions:
1: abort (with core dump, if enabled)
2: normal R exit
3: exit R without saving workspace
4: exit R saving workspace
如果我只以一个变量而不是多个变量为条件,也会发生同样的情况。
我真的不明白为什么会这样,因为在我看来我的情况与参考文献中的示例相同,只是数据不同。非常感谢任何调试此问题的帮助,请告诉我是否需要指定更多信息。
首先,很抱歉我错过了您最初包含的数据!
好吧,我希望我能早点意识到这一点(就像你一样……)。列必须是连续的,并且值必须从零开始。那是什么意思呢?您必须重新排列列,以便 col_c1
是第一列,col_c2
是第二列,依此类推。您必须将所有值减一(因为最低值为 1)。
这是我所做的(以及我如何检查它):
# there was no PVS, I assume this was PVSAT
c1 <- "PVSAT"
# c1 <- "PVS"
# there was no ACO2, I assume this was ARTCO2
c2 <- "ARTCO2"
# c2 <- "ACO2"
# there are no columns with these names...
# for VALV - VENTALV; for VLNG - VENTLUNG; for VTUB - VENTTUBE; for VMCH - VENTMACH
s <- c("VENTALV", "VENTLUNG", "VENTTUBE", "VENTMACH")
# s <- c("VALV", "VLNG", "VTUB", "VMCH")
下一个块与您写的完全一样:
n <- colnames(dataset)
col_c1 <- match(c1, n)
col_c2 <- match(c2, n)
cols_c3 <- c()
uni <- c(length(unique(dataset[c1])[[1]])[[1]],length(unique(dataset[c2])[[1]])[[1]])
if (!s[1]=="()"){
for(v in s){
idx <- match(v, n)
cols_c3 <- append(cols_c3,idx)
uni <- append(uni,length(unique(dataset[v])[[1]])[[1]])
}
}
# transforming the str DataFrame into a integer matrix
for (nn in n){
dataset[nn] <- unclass(as.factor(dataset[nn][[1]]))
}
ds <- as.matrix(dataset)
这是我将最小值设置为零的地方:
# look at the number of unique values before changing, as a means of validation
sapply(1:ncol(ds), function(x) length(unique(ds[, x])))
# look at the minimum, as a means of validation
sapply(1:ncol(ds), function(x) min(ds[,x]))
# the minimum value must be zero
ds <- ds - 1
# check
sapply(1:ncol(ds), function(x) min(ds[,x]))
sapply(1:ncol(ds), function(x) length(unique(ds[, x])))
# looked as expected
接下来,我重新排列了列。我在删除名称之前执行了此操作,因此我可以使用这些名称来确保顺序正确。
# the data must be consecutive numbers
# catch names before and after
n2 <- dimnames(ds)
# some of the results from this:
# [[2]]
# [1] "HISTORY" "CVP" "PCWP" "HYPOVOLEMIA"
# create the list of column indicies other than those getting called in g2Test
tellMe <- c(1:ncol(ds))
tellMe <- tellMe[-c(col_c1, col_c2, sort(cols_c3))]
# rearrange using the indices
ds <- ds[, c(col_c1, col_c2, sort(cols_c3), tellMe)]
# check it
(n3 <- dimnames(ds))
# some of the results from this
# [[2]]
# [1] "PVSAT" "ARTCO2" "VENTMACH" "VENTTUBE"
剩下的就是删除名称(就像您所做的那样),然后调用函数。不过,由于索引已更改,您的对象将无法在此处工作。
colnames(ds) <- NULL
# running the G2 test
# res <- g2Test(ds, col_c1, col_c2, sort(cols_c3), uni)
res2 <- g2Test(ds, 1, 2, c(3,4,5,6), c(3, 3, 4, 4, 4, 4))
# $statistic
# [1] 19.78506
#
# $df
# [1] 1024
#
我在使用 R 中 Fast 函数的 G2 测试函数时遇到问题,因为它输出了分段错误,即使在我看来输入参数是正确的。
更具体地说,我能够 运行 manual page
中的示例代码nvalues <- 3
nvars <- 10
nsamples <- 5000
data <- matrix( sample( 0:(nvalues - 1), nvars * nsamples, replace = TRUE ), nsamples, nvars )
dc <- rep(nvalues, nvars)
res<-g2Test( data, 1, 2, 3, c(3, 3, 3) )
但我无法 运行 使用我的数据。函数 g2Test 将一个数字矩阵作为输入,三个整数代表条件列(在示例中,我们正在研究第一个对第二个条件对第三个条件的依赖性)和一个向量,其中包含唯一的数每列值。
我的代码遵循从 ALARM csv file
读取数据的相同原则library(readr)
library(Rfast)
# open the file
path <- "datasets/alarm.csv"
dataset <- read.csv(path)
# search for the indexes of the column I'm interested in and the amount of unique values per column
c1 <- "PVS"
c2 <- "ACO2"
s <- c("VALV", "VLNG", "VTUB", "VMCH")
n <- colnames(dataset)
col_c1 <- match(c1, n)
col_c2 <- match(c2, n)
cols_c3 <- c()
uni <- c(length(unique(dataset[c1])[[1]])[[1]],length(unique(dataset[c2])[[1]])[[1]])
if (!s[1]=="()"){
for(v in s){
idx <- match(v, n)
cols_c3 <- append(cols_c3,idx)
uni <- append(uni,length(unique(dataset[v])[[1]])[[1]])
}
}
# transforming the str DataFrame into a integer matrix
for (nn in n){
dataset[nn] <- unclass(as.factor(dataset[nn][[1]]))
}
ds <- as.matrix(dataset)
colnames(ds) <- NULL
# running the G2 test
res <- g2Test(ds, col_c1, col_c2, cols_c3, uni)
但它会导致分段错误
*** caught segfault ***
address 0x1f103f96a, cause 'memory not mapped'
Traceback:
1: g2Test(ds, col_c1, col_c2, cols_c3, uni)
Possible actions:
1: abort (with core dump, if enabled)
2: normal R exit
3: exit R without saving workspace
4: exit R saving workspace
如果我只以一个变量而不是多个变量为条件,也会发生同样的情况。
我真的不明白为什么会这样,因为在我看来我的情况与参考文献中的示例相同,只是数据不同。非常感谢任何调试此问题的帮助,请告诉我是否需要指定更多信息。
首先,很抱歉我错过了您最初包含的数据!
好吧,我希望我能早点意识到这一点(就像你一样……)。列必须是连续的,并且值必须从零开始。那是什么意思呢?您必须重新排列列,以便 col_c1
是第一列,col_c2
是第二列,依此类推。您必须将所有值减一(因为最低值为 1)。
这是我所做的(以及我如何检查它):
# there was no PVS, I assume this was PVSAT
c1 <- "PVSAT"
# c1 <- "PVS"
# there was no ACO2, I assume this was ARTCO2
c2 <- "ARTCO2"
# c2 <- "ACO2"
# there are no columns with these names...
# for VALV - VENTALV; for VLNG - VENTLUNG; for VTUB - VENTTUBE; for VMCH - VENTMACH
s <- c("VENTALV", "VENTLUNG", "VENTTUBE", "VENTMACH")
# s <- c("VALV", "VLNG", "VTUB", "VMCH")
下一个块与您写的完全一样:
n <- colnames(dataset)
col_c1 <- match(c1, n)
col_c2 <- match(c2, n)
cols_c3 <- c()
uni <- c(length(unique(dataset[c1])[[1]])[[1]],length(unique(dataset[c2])[[1]])[[1]])
if (!s[1]=="()"){
for(v in s){
idx <- match(v, n)
cols_c3 <- append(cols_c3,idx)
uni <- append(uni,length(unique(dataset[v])[[1]])[[1]])
}
}
# transforming the str DataFrame into a integer matrix
for (nn in n){
dataset[nn] <- unclass(as.factor(dataset[nn][[1]]))
}
ds <- as.matrix(dataset)
这是我将最小值设置为零的地方:
# look at the number of unique values before changing, as a means of validation
sapply(1:ncol(ds), function(x) length(unique(ds[, x])))
# look at the minimum, as a means of validation
sapply(1:ncol(ds), function(x) min(ds[,x]))
# the minimum value must be zero
ds <- ds - 1
# check
sapply(1:ncol(ds), function(x) min(ds[,x]))
sapply(1:ncol(ds), function(x) length(unique(ds[, x])))
# looked as expected
接下来,我重新排列了列。我在删除名称之前执行了此操作,因此我可以使用这些名称来确保顺序正确。
# the data must be consecutive numbers
# catch names before and after
n2 <- dimnames(ds)
# some of the results from this:
# [[2]]
# [1] "HISTORY" "CVP" "PCWP" "HYPOVOLEMIA"
# create the list of column indicies other than those getting called in g2Test
tellMe <- c(1:ncol(ds))
tellMe <- tellMe[-c(col_c1, col_c2, sort(cols_c3))]
# rearrange using the indices
ds <- ds[, c(col_c1, col_c2, sort(cols_c3), tellMe)]
# check it
(n3 <- dimnames(ds))
# some of the results from this
# [[2]]
# [1] "PVSAT" "ARTCO2" "VENTMACH" "VENTTUBE"
剩下的就是删除名称(就像您所做的那样),然后调用函数。不过,由于索引已更改,您的对象将无法在此处工作。
colnames(ds) <- NULL
# running the G2 test
# res <- g2Test(ds, col_c1, col_c2, sort(cols_c3), uni)
res2 <- g2Test(ds, 1, 2, c(3,4,5,6), c(3, 3, 4, 4, 4, 4))
# $statistic
# [1] 19.78506
#
# $df
# [1] 1024
#