R:(选择性地)修剪循环的结果
R: (Selectively) Trimming the Results of a Loop
我正在使用 R 编程语言。我正在学习如何迭代循环一个过程(例如生成一些随机数据并适合不同的决策树)。在上一个问题()中,我学习了如何生成随机数据,拟合不同的决策树并记录它们的准确性:
library(caret)
library(rpart)
#generate data
a = rnorm(1000, 10, 10)
b = rnorm(1000, 10, 5)
c = rnorm(1000, 5, 10)
group <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.5,0.5))
group_1 <- 1:1000
#put data into a frame
d = data.frame(a,b,c, group, group_1)
d$group = as.factor(d$group)
e <- d
vec1 <- sample(200:300, 5)
vec2 <- sample(400:500,5)
vec3 <- sample(700:800,5)
z <- 0
df <- expand.grid(vec1, vec2, vec3)
df$Accuracy <- NA
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
for (k in seq_along(vec3)) {
# d <- e
d$group_2 = as.integer(ifelse(d$group_1 < vec1[i] , 0, ifelse(d$group_1 >vec1[i] & d$group_1 < vec2[j] , 1, ifelse(d$group_1 >vec2[j] & d$group_1 < vec3[k] , 2,3))))
d$group_2 = as.factor(d$group_2)
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 2,
## repeated ten times
repeats = 1)
TreeFit <- train(group_2 ~ ., data = d[,-5],
method = "rpart",
trControl = fitControl)
pred <- predict(
TreeFit,
d[,-5])
con <- confusionMatrix(
d$group_2,
pred)
#update results into table
#final_table[i,j] = con$overall[1]
z <- z + 1
df$Accuracy[z] <- con$overall[1]
}
}
}
#view the final results
head(df)
数据框“df”包含最终结果。我担心的是:如果你想多次迭代这个循环,“df”的大小会变得非常大。假设我只想保留“df”的“前 20 行”(基于 df$Accuracy
的降序值)。我可以按如下方式完成此操作:
#sort "df" by (descending values of) "Accuracy":
df_sort <- df[order(-df$Accuracy),]
#select first 20 rows
df_final = df_sort[1:20,]
但我担心计算机内存的限制可能会阻止创建“df”(对于大量迭代)。
我的问题:有没有办法阻止“df”达到超过 20 行?例如
- 填充“df”的前 20 行
- 如果第 21 行的精度低于前 20 行中的任何一行,则删除
- 如果第21行的精度大于前20行中最小的精度,则保留第21行并删除精度最小的行
这样,“df”的大小永远不会超过 20 行。
有人可以告诉我怎么做吗?
谢谢
您可以像这样实现逻辑:
library(caret)
library(rpart)
a = rnorm(1000, 10, 10)
b = rnorm(1000, 10, 5)
c = rnorm(1000, 5, 10)
group <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.5,0.5))
group_1 <- 1:1000
#put data into a frame
d = data.frame(a,b,c, group, group_1)
d$group = as.factor(d$group)
e <- d
vec1 <- sample(200:300, 5)
vec2 <- sample(400:500,5)
vec3 <- sample(700:800,5)
z <- 0
#Intialise a list
result <- vector('list', 20)
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
for (k in seq_along(vec3)) {
# d <- e
d$group_2 = as.integer(ifelse(d$group_1 < vec1[i] , 0, ifelse(d$group_1 >vec1[i] & d$group_1 < vec2[j] , 1, ifelse(d$group_1 >vec2[j] & d$group_1 < vec3[k] , 2,3))))
d$group_2 = as.factor(d$group_2)
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 2,
## repeated ten times
repeats = 1)
TreeFit <- train(group_2 ~ ., data = d[,-5],
method = "rpart",
trControl = fitControl)
pred <- predict(
TreeFit,
d[,-5])
con <- confusionMatrix(
d$group_2,
pred)
z <- z + 1
#Till 20 put the data in a list
if(z <= 20) {
result[[z]] <- data.frame(vec1 = vec1[i], vec2 = vec2[j], vec3 = vec3[j], Accuracy = con$overall[1])
} else {
#Create a dataframe of 20 list from above
if(z == 21) result <- do.call(rbind, result)
#Sort it in decreasing order
result <- result[order(-result$Accuracy), ]
#compare with last value
if(result$Accuracy[20] < con$overall[1]) {
#Create a new dataframe
new_df <- data.frame(vec1 = vec1[i], vec2 = vec2[j], vec3 = vec3[j], Accuracy = con$overall[1])
#Replace the last row
result <- rbind(head(result, 19), new_df)
}
}
}
}
}
这应该 return 类似于这样的输出:
result
# vec1 vec2 vec3 Accuracy
#Accuracy2 258 402 706 0.376
#Accuracy4 258 402 706 0.376
#Accuracy9 200 402 706 0.376
#Accuracy15 214 402 706 0.376
#Accuracy16 236 402 706 0.376
#Accuracy18 207 402 706 0.376
#Accuracy11 258 414 779 0.364
#Accuracy12 200 414 779 0.364
#Accuracy6 214 414 779 0.364
#Accuracy13 236 414 779 0.364
#Accuracy10 200 402 706 0.360
#Accuracy17 214 402 706 0.360
#Accuracy3 236 402 706 0.360
#Accuracy5 207 402 706 0.360
#Accuracy19 258 414 779 0.348
#Accuracy8 200 414 779 0.348
#Accuracy7 214 414 779 0.348
#Accuracy14 236 414 779 0.348
#Accuracy 207 414 779 0.348
#Accuracy1 207 414 779 0.364
我正在使用 R 编程语言。我正在学习如何迭代循环一个过程(例如生成一些随机数据并适合不同的决策树)。在上一个问题(
library(caret)
library(rpart)
#generate data
a = rnorm(1000, 10, 10)
b = rnorm(1000, 10, 5)
c = rnorm(1000, 5, 10)
group <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.5,0.5))
group_1 <- 1:1000
#put data into a frame
d = data.frame(a,b,c, group, group_1)
d$group = as.factor(d$group)
e <- d
vec1 <- sample(200:300, 5)
vec2 <- sample(400:500,5)
vec3 <- sample(700:800,5)
z <- 0
df <- expand.grid(vec1, vec2, vec3)
df$Accuracy <- NA
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
for (k in seq_along(vec3)) {
# d <- e
d$group_2 = as.integer(ifelse(d$group_1 < vec1[i] , 0, ifelse(d$group_1 >vec1[i] & d$group_1 < vec2[j] , 1, ifelse(d$group_1 >vec2[j] & d$group_1 < vec3[k] , 2,3))))
d$group_2 = as.factor(d$group_2)
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 2,
## repeated ten times
repeats = 1)
TreeFit <- train(group_2 ~ ., data = d[,-5],
method = "rpart",
trControl = fitControl)
pred <- predict(
TreeFit,
d[,-5])
con <- confusionMatrix(
d$group_2,
pred)
#update results into table
#final_table[i,j] = con$overall[1]
z <- z + 1
df$Accuracy[z] <- con$overall[1]
}
}
}
#view the final results
head(df)
数据框“df”包含最终结果。我担心的是:如果你想多次迭代这个循环,“df”的大小会变得非常大。假设我只想保留“df”的“前 20 行”(基于 df$Accuracy
的降序值)。我可以按如下方式完成此操作:
#sort "df" by (descending values of) "Accuracy":
df_sort <- df[order(-df$Accuracy),]
#select first 20 rows
df_final = df_sort[1:20,]
但我担心计算机内存的限制可能会阻止创建“df”(对于大量迭代)。
我的问题:有没有办法阻止“df”达到超过 20 行?例如
- 填充“df”的前 20 行
- 如果第 21 行的精度低于前 20 行中的任何一行,则删除
- 如果第21行的精度大于前20行中最小的精度,则保留第21行并删除精度最小的行
这样,“df”的大小永远不会超过 20 行。
有人可以告诉我怎么做吗?
谢谢
您可以像这样实现逻辑:
library(caret)
library(rpart)
a = rnorm(1000, 10, 10)
b = rnorm(1000, 10, 5)
c = rnorm(1000, 5, 10)
group <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.5,0.5))
group_1 <- 1:1000
#put data into a frame
d = data.frame(a,b,c, group, group_1)
d$group = as.factor(d$group)
e <- d
vec1 <- sample(200:300, 5)
vec2 <- sample(400:500,5)
vec3 <- sample(700:800,5)
z <- 0
#Intialise a list
result <- vector('list', 20)
for (i in seq_along(vec1)) {
for (j in seq_along(vec2)) {
for (k in seq_along(vec3)) {
# d <- e
d$group_2 = as.integer(ifelse(d$group_1 < vec1[i] , 0, ifelse(d$group_1 >vec1[i] & d$group_1 < vec2[j] , 1, ifelse(d$group_1 >vec2[j] & d$group_1 < vec3[k] , 2,3))))
d$group_2 = as.factor(d$group_2)
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 2,
## repeated ten times
repeats = 1)
TreeFit <- train(group_2 ~ ., data = d[,-5],
method = "rpart",
trControl = fitControl)
pred <- predict(
TreeFit,
d[,-5])
con <- confusionMatrix(
d$group_2,
pred)
z <- z + 1
#Till 20 put the data in a list
if(z <= 20) {
result[[z]] <- data.frame(vec1 = vec1[i], vec2 = vec2[j], vec3 = vec3[j], Accuracy = con$overall[1])
} else {
#Create a dataframe of 20 list from above
if(z == 21) result <- do.call(rbind, result)
#Sort it in decreasing order
result <- result[order(-result$Accuracy), ]
#compare with last value
if(result$Accuracy[20] < con$overall[1]) {
#Create a new dataframe
new_df <- data.frame(vec1 = vec1[i], vec2 = vec2[j], vec3 = vec3[j], Accuracy = con$overall[1])
#Replace the last row
result <- rbind(head(result, 19), new_df)
}
}
}
}
}
这应该 return 类似于这样的输出:
result
# vec1 vec2 vec3 Accuracy
#Accuracy2 258 402 706 0.376
#Accuracy4 258 402 706 0.376
#Accuracy9 200 402 706 0.376
#Accuracy15 214 402 706 0.376
#Accuracy16 236 402 706 0.376
#Accuracy18 207 402 706 0.376
#Accuracy11 258 414 779 0.364
#Accuracy12 200 414 779 0.364
#Accuracy6 214 414 779 0.364
#Accuracy13 236 414 779 0.364
#Accuracy10 200 402 706 0.360
#Accuracy17 214 402 706 0.360
#Accuracy3 236 402 706 0.360
#Accuracy5 207 402 706 0.360
#Accuracy19 258 414 779 0.348
#Accuracy8 200 414 779 0.348
#Accuracy7 214 414 779 0.348
#Accuracy14 236 414 779 0.348
#Accuracy 207 414 779 0.348
#Accuracy1 207 414 779 0.364