数据 table:对于每一行,从其他 table 生成随机值
data table: for each row generate random values from other table
我想通过从模拟 table (table_simul) 中生成航班号来模拟航班号,以观察 table 作为基础。
table_simul <- data.table(
date_f = c("2020-01-01","2020-01-02","2020-01-03","2020-01-03"),
city = c("Paris","Paris","London", "Berlin")
)
table_obs <- data.table(
city = c("Paris","Paris","Paris","London","London", "Berlin"),
flight = c(1,2,7,4,5,14),
weight = c(0.33,0.33,0.33,0.5,0.5,1)
)
已用数据:
Table simul:
date city
2020-01-01 Paris
2020-01-01 Paris
2020-01-01 London
2020-01-01 Berlin
Table obs
---------
city flight weight
Paris 1 0.33
Paris 2 0.33
Paris 7 0.33
London 4 0.5
London 5 0.5
Berlin 14 1
预期结果:
date_f city flight
2020-01-01 Paris 2
2020-01-02 Paris 2
2020-01-03 London 4
2020-01-03 Berlin 14
我想使用数据包table,因为数据量很大。
这是我尝试过但没有奏效的方法:
get_flight_sample <- function(param_city){
table_simul[city==param_city]
res <- sample(table_obs$flight,1, replace=T, prob = table_obs$weight)
}
res <- table_simul[,.(flight = get_flight_sample(city))]
这样的事情会有帮助吗?
simul_df <- data.frame()
sapply(c(1:1000), FUN = function(datos) {
date_f = as.character(sample(seq(as.Date('2021/01/01'), as.Date('2022/12/31'), by="day"), 1))
city = sample(table_obs$city, size = 1)
flight = sample(table_obs$flight, size = 1)
weight = sample(table_obs$weight, size = 1)
simul_df <<- as.data.frame(rbind(simul_df, cbind(date_f, city, flight, weight)))
})
最后你会得到这样的东西:
date_f city flight weight
1 2022-06-10 Berlin 7 1
2 2022-12-21 Paris 14 0.33
3 2021-08-12 Berlin 14 1
4 2022-08-30 Berlin 4 0.5
5 2022-06-30 Paris 4 1
6 2021-09-25 Paris 1 0.33
7 2022-08-01 Paris 1 0.5
8 2021-04-17 London 14 1
9 2021-06-11 London 2 0.33
10 2022-11-28 London 14 0.5
您将获得一个数据框,之后您可以将其转换为 data.table。
最佳
也许使用辅助函数从 table_obs
中采样,然后加入 table_simul
f <- function(i, ...) {
if(length(i) == 1) i else sample(i, size = 1, ...)
}
set.seed(42)
tmp <- table_obs[, .(flight = f(flight, prob = weight)), by = city]
table_simul[, flight := tmp[table_simul, on = .(city)]$flight]
table_simul
# date_f city flight
#1: 2020-01-01 Paris 1
#2: 2020-01-02 Paris 1
#3: 2020-01-03 London 4
#4: 2020-01-03 Berlin 14
我想通过从模拟 table (table_simul) 中生成航班号来模拟航班号,以观察 table 作为基础。
table_simul <- data.table(
date_f = c("2020-01-01","2020-01-02","2020-01-03","2020-01-03"),
city = c("Paris","Paris","London", "Berlin")
)
table_obs <- data.table(
city = c("Paris","Paris","Paris","London","London", "Berlin"),
flight = c(1,2,7,4,5,14),
weight = c(0.33,0.33,0.33,0.5,0.5,1)
)
已用数据:
Table simul:
date city
2020-01-01 Paris
2020-01-01 Paris
2020-01-01 London
2020-01-01 Berlin
Table obs
---------
city flight weight
Paris 1 0.33
Paris 2 0.33
Paris 7 0.33
London 4 0.5
London 5 0.5
Berlin 14 1
预期结果:
date_f city flight
2020-01-01 Paris 2
2020-01-02 Paris 2
2020-01-03 London 4
2020-01-03 Berlin 14
我想使用数据包table,因为数据量很大。 这是我尝试过但没有奏效的方法:
get_flight_sample <- function(param_city){
table_simul[city==param_city]
res <- sample(table_obs$flight,1, replace=T, prob = table_obs$weight)
}
res <- table_simul[,.(flight = get_flight_sample(city))]
这样的事情会有帮助吗?
simul_df <- data.frame()
sapply(c(1:1000), FUN = function(datos) {
date_f = as.character(sample(seq(as.Date('2021/01/01'), as.Date('2022/12/31'), by="day"), 1))
city = sample(table_obs$city, size = 1)
flight = sample(table_obs$flight, size = 1)
weight = sample(table_obs$weight, size = 1)
simul_df <<- as.data.frame(rbind(simul_df, cbind(date_f, city, flight, weight)))
})
最后你会得到这样的东西:
date_f city flight weight
1 2022-06-10 Berlin 7 1
2 2022-12-21 Paris 14 0.33
3 2021-08-12 Berlin 14 1
4 2022-08-30 Berlin 4 0.5
5 2022-06-30 Paris 4 1
6 2021-09-25 Paris 1 0.33
7 2022-08-01 Paris 1 0.5
8 2021-04-17 London 14 1
9 2021-06-11 London 2 0.33
10 2022-11-28 London 14 0.5
您将获得一个数据框,之后您可以将其转换为 data.table。 最佳
也许使用辅助函数从 table_obs
中采样,然后加入 table_simul
f <- function(i, ...) {
if(length(i) == 1) i else sample(i, size = 1, ...)
}
set.seed(42)
tmp <- table_obs[, .(flight = f(flight, prob = weight)), by = city]
table_simul[, flight := tmp[table_simul, on = .(city)]$flight]
table_simul
# date_f city flight
#1: 2020-01-01 Paris 1
#2: 2020-01-02 Paris 1
#3: 2020-01-03 London 4
#4: 2020-01-03 Berlin 14