潜在客户之间的R距离

R distance between potential clients

我有一份关于公司的数据 table,另一份关于银行的数据。我已经设法为每个银行和每个公司的每个城市生成经度和纬度。我想做的是为每家公司找到公司和银行之间的平均距离。

例如,假设我有以下数据集(我实际上有 400 多家银行和几千家公司):

Data_firm <- data.frame(
  Firm = c("A", "B"),
  Postal_firm = c("20246", "67720"),
  Longfirm = c("9.2","7.8"),
  Latfirm = c("42.6", "48.7")
  )

Data_bank <- data.frame(
  Bank = c("AB", "AC"),
  Postal_bank = c("50670", "88290"),
  Longbank = c("-1.2","6.8"),
  Latbank = c("48.7", "48.0"),
  Assets = c("100", "200"))

我想在 Data_firm 中添加一列,其中包含公司与系统中所有银行之间的平均距离(我使用距离 harvestine 计算),另一列包含按银行规模加权的平均距离(但我的问题是第一步)

提前致谢,

可能的解决方案:

library(tidyverse)
library(geosphere)

Data_firm <- data.frame(
  Firm = c("A", "B", "C"),
  Postal_firm = c("20246", "67720", "77720"),
  Longfirm = c("9.2","7.8", "8.1"),
  Latfirm = c("42.6", "48.7", "50")
)

Data_bank <- data.frame(
  Bank = c("AB", "AC"),
  Postal_bank = c("50670", "88290"),
  Longbank = c("-1.2","6.8"),
  Latbank = c("48.7", "48.0"),
  Assets = c("100", "200"))

# There are 2 banks and 3 firms

Data_firm %>% 
  inner_join(Data_bank, by=character()) %>% 
  mutate(across(starts_with(c("Lat","Long")), as.numeric)) %>% 
  rowwise() %>% 
  mutate(dist = distm(c(Longbank, Latbank), c(Longfirm, Latfirm),
    fun = distHaversine))

#>   Firm Postal_firm Longfirm Latfirm Bank Postal_bank Longbank Latbank Assets
#> 1    A       20246      9.2    42.6   AB       50670     -1.2    48.7    100
#> 2    A       20246      9.2    42.6   AC       88290      6.8    48.0    200
#> 3    B       67720      7.8    48.7   AB       50670     -1.2    48.7    100
#> 4    B       67720      7.8    48.7   AC       88290      6.8    48.0    200
#> 5    C       77720      8.1    50.0   AB       50670     -1.2    48.7    100
#> 6    C       77720      8.1    50.0   AC       88290      6.8    48.0    200
#>        dist
#> 1 1054789.9
#> 2  629728.5
#> 3  660855.4
#> 4  107446.8
#> 5  689276.5
#> 6  242027.5

这里有一个数据table/geosphere方法

library(data.table)
library(geosphere)
setDT(Data_firm);setDT(Data_bank)
#create data.table of all combinations of bank and firm
ans <- CJ(firm = Data_firm$Firm,bank = Data_bank$Bank)
# join in the coordinates
ans[Data_firm, `:=`(lon_f = i.Longfirm, lat_f = i.Latfirm), on = .(firm = Firm)]
ans[Data_bank, `:=`(lon_b = i.Longbank, lat_b = i.Latbank, assets = i.Assets), on = .(bank = Bank)]
# set coordinates to numeric
cols <- grep("lat|lon|ass", names(ans), value = TRUE)
ans[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
# calculate rowwise distance between firm and bank
ans[, firm_to_bank := distHaversine(matrix(c(lon_f, lat_f), ncol = 2),
                                    matrix(c(lon_b, lat_b), ncol = 2))]
#    firm bank lon_f lat_f lon_b lat_b assets firm_to_bank
# 1:    A   AB   9.2  42.6  -1.2  48.7    100    1054789.9
# 2:    A   AC   9.2  42.6   6.8  48.0    200     629728.5
# 3:    B   AB   7.8  48.7  -1.2  48.7    100     660855.4
# 4:    B   AC   7.8  48.7   6.8  48.0    200     107446.8

# calculate average distance to bank by firm
ans[, avg_dist_to_bank := mean(firm_to_bank), by = .(firm)]
ans[, wavg_dist_to_bank := weighted.mean(firm_to_bank, assets), by = .(firm)]
#    firm bank lon_f lat_f lon_b lat_b assets firm_to_bank avg_dist_to_bank wavg_dist_to_bank
# 1:    A   AB   9.2  42.6  -1.2  48.7    100    1054789.9         842259.2          771415.6
# 2:    A   AC   9.2  42.6   6.8  48.0    200     629728.5         842259.2          771415.6
# 3:    B   AB   7.8  48.7  -1.2  48.7    100     660855.4         384151.1          291916.3
# 4:    B   AC   7.8  48.7   6.8  48.0    200     107446.8         384151.1          291916.3