R:用几个m实现combn函数并定义输出变量

R: implement combn function with several m and define output variables

我正在使用以下代码获取名称以 "form" 开头的变量的所有可能组合 (m=2) 的平均值。

k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], 2, FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), 2, paste0, collapse="")
data <- cbind(data, combined)

数据集 "data" 如下:

structure(list(id = c(5309039, 5284969, 5300279, 5270289, 5259957, 
5267086, 5173196, 5057536, 5246135, 5255558, 5241070, 5280194, 
5112387, 444459, 5054590, 5048412, 5296390, 5093742, 5293520), 
    form13 = c(1300.81321145176, 1130.23869905075, 1292.03253463863, 
    1358.23586808642, 1250.66417156907, 1388.37813595599, 1277.89625553694, 
    1242.17552321015, 1275.95068420011, 1449.97932094858, 1494.93158409261, 
    1183.72005024492, 1319.72081010904, 1153.43556746197, 1451.47500658524, 
    1502.05308533551, 1641.66472289938, 1407.07852441646, 1444.3815517771
    ), form12 = c(1329.6, 1104.4, 1272, 1322.8, 1195.5, 1487.4, 
    1195.6, 1258, 1256.4, 1455, 1524, 1170, 1291.4, 1224.6, 1414, 
    1606, 1765.2, 1441, 1406.8), form11 = c(1325.578, 1201.752, 
    1346.42, 1424.884, 1328.03, 1367.262, 1294.928, 1278.99, 
    1330.482, 1493.54, 1524.19, 1242.21, 1379.522, 1178.458, 
    1438.37, 1475.15, 1611.236, 1426.11, 1431.014), form10 = c(1056.7264, 
    940.4956, 1076.29, 1149.9412, 1059.028, 1095.8536, 1027.9564, 
    1012.996, 1061.3296, 1214.386, 1243.156, 978.472, 1107.3616, 
    918.6304, 1162.6, 1197.124, 1324.8628, 1151.092, 1155.6952
    ), form9 = c(1265.95883621535, 1104.13796282321, 1292.61038190038, 
    1391.60226122629, 1269.10247448997, 1319.10781736395, 1226.47462059388, 
    1205.80097696249, 1272.24391797013, 1476.61400008329, 1514.11964245256, 
    1157.70450530205, 1334.62450699242, 1072.96302932, 1408.41424685422, 
    1453.98138963552, 1619.24856353662, 1393.1329826012, 1399.25113387699
    ), form8 = c(1482.14960970768, 1302.96011430734, 1455.11530997823, 
    1507.60187999797, 1403.62372119021, 1590.3115445541, 1392.70107590683, 
    1422.72772811208, 1440.68241714823, 1606.14610155669, 1656.53381495283, 
    1357.47229571355, 1476.63693689195, 1356.28387443873, 1567.80354390345, 
    1697.01564123702, 1829.93948069795, 1581.30521692185, 1561.45650301116
    ), form7 = c(1444.56088362196, 1256.09569669502, 1416.12716131828, 
    1471.33068319787, 1361.97012558123, 1558.32178921338, 1350.4820727773, 
    1382.06304580259, 1400.94715403591, 1574.97601740197, 1627.97203596215, 
    1313.42968513872, 1438.7628489193, 1312.17974558614, 1534.64866852904, 
    1670.54939207752, 1810.35399499291, 1548.84925168016, 1527.97307493173
    ), form6 = c(1199.39256844313, 1030.51525282711, 1173.91406615889, 
    1223.38008553142, 1125.38576782367, 1301.32988998026, 1115.09171006788, 
    1143.39035787661, 1160.31177216137, 1316.25318375141, 1363.74113364133, 
    1081.8903116367, 1194.19714454337, 1080.77028284113, 1280.11720270038, 
    1401.89327051093, 1527.16747332837, 1292.84186767351, 1274.13542778885
    ), form5 = c(1297.78687926793, 1159.12885718351, 1290.6491699916, 
    1344.46508388198, 1257.02131246849, 1368.96738018114, 1239.89545043121, 
    1250.12098970015, 1277.57642224122, 1419.04226152712, 1455.58342941928, 
    1202.60322079507, 1313.15664462902, 1177.98531965952, 1380.99558290387, 
    1461.37241431927, 1574.8610783177, 1384.16870680163, 1375.22939662201
    ), form4 = c(1335.97776730397, 1108.36308048125, 1324.2608292059, 
    1412.60257966574, 1269.05887158687, 1452.82443206729, 1240.94583733479, 
    1257.73161635649, 1302.80120256198, 1535.02507407783, 1595.00938916382, 
    1179.7286135352, 1361.20807332313, 1139.31698950533, 1472.56938122075, 
    1604.51232282192, 1790.81013902909, 1477.77823673001, 1463.10387273464
    ), form3 = c(1354.228, 1167.277, 1385.695, 1504.159, 1357.93, 
    1417.162, 1307.953, 1283.89, 1361.632, 1607.815, 1654.09, 
    1228.36, 1435.672, 1132.108, 1524.52, 1580.05, 1785.511, 
    1506.01, 1513.414), form2 = c(2275.7324829005, 1960.23260237236, 
    2259.163108513, 2384.94888103794, 2181.57337654262, 2442.86896126772, 
    2142.36120747078, 2165.7494001933, 2228.9072421228, 2562.48497832825, 
    2650.8148703194, 2057.68931533889, 2311.5302827576, 2002.33637794664, 
    2471.44922673607, 2664.88828208925, 2945.12448823488, 2479.00498842122, 
    2457.73611045874), form1 = c(1180.88828860349, 1056.82591443514, 
    1162.17101167316, 1198.5102427986, 1126.52065872992, 1255.77452231775, 
    1118.95833314255, 1139.74737411054, 1152.17835587263, 1266.73762443072, 
    1301.62370599969, 1094.56758356167, 1177.07157336578, 1093.7447765967, 
    1240.19104186727, 1329.65141749175, 1421.68162869499, 1249.53896489237, 
    1235.79664943772)), row.names = c(NA, -19L), class = c("tbl_df", 
"tbl", "data.frame"))
> 

代码运行良好,我正在尝试实现它,以便将 m 从 2 到 8 的所有可能组合。我已经尝试了以下代码,但它不起作用。

x<-2:8
k=which(grepl("^form",colnames(data)))
combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
data <- cbind(data, combined)

因为我收到以下错误:

> x<-2:8
> k=which(grepl("^form",colnames(data)))
> combined <- combn(data[,k], seq_along(x), FUN = rowMeans)
**Error in combn(data[, k], seq_along(x), FUN = rowMeans) : 
  length(m) == 1L is not TRUE**
> colnames(combined) <- combn(names(data[,k]), seq_along(x), paste0, collapse="")
**Error in combn(names(data[, k]), seq_along(x), paste0, collapse = "") : 
  length(m) == 1L is not TRUE**
> data <- cbind(data, combined)

我哪里错了?

此外,我想在所有生成的变量的名称中添加以下前缀 "comb_"。我该如何修改上面的代码?

谢谢!

函数combn,组合数只能取1个元素,所以你,需要用lapply,最后用do.call组合起来(cbind..) :

首先我们定义组合函数x:

func = function(x,DATA){
mat = combn(DATA,x,FUN=rowMeans)
colnames(mat) = combn(names(DATA),x, paste0, collapse="")
mat
}

然后我们迭代:

k=which(grepl("^form",colnames(data)))
combined = lapply(2:8,func,DATA=data[,k])
combined <- do.call(cbind, combined)

如果你熟悉purrr,你也可以这样做:

library(purrr)
library(dplyr)

combined = 2:8 %>% map(~as.tibble(func(.x,DATA=data[,k]))) %>% bind_cols()

原因很简单,combn一次只取一个m。只需使用 sapply 遍历 m 即可。为了一步得到列名,我们可以使用'colnames<-()''colnames<-'(x, names) 实际上与 colnames(x) <- names 相同,但优点是一切都在 RHS 上。 "form"后缀可以用gsub删除。

k <- 2:14
combined.2.lst <- sapply(2:8, function(m) 
  `colnames<-`(combn(data[,k], m, rowMeans),
                 combn(names(data[,k]), m, function(x) 
                         paste0("comb.", paste0(gsub("form", "", x), collapse=".")))))

这为您提供了一个列表,然后可以对其进行 cbind编辑。

combined.2 <- do.call(cbind, combined.2.lst)
dim(combined.2)
# [1]   19 7085

结果

combined.2[1:5, c(1, 50, 100, 500, 1000, 5000)]  # example columns
#      comb.13.12 comb.9.1 comb.13.10.9 comb.13.10.2.1 comb.9.5.4.3 comb.13.7.6.5.4.3.2
# [1,]   1315.207 1223.424     1207.833       1453.540     1313.488            1458.356
# [2,]   1117.319 1080.482     1058.291       1271.948     1134.727            1258.836
# [3,]   1282.016 1227.391     1220.311       1447.414     1323.304            1448.835
# [4,]   1340.518 1295.056     1299.926       1522.909     1413.207            1528.446
# [5,]   1223.082 1197.812     1192.932       1404.447     1288.278            1400.515

最后只用cbind(data, combined.2)

您需要使用 lapply()sapply()m<-2:8 进行迭代。我试图保留您代码的主要结构并进行最少的更改以使其正常工作:

m <- 2:8
k=which(grepl("^form",colnames(data)))
combined <- Reduce(cbind,lapply(m, function(m) combn(data[,k], m, FUN = rowMeans)))
colnames(combined) <-unlist(sapply(m, function(m) combn(names(data[,k]), m, paste0, collapse=""))) 
data <- cbind(data, combined)