如何在整洁的世界中折叠行或 data.table 在传播数据长 -> 宽后
How to collapse rows in the tidy world or with data.table after spreading data long -> wide
我在呼叫中心工作,此数据是我们自动拨号系统的所有相关数据。从 2019 年开始,我们公司开展了多项活动,以向会员提供更多可能的处方,而不是 1 个。
我们现在 运行 的情况是,会员 X 可能有 6 种处方,我们正在尝试就这些处方与他们联系,但这些处方通常是重复的。收集所有这些数据的部门会查看 IDCARD_MBR_ID 和 TARGET_DOC_ 列的唯一值,而不会注意处方药名称。
我的目标是删除重复的处方药。并重新填充任何空出的 space
药物的顺序也不总是相同的。一个简短的示例可能如下所示:
H0000000001 (member data columns) Target1, Target2
H0000000001 (same member data) Target3, Target1, Target2
我想看的是:
H0000000001 (member data columns) Target1, Target2,Target3
我已经通过收集处理了我的数据,删除了重复项,将我的目标重新编号为 1-6(这样我可以在找到 >3 时创建一个新行)并将其展开。我的问题是我的传播..我得到了这样的东西:
structure(list(IDCARD_MBR_ID = c("H0000000001", "H0000000001",
"H0000000001", "H0000000002", "H0000000002", "H0000000002", "H0000000003",
"H0000000003", "H0000000003", "H0000000004", "H0000000004", "H0000000004",
"H0000000005", "H0000000005", "H0000000005", "H0000000006", "H0000000006",
"H0000000006", "H0000000007", "H0000000007", "H0000000007", "H0000000008",
"H0000000008", "H0000000008", "H0000000009", "H0000000009", "H0000000009",
"H0000000010", "H0000000010", "H0000000010", "H0000000011", "H0000000011",
"H0000000011", "H0000000012", "H0000000012", "H0000000012"),
SDR_PERSON_ID = c("ID000000001", "ID000000001", "ID000000001",
"ID000000002", "ID000000002", "ID000000002", "ID000000003",
"ID000000003", "ID000000003", "ID000000004", "ID000000004",
"ID000000004", "ID000000005", "ID000000005", "ID000000005",
"ID000000006", "ID000000006", "ID000000006", "ID000000007",
"ID000000007", "ID000000007", "ID000000008", "ID000000008",
"ID000000008", "ID000000009", "ID000000009", "ID000000009",
"ID000000010", "ID000000010", "ID000000010", "ID000000011",
"ID000000011", "ID000000011", "ID000000012", "ID000000012",
"ID000000012"), FIRST_NAME = c("First1", "First1", "First1",
"First2", "First2", "First2", "First3", "First3", "First3",
"First4", "First4", "First4", "First5", "First5", "First5",
"First6", "First6", "First6", "First7", "First7", "First7",
"First8", "First8", "First8", "First9", "First9", "First9",
"First10", "First10", "First10", "First11", "First11", "First11",
"First12", "First12", "First12"), LAST_NAME = c("Last1",
"Last1", "Last1", "Last2", "Last2", "Last2", "Last3", "Last3",
"Last3", "Last4", "Last4", "Last4", "Last5", "Last5", "Last5",
"Last6", "Last6", "Last6", "Last7", "Last7", "Last7", "Last8",
"Last8", "Last8", "Last9", "Last9", "Last9", "Last10", "Last10",
"Last10", "Last11", "Last11", "Last11", "Last12", "Last12",
"Last12"), GENDER = c("F", "F", "F", "M", "M", "M", "M",
"M", "M", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M",
"M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "M", "F",
"F", "F", "F", "F", "F"), DOB = c("1901-01-01", "1901-01-01",
"1901-01-01", "1901-01-02", "1901-01-02", "1901-01-02", "1901-01-03",
"1901-01-03", "1901-01-03", "1901-01-04", "1901-01-04", "1901-01-04",
"1901-01-05", "1901-01-05", "1901-01-05", "1901-01-06", "1901-01-06",
"1901-01-06", "1901-01-07", "1901-01-07", "1901-01-07", "1901-01-08",
"1901-01-08", "1901-01-08", "1901-01-09", "1901-01-09", "1901-01-09",
"1901-01-10", "1901-01-10", "1901-01-10", "1901-01-11", "1901-01-11",
"1901-01-11", "1901-01-12", "1901-01-12", "1901-01-12"),
ADDRESS1 = c("123 Testing Way", "123 Testing Way", "123 Testing Way",
"456 Testing Way", "456 Testing Way", "456 Testing Way",
"789 Testing Way", "789 Testing Way", "789 Testing Way",
"1122 Testing Way", "1122 Testing Way", "1122 Testing Way",
"1455 Testing Way", "1455 Testing Way", "1455 Testing Way",
"1788 Testing Way", "1788 Testing Way", "1788 Testing Way",
"2121 Testing Way", "2121 Testing Way", "2121 Testing Way",
"2454 Testing Way", "2454 Testing Way", "2454 Testing Way",
"2787 Testing Way", "2787 Testing Way", "2787 Testing Way",
"3120 Testing Way", "3120 Testing Way", "3120 Testing Way",
"3453 Testing Way", "3453 Testing Way", "3453 Testing Way",
"3786 Testing Way", "3786 Testing Way", "3786 Testing Way"
), ADDRESS2 = c(NA, NA, NA, NA, NA, NA, "APT 302", "APT 302",
"APT 302", "APT 225", "APT 225", "APT 225", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), CITY = c("CLARKSVILLE", "CLARKSVILLE",
"CLARKSVILLE", "MONROE", "MONROE", "MONROE", "LOS ANGELES",
"LOS ANGELES", "LOS ANGELES", "TULSA", "TULSA", "TULSA",
"RED BUD", "RED BUD", "RED BUD", "MISSION VIEJO", "MISSION VIEJO",
"MISSION VIEJO", "MILLCREEK", "MILLCREEK", "MILLCREEK", "COLUMBIA",
"COLUMBIA", "COLUMBIA", "HOUSTON", "HOUSTON", "HOUSTON",
"APOPKA", "APOPKA", "APOPKA", "SEALY", "SEALY", "SEALY",
"SEWARD", "SEWARD", "SEWARD"), ST = c("IN", "IN", "IN", "MI",
"MI", "MI", "CA", "CA", "CA", "OK", "OK", "OK", "IL", "IL",
"IL", "CA", "CA", "CA", "UT", "UT", "UT", "MO", "MO", "MO",
"TX", "TX", "TX", "FL", "FL", "FL", "TX", "TX", "TX", "NE",
"NE", "NE"), ZIP = c(47129L, 47129L, 47129L, 48162L, 48162L,
48162L, 90028L, 90028L, 90028L, 74136L, 74136L, 74136L, 62278L,
62278L, 62278L, 92692L, 92692L, 92692L, 84124L, 84124L, 84124L,
65201L, 65201L, 65201L, 77057L, 77057L, 77057L, 32703L, 32703L,
32703L, 77474L, 77474L, 77474L, 68434L, 68434L, 68434L),
EMAIL_ADDRESS = c("Test-mail@test.com", "Test-mail@test.com",
"Test-mail@test.com", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Test-mail@test.com", "Test-mail@test.com", "Test-mail@test.com",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Test-mail@test.com",
"Test-mail@test.com", "Test-mail@test.com", NA, NA, NA, NA,
NA, NA), PHONE1 = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L), ARG_CLIENT_NUM = c(6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L), FORMULARYID = c(19430L, 19430L, 19431L,
19431L, 19431L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L,
19430L, 19430L, 19430L, 19430L, 19431L, 19431L, 19431L, 19430L,
19430L, 19431L, 19431L, 19431L, 19430L, 19430L, 19430L, 19431L,
19430L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L,
19431L), CAMPAIGN_TYPE = c("PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD"), LIS_INDICATOR = c("N",
"N", "N", "N", "N", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N", "N", "N",
"N", "N", "Y", "Y", "N", "Y", "Y", "Y", "Y", "Y", "N"), MONTH = c("February / DIALER",
"February / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "February / DIALER",
"February / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER"), Calltype1 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype2 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype3 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), TARGET_DOC_1 = c("A000000000025001",
NA, NA, "A000000000026001", NA, NA, "A000000000027001", NA,
NA, "A000000000028001", NA, NA, "A000000000029001", NA, NA,
"A000000000030001", NA, NA, "A000000000031001", NA, NA, "A000000000032001",
NA, NA, "A000000000033001", NA, NA, "A000000000034001", NA,
NA, "A000000000035001", NA, NA, "A000000000036001", NA, NA
), TARGET_DOC_2 = c(NA, "A000000000013001", NA, NA, "A000000000014001",
NA, NA, "A000000000015001", NA, NA, "A000000000016001", NA,
NA, "A000000000017001", NA, NA, "A000000000018001", NA, NA,
"A000000000019001", NA, NA, "A000000000020001", NA, NA, "A000000000021001",
NA, NA, "A000000000022001", NA, NA, "A000000000023001", NA,
NA, "A000000000024001", NA), TARGET_DOC_3 = c(NA, NA, "A000000000001001",
NA, NA, "A000000000002001", NA, NA, "A000000000003001", NA,
NA, "A000000000004001", NA, NA, "A000000000005001", NA, NA,
"A000000000006001", NA, NA, "A000000000007001", NA, NA, "A000000000008001",
NA, NA, "A000000000009001", NA, NA, "A000000000010001", NA,
NA, "A000000000011001", NA, NA, "A000000000012001"), TARGET_DRUG_ID_1 = c(57237014401,
NA, NA, 65862058301, NA, NA, 57237014301, NA, NA, 68180075701,
NA, NA, 65862058401, NA, NA, 55111033901, NA, NA, 65862058201,
NA, NA, 65862058405, NA, NA, 65862058601, NA, NA, 68180075801,
NA, NA, 65862058601, NA, NA, 68180076001, NA, NA), TARGET_DRUG_ID_2 = c(NA,
57237014401, NA, NA, 65862058301, NA, NA, 57237014301, NA,
NA, 68180075701, NA, NA, 65862058401, NA, NA, 55111033901,
NA, NA, 65862058201, NA, NA, 65862058405, NA, NA, 65862058601,
NA, NA, 68180075801, NA, NA, 65862058601, NA, NA, 68180076001,
NA), TARGET_DRUG_ID_3 = c(NA, NA, 65862058701, NA, NA, 65862058701,
NA, NA, 55111034001, NA, NA, 65862058601, NA, NA, 65862058701,
NA, NA, 55111034001, NA, NA, 65862058401, NA, NA, 65862058405,
NA, NA, 68180047301, NA, NA, 68180075801, NA, NA, 65862058701,
NA, NA, 65862058301), TARGET_NAME_1 = c("Target11", NA, NA,
"Target12", NA, NA, "Target13", NA, NA, "Target14", NA, NA,
"Target15", NA, NA, "Target16", NA, NA, "Target17", NA, NA,
"Target18", NA, NA, "Target19", NA, NA, "Target20", NA, NA,
"Target21", NA, NA, "Target22", NA, NA), TARGET_NAME_2 = c(NA,
"Target21", NA, NA, "Target22", NA, NA, "Target23", NA, NA,
"Target24", NA, NA, "Target25", NA, NA, "Target26", NA, NA,
"Target27", NA, NA, "Target28", NA, NA, "Target29", NA, NA,
"Target30", NA, NA, "Target31", NA, NA, "Target32", NA),
TARGET_NAME_3 = c(NA, NA, "Target31", NA, NA, "Target32",
NA, NA, "Target33", NA, NA, "Target34", NA, NA, "Target35",
NA, NA, "Target36", NA, NA, "Target37", NA, NA, "Target38",
NA, NA, "Target39", NA, NA, "Target40", NA, NA, "Target41",
NA, NA, "Target42")), class = "data.frame", row.names = c(NA,
-36L))}
每组目标都在他们自己的线上..我不知道如何折叠它们并删除 NA,这样我最终得到的结果如下所示:
structure(list(IDCARD_MBR_ID = c("H0000000001", "H0000000002",
"H0000000003", "H0000000004", "H0000000005", "H0000000006", "H0000000007",
"H0000000008", "H0000000009", "H0000000010", "H0000000011", "H0000000012"
), SDR_PERSON_ID = c("ID000000001", "ID000000002", "ID000000003",
"ID000000004", "ID000000005", "ID000000006", "ID000000007", "ID000000008",
"ID000000009", "ID000000010", "ID000000011", "ID000000012"),
FIRST_NAME = c("First1", "First2", "First3", "First4", "First5",
"First6", "First7", "First8", "First9", "First10", "First11",
"First12"), LAST_NAME = c("Last1", "Last2", "Last3", "Last4",
"Last5", "Last6", "Last7", "Last8", "Last9", "Last10", "Last11",
"Last12"), GENDER = c("F", "M", "M", "F", "F", "F", "M",
"F", "F", "M", "F", "F"), DOB = c("1/1/1901", "1/2/1901",
"1/3/1901", "1/4/1901", "1/5/1901", "1/6/1901", "1/7/1901",
"1/8/1901", "1/9/1901", "1/10/1901", "1/11/1901", "1/12/1901"
), ADDRESS1 = c("123 Testing Way", "456 Testing Way", "789 Testing Way",
"1122 Testing Way", "1455 Testing Way", "1788 Testing Way",
"2121 Testing Way", "2454 Testing Way", "2787 Testing Way",
"3120 Testing Way", "3453 Testing Way", "3786 Testing Way"
), ADDRESS2 = c(NA, NA, "APT 302", "APT 225", NA, NA, NA,
NA, NA, NA, NA, NA), CITY = c("CLARKSVILLE", "MONROE", "LOS ANGELES",
"TULSA", "RED BUD", "MISSION VIEJO", "MILLCREEK", "COLUMBIA",
"HOUSTON", "APOPKA", "SEALY", "SEWARD"), ST = c("IN", "MI",
"CA", "OK", "IL", "CA", "UT", "MO", "TX", "FL", "TX", "NE"
), ZIP = c(47129L, 48162L, 90028L, 74136L, 62278L, 92692L,
84124L, 65201L, 77057L, 32703L, 77474L, 68434L), EMAIL_ADDRESS = c("Test-mail@test.com",
NA, NA, NA, "Test-mail@test.com", NA, NA, NA, NA, "Test-mail@test.com",
NA, NA), PHONE1 = 1:12, ARG_CLIENT_NUM = c(6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), FORMULARYID = c(19430L,
19431L, 19430L, 19430L, 19430L, 19431L, 19430L, 19431L, 19430L,
19430L, 19430L, 19430L), CAMPAIGN_TYPE = c("PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD"), LIS_INDICATOR = c("N",
"N", "Y", "Y", "Y", "N", "Y", "N", "N", "Y", "Y", "Y"), MONTH = c("February / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER"), Calltype1 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype2 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype3 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), TARGET_DOC_1 = c("A000000000025001",
"A000000000026001", "A000000000027001", "A000000000028001",
"A000000000029001", "A000000000030001", "A000000000031001",
"A000000000032001", "A000000000033001", "A000000000034001",
"A000000000035001", "A000000000036001"), TARGET_DOC_2 = c("A000000000013001",
"A000000000014001", "A000000000015001", "A000000000016001",
"A000000000017001", "A000000000018001", "A000000000019001",
"A000000000020001", "A000000000021001", "A000000000022001",
"A000000000023001", "A000000000024001"), TARGET_DOC_3 = c("A000000000001001",
"A000000000002001", "A000000000003001", "A000000000004001",
"A000000000005001", "A000000000006001", "A000000000007001",
"A000000000008001", "A000000000009001", "A000000000010001",
"A000000000011001", "A000000000012001"), TARGET_DRUG_ID_1 = c(57237014401,
65862058301, 57237014301, 68180075701, 65862058401, 55111033901,
65862058201, 65862058405, 65862058601, 68180075801, 65862058601,
68180076001), TARGET_DRUG_ID_2 = c(57237014401, 65862058301,
57237014301, 68180075701, 65862058401, 55111033901, 65862058201,
65862058405, 65862058601, 68180075801, 65862058601, 68180076001
), TARGET_DRUG_ID_3 = c(65862058701, 65862058701, 55111034001,
65862058601, 65862058701, 55111034001, 65862058401, 65862058405,
68180047301, 68180075801, 65862058701, 65862058301), TARGET_NAME_1 = c("Target11",
"Target12", "Target13", "Target14", "Target15", "Target16",
"Target17", "Target18", "Target19", "Target20", "Target21",
"Target22"), TARGET_NAME_2 = c("Target21", "Target22", "Target23",
"Target24", "Target25", "Target26", "Target27", "Target28",
"Target29", "Target30", "Target31", "Target32"), TARGET_NAME_3 = c("Target31",
"Target32", "Target33", "Target34", "Target35", "Target36",
"Target37", "Target38", "Target39", "Target40", "Target41",
"Target42")), class = "data.frame", row.names = c(NA, -12L
))
我一直在看类似的问题,比如这个:
折叠重复的行
Collapse duplicated rows with different values in few columns using R
我最大的问题是我的列数太多了(出于必要.. 所有行都需要保持在一起以保持完整性)我很快就迷路了。
编辑 1:添加 'head(12)' 的 DF(请注意 DF 非常宽且难以粘贴以保持格式。这是受此问题影响的列)...另外,不确定为什么这个测试数据有些不适用,有些不适用。在我的真实数据中,它们是真正的缺失值不适用。
Calltype3 TARGET_DOC_1 TARGET_DOC_2 TARGET_DOC_3 TARGET_DRUG_ID_1 TARGET_DRUG_ID_2 TARGET_DRUG_ID_3
1 Cost Savings A000000000025001 <NA> <NA> 57237014401 NA NA
2 Cost Savings <NA> A000000000013001 <NA> NA 57237014401 NA
3 Cost Savings <NA> <NA> A000000000001001 NA NA 65862058701
4 Cost Savings A000000000026001 <NA> <NA> 65862058301 NA NA
5 Cost Savings <NA> A000000000014001 <NA> NA 65862058301 NA
6 Cost Savings <NA> <NA> A000000000002001 NA NA 65862058701
7 Cost Savings A000000000027001 <NA> <NA> 57237014301 NA NA
8 Cost Savings <NA> A000000000015001 <NA> NA 57237014301 NA
9 Cost Savings <NA> <NA> A000000000003001 NA NA 55111034001
10 Cost Savings A000000000028001 <NA> <NA> 68180075701 NA NA
11 Cost Savings <NA> A000000000016001 <NA> NA 68180075701 NA
12 Cost Savings <NA> <NA> A000000000004001 NA NA 65862058601
TARGET_NAME_1 TARGET_NAME_2 TARGET_NAME_3
1 Target11 <NA> <NA>
2 <NA> Target21 <NA>
3 <NA> <NA> Target31
4 Target12 <NA> <NA>
5 <NA> Target22 <NA>
6 <NA> <NA> Target32
7 Target13 <NA> <NA>
8 <NA> Target23 <NA>
9 <NA> <NA> Target33
10 Target14 <NA> <NA>
11 <NA> Target24 <NA>
12 <NA> <NA> Target34
编辑 4:根据 Humpelstielzchen 的建议,我 运行 这样做:
我的价差 DF = spread1
spread2 <- spread1 %>%
group_by(IDCARD_MBR_ID) %>%
mutate(MONTH = MONTH[row_number()==1],
FORMULARYID = FORMULARYID[row_number()==1],
LIS_INDICATOR = LIS_INDICATOR[row_number() ==1]) %>%
group_by_at(vars(-starts_with("TARGET"))) %>%
summarise_at(vars(starts_with("TARGET")), list(. = ~.[which(!is.na(.))[1]]))
它似乎工作正常;我有一个超过 100 万行的数据框,这帮助我减少到 ~600k;我需要一点时间来验证准确性,但看起来很棒!
试试这个:
library(dplyr)
df1 %>%
group_by(IDCARD_MBR_ID) %>%
mutate(MONTH = MONTH[row_number() == 1],
FORMULARYID = FORMULARYID[row_number() == 1],
LIS_INDICATOR = LIS_INDICATOR[row_number() == 1]) %>%
group_by_at(vars(-starts_with("TARGET"))) %>%
summarise_at(vars(starts_with("TARGET")), list(. = ~.[which(!is.na(.))[1]])) -> res
> res[,c("IDCARD_MBR_ID", "TARGET_NAME_1_.", "TARGET_NAME_2_.", "TARGET_NAME_3_." )]
# A tibble: 12 x 4
IDCARD_MBR_ID TARGET_NAME_1_. TARGET_NAME_2_. TARGET_NAME_3_.
<chr> <chr> <chr> <chr>
1 H0000000001 Target11 Target21 Target31
2 H0000000002 Target12 Target22 Target32
3 H0000000003 Target13 Target23 Target33
4 H0000000004 Target14 Target24 Target34
5 H0000000005 Target15 Target25 Target35
6 H0000000006 Target16 Target26 Target36
7 H0000000007 Target17 Target27 Target37
8 H0000000008 Target18 Target28 Target38
9 H0000000009 Target19 Target29 Target39
10 H0000000010 Target20 Target30 Target40
11 H0000000011 Target21 Target31 Target41
12 H0000000012 Target22 Target32 Target42
我不得不平衡一些分组变量(MONTH、FORMULARYID、LIS_INDICATOR),因为它们在同一个 ID 内不同。我只是为每个 ID 选择了第一个,您可以根据自己的喜好进行调整。
我在呼叫中心工作,此数据是我们自动拨号系统的所有相关数据。从 2019 年开始,我们公司开展了多项活动,以向会员提供更多可能的处方,而不是 1 个。
我们现在 运行 的情况是,会员 X 可能有 6 种处方,我们正在尝试就这些处方与他们联系,但这些处方通常是重复的。收集所有这些数据的部门会查看 IDCARD_MBR_ID 和 TARGET_DOC_ 列的唯一值,而不会注意处方药名称。
我的目标是删除重复的处方药。并重新填充任何空出的 space
药物的顺序也不总是相同的。一个简短的示例可能如下所示:
H0000000001 (member data columns) Target1, Target2
H0000000001 (same member data) Target3, Target1, Target2
我想看的是:
H0000000001 (member data columns) Target1, Target2,Target3
我已经通过收集处理了我的数据,删除了重复项,将我的目标重新编号为 1-6(这样我可以在找到 >3 时创建一个新行)并将其展开。我的问题是我的传播..我得到了这样的东西:
structure(list(IDCARD_MBR_ID = c("H0000000001", "H0000000001",
"H0000000001", "H0000000002", "H0000000002", "H0000000002", "H0000000003",
"H0000000003", "H0000000003", "H0000000004", "H0000000004", "H0000000004",
"H0000000005", "H0000000005", "H0000000005", "H0000000006", "H0000000006",
"H0000000006", "H0000000007", "H0000000007", "H0000000007", "H0000000008",
"H0000000008", "H0000000008", "H0000000009", "H0000000009", "H0000000009",
"H0000000010", "H0000000010", "H0000000010", "H0000000011", "H0000000011",
"H0000000011", "H0000000012", "H0000000012", "H0000000012"),
SDR_PERSON_ID = c("ID000000001", "ID000000001", "ID000000001",
"ID000000002", "ID000000002", "ID000000002", "ID000000003",
"ID000000003", "ID000000003", "ID000000004", "ID000000004",
"ID000000004", "ID000000005", "ID000000005", "ID000000005",
"ID000000006", "ID000000006", "ID000000006", "ID000000007",
"ID000000007", "ID000000007", "ID000000008", "ID000000008",
"ID000000008", "ID000000009", "ID000000009", "ID000000009",
"ID000000010", "ID000000010", "ID000000010", "ID000000011",
"ID000000011", "ID000000011", "ID000000012", "ID000000012",
"ID000000012"), FIRST_NAME = c("First1", "First1", "First1",
"First2", "First2", "First2", "First3", "First3", "First3",
"First4", "First4", "First4", "First5", "First5", "First5",
"First6", "First6", "First6", "First7", "First7", "First7",
"First8", "First8", "First8", "First9", "First9", "First9",
"First10", "First10", "First10", "First11", "First11", "First11",
"First12", "First12", "First12"), LAST_NAME = c("Last1",
"Last1", "Last1", "Last2", "Last2", "Last2", "Last3", "Last3",
"Last3", "Last4", "Last4", "Last4", "Last5", "Last5", "Last5",
"Last6", "Last6", "Last6", "Last7", "Last7", "Last7", "Last8",
"Last8", "Last8", "Last9", "Last9", "Last9", "Last10", "Last10",
"Last10", "Last11", "Last11", "Last11", "Last12", "Last12",
"Last12"), GENDER = c("F", "F", "F", "M", "M", "M", "M",
"M", "M", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M",
"M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "M", "F",
"F", "F", "F", "F", "F"), DOB = c("1901-01-01", "1901-01-01",
"1901-01-01", "1901-01-02", "1901-01-02", "1901-01-02", "1901-01-03",
"1901-01-03", "1901-01-03", "1901-01-04", "1901-01-04", "1901-01-04",
"1901-01-05", "1901-01-05", "1901-01-05", "1901-01-06", "1901-01-06",
"1901-01-06", "1901-01-07", "1901-01-07", "1901-01-07", "1901-01-08",
"1901-01-08", "1901-01-08", "1901-01-09", "1901-01-09", "1901-01-09",
"1901-01-10", "1901-01-10", "1901-01-10", "1901-01-11", "1901-01-11",
"1901-01-11", "1901-01-12", "1901-01-12", "1901-01-12"),
ADDRESS1 = c("123 Testing Way", "123 Testing Way", "123 Testing Way",
"456 Testing Way", "456 Testing Way", "456 Testing Way",
"789 Testing Way", "789 Testing Way", "789 Testing Way",
"1122 Testing Way", "1122 Testing Way", "1122 Testing Way",
"1455 Testing Way", "1455 Testing Way", "1455 Testing Way",
"1788 Testing Way", "1788 Testing Way", "1788 Testing Way",
"2121 Testing Way", "2121 Testing Way", "2121 Testing Way",
"2454 Testing Way", "2454 Testing Way", "2454 Testing Way",
"2787 Testing Way", "2787 Testing Way", "2787 Testing Way",
"3120 Testing Way", "3120 Testing Way", "3120 Testing Way",
"3453 Testing Way", "3453 Testing Way", "3453 Testing Way",
"3786 Testing Way", "3786 Testing Way", "3786 Testing Way"
), ADDRESS2 = c(NA, NA, NA, NA, NA, NA, "APT 302", "APT 302",
"APT 302", "APT 225", "APT 225", "APT 225", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), CITY = c("CLARKSVILLE", "CLARKSVILLE",
"CLARKSVILLE", "MONROE", "MONROE", "MONROE", "LOS ANGELES",
"LOS ANGELES", "LOS ANGELES", "TULSA", "TULSA", "TULSA",
"RED BUD", "RED BUD", "RED BUD", "MISSION VIEJO", "MISSION VIEJO",
"MISSION VIEJO", "MILLCREEK", "MILLCREEK", "MILLCREEK", "COLUMBIA",
"COLUMBIA", "COLUMBIA", "HOUSTON", "HOUSTON", "HOUSTON",
"APOPKA", "APOPKA", "APOPKA", "SEALY", "SEALY", "SEALY",
"SEWARD", "SEWARD", "SEWARD"), ST = c("IN", "IN", "IN", "MI",
"MI", "MI", "CA", "CA", "CA", "OK", "OK", "OK", "IL", "IL",
"IL", "CA", "CA", "CA", "UT", "UT", "UT", "MO", "MO", "MO",
"TX", "TX", "TX", "FL", "FL", "FL", "TX", "TX", "TX", "NE",
"NE", "NE"), ZIP = c(47129L, 47129L, 47129L, 48162L, 48162L,
48162L, 90028L, 90028L, 90028L, 74136L, 74136L, 74136L, 62278L,
62278L, 62278L, 92692L, 92692L, 92692L, 84124L, 84124L, 84124L,
65201L, 65201L, 65201L, 77057L, 77057L, 77057L, 32703L, 32703L,
32703L, 77474L, 77474L, 77474L, 68434L, 68434L, 68434L),
EMAIL_ADDRESS = c("Test-mail@test.com", "Test-mail@test.com",
"Test-mail@test.com", NA, NA, NA, NA, NA, NA, NA, NA, NA,
"Test-mail@test.com", "Test-mail@test.com", "Test-mail@test.com",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Test-mail@test.com",
"Test-mail@test.com", "Test-mail@test.com", NA, NA, NA, NA,
NA, NA), PHONE1 = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L,
9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L), ARG_CLIENT_NUM = c(6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L), FORMULARYID = c(19430L, 19430L, 19431L,
19431L, 19431L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L,
19430L, 19430L, 19430L, 19430L, 19431L, 19431L, 19431L, 19430L,
19430L, 19431L, 19431L, 19431L, 19430L, 19430L, 19430L, 19431L,
19430L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L, 19430L,
19431L), CAMPAIGN_TYPE = c("PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD"), LIS_INDICATOR = c("N",
"N", "N", "N", "N", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N", "N", "N",
"N", "N", "Y", "Y", "N", "Y", "Y", "Y", "Y", "Y", "N"), MONTH = c("February / DIALER",
"February / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "February / DIALER",
"February / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER"), Calltype1 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype2 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype3 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), TARGET_DOC_1 = c("A000000000025001",
NA, NA, "A000000000026001", NA, NA, "A000000000027001", NA,
NA, "A000000000028001", NA, NA, "A000000000029001", NA, NA,
"A000000000030001", NA, NA, "A000000000031001", NA, NA, "A000000000032001",
NA, NA, "A000000000033001", NA, NA, "A000000000034001", NA,
NA, "A000000000035001", NA, NA, "A000000000036001", NA, NA
), TARGET_DOC_2 = c(NA, "A000000000013001", NA, NA, "A000000000014001",
NA, NA, "A000000000015001", NA, NA, "A000000000016001", NA,
NA, "A000000000017001", NA, NA, "A000000000018001", NA, NA,
"A000000000019001", NA, NA, "A000000000020001", NA, NA, "A000000000021001",
NA, NA, "A000000000022001", NA, NA, "A000000000023001", NA,
NA, "A000000000024001", NA), TARGET_DOC_3 = c(NA, NA, "A000000000001001",
NA, NA, "A000000000002001", NA, NA, "A000000000003001", NA,
NA, "A000000000004001", NA, NA, "A000000000005001", NA, NA,
"A000000000006001", NA, NA, "A000000000007001", NA, NA, "A000000000008001",
NA, NA, "A000000000009001", NA, NA, "A000000000010001", NA,
NA, "A000000000011001", NA, NA, "A000000000012001"), TARGET_DRUG_ID_1 = c(57237014401,
NA, NA, 65862058301, NA, NA, 57237014301, NA, NA, 68180075701,
NA, NA, 65862058401, NA, NA, 55111033901, NA, NA, 65862058201,
NA, NA, 65862058405, NA, NA, 65862058601, NA, NA, 68180075801,
NA, NA, 65862058601, NA, NA, 68180076001, NA, NA), TARGET_DRUG_ID_2 = c(NA,
57237014401, NA, NA, 65862058301, NA, NA, 57237014301, NA,
NA, 68180075701, NA, NA, 65862058401, NA, NA, 55111033901,
NA, NA, 65862058201, NA, NA, 65862058405, NA, NA, 65862058601,
NA, NA, 68180075801, NA, NA, 65862058601, NA, NA, 68180076001,
NA), TARGET_DRUG_ID_3 = c(NA, NA, 65862058701, NA, NA, 65862058701,
NA, NA, 55111034001, NA, NA, 65862058601, NA, NA, 65862058701,
NA, NA, 55111034001, NA, NA, 65862058401, NA, NA, 65862058405,
NA, NA, 68180047301, NA, NA, 68180075801, NA, NA, 65862058701,
NA, NA, 65862058301), TARGET_NAME_1 = c("Target11", NA, NA,
"Target12", NA, NA, "Target13", NA, NA, "Target14", NA, NA,
"Target15", NA, NA, "Target16", NA, NA, "Target17", NA, NA,
"Target18", NA, NA, "Target19", NA, NA, "Target20", NA, NA,
"Target21", NA, NA, "Target22", NA, NA), TARGET_NAME_2 = c(NA,
"Target21", NA, NA, "Target22", NA, NA, "Target23", NA, NA,
"Target24", NA, NA, "Target25", NA, NA, "Target26", NA, NA,
"Target27", NA, NA, "Target28", NA, NA, "Target29", NA, NA,
"Target30", NA, NA, "Target31", NA, NA, "Target32", NA),
TARGET_NAME_3 = c(NA, NA, "Target31", NA, NA, "Target32",
NA, NA, "Target33", NA, NA, "Target34", NA, NA, "Target35",
NA, NA, "Target36", NA, NA, "Target37", NA, NA, "Target38",
NA, NA, "Target39", NA, NA, "Target40", NA, NA, "Target41",
NA, NA, "Target42")), class = "data.frame", row.names = c(NA,
-36L))}
每组目标都在他们自己的线上..我不知道如何折叠它们并删除 NA,这样我最终得到的结果如下所示:
structure(list(IDCARD_MBR_ID = c("H0000000001", "H0000000002",
"H0000000003", "H0000000004", "H0000000005", "H0000000006", "H0000000007",
"H0000000008", "H0000000009", "H0000000010", "H0000000011", "H0000000012"
), SDR_PERSON_ID = c("ID000000001", "ID000000002", "ID000000003",
"ID000000004", "ID000000005", "ID000000006", "ID000000007", "ID000000008",
"ID000000009", "ID000000010", "ID000000011", "ID000000012"),
FIRST_NAME = c("First1", "First2", "First3", "First4", "First5",
"First6", "First7", "First8", "First9", "First10", "First11",
"First12"), LAST_NAME = c("Last1", "Last2", "Last3", "Last4",
"Last5", "Last6", "Last7", "Last8", "Last9", "Last10", "Last11",
"Last12"), GENDER = c("F", "M", "M", "F", "F", "F", "M",
"F", "F", "M", "F", "F"), DOB = c("1/1/1901", "1/2/1901",
"1/3/1901", "1/4/1901", "1/5/1901", "1/6/1901", "1/7/1901",
"1/8/1901", "1/9/1901", "1/10/1901", "1/11/1901", "1/12/1901"
), ADDRESS1 = c("123 Testing Way", "456 Testing Way", "789 Testing Way",
"1122 Testing Way", "1455 Testing Way", "1788 Testing Way",
"2121 Testing Way", "2454 Testing Way", "2787 Testing Way",
"3120 Testing Way", "3453 Testing Way", "3786 Testing Way"
), ADDRESS2 = c(NA, NA, "APT 302", "APT 225", NA, NA, NA,
NA, NA, NA, NA, NA), CITY = c("CLARKSVILLE", "MONROE", "LOS ANGELES",
"TULSA", "RED BUD", "MISSION VIEJO", "MILLCREEK", "COLUMBIA",
"HOUSTON", "APOPKA", "SEALY", "SEWARD"), ST = c("IN", "MI",
"CA", "OK", "IL", "CA", "UT", "MO", "TX", "FL", "TX", "NE"
), ZIP = c(47129L, 48162L, 90028L, 74136L, 62278L, 92692L,
84124L, 65201L, 77057L, 32703L, 77474L, 68434L), EMAIL_ADDRESS = c("Test-mail@test.com",
NA, NA, NA, "Test-mail@test.com", NA, NA, NA, NA, "Test-mail@test.com",
NA, NA), PHONE1 = 1:12, ARG_CLIENT_NUM = c(6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), FORMULARYID = c(19430L,
19431L, 19430L, 19430L, 19430L, 19431L, 19430L, 19431L, 19430L,
19430L, 19430L, 19430L), CAMPAIGN_TYPE = c("PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD",
"PDP_Lean MAPD", "PDP_Lean MAPD", "PDP_Lean MAPD"), LIS_INDICATOR = c("N",
"N", "Y", "Y", "Y", "N", "Y", "N", "N", "Y", "Y", "Y"), MONTH = c("February / DIALER",
"January / DIALER", "February / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER", "January / DIALER",
"January / DIALER", "January / DIALER"), Calltype1 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype2 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), Calltype3 = c("Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings", "Cost Savings",
"Cost Savings", "Cost Savings", "Cost Savings"), TARGET_DOC_1 = c("A000000000025001",
"A000000000026001", "A000000000027001", "A000000000028001",
"A000000000029001", "A000000000030001", "A000000000031001",
"A000000000032001", "A000000000033001", "A000000000034001",
"A000000000035001", "A000000000036001"), TARGET_DOC_2 = c("A000000000013001",
"A000000000014001", "A000000000015001", "A000000000016001",
"A000000000017001", "A000000000018001", "A000000000019001",
"A000000000020001", "A000000000021001", "A000000000022001",
"A000000000023001", "A000000000024001"), TARGET_DOC_3 = c("A000000000001001",
"A000000000002001", "A000000000003001", "A000000000004001",
"A000000000005001", "A000000000006001", "A000000000007001",
"A000000000008001", "A000000000009001", "A000000000010001",
"A000000000011001", "A000000000012001"), TARGET_DRUG_ID_1 = c(57237014401,
65862058301, 57237014301, 68180075701, 65862058401, 55111033901,
65862058201, 65862058405, 65862058601, 68180075801, 65862058601,
68180076001), TARGET_DRUG_ID_2 = c(57237014401, 65862058301,
57237014301, 68180075701, 65862058401, 55111033901, 65862058201,
65862058405, 65862058601, 68180075801, 65862058601, 68180076001
), TARGET_DRUG_ID_3 = c(65862058701, 65862058701, 55111034001,
65862058601, 65862058701, 55111034001, 65862058401, 65862058405,
68180047301, 68180075801, 65862058701, 65862058301), TARGET_NAME_1 = c("Target11",
"Target12", "Target13", "Target14", "Target15", "Target16",
"Target17", "Target18", "Target19", "Target20", "Target21",
"Target22"), TARGET_NAME_2 = c("Target21", "Target22", "Target23",
"Target24", "Target25", "Target26", "Target27", "Target28",
"Target29", "Target30", "Target31", "Target32"), TARGET_NAME_3 = c("Target31",
"Target32", "Target33", "Target34", "Target35", "Target36",
"Target37", "Target38", "Target39", "Target40", "Target41",
"Target42")), class = "data.frame", row.names = c(NA, -12L
))
我一直在看类似的问题,比如这个: 折叠重复的行 Collapse duplicated rows with different values in few columns using R
我最大的问题是我的列数太多了(出于必要.. 所有行都需要保持在一起以保持完整性)我很快就迷路了。
编辑 1:添加 'head(12)' 的 DF(请注意 DF 非常宽且难以粘贴以保持格式。这是受此问题影响的列)...另外,不确定为什么这个测试数据有些不适用,有些不适用。在我的真实数据中,它们是真正的缺失值不适用。
Calltype3 TARGET_DOC_1 TARGET_DOC_2 TARGET_DOC_3 TARGET_DRUG_ID_1 TARGET_DRUG_ID_2 TARGET_DRUG_ID_3
1 Cost Savings A000000000025001 <NA> <NA> 57237014401 NA NA
2 Cost Savings <NA> A000000000013001 <NA> NA 57237014401 NA
3 Cost Savings <NA> <NA> A000000000001001 NA NA 65862058701
4 Cost Savings A000000000026001 <NA> <NA> 65862058301 NA NA
5 Cost Savings <NA> A000000000014001 <NA> NA 65862058301 NA
6 Cost Savings <NA> <NA> A000000000002001 NA NA 65862058701
7 Cost Savings A000000000027001 <NA> <NA> 57237014301 NA NA
8 Cost Savings <NA> A000000000015001 <NA> NA 57237014301 NA
9 Cost Savings <NA> <NA> A000000000003001 NA NA 55111034001
10 Cost Savings A000000000028001 <NA> <NA> 68180075701 NA NA
11 Cost Savings <NA> A000000000016001 <NA> NA 68180075701 NA
12 Cost Savings <NA> <NA> A000000000004001 NA NA 65862058601
TARGET_NAME_1 TARGET_NAME_2 TARGET_NAME_3
1 Target11 <NA> <NA>
2 <NA> Target21 <NA>
3 <NA> <NA> Target31
4 Target12 <NA> <NA>
5 <NA> Target22 <NA>
6 <NA> <NA> Target32
7 Target13 <NA> <NA>
8 <NA> Target23 <NA>
9 <NA> <NA> Target33
10 Target14 <NA> <NA>
11 <NA> Target24 <NA>
12 <NA> <NA> Target34
编辑 4:根据 Humpelstielzchen 的建议,我 运行 这样做: 我的价差 DF = spread1
spread2 <- spread1 %>%
group_by(IDCARD_MBR_ID) %>%
mutate(MONTH = MONTH[row_number()==1],
FORMULARYID = FORMULARYID[row_number()==1],
LIS_INDICATOR = LIS_INDICATOR[row_number() ==1]) %>%
group_by_at(vars(-starts_with("TARGET"))) %>%
summarise_at(vars(starts_with("TARGET")), list(. = ~.[which(!is.na(.))[1]]))
它似乎工作正常;我有一个超过 100 万行的数据框,这帮助我减少到 ~600k;我需要一点时间来验证准确性,但看起来很棒!
试试这个:
library(dplyr)
df1 %>%
group_by(IDCARD_MBR_ID) %>%
mutate(MONTH = MONTH[row_number() == 1],
FORMULARYID = FORMULARYID[row_number() == 1],
LIS_INDICATOR = LIS_INDICATOR[row_number() == 1]) %>%
group_by_at(vars(-starts_with("TARGET"))) %>%
summarise_at(vars(starts_with("TARGET")), list(. = ~.[which(!is.na(.))[1]])) -> res
> res[,c("IDCARD_MBR_ID", "TARGET_NAME_1_.", "TARGET_NAME_2_.", "TARGET_NAME_3_." )]
# A tibble: 12 x 4
IDCARD_MBR_ID TARGET_NAME_1_. TARGET_NAME_2_. TARGET_NAME_3_.
<chr> <chr> <chr> <chr>
1 H0000000001 Target11 Target21 Target31
2 H0000000002 Target12 Target22 Target32
3 H0000000003 Target13 Target23 Target33
4 H0000000004 Target14 Target24 Target34
5 H0000000005 Target15 Target25 Target35
6 H0000000006 Target16 Target26 Target36
7 H0000000007 Target17 Target27 Target37
8 H0000000008 Target18 Target28 Target38
9 H0000000009 Target19 Target29 Target39
10 H0000000010 Target20 Target30 Target40
11 H0000000011 Target21 Target31 Target41
12 H0000000012 Target22 Target32 Target42
我不得不平衡一些分组变量(MONTH、FORMULARYID、LIS_INDICATOR),因为它们在同一个 ID 内不同。我只是为每个 ID 选择了第一个,您可以根据自己的喜好进行调整。