重塑不随时间变化的文件

Question

我想重塑文件，但重塑的典型用法不适用于我的问题（我不认为）。我可以手动完成此操作，但它很容易出错，而且我不会学习如何使用 R 来完成此操作。所以在阅读了 Reshape 和 Tidyr 之后，我决定来这里。

我有一份电力公司的档案以及它们运营所在的县。我希望能够重塑它，使其成为一个县列表以及其中运营的电力公司。我还有一个变量，用于确定我是否在另一个名为“InData”的数据集中拥有每个实用程序的数据。然后我会在这个基础上确定每个县的缺失程度是多少。

列名是： UtilityName, InData, County1,County2,County3,County4... County12

是否有一种简单的方法来重塑它，或者我是否需要创建某种遍历每一列的循环？

这是数据示例：>

dput(utility_county)
structure(list(UtilityName = c("Alder Mutual Light Co Inc", "Avista Corporation", 
"Asotin County PUD", "Benton County PUD", "Blaine City Light", 
"Centralia City Light", "Chelan County PUD", "City of Cashmere", 
"City of Chewelah, Electric Department", "City of Cheney", "City of Coulee Dam Light Dept", 
"City of Ellensburg", "City of McCleary", "City of Milton", "City of Richland", 
"City of Sumas", "Clallam County PUD", "Clark County PUD", "Clearwater Power", 
"Columbia Rural Electric", "Cowlitz County PUD", "Douglas County PUD", 
"Elmhurst Power & Light Co", "Ferry County PUD", "Franklin County PUD", 
"Grant County PUD", "Grays Harbor County PUD", "Inland Power & Light", 
"Jefferson County PUD", "Kittitas County PUD", "Klickitat County PUD", 
"Kootenai Electric Cooperative Inc", "Lakeview Light & Power", 
"Lewis County PUD", "Mason County PUD1", "Mason County PUD3", 
"Modern Electric Water Company", "Nespelem Valley Electric Cooperative", 
"Ohop Mutual Light Co", "Okanogan PUD", "Orcas Power and Light Coop", 
"Pacific County PUD", "Pacific Power", "Parkland Light & Power", 
"Pend Oreille PUD", "Peninsula Light Company", "Port Angeles City Light", 
"Puget Sound Energy", "Seattle City Light", "Skamania PUD", "Snohomish County PUD", 
"Tacoma Public Utilities", "Tanner Electric Cooperative", "Town of Eatonville", 
"Town of Ruston", "Town of Steilacoom", "Vera Water & Power", 
"Wahkiakum County PUD", "Whatcom County PUD", "Big Bend Electric", 
"Northern Lights Inc"), InData = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 1L), UtilityType = c("Coop", "Private", "Public", 
"Public", "Public", "Public", "Public", "Public", "Public", "Public", 
"Public", "Public", "Public", "Public", "Public", "Public", "Public", 
"Public", "Coop", "Coop", "Public", "Public", "Coop", "Public", 
"Public", "Public", "Public", "Coop", "Public", "Public", "Public", 
"Coop", "Coop", "Public", "Public", "Public", "Public", "Coop", 
"Coop", "Public", "Coop", "Public", "Private", "Coop", "Public", 
"Coop", "Public", "Private", "Public", "Public", "Public", "Public", 
"Coop", "Public", "Public", "Public", "Coop", "Public", "Public", 
"Coop", "Coop"), County1 = c("Pierce", "Whitman", "Asotin", "Benton", 
"Whatcom", "Lewis", "Chelan", "Chelan", "Stevens", "Spokane", 
"Grant", "Kittitas", "Grays Harbor", "Fulton County", "Benton County", 
"Whatcom", "Clallam", "Clark", "Asotin  ", "Walla Walla", "Cowlitz", 
"Douglas", "Pierce", "Ferry", "Franklin", "Grant", "Grays Harbor", 
"Spokane", "Jefferson", "Kittitas", "Klickitat", "Spokane", "Grant", 
"Lewis", "Mason", "Mason", "Spokane", "Okanogan", "Pierce", "Okanogan", 
"San Juan", "Pacific", "Yakima", "Pierce", "Pend Oreille", "Pierce", 
"Clallam", "Island", "King", "Skamania", "Snohomish", "Pierce", 
"King", "Pierce", "Pierce", "Pierce", "Spokane", "Wahkiakum", 
"Whatcom", "Adams", "Pend Oreille"), County2 = c("", "Spokane", 
"", "", "", "", "", "", "", "", "Douglas", "", "", "", "", "", 
"", "", "Whitman", "Columbia", "", "", "", "Okanogan", "", "", 
"", "Whitman", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "Garfield", "", "", "", "", "King", "", "", "Island", 
"Mason", "Pierce", "", "", "", "", "", "", "", ""), County3 = c("", 
"Stevens", "", "", "", "", "", "", "", "", "Okanogan", "", "", 
"", "", "", "", "", "Garfield", "", "", "", "", "", "", "", "", 
"Lincoln", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "Columbia", "", "", "", "", "Kitsap", "", "", "", "Grays Harbor", 
"", "", "", "", "", "", "", "", ""), County4 = c("", "Lincoln", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "Stevens", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Walla Walla", "", "", 
"", "", "Kittitas", "", "", "", "Lewis", "", "", "", "", "", 
"", "", "", ""), County5 = c("", "Asotin", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "Garfield", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "Benton", "", "", "", "", "Pierce", "", "", 
"", "", "", "", "", "", "", "", "", "", ""), County6 = c("", 
"Adams", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "Adams", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "Kittitas", 
"", "", "", "", "Skagit", "", "", "", "", "", "", "", "", "", 
"", "", "", ""), County7 = c("", "Ferry", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "Columbia", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Thurston", "", "", "", 
"", "", "", "", "", "", "", "", "", ""), County8 = c("", "Franklin", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "Pend Orielle", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"Whatcom", "", "", "", "", "", "", "", "", "", "", "", "", ""
), County9 = c("", "Grant", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"Grant", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", ""), County10 = c("", "Pend Oreille", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "Asotin", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "")), row.names = c(NA, 61L
), class = "data.frame")

Answer 1

使用 reshape.

dat[] <- lapply(dat, \(x) {x[x == ''] <- NA;x})  ## encode missings first

res <- reshape(dat, varying=4:13, sep='', direction='long', timevar='County', 
               v.names='FOO')
head(res)
#                    tilityName InData UtilityType County     FOO id
# 1.1 Alder Mutual Light Co Inc      0        Coop      1  Pierce  1
# 2.1        Avista Corporation      1     Private      1 Whitman  2
# 3.1         Asotin County PUD      0      Public      1  Asotin  3
# 4.1         Benton County PUD      1      Public      1  Benton  4
# 5.1         Blaine City Light      0      Public      1 Whatcom  5
# 6.1      Centralia City Light      0      Public      1   Lewis  6

dim(res)
# [1] 610   6

Answer 2

这是 tidyr 的方法：

library(dplyr);library(tidyr)
utility_county %>% 
   pivot_longer(cols = -c(UtilityName:UtilityType),values_to = "County") %>%
   filter(County != "") %>%
   group_by(County) %>%
   summarise(Utilities = paste(UtilityName, collapse = ", "),
             Missing = n() - sum(InData))
## A tibble: 43 x 3
#   County          Utilities                                                    Missing
#   <chr>           <chr>                                                          <int>
# 1 "Adams"         Avista Corporation, Inland Power & Light, Big Bend Electric        1
# 2 "Asotin"        Avista Corporation, Asotin County PUD, Inland Power & Light        1
# 3 "Asotin  "      Clearwater Power                                                   0
# 4 "Benton"        Benton County PUD, Pacific Power                                   0
# 5 "Benton County" City of Richland                                                   1
# 6 "Chelan"        Chelan County PUD, City of Cashmere                                2
# 7 "Clallam"       Clallam County PUD, Port Angeles City Light                        1
# 8 "Clark"         Clark County PUD                                                   0
# 9 "Columbia"      Columbia Rural Electric, Inland Power & Light, Pacific Power       1
#10 "Cowlitz"       Cowlitz County PUD                                                 0
## … with 33 more rows

请注意，您的数据有 "Asotin "，但如果需要，您可以提前修复。

Answer 3

utility_county_long = 
  reshape(utility_county, 
         direction = 'long', 
         varying = grep('County\d+', names(utility_county), value = T), 
         timevar = 'county_id', 
         v.names = 'county_name', 
         idvar = grep('County\d+', names(utility_county), value = T, invert=T))

rownames(utility_county_long) = NULL # reshape concatenates the idvars into rownames 
                                     # but we don't need them

# make empties explicit NAs
utility_county_long[utility_county_long$county_name == "", 'county_name'] = NA

# remove white spaces
utility_county_long = transform(utility_county_long,
          UtilityName = trimws(UtilityName, 'both'), 
          county_name = trimws(county_name))

head(utility_county_long, 10)
counts = aggregate(list(N=utility_county_long$UtilityName), 
                   list(county=utility_county_long$county_name),
                   length)

# extra: add pasted-together utility names as in @Ian Campbell's answer
counts$utilities = aggregate(list(utilities = utility_county_long$UtilityName), 
                             list(county = utility_county_long$county_name), 
                   \(x) paste(x, collapse = ", "))$utilities


counts[, c("county", "N")]
          county  N
1          Adams  3
2         Asotin  4
3         Benton  2
4  Benton County  1
5         Chelan  2
6        Clallam  2
7          Clark  1
8       Columbia  3
9        Cowlitz  1
10       Douglas  2
11         Ferry  2
12      Franklin  2
13 Fulton County  1
14      Garfield  3
15         Grant  5
16  Grays Harbor  3
17        Island  2
18     Jefferson  1
19          King  3
20        Kitsap  1
21      Kittitas  4
22     Klickitat  1
23         Lewis  3
24       Lincoln  2
25         Mason  3
26      Okanogan  4
27       Pacific  1
28  Pend Oreille  3
29  Pend Orielle  1
30        Pierce 11
31      San Juan  1
32        Skagit  1
33      Skamania  1
34     Snohomish  1
35       Spokane  6
36       Stevens  3
37      Thurston  1
38     Wahkiakum  1
39   Walla Walla  2
40       Whatcom  4
41       Whitman  3
42        Yakima  1

重塑不随时间变化的文件

Reshaping a file that isn't time varying

r

reshape

dplyr

tidyr