用 gsub 进行变异以清除数字中的逗号 - table 用 rvest 刮掉

mutate with gsub to clean out commas from a number - table scraped with rvest

我正在练习抓取和数据清理,并且有一个 table 我从维基百科抓取的。我正在尝试改变 table 以创建一个列,该列将现有列中的逗号清除为 return 数字。我得到的只是一列 NA。

这是我的输出:

> library(dplyr)
> library(rvest)
> 
> pg <- read_html("https://en.wikipedia.org/wiki/Rugby_World_Cup")
> rugby <- pg %>% html_table(., fill = T)
> 
> rugby_table <- rugby[[3]]
>
> rugby_table
# A tibble: 9 x 8
   Year `Host(s)`                              `Total attend­ance` Matches `Avg attend­ance` `% change in avg att.` `Stadium capacity` `Attend­ance as % o~
  <int> <chr>                                  <chr>              <chr>   <chr>            <chr>                  <chr>              <chr>              
1  1987 Australia New Zealand                  604,500            32      20,156           —                      1,006,350          60%                
2  1991 England France Ireland Scotland  Wales 1,007,760          32      31,493           +56%                   1,212,800          79%                
3  1995 South Africa                           1,100,000          32      34,375           +9%                    1,423,850          77%                
4  1999 Wales                                  1,750,000          41      42,683           +24%                   2,104,500          83%                
5  2003 Australia                              1,837,547          48      38,282           –10%                   2,208,529          83%                
6  2007 France                                 2,263,223          48      47,150           +23%                   2,470,660          92%                
7  2011 New Zealand                            1,477,294          48      30,777           –35%                   1,732,000          85%                
8  2015 England                                2,477,805          48      51,621           +68%                   2,600,741          95%                
9  2019 Japan                                  1,698,528          45†     37,745           –27%                   1,811,866          90%                
> 
> rugby_table2 <- rugby %>%
+   .[[3]] %>%
+   tbl_df %>%
+   mutate(Attendance=as.numeric(gsub("[^0-9.-]+","",'Total attendance')))
>    
> rugby_table2
    # A tibble: 9 x 9
       Year `Host(s)`                              `Total attend­ance` Matches `Avg attend­ance` `% change in avg~ `Stadium capaci~ `Attend­ance as~ Attendance
      <int> <chr>                                  <chr>              <chr>   <chr>            <chr>             <chr>            <chr>                <dbl>
    1  1987 Australia New Zealand                  604,500            32      20,156           —                 1,006,350        60%                     NA
    2  1991 England France Ireland Scotland  Wales 1,007,760          32      31,493           +56%              1,212,800        79%                     NA
    3  1995 South Africa                           1,100,000          32      34,375           +9%               1,423,850        77%                     NA
    4  1999 Wales                                  1,750,000          41      42,683           +24%              2,104,500        83%                     NA
    5  2003 Australia                              1,837,547          48      38,282           –10%              2,208,529        83%                     NA
    6  2007 France                                 2,263,223          48      47,150           +23%              2,470,660        92%                     NA
    7  2011 New Zealand                            1,477,294          48      30,777           –35%              1,732,000        85%                     NA
    8  2015 England                                2,477,805          48      51,621           +68%              2,600,741        95%                     NA
    9  2019 Japan                                  1,698,528          45†     37,745           –27%              1,811,866        90%                     NA

有什么想法吗?

gsub 功能是对提供的模式的所有匹配项进行替换。如果您要删除所有带 gsub 的逗号,正确的语法是

rugby_table2 <- rugby %>%
   .[[3]] %>%
   tbl_df %>%
   mutate(Attendance = as.numeric(gsub(",", "", 'Total attendance')))

编辑:

rugby_table <- structure(list(Year = c(1987L, 1991L, 1995L, 1999L, 2003L, 2007L, 
                                       2011L, 2015L, 2019L), `Host(s)` = c("AustraliaNewZealand", "EnglandFranceIrelandScotlandWales", 
                                                                           "SouthAfrica", "Wales", "Australia", "France", "NewZealand", 
                                                                           "England", "Japan"), `Total attendance` = c("604,500", "1,007,760", 
                                                                                                                       "1,100,000", "1,750,000", "1,837,547", "2,263,223", "1,477,294", 
                                                                                                                       "2,477,805", "1,698,528"), Matches = c("32", "32", "32", "41", 
                                                                                                                                                              "48", "48", "48", "48", "45+"), `Avg attendance` = c("20,156", 
                                                                                                                                                                                                                   "31,493", "34,375", "42,683", "38,282", "47,150", "30,777", "51,621", 
                                                                                                                                                                                                                   "37,745"), `% change in avg att` = c("—", "56%", "9%", "24%", 
                                                                                                                                                                                                                                                        "–10%", "23%", "–35%", "68%", "–27%"), `Stadium capacity` = c("1,006,350", 
                                                                                                                                                                                                                                                                                                                      "1,212,800", "1,423,850", "2,104,500", "2,208,529", "2,470,660", 
                                                                                                                                                                                                                                                                                                                      "1,732,000", "2,600,741", "1,811,866"), `Attendance as % o~` = c("60%", 
                                                                                                                                                                                                                                                                                                                                                                                       "79%", "77%", "83%", "83%", "92%", "85%", "95%", "90%")), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                               -9L), class = c("tbl_df", "tbl", "data.frame"))

library(dplyr)

rugby_table %>% 
  mutate(Attendance = as.numeric(gsub(",", "", `Total attendance`))) %>% 
  select(Attendance)
#> # A tibble: 9 x 1
#>   Attendance
#>        <dbl>
#> 1     604500
#> 2    1007760
#> 3    1100000
#> 4    1750000
#> 5    1837547
#> 6    2263223
#> 7    1477294
#> 8    2477805
#> 9    1698528

这里的困难在于 gsub'Total attendance' 解释为字符串,而不是列名。我的自然反应是使用反引号而不是单引号,但随后我收到一条消息说找不到该对象。我不确定这里的问题是什么,但你可以使用 across

解决它
rugby_table2 <- rugby_table %>%
       mutate(Attendance = across(contains("Total"),
                              function(x) as.numeric(gsub(",", "", x))),
              Attendance = Attendance[[1]])

rugby_table2$Attendance
#> [1]  604500 1007760 1100000 1750000 1837547 2263223 1477294 2477805 1698528

编辑

Ronak Shah 已经确定了问题所在,即从网页传来的名称中有一个不可见的字符,这意味着无法识别该列。所以另一种解决方案是:

names(rugby_table)[3] <- "Total attendance"
rugby_table2 <- rugby_table %>%
  mutate(Attendance = as.numeric(gsub(",", "", `Total attendance`)))

rugby_table2$Attendance
#> [1]  604500 1007760 1100000 1750000 1837547 2263223 1477294 2477805