<U+00A0> 读取 csv 文件时的特殊字符

<U+00A0> special characters when reading a csv file

我正在将多个 csv 文件作为数据帧列表读入 R。在 Windows 机器上工作。

     create_lstdf_csv <- function(path, pattern = "*.csv") {
      files <- dir(path = path, pattern)
      lstdf <- files %>%
        purrr::map(function(x) vroom::vroom(file = file.path(path, x),
        .name_repair = ~ janitor::make_clean_names(.)),
trimws = T) %>%
        stats::setNames(tools::file_path_sans_ext(files)) %>%
        purrr::map(~.x,janitor::remove_empty(which = c("rows", "cols")))
      return(lstdf)
    }

数据框中的某些列有一些 spaces\xa0。即使 vroom 函数将 trimws 设置为 True,它也没有删除前导和尾随的白色 space。

   <chr>          
 1 "CTLA4"        
 2 "PDCD1"        
 3  NA            
 4  NA            
 5 "CXCR3"        
 6  NA            
 7 "\xa0KLRK1"    
 8 "\xa0NCR3\xa0" 
 9 "\xa0NCR2"     
10 "IL-12A/IL-12B" 

当我使用 gsub("\xA0", " ", df$gene, perl = TRUE) 时,即使在编码为 UTF-8 之后,我也会遇到同样的错误。

Error in gsub("\xA0", " ", df$gene, perl = TRUE) : 
  input string 7 is invalid UTF-8

有没有办法在将文件读入列表 df 时避免此错误?


数据

structure(list(gene = c("CTLA4", "PDCD1", NA, NA, "CXCR3", NA, 
"<U+00A0>KLRK1", "<U+00A0>NCR3<U+00A0>", "<U+00A0>NCR2", "IL-12A/IL-12B", 
"IL18R1 and IL18RAP", "<U+00A0>KLRK1", "IFNG", NA, "<U+00A0>KLRK1", 
"<U+00A0>KLRK1", "CXCR (gene group)", "CTLA4", "CTLA4", "PDCD1<U+00A0>", 
"HAVCR2", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", 
"PDCD1<U+00A0>", "PDCD1<U+00A0>", "PDCD1<U+00A0>", "PDCD1<U+00A0>", 
"CD80", "CD80", "LAG3", "LAG3", "<U+00A0>HAVCR2", "<U+00A0>HAVCR2", 
"<U+00A0>HAVCR2", "TNFRSF9", "TNFRSF9", "TNFRSF18", "TNFRSF18", 
"CD40", "CD40", "TNFRSF4", NA, NA, NA, NA, "TLR2", NA, NA, "<U+00A0>KLRK1", 
"<U+00A0>KLRK1", "CCR6", NA, "PDCD1<U+00A0>", "CCR4", "CCR4", 
"ITGAE", "TNFRSF9", "CSF1R", "CCR4", "CCR4", "CCR2", "CD40", 
"TNFRSF17", "TNFRSF13B", "FLT3", "CSF2RA", "CD40", "TNFRSF14", 
"IL12RB1 and IL12RB2", "IL12RB1 and IL12RB2", "IL18R1 and IL18RAP", 
"IL18R1 and IL18RAP", "IL18R1 and IL18RAP", NA, "TIGIT", "TMIGD2", 
"ICOS", "CD27", "TNFRSF14", "TNFRSF14", "TNFRSF14", "TNFRSF14", 
"<U+00A0>HAVCR2", "<U+00A0>HAVCR2", "LAG3", "LAG3", "TIGIT", 
"TIGIT", "TIGIT", "TIGIT", "TIGIT", "TIGIT", "TMIGD2", "TMIGD2", 
"ICOS", "ICOS", "CD27", "CD27", "TNFRSF9", "TNFRSF9", "TNFRSF18", 
"TNFRSF18", "TNFRSF4", "TNFRSF4", "CD40", "CD40", "TNFRSF14", 
"TNFRSF14", "FAS", "CD28", "CTLA4", "PDCD1<U+00A0>", "CD28", 
"CD28", "CD28", "CD28", "CTLA4", "CTLA4", "CTLA4", "CTLA4", "PDCD1<U+00A0>", 
"PDCD1<U+00A0>", NA, "CD40", "PDCD1<U+00A0>", "CTLA4", "CD28", 
"IL6R", "EPHA4", "THY1", "PDCD1<U+00A0>", "CD28", "CD28", "CTLA4", 
"CTLA4", "PDCD1<U+00A0>", "<U+00A0>HAVCR2", "LAG3", "TIGIT", 
"TIGIT", NA)), row.names = c(NA, -145L), class = c("tbl_df", 
"tbl", "data.frame"))

这应该适合你:

df %>% 
  mutate(clean_gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\3", gene))

备注clean_gene

gene               clean_gene        
   <chr>              <chr>             
 1 IL-12A/IL-12B      IL-12A/IL-12B     
 2 IL18R1 and IL18RAP IL18R1 and IL18RAP
 3 <U+00A0>KLRK1      KLRK1             
 4 IFNG               IFNG              
 5 NA                 NA                
 6 <U+00A0>KLRK1      KLRK1             
 7 <U+00A0>KLRK1      KLRK1            

编辑:

要应用于 data.frame 列表:

library(purrr)
library(dplyr)

list_of_dfs <- list_of_dfs %>% 
  map(~mutate(., gene = gsub("<([[:alpha:]][[:alnum:]]*)(.[^>]*)>([.^<]*)", "\3", gene)))