将txt文件加载到R中并根据其他数据框替换一些值

Loading txt file into R and replace some value based on other data frame

我有一个包含特定格式结构的大型 txt 文件。我的目标是使用 readLines 在 R 中加载文本,我想根据我的 df 数据框用新值替换每条记录的权重值。我不想更改 .txt 数据结构格式或解析 .txt 文件。最终输出应具有与原始 .txt (writeLines()) 完全相同的结构。我如何读取它并更新值?谢谢

这是我的参考数据框

df <- tibble::tribble(
        ~House_id,  ~id, ~new_weight,
  18105265, "Mab",        4567,
  18117631, "Maa",        3367,
  18121405, "Mab",        4500,
  71811763, "Maa",        2455,
  71811763, "Mab",        2872
  ) 

这是我的 .txt

的一小部分
H18105265_0
R1_0
Mab_3416311514210525745_W923650.80
T1_0
T2_0
T3_0
V64_0_2_010_ab171900171959
H18117631_0
R1_0
Maa_1240111711220682016_W123650.80
T1_0
V74_0_1_010_aa081200081259_aa081600081859_aa082100095659_aa095700101159_aa101300105059
H18121405_0
R1_0
Mab_2467211713110643835_W923650.80
T1_0
T2_0
V62_0_1_010_090500092459_100500101059_101100101659_140700140859_141100141359
H71811763_0
R1_0
Maa_5325411210120486554_W923650.80
Mab_5325411210110485554_W723650.80
T1_0
T2_0
T3_0
T4_0

这里是第一个单独记录的期望输出 house_id = 18105265:更新 Mab_3416311514210525745_W923650.80df

的新值 Mab_3416311514210525745_W4567 对齐
H18105265_0
R1_0
Mab_3416311514210525745_W4567
T1_0
T2_0
T3_0
V64_0_2_010_ab171900171959

您必须遍历在 readlines 文本文档之后获得的各种行。您可以使用 hpatt = 'H[0-9]+_0' 作为正则表达式从以 H 开头的行中解析 House_id,然后将 stringr 包应用于处理行:

for (i in 1:length(lines)){
  line = lines[[i]]

  #detect if line looks like 'H[number]_0'
  if (stringr::str_detect(line, hpatt)){
    #if it does, extract the 'house_id' from the line
    h_id = stringr::str_extract(test, pattern = 'H[0-9]+') %>% 
      stringr::str_replace('H|_0','')
  }

在第二部分中,您可以将原始权重替换为从您的 tibble 中获得的权重(我在这里将其命名为 replacetibble)。我正在使用正则表达式 mpatt = '^[a-zA-z]+_[0-9]+_W[0-9\.]+$',它查找类似于 [character-onlyname]_[number]_W[numberwithdecimal]:

的字符串
  if (stringr::str_detect(line, mpatt)){
    # split string to get 'id'
    id = stringr::str_split(line, '_')[[1]][[1]]
    # look up weight
    wt = (replacetibble %>% filter(house_id==h_id & id == id) %>% select(weight))
    # replace number in line, split the original line by the 'W'
    # this will of course break if your id contains a W - please
    # adapt logic according to your naming rules
    replaceline = stringr::str_split(line, 'W')[[1]]
    replaceline[length(replaceline)] =wt
    # put the line back together with a 'W' character
    lines[[i]] = paste0(replaceline, collapse = 'W')
  }
}

Stringr (cheat sheet here) 一般在操作字符串方面还是比较厉害的

我将加载和保存部分留给你。

编辑 - 添加 id 以查找以区分非唯一 House_id。

这是一种方法,我读取数据,加入 df 中的更新权重,然后使用新权重在以“M”开头的行上创建更新值。

library(tidyverse)
read_fwf("txt_sample.txt" ,  col_positions = fwf_empty("txt_sample.txt")) %>% # edit suggested by DanG

# if the row starts with H, extract 8 digit house number and
# use that to join to the table with new weights
mutate(House_id = if_else(str_starts(X1, "H"), as.numeric(str_sub(X1, 2,9)), NA_real_),
       id = if_else(str_starts(X1, "M"), str_sub(X1, 1,3), NA_character_)) %>%
fill(House_id) %>%
left_join(df, by = c("House_id", "id")) %>%
fill(new_weight) %>%

# make new string using updated weight (or keep existing string)
mutate(X1_new = coalesce(
  if_else(str_starts(X1, "M"),
          paste0(word(X1, end = 2, sep = "_"), "_W", new_weight),
          NA_character_),
  X1)) %>%

pull(X1_new) %>% 
writeLines()

输出

H18105265_0
R1_0
Mab_3416311514210525745_W4567
T1_0
T2_0
T3_0
V64_0_2_010_ab171900171959
H18117631_0
R1_0
Maa_1240111711220682016_W3367
T1_0
V74_0_1_010_aa081200081259_aa081600081859_aa082100095659_aa095700101159_aa101300105059
H18121405_0
R1_0
Mab_2467211713110643835_W4500
T1_0
T2_0
V62_0_1_010_090500092459_100500101059_101100101659_140700140859_141100141359
H71811763_0
R1_0
Maa_5325411210120486554_W2455
Mab_5325411210110485554_W2872
T1_0
T2_0
T3_0
T4_0

我试图将每一步都放在一个新对象中,以便更好地理解发生了什么。如果您不清楚任何正则表达式,请随时询问。

id不限位数,个人id只限于以“Ma(任意字符)_”开头,可以很容易地扩展,因此一个房屋ID可以包含任意数量的个人。

library(tidyverse)
df <- tibble::tribble(
  ~House_id,  ~id, ~new_weight,
  18105265, "Mab",        4567,
  18117631, "Maa",        3367,
  18121405, "Mab",        4500,
  71811763, "Maa",        2455,
  71811763, "Mab",        2872
)

# read the data
dat <- readLines("test.txt")

# convert to tibble
dat2 <- tibble::tibble(X = dat)

# keep relevant info, i.e. house IDs and individual IDs
dat3 <- dat2 %>% 
  rowid_to_column() %>% 
  filter(grepl(pattern = "H[0-9]+_0", X) | 
           grepl(pattern = "^Ma._[0-9]+", X))
dat3
#> # A tibble: 9 × 2
#>   rowid X                                 
#>   <int> <chr>                             
#> 1     1 H18105265_0                       
#> 2     3 Mab_3416311514210525745_W923650.80
#> 3     8 H18117631_0                       
#> 4    10 Maa_1240111711220682016_W123650.80
#> 5    13 H18121405_0                       
#> 6    15 Mab_2467211713110643835_W923650.80
#> 7    19 H71811763_0                       
#> 8    21 Maa_5325411210120486554_W923650.80
#> 9    22 Mab_5325411210110485554_W723650.80


# determine which individuals belong to which house
dat4 <- dat3 %>% 
  mutate(house1 = grepl(pattern = "H[0-9]+_0", X)) %>% 
  mutate(house2 = cumsum(house1))
dat4
#> # A tibble: 9 × 4
#>   rowid X                                  house1 house2
#>   <int> <chr>                              <lgl>   <int>
#> 1     1 H18105265_0                        TRUE        1
#> 2     3 Mab_3416311514210525745_W923650.80 FALSE       1
#> 3     8 H18117631_0                        TRUE        2
#> 4    10 Maa_1240111711220682016_W123650.80 FALSE       2
#> 5    13 H18121405_0                        TRUE        3
#> 6    15 Mab_2467211713110643835_W923650.80 FALSE       3
#> 7    19 H71811763_0                        TRUE        4
#> 8    21 Maa_5325411210120486554_W923650.80 FALSE       4
#> 9    22 Mab_5325411210110485554_W723650.80 FALSE       4


dat4b <- dat4 %>% 
  filter(grepl(pattern = "H[0-9]+_0", X)) %>% 
  select(house_id = X, house2)
dat4b
#> # A tibble: 4 × 2
#>   house_id    house2
#>   <chr>        <int>
#> 1 H18105265_0      1
#> 2 H18117631_0      2
#> 3 H18121405_0      3
#> 4 H71811763_0      4


# combine house and individual ids next to each other
dat5 <- dat4 %>% 
  left_join(dat4b,
            by = "house2") %>% 
  mutate(prefix = gsub(pattern = "_.+", replacement = "", x = X),
         house_id = as.numeric(gsub("^H|_0", "", house_id))) %>% 
  select(rowid, house_id, prefix, X) %>% 
  filter(grepl(pattern = "^Ma._[0-9]+", X)) 
dat5
#> # A tibble: 5 × 4
#>   rowid house_id prefix X                                 
#>   <int>    <dbl> <chr>  <chr>                             
#> 1     3 18105265 Mab    Mab_3416311514210525745_W923650.80
#> 2    10 18117631 Maa    Maa_1240111711220682016_W123650.80
#> 3    15 18121405 Mab    Mab_2467211713110643835_W923650.80
#> 4    21 71811763 Maa    Maa_5325411210120486554_W923650.80
#> 5    22 71811763 Mab    Mab_5325411210110485554_W723650.80


# add he new information about individual ids
dat6 <- left_join(dat5, df,
                  by = c("house_id" = "House_id",
                         "prefix" = "id"))
dat6
#> # A tibble: 5 × 5
#>   rowid house_id prefix X                                  new_weight
#>   <int>    <dbl> <chr>  <chr>                                   <dbl>
#> 1     3 18105265 Mab    Mab_3416311514210525745_W923650.80       4567
#> 2    10 18117631 Maa    Maa_1240111711220682016_W123650.80       3367
#> 3    15 18121405 Mab    Mab_2467211713110643835_W923650.80       4500
#> 4    21 71811763 Maa    Maa_5325411210120486554_W923650.80       2455
#> 5    22 71811763 Mab    Mab_5325411210110485554_W723650.80       2872


# generate the new ids
dat7 <- dat6 %>% 
  mutate(Y = gsub(pattern = "(?=W).+", replacement = "", x = X, perl = T),
         X_new = paste0(Y, "W", new_weight)) %>% 
  select(rowid, X_new)
dat7
#> # A tibble: 5 × 2
#>   rowid X_new                        
#>   <int> <chr>                        
#> 1     3 Mab_3416311514210525745_W4567
#> 2    10 Maa_1240111711220682016_W3367
#> 3    15 Mab_2467211713110643835_W4500
#> 4    21 Maa_5325411210120486554_W2455
#> 5    22 Mab_5325411210110485554_W2872


# replace the old ids by the new ones
dat[dat7$rowid] <- dat7$X_new
dat
#>  [1] "H18105265_0"                                                                           
#>  [2] "R1_0"                                                                                  
#>  [3] "Mab_3416311514210525745_W4567"                                                         
#>  [4] "T1_0"                                                                                  
#>  [5] "T2_0"                                                                                  
#>  [6] "T3_0"                                                                                  
#>  [7] "V64_0_2_010_ab171900171959"                                                            
#>  [8] "H18117631_0"                                                                           
#>  [9] "R1_0"                                                                                  
#> [10] "Maa_1240111711220682016_W3367"                                                         
#> [11] "T1_0"                                                                                  
#> [12] "V74_0_1_010_aa081200081259_aa081600081859_aa082100095659_aa095700101159_aa101300105059"
#> [13] "H18121405_0"                                                                           
#> [14] "R1_0"                                                                                  
#> [15] "Mab_2467211713110643835_W4500"                                                         
#> [16] "T1_0"                                                                                  
#> [17] "T2_0"                                                                                  
#> [18] "V62_0_1_010_090500092459_100500101059_101100101659_140700140859_141100141359"          
#> [19] "H71811763_0"                                                                           
#> [20] "R1_0"                                                                                  
#> [21] "Maa_5325411210120486554_W2455"                                                         
#> [22] "Mab_5325411210110485554_W2872"                                                         
#> [23] "T1_0"                                                                                  
#> [24] "T2_0"                                                                                  
#> [25] "T3_0"                                                                                  
#> [26] "T4_0"


# write back the updated data
# writeLines(...)

您可以尝试以下基本 R 代码

writeLines(
  do.call(
    paste0,
    lapply(
      unlist(
        strsplit(
          readChar("test.txt", file.info("test.txt")$size),
          "(?<=\d)\n(?=H)",
          perl = TRUE
        )
      ),
      function(x) {
        with(
          df,
          Reduce(
            function(x, ps) sub(ps[[1]], ps[[2]], x),
            asplit(rbind(
              unlist(regmatches(x, gregexpr("W.*(?=\n)", x, perl = TRUE))),
              paste0("W", new_weight[sapply(sprintf("H%s.*%s_\d+_W", House_id, id), grepl, x)])
            ), 2),
            init = x
          )
        )
      }
    )
  )
)

这给出了

H18105265_0
R1_0
Mab_3416311514210525745_W4567
T1_0
T2_0
T3_0
V64_0_2_010_ab171900171959
H18117631_0
R1_0
Maa_1240111711220682016_W3367
T1_0
V74_0_1_010_aa081200081259_aa081600081859_aa082100095659_aa095700101159_aa101300105059
H18121405_0
R1_0
Mab_2467211713110643835_W4500
T1_0
T2_0
V62_0_1_010_090500092459_100500101059_101100101659_140700140859_141100141359
H71811763_0
R1_0
Maa_5325411210120486554_W2455
Mab_5325411210110485554_W2872
T1_0
T2_0
T3_0
T4_0

分解代码

  • 我们先用下面的代码把长字符串分成更小的块
      unlist(
        strsplit(
          readChar("test.txt", file.info("test.txt")$size),
          "(?<=\d)\n(?=H)",
          perl = TRUE
        )
      )
  • 对于每个块中的子字符串,我们找到匹配的House_id + id,并将权重部分,例如Wxxxxxx替换为对应的new_weight
        with(
          df,
          Reduce(
            function(x, ps) sub(ps[[1]], ps[[2]], x),
            asplit(
              rbind(
              unlist(regmatches(x, gregexpr("W.*(?=\n)", x, perl = TRUE))),
              paste0("W", new_weight[sapply(sprintf("H%s.*%s_\d+_W", House_id, id), grepl, x)])
            ), 2),
            init = x
          )
        )

注意最后一个块有两个不同的匹配id,我们使用Reduce迭代替换权重

这是对大型数据集的 dplyr solution that uses a left_join()...but otherwise relies exclusively on vectorized operations, which are significantly more efficient than looping

虽然代码可能 显得 冗长,但这只是一种格式选择:为了清楚起见,我使用

foo(
  arg_1 = bar,
  arg_2 = baz,
  # ...
  arg_n = qux
) 

而不是单行foo(bar, baz, qux)。同样为了清楚起见,我将详细说明

    # Map each row to its house ID.
    House_id = data[row_number()[target][cumsum(target)]],

详细信息部分。

解决方案

给定一个文件,如 subset.txt 转载于此

H18105265_0
R1_0
Mab_3416311514210525745_W923650.80
T1_0
T2_0
T3_0
V64_0_2_010_ab171900171959
H18117631_0
R1_0
Maa_1240111711220682016_W123650.80
T1_0
V74_0_1_010_aa081200081259_aa081600081859_aa082100095659_aa095700101159_aa101300105059
H18121405_0
R1_0
Mab_2467211713110643835_W923650.80
T1_0
T2_0
V62_0_1_010_090500092459_100500101059_101100101659_140700140859_141100141359
H71811763_0
R1_0
Maa_5325411210120486554_W923650.80
Mab_5325411210110485554_W723650.80
T1_0
T2_0
T3_0
T4_0

和此处转载的 df 等参考数据集

df <- tibble::tribble(
  ~House_id,   ~id, ~new_weight,
   18105265, "Mab",        4567,
   18117631, "Maa",        3367,
   18121405, "Mab",        4500,
   71811763, "Maa",        2455,
   71811763, "Mab",        2872
)

以下解决方案

# For manipulating data.
library(dplyr)


# ...
# Code to generate your reference 'df'.
# ...



# Specify the filepath.
text_filepath <- "subset.txt"

# Define the textual pattern for each data item we want, where the relevant
# values are divided into their own capture groups.
regex_house_id <- "(H)(\d+)(_)(\d)"
regex_weighted_label <- "(M[a-z]{2,})(_)(\d+)(_W)(\d+(\.\d+)?)"



# Read the textual data (into a dataframe).
data.frame(data = readLines(text_filepath)) %>%

  # Transform the textual data.
  mutate(
    # Target (TRUE) the identifying row (house ID) for each (contiguous) group.
    target = grepl(
      # Use the textual pattern for house IDs.
      pattern = regex_house_id,
      x = data
    ),

    # Map each row to its house ID.
    House_id = data[row_number()[target][cumsum(target)]],

    # Extract the underlying numeric ID from the house ID.
    House_id = gsub(
      pattern = regex_house_id,
      # The numeric ID is in the 2nd capture group.
      replacement = "\2",
      x = House_id
    ),

    # Treat the numeric ID as a number.
    House_id = as.numeric(House_id),



    # Target (TRUE) the weighted labels.
    target = grepl(
      # Use the textual pattern for weighted labels.
      pattern = regex_weighted_label,
      x = data
    ),

    # Extract the ID from (only) the weighted labels.
    id = if_else(
      target,
      gsub(
        pattern = regex_weighted_label,
        # The ID is in the 1st capture group.
        replacement = "\1",
        x = data
      ),
      # For any data that is NOT a weighted label, give it a blank (NA) ID.
      as.character(NA)
    ),

    # Extract from (only) the weighted labels everything else but the weight.
    rest = if_else(
      target,
      gsub(
        pattern = regex_weighted_label,
        # Everything is in the 2nd, 3rd, and 4th capture groups; ignoring the ID
        # (1st) and the weight (5th).
        replacement = "\2\3\4",
        x = data
      ),
      # For any data that is NOT a weighted label, make it blank (NA) for
      # everything else.
      as.character(NA)
    )
  ) %>%

  # Link (JOIN) each weighted label to its new weight; with blanks (NAs) for
  # nonmatches.
  left_join(df, by = c("House_id", "id")) %>%

  # Replace (only) the weighted labels, with their updated values.
  mutate(
    data = if_else(
      target,
      # Generate the updated value by splicing together the original components
      # with the new weight.
      paste0(id, rest, new_weight),
      # For data that is NOT a weighted label, leave it unchanged.
      data
    )
  ) %>%

  # Extract the column of updated values.
  .$data %>%

  # Overwrite the original text with the updated values.
  writeLines(con = text_filepath)

将转换您的文本数据并更新原始文件。

结果

原始文件(此处subset.txt)现在将包含更新信息:

H18105265_0
R1_0
Mab_3416311514210525745_W4567
T1_0
T2_0
T3_0
V64_0_2_010_ab171900171959
H18117631_0
R1_0
Maa_1240111711220682016_W3367
T1_0
V74_0_1_010_aa081200081259_aa081600081859_aa082100095659_aa095700101159_aa101300105059
H18121405_0
R1_0
Mab_2467211713110643835_W4500
T1_0
T2_0
V62_0_1_010_090500092459_100500101059_101100101659_140700140859_141100141359
H71811763_0
R1_0
Maa_5325411210120486554_W2455
Mab_5325411210110485554_W2872
T1_0
T2_0
T3_0
T4_0

详情

正则表达式

文本操作仅依赖于正则表达式中 grepl() (to identify matches) and gsub() (to extract components). We divide each of our textual patterns regex_house_id and regex_weighted_label into their components, as distinguished by capture groups 的基本功能:

#      The "H" prefix.      The "_" separator.
#                  | |      | |
regex_house_id <- "(H)(\d+)(_)(\d)"
#                     |    |   |   |
#  The digits following "H".   The "0" suffix (or any digit).
#                                The digits after the 'id'.
#   The 'id': "M" then 2 small letters.   |    |    The weight (possibly a decimal).
#                          |          |   |    |    |              |
regex_weighted_label <-   "(M[a-z]{2,})(_)(\d+)(_W)(\d+(\.\d+)?)"
#                                      | |      |  |
#                       The "_" separator.      The "_" separator and "W" prefix before weight.

我们可以使用 grepl(pattern = regex_weighted_label, x = my_strings) 检查向量 my_strings 中的哪些字符串匹配加权标签的格式(如 "Mab_3416311514210525745_W923650.80")。

我们还可以使用 gsub(pattern = regex_weighted label, replacement = "\5", my_labels) 从该格式的标签向量 my_labels 中提取权重(第 5 个捕获组)。

映射

在第一个 mutate() 语句中找到,行

    # Map each row to its house ID.
    House_id = data[row_number()[target][cumsum(target)]],

可能看起来很神秘。但是,它只是 classic arithmetic trick (also employed by @mnist in their ) 将连续值索引为组。

代码 cumsum(target) 扫描 target 列,该列(此时在工作流中)具有逻辑值 (TRUE FALSE FALSE ...) 指示是否 (TRUE)或不是 (FALSE) 文本行是房屋 ID(如 "H18105265_0")。当它达到 TRUE(数值为 1)时,它会增加其 运行 总数,而 FALSE(数值为 0)则保持总数不变。

由于文本 data

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |------------ ...
  "H18105265_0" "R1_0" ...                 "H18117631_0" "R1_0" ...           "H18121405_0" ...

给了我们合乎逻辑的 target

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE ...

这些值(TRUEFALSE)被强制转换为数字(10

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  1    0     0     0     0     0     0     1    0     0     0     0     0     1    0     ...

在此处生成 cumsum()

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  1    1     1     1     1     1     1     2    2     2     2     2     2     3    3     ...  

注意现在我们已经将每一行映射到它的“组号”。 cumsum(target).

这么多

现在 row_number()[target]!实际上,row_number() 只是“索引”每个位置(行)

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  1             2      ...                 8             9      ...           13         ...

data 列(或任何其他列)中:

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |------------ ...
  "H18105265_0" "R1_0" ...                 "H18117631_0" "R1_0" ...           "H18121405_0" ...

所以用 target

下标这些索引
# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  TRUE           FALSE ...                  TRUE          FALSE ...           TRUE       ...

仅选择具有房屋 ID 的位置:

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  1                                         8                                 13         ...

因此,如果我们采用 row_number()[target]

的结果
# House ID: 1st 2nd 3rd ...
# Position:
            1   8   13  ... 

下标cumsum(target)

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  1    1     1     1     1     1     1     2    2     2     2     2     2     3    3     ...

我们将每一行映射到其房屋 ID 的位置(在 data 中):

# |-------------- Group 1 ---------------| |----------- Group 2 ------------| |--------- ...
  1    1     1     1     1     1     1     8    8     8     8     8     8     13   13    ...

这是 row_number()[target][cumsum(target)] 的结果。

最后,当我们用它的房屋 ID 的这些(重复的)位置下标 data 时,我们得到 House_id

# |----------------- Group 1 -----------------| |----------------- Group 2 -----------------| |-------------------------- ...
  "H18105265_0" "H18105265_0" ... "H18105265_0" "H18117631_0" "H18117631_0" ... "H18117631_0" "H18121405_0" "H18121405_0" ...

其中 data 中的每个值都映射到其组的房屋 ID。

感谢这个 House_id 专栏

House_id = data[row_number()[target][cumsum(target)]]

在我们的 data 列旁边,我们可以将 df 中的 id 映射 (left_join()) 到它们相应的文本 data