如何 select 带条件的字符串模式 [r]
How to select string pattern with conditions in loop [r]
在我对 r 数据帧中某些行中字符串的 select 部分的探索中,我需要一些帮助。我在下面模拟了一些虚拟数据 (floyd) 来说明。
第一个数据框行每列只有一个单词(它是一个数字,但我将所有数字视为 characters/words),但第 2 到 4 行有多个单词。我想 select 每个 row/cell 中的数字基于命名向量 cool_floyd_position
.
传递给它的位置
# please NB need stringr installed for my solution attempt!
# some scenario data
floyd = data.frame(people = c("roger", "david", "rick", "nick"),
spec1 = c("1", "3 5 75 101", "3 65 85", "12 2"),
spec2 = c("45", "75 101 85 12", "45 65 8", "45 87" ),
spec3 = c("1", "3 5 75 101", "75 98 5", "65 32"))
# tweak my data
rownames(floyd) = floyd$people
floyd$people = NULL
# ppl of interest
cool_floyd = rownames(floyd)[2:4]
# ppl string position criteria
cool_floyd_position = c(2,3,1)
names(cool_floyd_position) = c("david", "rick", "nick")
# my solution attempt
for(i in 1:length(cool_floyd))
{
select_ppl = cool_floyd[i]
string_select = cool_floyd_position[i]
floyd[row.names(floyd) == select_ppl,] = apply(floyd[row.names(floyd) == select_ppl], 1,
function(x) unlist(stringr::str_split(x, " ")[string_select]))
}
我试图让我的 floyd 数据框看起来像下面这样,其中第二个词是 selected 用于所有 david 列,第三个词用于所有 rick 列,第一个词用于所有 nick 列(罗杰专栏必须保持原样)
my_target_df = data.frame(people = c("roger", "david", "rick", "nick"),
spec1 = c("1", "5", "85", "12"),
spec2 = c("45", "101", "8", "45" ),
spec3 = c("1", "5", "5", "65"))
row.names(my_target_df) = my_target_df$people
my_target_df$people = NULL
非常感谢!
您可以尝试组合使用 sapply
来遍历数据框,并尝试使用 mapply
从每列中提取第 n 个 word
。即,
library(stringr)
df1 <- rbind(df[1,-1], sapply(df[-1,-1], function(i) mapply(word, i, cool_floyd_position)))
rownames(df1) <- df$people
df1
# spec1 spec2 spec3
#roger 1 45 1
#david 5 101 5
#rick 85 8 5
#nick 12 45 65
此解决方案的唯一缺点是 people
显示为行名而不是单个列。有很多方法可以使它成为一个列,即
df1$people <- rownames(df1)
rownames(df1) <- NULL
df1[c(ncol(df1), 1:ncol(df1)-1)]
# people spec1 spec2 spec3
#1 roger 1 45 1
#2 david 5 101 5
#3 rick 85 8 5
#4 nick 12 45 65
这是另一个使用 mapply
的选项
library(stringr)
#convert the factor columns to character
floyd[] <- lapply(floyd, as.character)
#transpose the floyd, subset the columns, convert to data.frame
# use mapply to extract the `word` specified in the corresponding c1
#transpose and assign it back to the row in 'floyd'
floyd[names(c1),] <- t(mapply(function(x,y) word(x, y),
as.data.frame(t(floyd)[, names(c1)], stringsAsFactors=FALSE), c1))
floyd
# spec1 spec2 spec3
#roger 1 45 1
#david 5 101 5
#rick 85 8 5
#nick 12 45 65
其中
c1 <- cool_floyd_position #just to avoid typing
Tidyverse 解决方案:
library(stringi) # you have this installed if you have stringr
library(tidyverse)
pick_pos <- function(who, x, lkp) {
if (who %in% names(lkp)) {
map_chr(x, ~stri_split_fixed(., " ")[[1]][lkp[[who]]])
} else {
x
}
}
rownames_to_column(floyd, "people") %>%
mutate_all(funs(as.character)) %>% # necessary since you have factors
group_by(people) %>%
mutate_all(funs(pick_pos(people, ., cool_floyd_position))) %>%
data.frame() %>%
column_to_rownames("people")
在我对 r 数据帧中某些行中字符串的 select 部分的探索中,我需要一些帮助。我在下面模拟了一些虚拟数据 (floyd) 来说明。
第一个数据框行每列只有一个单词(它是一个数字,但我将所有数字视为 characters/words),但第 2 到 4 行有多个单词。我想 select 每个 row/cell 中的数字基于命名向量 cool_floyd_position
.
# please NB need stringr installed for my solution attempt!
# some scenario data
floyd = data.frame(people = c("roger", "david", "rick", "nick"),
spec1 = c("1", "3 5 75 101", "3 65 85", "12 2"),
spec2 = c("45", "75 101 85 12", "45 65 8", "45 87" ),
spec3 = c("1", "3 5 75 101", "75 98 5", "65 32"))
# tweak my data
rownames(floyd) = floyd$people
floyd$people = NULL
# ppl of interest
cool_floyd = rownames(floyd)[2:4]
# ppl string position criteria
cool_floyd_position = c(2,3,1)
names(cool_floyd_position) = c("david", "rick", "nick")
# my solution attempt
for(i in 1:length(cool_floyd))
{
select_ppl = cool_floyd[i]
string_select = cool_floyd_position[i]
floyd[row.names(floyd) == select_ppl,] = apply(floyd[row.names(floyd) == select_ppl], 1,
function(x) unlist(stringr::str_split(x, " ")[string_select]))
}
我试图让我的 floyd 数据框看起来像下面这样,其中第二个词是 selected 用于所有 david 列,第三个词用于所有 rick 列,第一个词用于所有 nick 列(罗杰专栏必须保持原样)
my_target_df = data.frame(people = c("roger", "david", "rick", "nick"),
spec1 = c("1", "5", "85", "12"),
spec2 = c("45", "101", "8", "45" ),
spec3 = c("1", "5", "5", "65"))
row.names(my_target_df) = my_target_df$people
my_target_df$people = NULL
非常感谢!
您可以尝试组合使用 sapply
来遍历数据框,并尝试使用 mapply
从每列中提取第 n 个 word
。即,
library(stringr)
df1 <- rbind(df[1,-1], sapply(df[-1,-1], function(i) mapply(word, i, cool_floyd_position)))
rownames(df1) <- df$people
df1
# spec1 spec2 spec3
#roger 1 45 1
#david 5 101 5
#rick 85 8 5
#nick 12 45 65
此解决方案的唯一缺点是 people
显示为行名而不是单个列。有很多方法可以使它成为一个列,即
df1$people <- rownames(df1)
rownames(df1) <- NULL
df1[c(ncol(df1), 1:ncol(df1)-1)]
# people spec1 spec2 spec3
#1 roger 1 45 1
#2 david 5 101 5
#3 rick 85 8 5
#4 nick 12 45 65
这是另一个使用 mapply
library(stringr)
#convert the factor columns to character
floyd[] <- lapply(floyd, as.character)
#transpose the floyd, subset the columns, convert to data.frame
# use mapply to extract the `word` specified in the corresponding c1
#transpose and assign it back to the row in 'floyd'
floyd[names(c1),] <- t(mapply(function(x,y) word(x, y),
as.data.frame(t(floyd)[, names(c1)], stringsAsFactors=FALSE), c1))
floyd
# spec1 spec2 spec3
#roger 1 45 1
#david 5 101 5
#rick 85 8 5
#nick 12 45 65
其中
c1 <- cool_floyd_position #just to avoid typing
Tidyverse 解决方案:
library(stringi) # you have this installed if you have stringr
library(tidyverse)
pick_pos <- function(who, x, lkp) {
if (who %in% names(lkp)) {
map_chr(x, ~stri_split_fixed(., " ")[[1]][lkp[[who]]])
} else {
x
}
}
rownames_to_column(floyd, "people") %>%
mutate_all(funs(as.character)) %>% # necessary since you have factors
group_by(people) %>%
mutate_all(funs(pick_pos(people, ., cool_floyd_position))) %>%
data.frame() %>%
column_to_rownames("people")