在 R 中创建新列 - 从其他列中提取常规字符

Create New Column in R - Extract regular characters from other column

我在这个帖子中有一个与原始发帖人非常相似的任务: Create new column in dataframe based on partial string matching other column

但是在 TEST 下有 10 个不同的条件。原始线程中有一个建议如何针对 >3 条件进行编码,但我无法理解如何将其应用于我的数据。

我想创建一个名为 DISTANCE 的列,用于从测试中提取距离。因此,对于名称中包含“0.10m”的任何测试,我希望能够在距离列中包含“0-10m”。如果名称中为“0.20m”,我希望它在 DISTANCE 列中为“0-20m”,依此类推。

PLAYER      SEX     TEST        VALUE             
Player 1    Female    ICE_0.10m    2.100000
Player 1    Female    ICE_0.20m    3.475000
Player 1    Female    ICE_10.20m    1.375000
Player 1    Female    ICE_20.30m    1.246000
Player 1    Female    ICE_0.30m    4.721000
Player 1    Female    ICE_Vel_0.10m    4.761905
Player 1    Female    ICE_Vel_0.20m    5.755396
Player 1    Female    ICE_Vel_10.20m    7.272727
Player 1    Female    ICE_Vel_20.30m    8.025682
Player 1    Female    ICE_Vel_0.30m    6.354586
Player 1    Female    OFF_0.10m    1.983000
Player 1    Female    OFF_0.20m    3.380000
Player 1    Female    OFF_10.20m    1.397000
Player 1    Female    OFF_20.30m    1.380000
Player 1    Female    OFF_0.30m    4.760000
Player 1    Female    OFF_Vel_0.10m    5.042864
Player 1    Female    OFF_Vel_0.20m    5.917160
Player 1    Female    OFF_Vel_10.20m    7.158196
Player 1    Female    OFF_Vel_20.30m    7.246377
Player 1    Female    OFF_Vel_0.30m    6.302521

我试过了,但没用:

SpeedLong$Distance <- ifelse(grepl("0.10m", SpeedLong$Tag, ignore.case = T), "0-10m",
ifelse(grepl("0.20m", SpeedLong$Tag, ignore.case = T), "0-20m",
ifelse(grepl("0.30m", SpeedLong$Tag, ignore.case = T), "0-30m",
ifelse(grepl("0.10m", SpeedLong$Tag, ignore.case = T), "0-10m", "20-30m"))

使用该代码我没有收到错误消息,但它在控制台中显示的代码以 + 号结尾,我猜这意味着代码不完整?我不知道 else 和 grepl 是否是解决此问题的最佳方法,因此欢迎提出其他建议!

而不是嵌套 ifelse,更好的选择是提取匹配的子字符串并使用正则表达式模式将 . 更改为 -。在这里,我们匹配字符 (.*) 直到 _,将第一组数字 ([0-9]+) 捕获为一组 ((...)),然后是点 (\. - 点是匹配任何字符的元字符,因此我们将其转义 (\) 以获取文字值),然后是另一个捕获组中的第二组数字,以及 replacement使用捕获组

的反向引用(\1\2
library(dplyr)
library(stringr)
df1 %>% 
    mutate(DISTANCE = str_replace(TEST, ".*_([0-9]+)\.([0-9]+)", "\1-\2"))
#     PLAYER    SEX           TEST    VALUE DISTANCE
#1  Player 1 Female      ICE_0.10m 2.100000    0-10m
#2  Player 1 Female      ICE_0.20m 3.475000    0-20m
#3  Player 1 Female     ICE_10.20m 1.375000   10-20m
#4  Player 1 Female     ICE_20.30m 1.246000   20-30m
#5  Player 1 Female      ICE_0.30m 4.721000    0-30m
#6  Player 1 Female  ICE_Vel_0.10m 4.761905    0-10m
#7  Player 1 Female  ICE_Vel_0.20m 5.755396    0-20m
#8  Player 1 Female ICE_Vel_10.20m 7.272727   10-20m
#9  Player 1 Female ICE_Vel_20.30m 8.025682   20-30m
#10 Player 1 Female  ICE_Vel_0.30m 6.354586    0-30m
#11 Player 1 Female      OFF_0.10m 1.983000    0-10m
#12 Player 1 Female      OFF_0.20m 3.380000    0-20m
#13 Player 1 Female     OFF_10.20m 1.397000   10-20m
#14 Player 1 Female     OFF_20.30m 1.380000   20-30m
#15 Player 1 Female      OFF_0.30m 4.760000    0-30m
#16 Player 1 Female  OFF_Vel_0.10m 5.042864    0-10m
#17 Player 1 Female  OFF_Vel_0.20m 5.917160    0-20m
#18 Player 1 Female OFF_Vel_10.20m 7.158196   10-20m
#19 Player 1 Female OFF_Vel_20.30m 7.246377   20-30m
#20 Player 1 Female  OFF_Vel_0.30m 6.302521    0-30m

或使用base R

df1$DISTANCE <- sub(".*_([0-9]+)\.([0-9]+)", "\1-\2", df1$TEST)

数据

df1 <- structure(list(PLAYER = c("Player 1", "Player 1", "Player 1", 
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", 
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1", 
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1"), 
    SEX = c("Female", "Female", "Female", "Female", "Female", 
    "Female", "Female", "Female", "Female", "Female", "Female", 
    "Female", "Female", "Female", "Female", "Female", "Female", 
    "Female", "Female", "Female"), TEST = c("ICE_0.10m", "ICE_0.20m", 
    "ICE_10.20m", "ICE_20.30m", "ICE_0.30m", "ICE_Vel_0.10m", 
    "ICE_Vel_0.20m", "ICE_Vel_10.20m", "ICE_Vel_20.30m", "ICE_Vel_0.30m", 
    "OFF_0.10m", "OFF_0.20m", "OFF_10.20m", "OFF_20.30m", "OFF_0.30m", 
    "OFF_Vel_0.10m", "OFF_Vel_0.20m", "OFF_Vel_10.20m", "OFF_Vel_20.30m", 
    "OFF_Vel_0.30m"), VALUE = c(2.1, 3.475, 1.375, 1.246, 4.721, 
    4.761905, 5.755396, 7.272727, 8.025682, 6.354586, 1.983, 
    3.38, 1.397, 1.38, 4.76, 5.042864, 5.91716, 7.158196, 7.246377, 
    6.302521)), class = "data.frame", row.names = c(NA, -20L))