在 R 中创建新列 - 从其他列中提取常规字符
Create New Column in R - Extract regular characters from other column
我在这个帖子中有一个与原始发帖人非常相似的任务:
Create new column in dataframe based on partial string matching other column
但是在 TEST 下有 10 个不同的条件。原始线程中有一个建议如何针对 >3 条件进行编码,但我无法理解如何将其应用于我的数据。
我想创建一个名为 DISTANCE 的列,用于从测试中提取距离。因此,对于名称中包含“0.10m”的任何测试,我希望能够在距离列中包含“0-10m”。如果名称中为“0.20m”,我希望它在 DISTANCE 列中为“0-20m”,依此类推。
PLAYER SEX TEST VALUE
Player 1 Female ICE_0.10m 2.100000
Player 1 Female ICE_0.20m 3.475000
Player 1 Female ICE_10.20m 1.375000
Player 1 Female ICE_20.30m 1.246000
Player 1 Female ICE_0.30m 4.721000
Player 1 Female ICE_Vel_0.10m 4.761905
Player 1 Female ICE_Vel_0.20m 5.755396
Player 1 Female ICE_Vel_10.20m 7.272727
Player 1 Female ICE_Vel_20.30m 8.025682
Player 1 Female ICE_Vel_0.30m 6.354586
Player 1 Female OFF_0.10m 1.983000
Player 1 Female OFF_0.20m 3.380000
Player 1 Female OFF_10.20m 1.397000
Player 1 Female OFF_20.30m 1.380000
Player 1 Female OFF_0.30m 4.760000
Player 1 Female OFF_Vel_0.10m 5.042864
Player 1 Female OFF_Vel_0.20m 5.917160
Player 1 Female OFF_Vel_10.20m 7.158196
Player 1 Female OFF_Vel_20.30m 7.246377
Player 1 Female OFF_Vel_0.30m 6.302521
我试过了,但没用:
SpeedLong$Distance <- ifelse(grepl("0.10m", SpeedLong$Tag, ignore.case = T), "0-10m",
ifelse(grepl("0.20m", SpeedLong$Tag, ignore.case = T), "0-20m",
ifelse(grepl("0.30m", SpeedLong$Tag, ignore.case = T), "0-30m",
ifelse(grepl("0.10m", SpeedLong$Tag, ignore.case = T), "0-10m", "20-30m"))
使用该代码我没有收到错误消息,但它在控制台中显示的代码以 + 号结尾,我猜这意味着代码不完整?我不知道 else 和 grepl 是否是解决此问题的最佳方法,因此欢迎提出其他建议!
而不是嵌套 ifelse
,更好的选择是提取匹配的子字符串并使用正则表达式模式将 .
更改为 -
。在这里,我们匹配字符 (.*
) 直到 _
,将第一组数字 ([0-9]+
) 捕获为一组 ((...)
),然后是点 (\.
- 点是匹配任何字符的元字符,因此我们将其转义 (\
) 以获取文字值),然后是另一个捕获组中的第二组数字,以及 replacement
使用捕获组
的反向引用(\1
、\2
)
library(dplyr)
library(stringr)
df1 %>%
mutate(DISTANCE = str_replace(TEST, ".*_([0-9]+)\.([0-9]+)", "\1-\2"))
# PLAYER SEX TEST VALUE DISTANCE
#1 Player 1 Female ICE_0.10m 2.100000 0-10m
#2 Player 1 Female ICE_0.20m 3.475000 0-20m
#3 Player 1 Female ICE_10.20m 1.375000 10-20m
#4 Player 1 Female ICE_20.30m 1.246000 20-30m
#5 Player 1 Female ICE_0.30m 4.721000 0-30m
#6 Player 1 Female ICE_Vel_0.10m 4.761905 0-10m
#7 Player 1 Female ICE_Vel_0.20m 5.755396 0-20m
#8 Player 1 Female ICE_Vel_10.20m 7.272727 10-20m
#9 Player 1 Female ICE_Vel_20.30m 8.025682 20-30m
#10 Player 1 Female ICE_Vel_0.30m 6.354586 0-30m
#11 Player 1 Female OFF_0.10m 1.983000 0-10m
#12 Player 1 Female OFF_0.20m 3.380000 0-20m
#13 Player 1 Female OFF_10.20m 1.397000 10-20m
#14 Player 1 Female OFF_20.30m 1.380000 20-30m
#15 Player 1 Female OFF_0.30m 4.760000 0-30m
#16 Player 1 Female OFF_Vel_0.10m 5.042864 0-10m
#17 Player 1 Female OFF_Vel_0.20m 5.917160 0-20m
#18 Player 1 Female OFF_Vel_10.20m 7.158196 10-20m
#19 Player 1 Female OFF_Vel_20.30m 7.246377 20-30m
#20 Player 1 Female OFF_Vel_0.30m 6.302521 0-30m
或使用base R
df1$DISTANCE <- sub(".*_([0-9]+)\.([0-9]+)", "\1-\2", df1$TEST)
数据
df1 <- structure(list(PLAYER = c("Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1"),
SEX = c("Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female"), TEST = c("ICE_0.10m", "ICE_0.20m",
"ICE_10.20m", "ICE_20.30m", "ICE_0.30m", "ICE_Vel_0.10m",
"ICE_Vel_0.20m", "ICE_Vel_10.20m", "ICE_Vel_20.30m", "ICE_Vel_0.30m",
"OFF_0.10m", "OFF_0.20m", "OFF_10.20m", "OFF_20.30m", "OFF_0.30m",
"OFF_Vel_0.10m", "OFF_Vel_0.20m", "OFF_Vel_10.20m", "OFF_Vel_20.30m",
"OFF_Vel_0.30m"), VALUE = c(2.1, 3.475, 1.375, 1.246, 4.721,
4.761905, 5.755396, 7.272727, 8.025682, 6.354586, 1.983,
3.38, 1.397, 1.38, 4.76, 5.042864, 5.91716, 7.158196, 7.246377,
6.302521)), class = "data.frame", row.names = c(NA, -20L))
我在这个帖子中有一个与原始发帖人非常相似的任务: Create new column in dataframe based on partial string matching other column
但是在 TEST 下有 10 个不同的条件。原始线程中有一个建议如何针对 >3 条件进行编码,但我无法理解如何将其应用于我的数据。
我想创建一个名为 DISTANCE 的列,用于从测试中提取距离。因此,对于名称中包含“0.10m”的任何测试,我希望能够在距离列中包含“0-10m”。如果名称中为“0.20m”,我希望它在 DISTANCE 列中为“0-20m”,依此类推。
PLAYER SEX TEST VALUE
Player 1 Female ICE_0.10m 2.100000
Player 1 Female ICE_0.20m 3.475000
Player 1 Female ICE_10.20m 1.375000
Player 1 Female ICE_20.30m 1.246000
Player 1 Female ICE_0.30m 4.721000
Player 1 Female ICE_Vel_0.10m 4.761905
Player 1 Female ICE_Vel_0.20m 5.755396
Player 1 Female ICE_Vel_10.20m 7.272727
Player 1 Female ICE_Vel_20.30m 8.025682
Player 1 Female ICE_Vel_0.30m 6.354586
Player 1 Female OFF_0.10m 1.983000
Player 1 Female OFF_0.20m 3.380000
Player 1 Female OFF_10.20m 1.397000
Player 1 Female OFF_20.30m 1.380000
Player 1 Female OFF_0.30m 4.760000
Player 1 Female OFF_Vel_0.10m 5.042864
Player 1 Female OFF_Vel_0.20m 5.917160
Player 1 Female OFF_Vel_10.20m 7.158196
Player 1 Female OFF_Vel_20.30m 7.246377
Player 1 Female OFF_Vel_0.30m 6.302521
我试过了,但没用:
SpeedLong$Distance <- ifelse(grepl("0.10m", SpeedLong$Tag, ignore.case = T), "0-10m",
ifelse(grepl("0.20m", SpeedLong$Tag, ignore.case = T), "0-20m",
ifelse(grepl("0.30m", SpeedLong$Tag, ignore.case = T), "0-30m",
ifelse(grepl("0.10m", SpeedLong$Tag, ignore.case = T), "0-10m", "20-30m"))
使用该代码我没有收到错误消息,但它在控制台中显示的代码以 + 号结尾,我猜这意味着代码不完整?我不知道 else 和 grepl 是否是解决此问题的最佳方法,因此欢迎提出其他建议!
而不是嵌套 ifelse
,更好的选择是提取匹配的子字符串并使用正则表达式模式将 .
更改为 -
。在这里,我们匹配字符 (.*
) 直到 _
,将第一组数字 ([0-9]+
) 捕获为一组 ((...)
),然后是点 (\.
- 点是匹配任何字符的元字符,因此我们将其转义 (\
) 以获取文字值),然后是另一个捕获组中的第二组数字,以及 replacement
使用捕获组
\1
、\2
)
library(dplyr)
library(stringr)
df1 %>%
mutate(DISTANCE = str_replace(TEST, ".*_([0-9]+)\.([0-9]+)", "\1-\2"))
# PLAYER SEX TEST VALUE DISTANCE
#1 Player 1 Female ICE_0.10m 2.100000 0-10m
#2 Player 1 Female ICE_0.20m 3.475000 0-20m
#3 Player 1 Female ICE_10.20m 1.375000 10-20m
#4 Player 1 Female ICE_20.30m 1.246000 20-30m
#5 Player 1 Female ICE_0.30m 4.721000 0-30m
#6 Player 1 Female ICE_Vel_0.10m 4.761905 0-10m
#7 Player 1 Female ICE_Vel_0.20m 5.755396 0-20m
#8 Player 1 Female ICE_Vel_10.20m 7.272727 10-20m
#9 Player 1 Female ICE_Vel_20.30m 8.025682 20-30m
#10 Player 1 Female ICE_Vel_0.30m 6.354586 0-30m
#11 Player 1 Female OFF_0.10m 1.983000 0-10m
#12 Player 1 Female OFF_0.20m 3.380000 0-20m
#13 Player 1 Female OFF_10.20m 1.397000 10-20m
#14 Player 1 Female OFF_20.30m 1.380000 20-30m
#15 Player 1 Female OFF_0.30m 4.760000 0-30m
#16 Player 1 Female OFF_Vel_0.10m 5.042864 0-10m
#17 Player 1 Female OFF_Vel_0.20m 5.917160 0-20m
#18 Player 1 Female OFF_Vel_10.20m 7.158196 10-20m
#19 Player 1 Female OFF_Vel_20.30m 7.246377 20-30m
#20 Player 1 Female OFF_Vel_0.30m 6.302521 0-30m
或使用base R
df1$DISTANCE <- sub(".*_([0-9]+)\.([0-9]+)", "\1-\2", df1$TEST)
数据
df1 <- structure(list(PLAYER = c("Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1", "Player 1",
"Player 1", "Player 1", "Player 1", "Player 1", "Player 1"),
SEX = c("Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female"), TEST = c("ICE_0.10m", "ICE_0.20m",
"ICE_10.20m", "ICE_20.30m", "ICE_0.30m", "ICE_Vel_0.10m",
"ICE_Vel_0.20m", "ICE_Vel_10.20m", "ICE_Vel_20.30m", "ICE_Vel_0.30m",
"OFF_0.10m", "OFF_0.20m", "OFF_10.20m", "OFF_20.30m", "OFF_0.30m",
"OFF_Vel_0.10m", "OFF_Vel_0.20m", "OFF_Vel_10.20m", "OFF_Vel_20.30m",
"OFF_Vel_0.30m"), VALUE = c(2.1, 3.475, 1.375, 1.246, 4.721,
4.761905, 5.755396, 7.272727, 8.025682, 6.354586, 1.983,
3.38, 1.397, 1.38, 4.76, 5.042864, 5.91716, 7.158196, 7.246377,
6.302521)), class = "data.frame", row.names = c(NA, -20L))