如何从 R 公式字符串中删除系数
How to remove coefficients form R formula string
我有如下字符串:
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
我正在尝试将其拆分为以下内容:
b: Variables Coefficient
sl_1__1_1_1 1
from_st_1_4_1_1_1 5
into_st_4_1_1_1_1 -70
sl_1__1_1_2 -1
b1: Variables Coefficient
sh_8_6_1_1_1 1
sdp_8_6_1_1_1 -1000
我目前使用的 strsplit 函数无法提取多于一位数的系数(即 1000 系数)。
如有任何帮助,我们将不胜感激。
这是 strpslit
的一种方法:
b= "+fg_1+5*ug1_1-7*tg_4" # original example string
Coefficient <- as.numeric(sub("\+$", "1", strsplit(b, "\**[a-z0-9]+_\d", perl=TRUE)[[1]]))
Variable <- sub("\+", "", strsplit(b, "[+-]\d\**", perl=TRUE)[[1]])
data.frame(Variable, Coefficient)
# Variable Coefficient
#1 fg_1 1
#2 ug1_1 5
#3 tg_4 -7
\**[a-z0-9]+_\d
的解释:
- 匹配一个可选的星号:
\**
- 结尾
*
表示重复0次或多次。可以很容易地使用 ?
代替可选:\*?
- 后跟任意小写字母,再跟任意数字 0 到 9:
[a-z0-9]
+
一次或多次:[a-z0-9]+
- 后跟一个下划线:
_
- 后跟一个数字字符:
\d
编辑:更新新的示例字符串,其模式略有不同,末尾可能有常量(例如,下例中的 +50):
# new sample strings
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
#b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
#b ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
#b= "+fg_1+5*ug1_1-7*tg_4" # 1st sample string from original question
Variable <- strsplit(b, "[+-][0-9]*\**", perl=TRUE)[[1]]
Variable <- Variable[!Variable == ""]
Coefficient <- as.numeric(sub("([+-]$)", "\11",
strsplit(b, "(?<=[+-])\D+.*?(?=[+-]|$)|(?<=\d)\*.*?(?=[+-]|$)",
perl=TRUE)[[1]]))
# handle possible constant at end of string:
ifelse(length(Coefficient) == (length(Variable)+1L),
df <- data.frame(Variable=c(Variable, "constant"), Coefficient),
df <- data.frame(Variable, Coefficient))
df
# Variable Coefficient
#1 sl_1__1_1_1 1
#2 from_st_1_4_1_1_1 5
#3 into_st_4_1_1_1_1 -70
#4 sl_1__1_1_2 -1
这是另一个
b <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 <- "+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
b2 <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
第一步,为了让生活更轻松,是在没有给出系数时添加 1,然后使用我在评论中提到的正则表达式。
(b <- gsub('([+-])(\D)', '\11+\2', b2))
# [1] "+1+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-1+sl_1__1_1_2"
(bb <- regmatches(b, gregexpr('[+-]?\w+', text = b))[[1]])
# [1] "+1" "+sl_1__1_1_1" "+5"
# [4] "from_st_1_4_1_1_1" "-70" "into_st_4_1_1_1_1"
# [7] "-1" "+sl_1__1_1_2"
然后做一些最后的重新排列和格式化
(bb <- data.frame(matrix(bb, ncol = 2, byrow = TRUE)[, 2:1]))
# X1 X2
# 1 +sl_1__1_1_1 +1
# 2 from_st_1_4_1_1_1 +5
# 3 into_st_4_1_1_1_1 -70
# 4 +sl_1__1_1_2 -1
within(bb, {
X1 <- gsub('\-|\+', '', as.character(X1))
X2 <- as.numeric(as.character(X2))
})
# X1 X2
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
如果你把它放到一个函数中会更容易使用。我还添加了对尾随常数项的检查
f <- function(x) {
## check constant
x <- gsub('([+-]\d+)$', '\1*constant', x)
x <- gsub('([+-])(\D)', '\11+\2', x)
x <- regmatches(x, gregexpr('[+-]?\w+', text = x))[[1]]
x <- data.frame(matrix(x, ncol = 2, byrow = TRUE)[, 2:1],
stringsAsFactors = FALSE)
x[, 2] <- as.numeric(x[, 2])
x[, 1] <- gsub('\-|\+', '' , x[, 1])
setNames(x, c('Variable','Coefficient'))
}
f(b)
# Variable Coefficient
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
f(b1)
# Variable Coefficient
# 1 sh_8_6_1_1_1 1
# 2 sdp_8_6_1_1_1 -1000
f(b2)
# Variable Coefficient
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
# 5 constant 50
我有如下字符串:
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
我正在尝试将其拆分为以下内容:
b: Variables Coefficient
sl_1__1_1_1 1
from_st_1_4_1_1_1 5
into_st_4_1_1_1_1 -70
sl_1__1_1_2 -1
b1: Variables Coefficient
sh_8_6_1_1_1 1
sdp_8_6_1_1_1 -1000
我目前使用的 strsplit 函数无法提取多于一位数的系数(即 1000 系数)。
如有任何帮助,我们将不胜感激。
这是 strpslit
的一种方法:
b= "+fg_1+5*ug1_1-7*tg_4" # original example string
Coefficient <- as.numeric(sub("\+$", "1", strsplit(b, "\**[a-z0-9]+_\d", perl=TRUE)[[1]]))
Variable <- sub("\+", "", strsplit(b, "[+-]\d\**", perl=TRUE)[[1]])
data.frame(Variable, Coefficient)
# Variable Coefficient
#1 fg_1 1
#2 ug1_1 5
#3 tg_4 -7
\**[a-z0-9]+_\d
的解释:
- 匹配一个可选的星号:
\**
- 结尾
*
表示重复0次或多次。可以很容易地使用?
代替可选:\*?
- 结尾
- 后跟任意小写字母,再跟任意数字 0 到 9:
[a-z0-9]
+
一次或多次:[a-z0-9]+
- 后跟一个下划线:
_
- 后跟一个数字字符:
\d
编辑:更新新的示例字符串,其模式略有不同,末尾可能有常量(例如,下例中的 +50):
# new sample strings
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
#b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
#b ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
#b= "+fg_1+5*ug1_1-7*tg_4" # 1st sample string from original question
Variable <- strsplit(b, "[+-][0-9]*\**", perl=TRUE)[[1]]
Variable <- Variable[!Variable == ""]
Coefficient <- as.numeric(sub("([+-]$)", "\11",
strsplit(b, "(?<=[+-])\D+.*?(?=[+-]|$)|(?<=\d)\*.*?(?=[+-]|$)",
perl=TRUE)[[1]]))
# handle possible constant at end of string:
ifelse(length(Coefficient) == (length(Variable)+1L),
df <- data.frame(Variable=c(Variable, "constant"), Coefficient),
df <- data.frame(Variable, Coefficient))
df
# Variable Coefficient
#1 sl_1__1_1_1 1
#2 from_st_1_4_1_1_1 5
#3 into_st_4_1_1_1_1 -70
#4 sl_1__1_1_2 -1
这是另一个
b <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 <- "+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
b2 <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
第一步,为了让生活更轻松,是在没有给出系数时添加 1,然后使用我在评论中提到的正则表达式。
(b <- gsub('([+-])(\D)', '\11+\2', b2))
# [1] "+1+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-1+sl_1__1_1_2"
(bb <- regmatches(b, gregexpr('[+-]?\w+', text = b))[[1]])
# [1] "+1" "+sl_1__1_1_1" "+5"
# [4] "from_st_1_4_1_1_1" "-70" "into_st_4_1_1_1_1"
# [7] "-1" "+sl_1__1_1_2"
然后做一些最后的重新排列和格式化
(bb <- data.frame(matrix(bb, ncol = 2, byrow = TRUE)[, 2:1]))
# X1 X2
# 1 +sl_1__1_1_1 +1
# 2 from_st_1_4_1_1_1 +5
# 3 into_st_4_1_1_1_1 -70
# 4 +sl_1__1_1_2 -1
within(bb, {
X1 <- gsub('\-|\+', '', as.character(X1))
X2 <- as.numeric(as.character(X2))
})
# X1 X2
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
如果你把它放到一个函数中会更容易使用。我还添加了对尾随常数项的检查
f <- function(x) {
## check constant
x <- gsub('([+-]\d+)$', '\1*constant', x)
x <- gsub('([+-])(\D)', '\11+\2', x)
x <- regmatches(x, gregexpr('[+-]?\w+', text = x))[[1]]
x <- data.frame(matrix(x, ncol = 2, byrow = TRUE)[, 2:1],
stringsAsFactors = FALSE)
x[, 2] <- as.numeric(x[, 2])
x[, 1] <- gsub('\-|\+', '' , x[, 1])
setNames(x, c('Variable','Coefficient'))
}
f(b)
# Variable Coefficient
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
f(b1)
# Variable Coefficient
# 1 sh_8_6_1_1_1 1
# 2 sdp_8_6_1_1_1 -1000
f(b2)
# Variable Coefficient
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
# 5 constant 50