从 R 中的数据框的列中删除文本字符串中的重复数字
Remove duplicated numbers in text string from a column of a dataframe in R
我有这个例子:
df <- structure(list(PdivR = c(0.93, 0.92, 0.97, 1.07, 1.08, 1.02,
0.95, 0.92, 1.19, 0.99, 0.92, 1.02, 1.02, 0.88, 1.13, 0.97, 0.86,
1.06, 0.98, 0.97, 1, 1.01, 0.96, 1.01, 1.02, 1.04, 0.98, 1.06,
1.05, 0.97, 1.13, 0.97, 0.87, 1.06, 0.94, 1.03, 1.11, 1.11, 0.97,
0.94, 1.05, 1, 0.94, 1.02, 1.04, 0.87, 1.17, 0.98, 0.95, 1.04,
0.96, 0.99, 1.07, 1.04, 0.95, 1.01, 0.9, 1.09, 0.91, 1.02, 1.08,
1.04, 1.1, 0.99, 0.84, 1.04, 1.05), Regression = c("TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9")), row.names = 1234:1300, class = "data.frame")
可以在dataframe中显示
head(df)
PdivR Regression
1234 0.93 TLC~8+8+10+10
1235 0.92 TLC~8+8+10+10
1236 0.97 TLC~8+8+10+10
1237 1.07 TLC~8+8+10+10
1238 1.08 TLC~8+8+10+10
1239 1.02 TLC~8+8+10+10
如果我们将注意力集中在列 Regression
上,我们会发现数字是重复的 (TLC~8+8+10+10
)。
如何以时尚的方式删除这些重复的数字,以便正确配置 Regression
列?在前面的示例中,正确的值将是 TLC~8+10
我们可以使用
df$Regression <- gsub("(\d+\+)\1+", "\1",
gsub("(\+\d+)\1+", "\1", df$Regression))
或者也可以使用strsplit
df$Regression <- sapply(strsplit(df$Regression, "[[:punct:]]"),
function(x) deparse(reformulate(unique(x)[-1], response = x[1])))
我有这个例子:
df <- structure(list(PdivR = c(0.93, 0.92, 0.97, 1.07, 1.08, 1.02,
0.95, 0.92, 1.19, 0.99, 0.92, 1.02, 1.02, 0.88, 1.13, 0.97, 0.86,
1.06, 0.98, 0.97, 1, 1.01, 0.96, 1.01, 1.02, 1.04, 0.98, 1.06,
1.05, 0.97, 1.13, 0.97, 0.87, 1.06, 0.94, 1.03, 1.11, 1.11, 0.97,
0.94, 1.05, 1, 0.94, 1.02, 1.04, 0.87, 1.17, 0.98, 0.95, 1.04,
0.96, 0.99, 1.07, 1.04, 0.95, 1.01, 0.9, 1.09, 0.91, 1.02, 1.08,
1.04, 1.1, 0.99, 0.84, 1.04, 1.05), Regression = c("TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~8+8+10+10",
"TLC~8+8+10+10", "TLC~8+8+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~9+9+10+10",
"TLC~9+9+10+10", "TLC~9+9+10+10", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9",
"TLC~7+7+8+8+9+9", "TLC~7+7+8+8+9+9")), row.names = 1234:1300, class = "data.frame")
可以在dataframe中显示
head(df)
PdivR Regression
1234 0.93 TLC~8+8+10+10
1235 0.92 TLC~8+8+10+10
1236 0.97 TLC~8+8+10+10
1237 1.07 TLC~8+8+10+10
1238 1.08 TLC~8+8+10+10
1239 1.02 TLC~8+8+10+10
如果我们将注意力集中在列 Regression
上,我们会发现数字是重复的 (TLC~8+8+10+10
)。
如何以时尚的方式删除这些重复的数字,以便正确配置 Regression
列?在前面的示例中,正确的值将是 TLC~8+10
我们可以使用
df$Regression <- gsub("(\d+\+)\1+", "\1",
gsub("(\+\d+)\1+", "\1", df$Regression))
或者也可以使用strsplit
df$Regression <- sapply(strsplit(df$Regression, "[[:punct:]]"),
function(x) deparse(reformulate(unique(x)[-1], response = x[1])))