将逗号转换为点和数字,但仅限于一定数量的变量
Convert comma's to point and as numeric, but only in a certain amount of variables
所以我有一个看起来像这样的 df,其中数值拆分为逗号而不是点,并且它们被归类为字符。
var0 <- c("There, are commas", "in the text, string", "as,well", "how, can", "i", "fix, this", "thank you")
var1 <- c("50,0", "72,0", "960,0", "1.920,0", "50,0", "50,0", "960,0")
var2 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
var3<- c("40,0", "72,0", "90,0", "1,30", "50,0", "50,0", "960,0")
...
var96 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
df <- data.frame(cbind(var0, var1, var2, var3))
我知道如何使用 gsub 手动转换每个变量,但正如您在下面看到的,我有大约 96 个这样的变量。除此之外,我还有其他变量,包括不需要转换逗号的文本字符串和因子级别。
对此有什么建议吗?
谢谢
tidyverse 包很适合这类事情。
library(tidyverse)
df <- df %>%
# First, remove the points in your numbers b/c otherwise, you'll end up
# with, e.g., "1.920.0"
mutate_all(.fun = function(x) gsub("\.", "", x)) %>%
# Next, replace all the commas with points and convert to numeric. Only do
# this for the columns that don't contain text, though.
mutate_at(.vars = vars(matches("var[1-3]")),
.fun = function(x) as.numeric(gsub(",", "\.", x)))
请注意,在 mutate_at
调用中,我假设只有“var0”列包含您要保留的文本,并且我转换了与正则表达式“var[1- 3]" 到数字数据并使用点而不是逗号。您需要根据自己的情况调整正则表达式。
这是一个仅用小数点替换逗号并删除所有其他点的函数,如果出现的所有字符都是数字 0-9、点和逗号。
commas2dots <- function(x){
if(any(grepl("[^\.,[:digit:]]", x))){
x
} else {
y <- gsub("\.", "", x)
tc <- textConnection(y)
on.exit(close(tc))
scan(tc, dec = ",", quiet = TRUE)
}
}
lapply(df, commas2dots)
#$var0
#[1] "There, are commas" "in the text, string"
#[3] "as,well" "how, can"
#[5] "i" "fix, this"
#[7] "thank you"
#
#$var1
#[1] 50 72 960 1920 50 50 960
#
#$var2
#[1] 40 742 9460 1920 50 50 960
#
#$var3
#[1] 40.0 72.0 90.0 1.3 50.0 50.0 960.0
#
#$var96
#[1] 40 742 9460 1920 50 50 960
要更改 data.frame 的列:
df[] <- lapply(df, commas2dots)
df
# var0 var1 var2 var3 var96
#1 There, are commas 50 40 40.0 40
#2 in the text, string 72 742 72.0 742
#3 as,well 960 9460 90.0 9460
#4 how, can 1920 1920 1.3 1920
#5 i 50 50 50.0 50
#6 fix, this 50 50 50.0 50
#7 thank you 960 960 960.0 960
数据
var0 <- c("There, are commas", "in the text, string", "as,well", "how, can", "i", "fix, this", "thank you")
var1 <- c("50,0", "72,0", "960,0", "1.920,0", "50,0", "50,0", "960,0")
var2 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
var3<- c("40,0", "72,0", "90,0", "1,30", "50,0", "50,0", "960,0")
var96 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
df <- data.frame(var0, var1, var2, var3, var96)
所以我有一个看起来像这样的 df,其中数值拆分为逗号而不是点,并且它们被归类为字符。
var0 <- c("There, are commas", "in the text, string", "as,well", "how, can", "i", "fix, this", "thank you")
var1 <- c("50,0", "72,0", "960,0", "1.920,0", "50,0", "50,0", "960,0")
var2 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
var3<- c("40,0", "72,0", "90,0", "1,30", "50,0", "50,0", "960,0")
...
var96 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
df <- data.frame(cbind(var0, var1, var2, var3))
我知道如何使用 gsub 手动转换每个变量,但正如您在下面看到的,我有大约 96 个这样的变量。除此之外,我还有其他变量,包括不需要转换逗号的文本字符串和因子级别。
对此有什么建议吗?
谢谢
tidyverse 包很适合这类事情。
library(tidyverse)
df <- df %>%
# First, remove the points in your numbers b/c otherwise, you'll end up
# with, e.g., "1.920.0"
mutate_all(.fun = function(x) gsub("\.", "", x)) %>%
# Next, replace all the commas with points and convert to numeric. Only do
# this for the columns that don't contain text, though.
mutate_at(.vars = vars(matches("var[1-3]")),
.fun = function(x) as.numeric(gsub(",", "\.", x)))
请注意,在 mutate_at
调用中,我假设只有“var0”列包含您要保留的文本,并且我转换了与正则表达式“var[1- 3]" 到数字数据并使用点而不是逗号。您需要根据自己的情况调整正则表达式。
这是一个仅用小数点替换逗号并删除所有其他点的函数,如果出现的所有字符都是数字 0-9、点和逗号。
commas2dots <- function(x){
if(any(grepl("[^\.,[:digit:]]", x))){
x
} else {
y <- gsub("\.", "", x)
tc <- textConnection(y)
on.exit(close(tc))
scan(tc, dec = ",", quiet = TRUE)
}
}
lapply(df, commas2dots)
#$var0
#[1] "There, are commas" "in the text, string"
#[3] "as,well" "how, can"
#[5] "i" "fix, this"
#[7] "thank you"
#
#$var1
#[1] 50 72 960 1920 50 50 960
#
#$var2
#[1] 40 742 9460 1920 50 50 960
#
#$var3
#[1] 40.0 72.0 90.0 1.3 50.0 50.0 960.0
#
#$var96
#[1] 40 742 9460 1920 50 50 960
要更改 data.frame 的列:
df[] <- lapply(df, commas2dots)
df
# var0 var1 var2 var3 var96
#1 There, are commas 50 40 40.0 40
#2 in the text, string 72 742 72.0 742
#3 as,well 960 9460 90.0 9460
#4 how, can 1920 1920 1.3 1920
#5 i 50 50 50.0 50
#6 fix, this 50 50 50.0 50
#7 thank you 960 960 960.0 960
数据
var0 <- c("There, are commas", "in the text, string", "as,well", "how, can", "i", "fix, this", "thank you")
var1 <- c("50,0", "72,0", "960,0", "1.920,0", "50,0", "50,0", "960,0")
var2 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
var3<- c("40,0", "72,0", "90,0", "1,30", "50,0", "50,0", "960,0")
var96 <- c("40,0", "742,0", "9460,0", "1.920,0", "50,0", "50,0", "960,0")
df <- data.frame(var0, var1, var2, var3, var96)