尝试用数值替换数据框中的字符值,错误“无效因子级别,NA 生成”
Attempting to replace character value in dataframe with numeric value , Error " invalid factor level, NA generated"
我正在尝试进行一些预处理,并希望将 classe
因子值 {A,B,C,D,E}
转换为 {1,2,3,4,5}
。
classe
列的类型是factor
,我已经提供了所有步骤,见下:
#get the data
training <- read.table("http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
training_df <- data.frame(training,stringsAsFactors=FALSE)
#split to training & test sets
inTrain <- createDataPartition(y=training$classe, p=0.75, list=FALSE)
training_data <- training[inTrain,]
testing_data <- training[-inTrain,]
#subset based on columns of interest, based on previous studies
training_data_subset <- subset(training_data, select=c("avg_roll_belt","var_roll_belt","var_total_accel_belt","amplitude_roll_belt","max_roll_belt","var_roll_belt",
"var_accel_arm","magnet_arm_x","magnet_arm_y","magnet_arm_z","accel_dumbbell_y","accel_dumbbell_z","magnet_dumbbell_x","gyros_dumbbell_x",
"gyros_dumbbell_y","gyros_dumbbell_z","pitch_forearm","gyros_forearm_x","gyros_forearm_y","classe"))
#see which columns are factors, the training_data_subset#classe feature is a factor
sapply(training_data_subset, class)
#sapply output
avg_roll_belt var_roll_belt var_total_accel_belt amplitude_roll_belt max_roll_belt
"numeric" "numeric" "numeric" "numeric" "numeric"
var_roll_belt.1 var_accel_arm magnet_arm_x magnet_arm_y magnet_arm_z
"numeric" "numeric" "integer" "integer" "integer"
accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x gyros_dumbbell_x gyros_dumbbell_y
"integer" "integer" "integer" "numeric" "numeric"
gyros_dumbbell_z pitch_forearm gyros_forearm_x gyros_forearm_y classe
"numeric" "numeric" "numeric" "numeric" "factor"
我创建了一个试图替换 A=1,B=2,C=3,D=4,E=5 的函数,见下文:
factorsToNumeric <- function(data)
{
data_numeric <- data
data_numeric$classe <-as.numeric(factor(toupper(as.character(data_numeric$classe))))
#loop through the data frame based on replace values
for(i in 1:nrow(data_numeric))
{
if ((data_numeric[i,]$classe == "A") || (data_numeric[i,]$classe == "a"))
{data_numeric[i,]$classe <- "1"}
else if ((data_numeric[i,]$classe == "B") || (data_numeric[i,]$classe == "b"))
{data_numeric[i,]$classe <- "2"}
else if ((data_numeric[i,]$classe == "C") || (data_numeric[i,]$classe == "c"))
{data_numeric[i,]$classe <- "3"}
else if ((data_numeric[i,]$classe == "D") || (data_numeric[i,]$classe == "d"))
{data_numeric[i,]$classe <- "4"}
else if ((data_numeric[i,]$classe == "E") || (data_numeric[i,]$classe == "e"))
{data_numeric[i,]$classe <- "5"}
else
{
#do nothing
}
}
return (data_numeric)
}
但是,我得到这个错误:
training_data_subset_numeric <- factorsToNumeric(training_data_subset)
错误:
Warning messages:
1: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
2: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
3: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
4: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
5: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
6: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
7: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
8: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
9: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
进一步检查显示列 "classe" 的 class 已转换为 "numeric":
sapply(training_data_subset_numeric, class)
avg_roll_belt var_roll_belt var_total_accel_belt amplitude_roll_belt max_roll_belt
"numeric" "numeric" "numeric" "numeric" "numeric"
var_roll_belt.1 var_accel_arm magnet_arm_x magnet_arm_y magnet_arm_z
"numeric" "numeric" "integer" "integer" "integer"
accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x gyros_dumbbell_x gyros_dumbbell_y
"integer" "integer" "integer" "numeric" "numeric"
gyros_dumbbell_z pitch_forearm gyros_forearm_x gyros_forearm_y classe
"numeric" "numeric" "numeric" "numeric" "numeric"
然而,head函数确认了上面的错误&所有值A、B、C、D、E都被错误地替换为NA
。
如果要转换 training_data_subset
的 classe
列,则无需定义自己的函数。您可以使用 LETTERS
向量:
sapply(training_data_subset[,'classe'], function(x) which(LETTERS==x))
因素不是那样起作用的。您不能像其他数据类型那样使用简单的 <-
赋值来更改值。有几种不同的方法可以改变一个因素。这是使用 levels<-
替换函数的一种方法。
这是您的海量数据中的一个示例,但阅读起来花费的时间太长了 :) 对于此数据来说很容易,因为级别已经按正确的顺序排列。
set.seed(2)
x <- sample(training$classe, 20)
x
# [1] A D C A E E A E B C C A D A B E E A B A
# Levels: A B C D E
levels(x) <- 1:5
x
# [1] 1 4 3 1 5 5 1 5 2 3 3 1 4 1 2 5 5 1 2 1
# Levels: 1 2 3 4 5
所以不需要你的功能。你可以简单地做
levels(training$classe) <- 1:5
我们可以看到新列的 str
显示了更改后的值
str(training$classe)
# Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
请注意,对于这种简单的情况,as.integer(training$classe)
也适用。虽然大多数时候不会那么容易。
我正在尝试进行一些预处理,并希望将 classe
因子值 {A,B,C,D,E}
转换为 {1,2,3,4,5}
。
classe
列的类型是factor
,我已经提供了所有步骤,见下:
#get the data
training <- read.table("http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
training_df <- data.frame(training,stringsAsFactors=FALSE)
#split to training & test sets
inTrain <- createDataPartition(y=training$classe, p=0.75, list=FALSE)
training_data <- training[inTrain,]
testing_data <- training[-inTrain,]
#subset based on columns of interest, based on previous studies
training_data_subset <- subset(training_data, select=c("avg_roll_belt","var_roll_belt","var_total_accel_belt","amplitude_roll_belt","max_roll_belt","var_roll_belt",
"var_accel_arm","magnet_arm_x","magnet_arm_y","magnet_arm_z","accel_dumbbell_y","accel_dumbbell_z","magnet_dumbbell_x","gyros_dumbbell_x",
"gyros_dumbbell_y","gyros_dumbbell_z","pitch_forearm","gyros_forearm_x","gyros_forearm_y","classe"))
#see which columns are factors, the training_data_subset#classe feature is a factor
sapply(training_data_subset, class)
#sapply output
avg_roll_belt var_roll_belt var_total_accel_belt amplitude_roll_belt max_roll_belt
"numeric" "numeric" "numeric" "numeric" "numeric"
var_roll_belt.1 var_accel_arm magnet_arm_x magnet_arm_y magnet_arm_z
"numeric" "numeric" "integer" "integer" "integer"
accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x gyros_dumbbell_x gyros_dumbbell_y
"integer" "integer" "integer" "numeric" "numeric"
gyros_dumbbell_z pitch_forearm gyros_forearm_x gyros_forearm_y classe
"numeric" "numeric" "numeric" "numeric" "factor"
我创建了一个试图替换 A=1,B=2,C=3,D=4,E=5 的函数,见下文:
factorsToNumeric <- function(data)
{
data_numeric <- data
data_numeric$classe <-as.numeric(factor(toupper(as.character(data_numeric$classe))))
#loop through the data frame based on replace values
for(i in 1:nrow(data_numeric))
{
if ((data_numeric[i,]$classe == "A") || (data_numeric[i,]$classe == "a"))
{data_numeric[i,]$classe <- "1"}
else if ((data_numeric[i,]$classe == "B") || (data_numeric[i,]$classe == "b"))
{data_numeric[i,]$classe <- "2"}
else if ((data_numeric[i,]$classe == "C") || (data_numeric[i,]$classe == "c"))
{data_numeric[i,]$classe <- "3"}
else if ((data_numeric[i,]$classe == "D") || (data_numeric[i,]$classe == "d"))
{data_numeric[i,]$classe <- "4"}
else if ((data_numeric[i,]$classe == "E") || (data_numeric[i,]$classe == "e"))
{data_numeric[i,]$classe <- "5"}
else
{
#do nothing
}
}
return (data_numeric)
}
但是,我得到这个错误:
training_data_subset_numeric <- factorsToNumeric(training_data_subset)
错误:
Warning messages:
1: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
2: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
3: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
4: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
5: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
6: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
7: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
8: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
9: In `[<-.factor`(`*tmp*`, iseq, value = "1") :
invalid factor level, NA generated
进一步检查显示列 "classe" 的 class 已转换为 "numeric":
sapply(training_data_subset_numeric, class)
avg_roll_belt var_roll_belt var_total_accel_belt amplitude_roll_belt max_roll_belt
"numeric" "numeric" "numeric" "numeric" "numeric"
var_roll_belt.1 var_accel_arm magnet_arm_x magnet_arm_y magnet_arm_z
"numeric" "numeric" "integer" "integer" "integer"
accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x gyros_dumbbell_x gyros_dumbbell_y
"integer" "integer" "integer" "numeric" "numeric"
gyros_dumbbell_z pitch_forearm gyros_forearm_x gyros_forearm_y classe
"numeric" "numeric" "numeric" "numeric" "numeric"
然而,head函数确认了上面的错误&所有值A、B、C、D、E都被错误地替换为NA
。
如果要转换 training_data_subset
的 classe
列,则无需定义自己的函数。您可以使用 LETTERS
向量:
sapply(training_data_subset[,'classe'], function(x) which(LETTERS==x))
因素不是那样起作用的。您不能像其他数据类型那样使用简单的 <-
赋值来更改值。有几种不同的方法可以改变一个因素。这是使用 levels<-
替换函数的一种方法。
这是您的海量数据中的一个示例,但阅读起来花费的时间太长了 :) 对于此数据来说很容易,因为级别已经按正确的顺序排列。
set.seed(2)
x <- sample(training$classe, 20)
x
# [1] A D C A E E A E B C C A D A B E E A B A
# Levels: A B C D E
levels(x) <- 1:5
x
# [1] 1 4 3 1 5 5 1 5 2 3 3 1 4 1 2 5 5 1 2 1
# Levels: 1 2 3 4 5
所以不需要你的功能。你可以简单地做
levels(training$classe) <- 1:5
我们可以看到新列的 str
显示了更改后的值
str(training$classe)
# Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
请注意,对于这种简单的情况,as.integer(training$classe)
也适用。虽然大多数时候不会那么容易。