创建一个函数以从数据中添加因子列
Create a function to add factor column from data
我打算读取来自跟踪数据的多个数据集。
他们有一些第一行,我将直接跳过 before/after 阅读。
要保留的重要结构是 3 列中的 x
、y
和 t
数据。
我还需要将 "Label: Caja i" 添加到新列以使其成为 ID 列。每个 i 是每个实验单元。我在完整的数据集中一直到 24,并不总是按顺序。
接下来的问题是如何整理数据集,删除 x、y 和 t headers 并添加 "Caja" 标签。
这是我的数据子集,请注意实验单元记录之间的 NA 线。
dput(cucu)
structure(list(X = c("Key Images", "Title", "0:00:00:00", "",
"Track", "Label :", "Coords (x,y:px; t:time)", "x", "0", "-2",
"-11", "-5", "2", "5", "4", "2", "6", "6", "6", "6", "6", "6",
"7", "-7", "-29", "-27", "-10", "-2", "5", "7", "7", "8", "13",
"", "Track", "Label :", "Coords (x,y:px; t:time)", "x", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "26",
"23", "29", "29", "29", "19", "13", "19", "25", "19", "6", "-1",
"", "Track", "Label :", "Coords (x,y:px; t:time)", "x", "0",
"3", "6", "17"), X.1 = c("", "Time", "0:00:00:00", "", "", "Caja 4",
"", "y", "0", "10", "-11", "2", "17", "29", "25", "12", "-9",
"0", "12", "28", "54", "84", "96", "105", "114", "114", "111",
"112", "111", "116", "120", "120", "127", "", "", "Caja 5", "",
"y", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "44", "27", "12", "1", "0", "2", "3", "2", "-2", "1", "2",
"7", "", "", "Caja 6", "", "y", "0", "14", "32", "38"), X.2 = c("",
"", "", "", "", "", "", "t", "0:00:00:00", "0:00:00:14", "0:00:38:71",
"0:00:38:85", "0:00:39:00", "0:00:39:14", "0:00:39:28", "0:00:39:42",
"0:00:39:57", "0:00:39:71", "0:00:39:85", "0:00:40:00", "0:00:40:14",
"0:00:40:28", "0:00:40:42", "0:00:40:57", "0:00:40:71", "0:00:40:85",
"0:00:41:00", "0:00:41:14", "0:00:41:28", "0:00:41:42", "0:00:41:57",
"0:00:41:71", "0:00:41:85", "", "", "", "", "t", "0:00:00:00",
"0:00:00:14", "0:00:00:28", "0:00:00:42", "0:00:00:57", "0:00:00:71",
"0:00:00:85", "0:00:01:00", "0:00:01:14", "0:00:01:28", "0:00:01:42",
"0:00:01:57", "0:00:01:71", "0:00:40:28", "0:00:40:42", "0:00:40:57",
"0:00:40:71", "0:00:40:85", "0:00:41:00", "0:00:41:14", "0:00:41:28",
"0:00:41:42", "0:00:41:57", "0:00:41:71", "0:00:41:85", "", "",
"", "", "t", "0:00:00:00", "0:00:00:14", "0:00:00:28", "0:00:00:42"
)), .Names = c("X", "X.1", "X.2"), row.names = c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 280L, 281L, 282L, 283L, 284L, 285L,
286L, 287L, 288L, 289L, 290L, 291L, 292L, 293L, 294L, 295L, 296L,
297L, 298L, 299L, 300L, 301L, 302L, 303L, 304L, 305L, 306L, 307L,
308L, 309L, 310L, 311L, 312L, 313L, 314L, 315L, 316L, 317L, 318L,
319L, 320L, 590L, 591L, 592L, 593L, 594L, 595L, 596L, 597L, 598L,
599L, 600L, 601L, 602L, 603L, 604L, 605L, 606L, 607L, 608L, 609L,
610L), class = "data.frame")
这是一个伪造的预期输出,其中 "Caja" 作为所提到的每个级别的 ID 列。
dput(df)
structure(list(Caja = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6), x = c(-3.07268932779439,
0.484266873196818, 0.917705630758503, 1.03584730496379, 0.838625757843941,
0.647284457113794, 1.85357454343089, 1.33693715439301, 0.293179064796777,
0.00633063048373611, 0.612938289390049, -0.50539082918022, 0.251496480385252,
-1.75717878846605, 1.00013191487416, -1.52759310599809, 0.372782154041601,
-1.43694367534352, 1.02558031999024, 1.35942769142916, -0.442425921092658,
1.08025883023299, 0.715069454457284, -0.479919179648368, -1.02735585729109,
-0.242179335814154, 1.12564595565253, 1.98471585124264, 0.804050335171037,
-0.190691648766175), y = c(-0.897147705248384, 0.226793982297109,
0.10644613224225, 0.99551047056361, -0.526239601986565, 0.75860661193569,
-0.481560713881324, 0.280105671338355, 0.997811189730975, 0.491707670881505,
-0.279446168649783, 0.440627202618172, 0.95354565418694, -1.70769016339517,
-1.53381426766088, -0.0544834794352239, -1.20106285132598, -0.333373160001734,
-1.20248240636362, -1.04874198341736, 0.181810095042002, -0.423574476932895,
-1.62483188394934, 1.21775369607968, 1.14458843271056, 0.373070235147697,
0.409431464558393, -0.213415686981125, -0.347800469559547, 0.185367109684458
), t = c(2.32451959770308, 0.692272863593035, -1.89883997859401,
-0.223727358926319, 0.995802018891691, 0.732823865155816, 0.961381338943591,
0.793294686992325, -1.54254907957044, 0.378897583645236, -0.818413418720775,
-0.502999381649839, 0.103448616784216, -0.455956879543311, -1.33974211593966,
1.33950932984407, 0.281470433388303, -0.670832974337081, -0.654599666494088,
-0.486434674492362, -0.0600488930758214, 0.302971586427514, 1.91331234891505,
-0.638758602719345, 0.975063194257583, -0.544269357921009, -0.129239918745744,
0.167815176992696, 1.64640395321812, -1.26864285133868)), .Names = c("Caja",
"x", "y", "t"), row.names = c(NA, -30L), class = "data.frame")
因为我打算为一堆数据集做这个,我想我需要写一个函数。我认为每个 Caja 的记录之间的空行可能有助于定义每个记录的长度
tidy<- function (data.frame) {
Caja<-data.frame[grep("Caja",data.frame$X.1),2] # ("Caja 4" "Caja 5" "Caja 6")
#This is the character vector I need to create a factor: "Caja 4" "Caja 5" "Caja 6"
# Remove empty and extra Rows
names(data.frame)<-c("Caja","x","y","t")
return(data.frame)
}
tidy_df <- function(df) {
#Create index of column headers and values
val.indx <- grepl("\w", df[,3])
#Index of Caja label positions
label.indx <- grepl("Caja", df[,2])
#subset original data with indices
newdf <- df[val.indx | label.indx,]
#Individual label names
labels <- unique(grep("Caja", newdf[,2], value=T))
#Create label column
newdf$Caja <- labels[cumsum(grepl("Caja", newdf[,2]))]
#Remove non-value elements (headers, labels, stray text)
newdf <- newdf[-grep("Label|x",newdf[,1]),]
#Not necessary but makes the output look better
row.names(newdf) <- NULL
names(newdf) <- c("x", "y", "t", "Caja")
newdf
}
head(tidy_df(cucu))
# x y t Caja
# 1 0 0 0:00:00:00 Caja 4
# 2 -2 10 0:00:00:14 Caja 4
# 3 -11 -11 0:00:38:71 Caja 4
# 4 -5 2 0:00:38:85 Caja 4
# 5 2 17 0:00:39:00 Caja 4
# 6 5 29 0:00:39:14 Caja 4
我打算读取来自跟踪数据的多个数据集。
他们有一些第一行,我将直接跳过 before/after 阅读。
要保留的重要结构是 3 列中的 x
、y
和 t
数据。
我还需要将 "Label: Caja i" 添加到新列以使其成为 ID 列。每个 i 是每个实验单元。我在完整的数据集中一直到 24,并不总是按顺序。
接下来的问题是如何整理数据集,删除 x、y 和 t headers 并添加 "Caja" 标签。
这是我的数据子集,请注意实验单元记录之间的 NA 线。
dput(cucu)
structure(list(X = c("Key Images", "Title", "0:00:00:00", "",
"Track", "Label :", "Coords (x,y:px; t:time)", "x", "0", "-2",
"-11", "-5", "2", "5", "4", "2", "6", "6", "6", "6", "6", "6",
"7", "-7", "-29", "-27", "-10", "-2", "5", "7", "7", "8", "13",
"", "Track", "Label :", "Coords (x,y:px; t:time)", "x", "0",
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "26",
"23", "29", "29", "29", "19", "13", "19", "25", "19", "6", "-1",
"", "Track", "Label :", "Coords (x,y:px; t:time)", "x", "0",
"3", "6", "17"), X.1 = c("", "Time", "0:00:00:00", "", "", "Caja 4",
"", "y", "0", "10", "-11", "2", "17", "29", "25", "12", "-9",
"0", "12", "28", "54", "84", "96", "105", "114", "114", "111",
"112", "111", "116", "120", "120", "127", "", "", "Caja 5", "",
"y", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",
"0", "44", "27", "12", "1", "0", "2", "3", "2", "-2", "1", "2",
"7", "", "", "Caja 6", "", "y", "0", "14", "32", "38"), X.2 = c("",
"", "", "", "", "", "", "t", "0:00:00:00", "0:00:00:14", "0:00:38:71",
"0:00:38:85", "0:00:39:00", "0:00:39:14", "0:00:39:28", "0:00:39:42",
"0:00:39:57", "0:00:39:71", "0:00:39:85", "0:00:40:00", "0:00:40:14",
"0:00:40:28", "0:00:40:42", "0:00:40:57", "0:00:40:71", "0:00:40:85",
"0:00:41:00", "0:00:41:14", "0:00:41:28", "0:00:41:42", "0:00:41:57",
"0:00:41:71", "0:00:41:85", "", "", "", "", "t", "0:00:00:00",
"0:00:00:14", "0:00:00:28", "0:00:00:42", "0:00:00:57", "0:00:00:71",
"0:00:00:85", "0:00:01:00", "0:00:01:14", "0:00:01:28", "0:00:01:42",
"0:00:01:57", "0:00:01:71", "0:00:40:28", "0:00:40:42", "0:00:40:57",
"0:00:40:71", "0:00:40:85", "0:00:41:00", "0:00:41:14", "0:00:41:28",
"0:00:41:42", "0:00:41:57", "0:00:41:71", "0:00:41:85", "", "",
"", "", "t", "0:00:00:00", "0:00:00:14", "0:00:00:28", "0:00:00:42"
)), .Names = c("X", "X.1", "X.2"), row.names = c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 280L, 281L, 282L, 283L, 284L, 285L,
286L, 287L, 288L, 289L, 290L, 291L, 292L, 293L, 294L, 295L, 296L,
297L, 298L, 299L, 300L, 301L, 302L, 303L, 304L, 305L, 306L, 307L,
308L, 309L, 310L, 311L, 312L, 313L, 314L, 315L, 316L, 317L, 318L,
319L, 320L, 590L, 591L, 592L, 593L, 594L, 595L, 596L, 597L, 598L,
599L, 600L, 601L, 602L, 603L, 604L, 605L, 606L, 607L, 608L, 609L,
610L), class = "data.frame")
这是一个伪造的预期输出,其中 "Caja" 作为所提到的每个级别的 ID 列。
dput(df)
structure(list(Caja = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6), x = c(-3.07268932779439,
0.484266873196818, 0.917705630758503, 1.03584730496379, 0.838625757843941,
0.647284457113794, 1.85357454343089, 1.33693715439301, 0.293179064796777,
0.00633063048373611, 0.612938289390049, -0.50539082918022, 0.251496480385252,
-1.75717878846605, 1.00013191487416, -1.52759310599809, 0.372782154041601,
-1.43694367534352, 1.02558031999024, 1.35942769142916, -0.442425921092658,
1.08025883023299, 0.715069454457284, -0.479919179648368, -1.02735585729109,
-0.242179335814154, 1.12564595565253, 1.98471585124264, 0.804050335171037,
-0.190691648766175), y = c(-0.897147705248384, 0.226793982297109,
0.10644613224225, 0.99551047056361, -0.526239601986565, 0.75860661193569,
-0.481560713881324, 0.280105671338355, 0.997811189730975, 0.491707670881505,
-0.279446168649783, 0.440627202618172, 0.95354565418694, -1.70769016339517,
-1.53381426766088, -0.0544834794352239, -1.20106285132598, -0.333373160001734,
-1.20248240636362, -1.04874198341736, 0.181810095042002, -0.423574476932895,
-1.62483188394934, 1.21775369607968, 1.14458843271056, 0.373070235147697,
0.409431464558393, -0.213415686981125, -0.347800469559547, 0.185367109684458
), t = c(2.32451959770308, 0.692272863593035, -1.89883997859401,
-0.223727358926319, 0.995802018891691, 0.732823865155816, 0.961381338943591,
0.793294686992325, -1.54254907957044, 0.378897583645236, -0.818413418720775,
-0.502999381649839, 0.103448616784216, -0.455956879543311, -1.33974211593966,
1.33950932984407, 0.281470433388303, -0.670832974337081, -0.654599666494088,
-0.486434674492362, -0.0600488930758214, 0.302971586427514, 1.91331234891505,
-0.638758602719345, 0.975063194257583, -0.544269357921009, -0.129239918745744,
0.167815176992696, 1.64640395321812, -1.26864285133868)), .Names = c("Caja",
"x", "y", "t"), row.names = c(NA, -30L), class = "data.frame")
因为我打算为一堆数据集做这个,我想我需要写一个函数。我认为每个 Caja 的记录之间的空行可能有助于定义每个记录的长度
tidy<- function (data.frame) {
Caja<-data.frame[grep("Caja",data.frame$X.1),2] # ("Caja 4" "Caja 5" "Caja 6")
#This is the character vector I need to create a factor: "Caja 4" "Caja 5" "Caja 6"
# Remove empty and extra Rows
names(data.frame)<-c("Caja","x","y","t")
return(data.frame)
}
tidy_df <- function(df) {
#Create index of column headers and values
val.indx <- grepl("\w", df[,3])
#Index of Caja label positions
label.indx <- grepl("Caja", df[,2])
#subset original data with indices
newdf <- df[val.indx | label.indx,]
#Individual label names
labels <- unique(grep("Caja", newdf[,2], value=T))
#Create label column
newdf$Caja <- labels[cumsum(grepl("Caja", newdf[,2]))]
#Remove non-value elements (headers, labels, stray text)
newdf <- newdf[-grep("Label|x",newdf[,1]),]
#Not necessary but makes the output look better
row.names(newdf) <- NULL
names(newdf) <- c("x", "y", "t", "Caja")
newdf
}
head(tidy_df(cucu))
# x y t Caja
# 1 0 0 0:00:00:00 Caja 4
# 2 -2 10 0:00:00:14 Caja 4
# 3 -11 -11 0:00:38:71 Caja 4
# 4 -5 2 0:00:38:85 Caja 4
# 5 2 17 0:00:39:00 Caja 4
# 6 5 29 0:00:39:14 Caja 4