通过按输入数据帧的值索引输出数据帧来复制数据
Copying data by indexing the output dataframe by the input dataframe's value
我有一个已读入数据框的 csv 文件(基本上是 MALLET 主题建模功能的主题输出),如下所示:
函数 dput 给出的形式为:
structure(list(V1 = structure(1:10, .Label = c("file:/C:/mallet/my-data/dickens-greatexpectations.txt",
"file:/C:/mallet/my-data/dickens-olivertwist.txt", "file:/C:/mallet/my-data/emma-austen.txt",
"file:/C:/mallet/my-data/hardy-judetheobscure.txt", "file:/C:/mallet/my-data/hardy-pairofblueyes.txt",
"file:/C:/mallet/my-data/jacob-room-woolf.txt", "file:/C:/mallet/my-data/melville-moby-dick.txt",
"file:/C:/mallet/my-data/pride-and-prejudice.txt", "file:/C:/mallet/my-data/shakespeare-asyoulikieit.txt",
"file:/C:/mallet/my-data/shakespeare-hamlet.txt"), class = "factor"),
V2 = c(9L, 11L, 0L, 5L, 10L, 7L, 19L, 18L, 14L, 1L), V3 = c(0.239748159,
0.309737525, 0.346177616, 0.255417865, 0.247361424, 0.363149847,
0.244273675, 0.359370464, 0.623336798, 0.529540077), V4 = c(8L,
2L, 8L, 8L, 8L, 2L, 6L, 15L, 1L, 14L), V5 = c(0.153314326,
0.180447378, 0.216411342, 0.222792466, 0.204428308, 0.106517068,
0.142443012, 0.225841963, 0.060758836, 0.10636646), V6 = c(12L,
8L, 15L, 2L, 2L, 17L, 3L, 8L, 12L, 3L), V7 = c(0.133753069,
0.119245504, 0.189293205, 0.115636944, 0.125618736, 0.100731465,
0.1118036, 0.214664183, 0.058264033, 0.081965395), V8 = c(2L,
12L, 16L, 12L, 12L, 12L, 17L, 16L, 3L, 12L), V9 = c(0.126641324,
0.106343402, 0.152679976, 0.114001734, 0.092237676, 0.097094801,
0.111000484, 0.116048098, 0.055665281, 0.069469092), V10 = c(13L,
15L, 2L, 4L, 4L, 8L, 4L, 2L, 8L, 15L), V11 = c(0.102613447,
0.088533751, 0.035143082, 0.072097009, 0.087287789, 0.081762956,
0.089448381, 0.040344043, 0.048284823, 0.046398994), V12 = c(15L,
17L, 4L, 15L, 15L, 16L, 12L, 4L, 15L, 8L), V13 = c(0.089197751,
0.061352267, 0.022595325, 0.060847551, 0.067365514, 0.067877511,
0.069480505, 0.01936756, 0.04454262, 0.035159716), V14 = c(16L,
4L, 12L, 16L, 16L, 3L, 2L, 12L, 16L, 17L), V15 = c(0.045022571,
0.043479293, 0.018194247, 0.044751566, 0.05556351, 0.06275312,
0.051052852, 0.012810897, 0.032276507, 0.029392192), V16 = c(17L,
16L, 17L, 17L, 17L, 13L, 15L, 3L, 17L, 16L), V17 = c(0.040017423,
0.036038817, 0.005084651, 0.034467473, 0.04232972, 0.037379122,
0.043670789, 0.003133166, 0.025935551, 0.025842946), V18 = c(4L,
13L, 18L, 3L, 3L, 4L, 13L, 17L, 2L, 2L), V19 = c(0.030276392,
0.033395073, 0.004279347, 0.032379132, 0.02829822, 0.024568146,
0.042713651, 0.002576696, 0.018035343, 0.021258503), V20 = c(3L,
3L, 3L, 13L, 13L, 15L, 8L, 13L, 4L, 4L), V21 = c(0.014817455,
0.010440413, 0.003942243, 0.016755979, 0.014287175, 0.020600876,
0.039325163, 0.001657312, 0.010343035, 0.019631766), V22 = c(6L,
6L, 13L, 10L, 6L, 6L, 16L, 0L, 13L, 13L), V23 = c(0.011443732,
0.006704344, 0.002256723, 0.007614563, 0.010114543, 0.016385652,
0.034396453, 0.001657312, 0.009303534, 0.014603668)), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20",
"V21", "V22", "V23"), class = "data.frame", row.names = c(NA,
-10L))
这显示的是文本的名称和相应的主题编号及其在文本中的相关百分比存在,因此在 row1 中,col1 有文本名称,col2 有主题 9,col 3 有主题 9 的百分比,然后 col4 有主题 8,然后是主题 8 的百分比,依此类推。主题没有特定顺序排列。所以我想做的是创建一个新的数据框,其中的数据以行表示文本编号的形式排列,col 编号表示主题编号,因此 row1 X col1 将具有主题 1 在文本 1 中的百分比,依此类推(NA如果不存在)。我的代码是一个简单的嵌套循环:
topics <-read.csv("topics.csv", sep = ",", na.strings=c(""," ","NA"), header = FALSE)
row <- nrow(topics)
df <- data.frame(matrix(ncol = 21, nrow = row)) #initialize an empty dataframe of fixed size
for (i in 1:nrow(topics)) {
for (j in 2:22) {
df[i,topics[i,j] + 1] <- topics[i,j+1]
}
}
我在这里所做的是访问与主题相对应的新数据框列的特定索引,并将关联的值放入
但它给出的输出是以下形式:
dput 的输出:
structure(list(X1 = c(6, 6, 13, 10, 6, 6, 16, 0.001657312, 13,
13), X2 = c(NA, NA, NA, NA, NA, NA, NA, NA, 0.060758836, 0.529540077
), X3 = c(0.126641324, 0.180447378, 0.035143082, 0.115636944,
0.125618736, 0.106517068, 0.051052852, 0.040344043, 0.018035343,
0.021258503), X4 = c(0.014817455, 0.010440413, 0.003942243, 0.032379132,
0.02829822, 0.06275312, 0.1118036, 0.003133166, 0.055665281,
0.081965395), X5 = c(0.030276392, 0.043479293, 0.022595325, 0.072097009,
0.087287789, 0.024568146, 0.089448381, 0.01936756, 0.010343035,
0.019631766), X6 = c(NA, NA, NA, 0.255417865, NA, NA, NA, NA,
NA, NA), X7 = c(0.011443732, 0.006704344, NA, NA, 0.010114543,
0.016385652, 0.142443012, NA, NA, NA), X8 = c(NA, NA, NA, NA,
NA, 0.363149847, NA, NA, NA, NA), X9 = c(0.153314326, 0.119245504,
0.216411342, 0.222792466, 0.204428308, 0.081762956, 0.039325163,
0.214664183, 0.048284823, 0.035159716), X10 = c(0.239748159,
NA, NA, NA, NA, NA, NA, NA, NA, NA), X11 = c(NA, NA, NA, 0.007614563,
0.247361424, NA, NA, NA, NA, NA), X12 = c(NA, 0.309737525, NA,
NA, NA, NA, NA, NA, NA, NA), X13 = c(0.133753069, 0.106343402,
0.018194247, 0.114001734, 0.092237676, 0.097094801, 0.069480505,
0.012810897, 0.058264033, 0.069469092), X14 = c(0.102613447,
0.033395073, 0.002256723, 0.016755979, 0.014287175, 0.037379122,
0.042713651, 0.001657312, 0.009303534, 0.014603668), X15 = c(NA,
NA, NA, NA, NA, NA, NA, NA, 0.623336798, 0.10636646), X16 = c(0.089197751,
0.088533751, 0.189293205, 0.060847551, 0.067365514, 0.020600876,
0.043670789, 0.225841963, 0.04454262, 0.046398994), X17 = c(0.045022571,
0.036038817, 0.152679976, 0.044751566, 0.05556351, 0.067877511,
0.034396453, 0.116048098, 0.032276507, 0.025842946), X18 = c(0.040017423,
0.061352267, 0.005084651, 0.034467473, 0.04232972, 0.100731465,
0.111000484, 0.002576696, 0.025935551, 0.029392192), X19 = c(NA,
NA, 0.004279347, NA, NA, NA, NA, 0.359370464, NA, NA), X20 = c(NA,
NA, NA, NA, NA, NA, 0.244273675, NA, NA, NA), X21 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("X1", "X2", "X3",
"X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13",
"X14", "X15", "X16", "X17", "X18", "X19", "X20", "X21"), row.names = c(NA,
-10L), class = "data.frame")
问题在于第 1 列中应该出现主题 0 的地方有一个虚拟值。在主题的原始输入数据帧的第一行中,主题 0 没有相应的条目,因此在新数据帧的 [1,0+1] 索引中应该有 NA 而不是值 6.000。
类似地,在第 3 行中,主题 0 具有一些未出现在输出矩阵中的对应值。有人可以帮忙吗?还有没有循环我可以做这样的事情的方法。
我们根据主题编号列创建了一个 row/column 索引 ('indx1'),主题编号列是数据集中的交替列。在删除第一列后,可以使用逻辑索引 ('indx') 选择这些列,即。文本的名称)。创建一个 matrix
的 NA,维度为 'df1' 的 nrow
,ncol
作为 'indx1' 的列索引的最大值。将其转换为 data.frame
,并使用 'indx1'.
替换 NA 值
indx <- c(TRUE, FALSE)
df1[-1][indx] <- df1[-1][indx]+1
indx1 <- cbind(1:nrow(df1),unlist(df1[-1][indx]))
df2 <- as.data.frame(matrix(NA, nrow=nrow(df1),ncol=max(indx1[,2])))
df2[indx1] <- unlist(df1[-1][!indx])
head(df2,2)
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
#1 NA NA 0.1266413 0.01481746 0.03027639 NA 0.011443732 NA 0.1533143 0.2397482
#2 NA NA 0.1804474 0.01044041 0.04347929 NA 0.006704344 NA 0.1192455 NA
# V11 V12 V13 V14 V15 V16 V17 V18 V19
#1 NA NA 0.1337531 0.10261345 NA 0.08919775 0.04502257 0.04001742 NA
#2 NA 0.3097375 0.1063434 0.03339507 NA 0.08853375 0.03603882 0.06135227 NA
# V20
#1 NA
#2 NA
OP 数据的输出
head(Out,2)
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 6 NA 0.1266413 0.01481746 0.03027639 NA 0.011443732 NA 0.1533143 0.2397482
# 2 6 NA 0.1804474 0.01044041 0.04347929 NA 0.006704344 NA 0.1192455 NA
# X11 X12 X13 X14 X15 X16 X17 X18 X19
#1 NA NA 0.1337531 0.10261345 NA 0.08919775 0.04502257 0.04001742 NA
#2 NA 0.3097375 0.1063434 0.03339507 NA 0.08853375 0.03603882 0.06135227 NA
# X20 X21
#1 NA NA
#2 NA NA
我有一个已读入数据框的 csv 文件(基本上是 MALLET 主题建模功能的主题输出),如下所示:
函数 dput 给出的形式为:
structure(list(V1 = structure(1:10, .Label = c("file:/C:/mallet/my-data/dickens-greatexpectations.txt",
"file:/C:/mallet/my-data/dickens-olivertwist.txt", "file:/C:/mallet/my-data/emma-austen.txt",
"file:/C:/mallet/my-data/hardy-judetheobscure.txt", "file:/C:/mallet/my-data/hardy-pairofblueyes.txt",
"file:/C:/mallet/my-data/jacob-room-woolf.txt", "file:/C:/mallet/my-data/melville-moby-dick.txt",
"file:/C:/mallet/my-data/pride-and-prejudice.txt", "file:/C:/mallet/my-data/shakespeare-asyoulikieit.txt",
"file:/C:/mallet/my-data/shakespeare-hamlet.txt"), class = "factor"),
V2 = c(9L, 11L, 0L, 5L, 10L, 7L, 19L, 18L, 14L, 1L), V3 = c(0.239748159,
0.309737525, 0.346177616, 0.255417865, 0.247361424, 0.363149847,
0.244273675, 0.359370464, 0.623336798, 0.529540077), V4 = c(8L,
2L, 8L, 8L, 8L, 2L, 6L, 15L, 1L, 14L), V5 = c(0.153314326,
0.180447378, 0.216411342, 0.222792466, 0.204428308, 0.106517068,
0.142443012, 0.225841963, 0.060758836, 0.10636646), V6 = c(12L,
8L, 15L, 2L, 2L, 17L, 3L, 8L, 12L, 3L), V7 = c(0.133753069,
0.119245504, 0.189293205, 0.115636944, 0.125618736, 0.100731465,
0.1118036, 0.214664183, 0.058264033, 0.081965395), V8 = c(2L,
12L, 16L, 12L, 12L, 12L, 17L, 16L, 3L, 12L), V9 = c(0.126641324,
0.106343402, 0.152679976, 0.114001734, 0.092237676, 0.097094801,
0.111000484, 0.116048098, 0.055665281, 0.069469092), V10 = c(13L,
15L, 2L, 4L, 4L, 8L, 4L, 2L, 8L, 15L), V11 = c(0.102613447,
0.088533751, 0.035143082, 0.072097009, 0.087287789, 0.081762956,
0.089448381, 0.040344043, 0.048284823, 0.046398994), V12 = c(15L,
17L, 4L, 15L, 15L, 16L, 12L, 4L, 15L, 8L), V13 = c(0.089197751,
0.061352267, 0.022595325, 0.060847551, 0.067365514, 0.067877511,
0.069480505, 0.01936756, 0.04454262, 0.035159716), V14 = c(16L,
4L, 12L, 16L, 16L, 3L, 2L, 12L, 16L, 17L), V15 = c(0.045022571,
0.043479293, 0.018194247, 0.044751566, 0.05556351, 0.06275312,
0.051052852, 0.012810897, 0.032276507, 0.029392192), V16 = c(17L,
16L, 17L, 17L, 17L, 13L, 15L, 3L, 17L, 16L), V17 = c(0.040017423,
0.036038817, 0.005084651, 0.034467473, 0.04232972, 0.037379122,
0.043670789, 0.003133166, 0.025935551, 0.025842946), V18 = c(4L,
13L, 18L, 3L, 3L, 4L, 13L, 17L, 2L, 2L), V19 = c(0.030276392,
0.033395073, 0.004279347, 0.032379132, 0.02829822, 0.024568146,
0.042713651, 0.002576696, 0.018035343, 0.021258503), V20 = c(3L,
3L, 3L, 13L, 13L, 15L, 8L, 13L, 4L, 4L), V21 = c(0.014817455,
0.010440413, 0.003942243, 0.016755979, 0.014287175, 0.020600876,
0.039325163, 0.001657312, 0.010343035, 0.019631766), V22 = c(6L,
6L, 13L, 10L, 6L, 6L, 16L, 0L, 13L, 13L), V23 = c(0.011443732,
0.006704344, 0.002256723, 0.007614563, 0.010114543, 0.016385652,
0.034396453, 0.001657312, 0.009303534, 0.014603668)), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20",
"V21", "V22", "V23"), class = "data.frame", row.names = c(NA,
-10L))
这显示的是文本的名称和相应的主题编号及其在文本中的相关百分比存在,因此在 row1 中,col1 有文本名称,col2 有主题 9,col 3 有主题 9 的百分比,然后 col4 有主题 8,然后是主题 8 的百分比,依此类推。主题没有特定顺序排列。所以我想做的是创建一个新的数据框,其中的数据以行表示文本编号的形式排列,col 编号表示主题编号,因此 row1 X col1 将具有主题 1 在文本 1 中的百分比,依此类推(NA如果不存在)。我的代码是一个简单的嵌套循环:
topics <-read.csv("topics.csv", sep = ",", na.strings=c(""," ","NA"), header = FALSE)
row <- nrow(topics)
df <- data.frame(matrix(ncol = 21, nrow = row)) #initialize an empty dataframe of fixed size
for (i in 1:nrow(topics)) {
for (j in 2:22) {
df[i,topics[i,j] + 1] <- topics[i,j+1]
}
}
我在这里所做的是访问与主题相对应的新数据框列的特定索引,并将关联的值放入 但它给出的输出是以下形式:
dput 的输出:
structure(list(X1 = c(6, 6, 13, 10, 6, 6, 16, 0.001657312, 13,
13), X2 = c(NA, NA, NA, NA, NA, NA, NA, NA, 0.060758836, 0.529540077
), X3 = c(0.126641324, 0.180447378, 0.035143082, 0.115636944,
0.125618736, 0.106517068, 0.051052852, 0.040344043, 0.018035343,
0.021258503), X4 = c(0.014817455, 0.010440413, 0.003942243, 0.032379132,
0.02829822, 0.06275312, 0.1118036, 0.003133166, 0.055665281,
0.081965395), X5 = c(0.030276392, 0.043479293, 0.022595325, 0.072097009,
0.087287789, 0.024568146, 0.089448381, 0.01936756, 0.010343035,
0.019631766), X6 = c(NA, NA, NA, 0.255417865, NA, NA, NA, NA,
NA, NA), X7 = c(0.011443732, 0.006704344, NA, NA, 0.010114543,
0.016385652, 0.142443012, NA, NA, NA), X8 = c(NA, NA, NA, NA,
NA, 0.363149847, NA, NA, NA, NA), X9 = c(0.153314326, 0.119245504,
0.216411342, 0.222792466, 0.204428308, 0.081762956, 0.039325163,
0.214664183, 0.048284823, 0.035159716), X10 = c(0.239748159,
NA, NA, NA, NA, NA, NA, NA, NA, NA), X11 = c(NA, NA, NA, 0.007614563,
0.247361424, NA, NA, NA, NA, NA), X12 = c(NA, 0.309737525, NA,
NA, NA, NA, NA, NA, NA, NA), X13 = c(0.133753069, 0.106343402,
0.018194247, 0.114001734, 0.092237676, 0.097094801, 0.069480505,
0.012810897, 0.058264033, 0.069469092), X14 = c(0.102613447,
0.033395073, 0.002256723, 0.016755979, 0.014287175, 0.037379122,
0.042713651, 0.001657312, 0.009303534, 0.014603668), X15 = c(NA,
NA, NA, NA, NA, NA, NA, NA, 0.623336798, 0.10636646), X16 = c(0.089197751,
0.088533751, 0.189293205, 0.060847551, 0.067365514, 0.020600876,
0.043670789, 0.225841963, 0.04454262, 0.046398994), X17 = c(0.045022571,
0.036038817, 0.152679976, 0.044751566, 0.05556351, 0.067877511,
0.034396453, 0.116048098, 0.032276507, 0.025842946), X18 = c(0.040017423,
0.061352267, 0.005084651, 0.034467473, 0.04232972, 0.100731465,
0.111000484, 0.002576696, 0.025935551, 0.029392192), X19 = c(NA,
NA, 0.004279347, NA, NA, NA, NA, 0.359370464, NA, NA), X20 = c(NA,
NA, NA, NA, NA, NA, 0.244273675, NA, NA, NA), X21 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("X1", "X2", "X3",
"X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13",
"X14", "X15", "X16", "X17", "X18", "X19", "X20", "X21"), row.names = c(NA,
-10L), class = "data.frame")
问题在于第 1 列中应该出现主题 0 的地方有一个虚拟值。在主题的原始输入数据帧的第一行中,主题 0 没有相应的条目,因此在新数据帧的 [1,0+1] 索引中应该有 NA 而不是值 6.000。 类似地,在第 3 行中,主题 0 具有一些未出现在输出矩阵中的对应值。有人可以帮忙吗?还有没有循环我可以做这样的事情的方法。
我们根据主题编号列创建了一个 row/column 索引 ('indx1'),主题编号列是数据集中的交替列。在删除第一列后,可以使用逻辑索引 ('indx') 选择这些列,即。文本的名称)。创建一个 matrix
的 NA,维度为 'df1' 的 nrow
,ncol
作为 'indx1' 的列索引的最大值。将其转换为 data.frame
,并使用 'indx1'.
indx <- c(TRUE, FALSE)
df1[-1][indx] <- df1[-1][indx]+1
indx1 <- cbind(1:nrow(df1),unlist(df1[-1][indx]))
df2 <- as.data.frame(matrix(NA, nrow=nrow(df1),ncol=max(indx1[,2])))
df2[indx1] <- unlist(df1[-1][!indx])
head(df2,2)
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
#1 NA NA 0.1266413 0.01481746 0.03027639 NA 0.011443732 NA 0.1533143 0.2397482
#2 NA NA 0.1804474 0.01044041 0.04347929 NA 0.006704344 NA 0.1192455 NA
# V11 V12 V13 V14 V15 V16 V17 V18 V19
#1 NA NA 0.1337531 0.10261345 NA 0.08919775 0.04502257 0.04001742 NA
#2 NA 0.3097375 0.1063434 0.03339507 NA 0.08853375 0.03603882 0.06135227 NA
# V20
#1 NA
#2 NA
OP 数据的输出
head(Out,2)
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 6 NA 0.1266413 0.01481746 0.03027639 NA 0.011443732 NA 0.1533143 0.2397482
# 2 6 NA 0.1804474 0.01044041 0.04347929 NA 0.006704344 NA 0.1192455 NA
# X11 X12 X13 X14 X15 X16 X17 X18 X19
#1 NA NA 0.1337531 0.10261345 NA 0.08919775 0.04502257 0.04001742 NA
#2 NA 0.3097375 0.1063434 0.03339507 NA 0.08853375 0.03603882 0.06135227 NA
# X20 X21
#1 NA NA
#2 NA NA