并行处理的句子生成会产生乱码结果
Parallel processed sentence generation creates garbled results
我正在尝试为某些神经网络学习目的创建一个数据集。以前,我使用 for 循环来连接和造句,但由于这个过程花费了很长时间,所以我使用 foreach 实现了句子生成。该过程很快,并在 50 秒内完成。我只是在模板上使用插槽填充,然后将其粘贴在一起形成一个句子,但输出出现乱码(单词拼写错误、单词之间的未知空格、单词本身丢失等)
library(foreach)
library(doParallel)
library(tictoc)
tic("Data preparation - parallel mode")
cl <- makeCluster(3)
registerDoParallel(cl)
f_sentences<-c();sentences<-c()
hr=38:180;fl=1:5;month=1:5
strt<-Sys.time()
a<-foreach(hr=38:180,.packages = c('foreach','doParallel')) %dopar% {
foreach(fl=1:5,.packages = c('foreach','doParallel')) %dopar%{
foreach(month=1:5,.packages = c('foreach','doParallel')) %dopar% {
if(hr>=35 & hr<=44){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being severly_low).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=45 & hr<=59){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being low).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=60 & hr<=100){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being medium).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=101 & hr<=150){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being high).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=151 & hr<=180){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being severly_high).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
return(outfile)
}
write.table(outfile,file="/home/outfile.txt",append = T,row.names = F,col.names = F)
gc()
}
}
stopCluster(cl)
toc()
如此创建的文件的统计信息:
- 行数:427,975
- 使用的拆分:单词拆分 (" ")
词汇量:567
path<-"/home/outfile.txt"
File<-(fread(path,sep = "\n",header = F))[[1]]
corpus<-tolower(File) %>%
#removePunctuation() %>%
strsplit(splitting) %>%
unlist()
vocab<-unique(corpus)
像这样的简单句子的词汇量应该很少,因为数字是这里唯一变化的参数。查看vocab输出,使用grep命令,发现有很多乱码
(也有一些遗漏的词)像 wentt、crpply 等出现在句子中,通常不应该出现,因为我有一个固定的模板。
Expected sentence
"About 40 soldiers died in the battle (count being severly_low). Around 1 soldiers and civilians went missing. We only have about 146 crates which lasts for 1 months as food supply"
grep -rnw 'outfile.txt' -e 'wentt'
24105:"About 62 soldiers died in the battle (count being medium). Around 2 soldiers and civilians wentt 117 crates which lasts for 1 months as food supply"
grep -rnw 'outfile.txt' -e 'crpply'
76450:"About 73 soldiers died in the battle (count being medium). Around 1 soldiers and civilians went missing. We only have about 133 crpply"
前几句,生成正确后出现问题。这是什么原因?我只是在执行带槽填充的普通粘贴。如有任何帮助,我们将不胜感激!
代码现在 运行 正确。没有更多的错误。我假设错误是由于上次出现故障而发生的。在具有不同 R 版本的其他机器上对此进行了测试,仍然没问题。
我正在尝试为某些神经网络学习目的创建一个数据集。以前,我使用 for 循环来连接和造句,但由于这个过程花费了很长时间,所以我使用 foreach 实现了句子生成。该过程很快,并在 50 秒内完成。我只是在模板上使用插槽填充,然后将其粘贴在一起形成一个句子,但输出出现乱码(单词拼写错误、单词之间的未知空格、单词本身丢失等)
library(foreach)
library(doParallel)
library(tictoc)
tic("Data preparation - parallel mode")
cl <- makeCluster(3)
registerDoParallel(cl)
f_sentences<-c();sentences<-c()
hr=38:180;fl=1:5;month=1:5
strt<-Sys.time()
a<-foreach(hr=38:180,.packages = c('foreach','doParallel')) %dopar% {
foreach(fl=1:5,.packages = c('foreach','doParallel')) %dopar%{
foreach(month=1:5,.packages = c('foreach','doParallel')) %dopar% {
if(hr>=35 & hr<=44){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being severly_low).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=45 & hr<=59){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being low).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=60 & hr<=100){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being medium).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=101 & hr<=150){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being high).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
if(hr>=151 & hr<=180){
sentences<-paste("About",toString(hr),"soldiers died in the battle (count being severly_high).","Around",toString(fl),
"soldiers and civilians went missing. We only have about",(sample(38:180,1)),"crates which lasts for",toString(month),"months as food supply")
f_sentences<-c(f_sentences,sentences);outfile<-unname(f_sentences)}
return(outfile)
}
write.table(outfile,file="/home/outfile.txt",append = T,row.names = F,col.names = F)
gc()
}
}
stopCluster(cl)
toc()
如此创建的文件的统计信息:
- 行数:427,975
- 使用的拆分:单词拆分 (" ")
词汇量:567
path<-"/home/outfile.txt"
File<-(fread(path,sep = "\n",header = F))[[1]]
corpus<-tolower(File) %>%
#removePunctuation() %>%
strsplit(splitting) %>%
unlist()
vocab<-unique(corpus)
像这样的简单句子的词汇量应该很少,因为数字是这里唯一变化的参数。查看vocab输出,使用grep命令,发现有很多乱码 (也有一些遗漏的词)像 wentt、crpply 等出现在句子中,通常不应该出现,因为我有一个固定的模板。
Expected sentence
"About 40 soldiers died in the battle (count being severly_low). Around 1 soldiers and civilians went missing. We only have about 146 crates which lasts for 1 months as food supply"grep -rnw 'outfile.txt' -e 'wentt'
24105:"About 62 soldiers died in the battle (count being medium). Around 2 soldiers and civilians wentt 117 crates which lasts for 1 months as food supply"grep -rnw 'outfile.txt' -e 'crpply'
76450:"About 73 soldiers died in the battle (count being medium). Around 1 soldiers and civilians went missing. We only have about 133 crpply"前几句,生成正确后出现问题。这是什么原因?我只是在执行带槽填充的普通粘贴。如有任何帮助,我们将不胜感激!
代码现在 运行 正确。没有更多的错误。我假设错误是由于上次出现故障而发生的。在具有不同 R 版本的其他机器上对此进行了测试,仍然没问题。