Google Apps 脚本:用于修复格式错误的管道分隔 csv 文件的 REGEX 运行速度太慢
Google Apps Script: REGEX to fix malformed pipe delimited csv file runs too slowly
我有一个 Google Apps 脚本每天处理这个“csv”文件。
文件越来越大,并且开始超时。
竖线分隔的“csv”文件在某些记录的注释字段中包含新行和下一行。这会导致这些记录在真正的记录结束之前被打破。以下代码在记录中间删除无关的新行和下一行,并将数据格式化为有用的 csv 格式。有没有更有效的方法来编写这段代码?
这是片段:
function cleanCSV(csvFileId){
//The file we receive has line breaks in the middle of the records, this removes the line breaks and converts the file to a csv.
var content = DriveApp.getFileById(csvFileId).getBlob().getDataAsString();
var identifyNewLine = content.replace(/\r\n\d{1,5}\|/g,"~~$&"); //This marks the beginning of a new record with double tildes before we can remove all the line breaks.
var noReturnsContent = identifyNewLine.replace(/\r\n/g, ""); //Removes Returns
var newContent = noReturnsContent.replace(/~~/g,"\r\n"); //returns one record per client
var noEndQuote = newContent.replace(/'\|/g,"|"); // removes trailing single quote
var csvContent = noEndQuote.replace(/\|'/g,"|"); // removes leading single quote
//Logger.log(csvContent);
var sheetId = DriveApp.getFolderById(csvFolderId).createFile(csvFileName, csvContent, MimeType.CSV).getId();
return sheetId;
}
这是文件的 sample:
前三个 replace
行可以合并为一个,您只需删除所有 \r\n
后面没有跟上 1 到 5 位数字和 |
的出现,.replace(/\r\n(?!\d{1,5}\|)/g,"")
.
最后两行replace
也可以合并为一行,如果你使用alternaton,.replace(/'\||\|'/g,"|")
。
使用
function cleanCSV(csvFileId){
//The file we receive has line breaks in the middle of the records, this removes the line breaks and converts the file to a csv.
var content = DriveApp.getFileById(csvFileId).getBlob().getDataAsString();
var newContent = content.replace(/\r\n(?!\d{1,5}\|)/g,""); // remove line endings not followed with 1-5 digits and |
var csvContent = newContent.replace(/'\||\|'/g,"|"); // removes trailing/leading single quote
//Logger.log(csvContent);
var sheetId = DriveApp.getFolderById(csvFolderId).createFile(csvFileName, csvContent, MimeType.CSV).getId();
return sheetId;
}
我有一个 Google Apps 脚本每天处理这个“csv”文件。 文件越来越大,并且开始超时。 竖线分隔的“csv”文件在某些记录的注释字段中包含新行和下一行。这会导致这些记录在真正的记录结束之前被打破。以下代码在记录中间删除无关的新行和下一行,并将数据格式化为有用的 csv 格式。有没有更有效的方法来编写这段代码?
这是片段:
function cleanCSV(csvFileId){
//The file we receive has line breaks in the middle of the records, this removes the line breaks and converts the file to a csv.
var content = DriveApp.getFileById(csvFileId).getBlob().getDataAsString();
var identifyNewLine = content.replace(/\r\n\d{1,5}\|/g,"~~$&"); //This marks the beginning of a new record with double tildes before we can remove all the line breaks.
var noReturnsContent = identifyNewLine.replace(/\r\n/g, ""); //Removes Returns
var newContent = noReturnsContent.replace(/~~/g,"\r\n"); //returns one record per client
var noEndQuote = newContent.replace(/'\|/g,"|"); // removes trailing single quote
var csvContent = noEndQuote.replace(/\|'/g,"|"); // removes leading single quote
//Logger.log(csvContent);
var sheetId = DriveApp.getFolderById(csvFolderId).createFile(csvFileName, csvContent, MimeType.CSV).getId();
return sheetId;
}
这是文件的 sample:
前三个 replace
行可以合并为一个,您只需删除所有 \r\n
后面没有跟上 1 到 5 位数字和 |
的出现,.replace(/\r\n(?!\d{1,5}\|)/g,"")
.
最后两行replace
也可以合并为一行,如果你使用alternaton,.replace(/'\||\|'/g,"|")
。
使用
function cleanCSV(csvFileId){
//The file we receive has line breaks in the middle of the records, this removes the line breaks and converts the file to a csv.
var content = DriveApp.getFileById(csvFileId).getBlob().getDataAsString();
var newContent = content.replace(/\r\n(?!\d{1,5}\|)/g,""); // remove line endings not followed with 1-5 digits and |
var csvContent = newContent.replace(/'\||\|'/g,"|"); // removes trailing/leading single quote
//Logger.log(csvContent);
var sheetId = DriveApp.getFolderById(csvFolderId).createFile(csvFileName, csvContent, MimeType.CSV).getId();
return sheetId;
}