读写大文件C++(内存过载)
Reading and Writing Huge Files C++(Memory Overload)
我有一个非常大的文件(55 GB 的 json 数据)。我正在使用 ifstream
读取和另一个 ofstream
写入另一个文件。程序正确运行了一段时间。然后由于大量内存使用而崩溃。
我尝试使用 ignore
和 clear
来清除输入缓冲区。
并且还尝试使用 flush
清除输出缓冲区。
而且文件很大,所以,我希望它快点。
p.s。我在半睡半醒的时候写了 json 解析器。所以请原谅我糟糕的解析器代码。也许那里存在内存泄漏。任何帮助将不胜感激。
小例子
int main()
{
std::ifstream file("aggressive_dedup.json", std::ifstream::in);
std::ofstream outFile("processed.json", std::ofstream::out);
std::string str;
int a;
long long count = 0;
while (std::getline(file, str))
{
JsonParserStateMachine jsonParserStateMachine;
for(char &c : str) jsonParserStateMachine.changeState(c);
//std::cout<<jsonParserStateMachine.getReview();
//This lines just gives a string to the output which is around may be 1000 characters
outFile << jsonParserStateMachine.getReview();
if(++count % 1000 == 0) {
std::cout<<count<<" Processed\n";
outFile.flush();
return 0;
}
}
outFile.close();
return 0;
}
给愿意看完整代码的人
#include <fstream>
#include <string>
#include <iostream>
enum state {
q0, q1, q2, q3, q4, q5, q6, h
};
class KeyValueStore{
std::string *keys;
std::string *values;
int currentKeyPosition;
int currentValuePosition;
int maxLength;
public:
KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
{
this->keys = new std::string[length];
this->values = new std::string[length];
for(int i=0;i<length;i++)
{
this->keys[i] = "";
this->values[i] = "";
}
}
void updateKeyPosition()
{
this->currentKeyPosition = this->currentKeyPosition++%9;
}
void updateValuePosition()
{
this->currentValuePosition = this->currentValuePosition++%9;
}
void putKey(char c)
{
this->keys[currentKeyPosition] += c;
}
void putValue(char c)
{
this->values[currentValuePosition] += c;
}
std::string getValue(std::string key)
{
for(int i=0;i<this->maxLength;i++)
{
if(this->keys[i] == key) return this->values[i];
}
return "";
}
void print()
{
std::cout<<"Keys"<<"\t"<<"Values"<<std::endl;
for(int i=0;i<maxLength;i++)
{
std::cout<<this->keys[i] <<'\t'<<this->values[i]<<std::endl;
}
}
std::string getReview()
{
return std::string("{\"" + this->getValue("reviewText") + "\":\"" + this->getValue("overall") + "\"}");
}
};
class JsonParserStateMachine{
state currentState;
KeyValueStore keyValueStore;
bool inNum;
bool inArray;
public:
JsonParserStateMachine(): keyValueStore(9), currentState(state::q0), inNum(false),inArray(false){}
state getState()
{
return this->currentState;
}
void print()
{
keyValueStore.print();
}
std::string getReview()
{
return keyValueStore.getReview();
}
state changeState(char c)
{
switch(currentState)
{
case state::q0:
if(c == ' ') break;
else if(c == '{') this->currentState = state::q1;
else this->currentState = state::h;
break;
case state::q1:
if(c == ' ') break;
else if(c == '\"') this->currentState = state::q2;
else this->currentState = state::h;
break;
case state::q2:
if(c == '\"'){
this->currentState = state::q3;
this->keyValueStore.updateKeyPosition();
break;
}
else{
this->keyValueStore.putKey(c);
break;
}
case state::q3:
if(c == ':') this->currentState = state::q4;
else if(c == ' ') {
}
else {
//std::cout<<"From Q3"<<std::endl;
this->currentState = state::h;
}break;
case state::q4:
if(c == '\"' || c == '[') {
this->currentState = state::q5;
inArray = c == '[' ? true: false;
}else if(c == ' ') break;
else {
//std::cout<<"Got Here"<<std::endl;
inNum = true;
this->currentState = state::q5;
this->keyValueStore.putValue(c);
}
break;
case state::q5:
if(c == '\"' || c == ']'){
this->currentState = state::q6;
this->keyValueStore.updateValuePosition();
inArray = c == ']'? false: true;
break;
}else if(inNum && c == ',' ){
this->currentState = state::q1;
this->keyValueStore.updateValuePosition();
inNum = false;
}
else{
this->keyValueStore.putValue(c);
break;
}
case state::q6:
if(c == ','){
this->currentState = state::q1;
break;
}else if(c == ' '){
break;
}else{
//std::cout<<"From Q6"<<std::endl;
this->currentState = state::h;
}
}
return this->currentState;
}
};
class Review{
std::string reviewText;
int overall;
std::string summary;
public:
void pusReviewText(std::string reviewText)
{
this->reviewText = reviewText;
}
void putOverall(int overall)
{
this->overall = overall;
}
void putSummary(std::string summary)
{
this->summary = summary;
}
std::string getReviewText()
{
return this->reviewText;
}
int getOverall()
{
return this->overall;
}
std::string getSummary()
{
return this->summary;
}
};
int main()
{
std::ifstream file("aggressive_dedup.json", std::ifstream::in);
std::ofstream outFile("processed.json", std::ofstream::out);
std::string str;
int a;
long long count = 0;
while (std::getline(file, str))
{
JsonParserStateMachine jsonParserStateMachine;
for(char &c : str) jsonParserStateMachine.changeState(c);
//std::cout<<jsonParserStateMachine.getReview();
outFile << jsonParserStateMachine.getReview();
if(++count % 1000 == 0) {
std::cout<<count<<" Processed\n";
outFile.flush();
return 0;
}
}
outFile.close();
return 0;
}
问题出在你的KeyValueStore
class:
KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
{
this->keys = new std::string[length];
this->values = new std::string[length];
...
没有任何东西会删除这些数组。在析构函数中删除它们是简单的解决方法:
~KeyValueStore() {
delete[] this->keys;
delete[] this->values;
}
但是!您真的应该考虑改用 std::vector<std::string>
。或者更好的是,围绕 std::unordered_map<std::string, std::string> instead.
重建整个事物
我有一个非常大的文件(55 GB 的 json 数据)。我正在使用 ifstream
读取和另一个 ofstream
写入另一个文件。程序正确运行了一段时间。然后由于大量内存使用而崩溃。
我尝试使用 ignore
和 clear
来清除输入缓冲区。
并且还尝试使用 flush
清除输出缓冲区。
而且文件很大,所以,我希望它快点。
p.s。我在半睡半醒的时候写了 json 解析器。所以请原谅我糟糕的解析器代码。也许那里存在内存泄漏。任何帮助将不胜感激。
小例子
int main()
{
std::ifstream file("aggressive_dedup.json", std::ifstream::in);
std::ofstream outFile("processed.json", std::ofstream::out);
std::string str;
int a;
long long count = 0;
while (std::getline(file, str))
{
JsonParserStateMachine jsonParserStateMachine;
for(char &c : str) jsonParserStateMachine.changeState(c);
//std::cout<<jsonParserStateMachine.getReview();
//This lines just gives a string to the output which is around may be 1000 characters
outFile << jsonParserStateMachine.getReview();
if(++count % 1000 == 0) {
std::cout<<count<<" Processed\n";
outFile.flush();
return 0;
}
}
outFile.close();
return 0;
}
给愿意看完整代码的人
#include <fstream>
#include <string>
#include <iostream>
enum state {
q0, q1, q2, q3, q4, q5, q6, h
};
class KeyValueStore{
std::string *keys;
std::string *values;
int currentKeyPosition;
int currentValuePosition;
int maxLength;
public:
KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
{
this->keys = new std::string[length];
this->values = new std::string[length];
for(int i=0;i<length;i++)
{
this->keys[i] = "";
this->values[i] = "";
}
}
void updateKeyPosition()
{
this->currentKeyPosition = this->currentKeyPosition++%9;
}
void updateValuePosition()
{
this->currentValuePosition = this->currentValuePosition++%9;
}
void putKey(char c)
{
this->keys[currentKeyPosition] += c;
}
void putValue(char c)
{
this->values[currentValuePosition] += c;
}
std::string getValue(std::string key)
{
for(int i=0;i<this->maxLength;i++)
{
if(this->keys[i] == key) return this->values[i];
}
return "";
}
void print()
{
std::cout<<"Keys"<<"\t"<<"Values"<<std::endl;
for(int i=0;i<maxLength;i++)
{
std::cout<<this->keys[i] <<'\t'<<this->values[i]<<std::endl;
}
}
std::string getReview()
{
return std::string("{\"" + this->getValue("reviewText") + "\":\"" + this->getValue("overall") + "\"}");
}
};
class JsonParserStateMachine{
state currentState;
KeyValueStore keyValueStore;
bool inNum;
bool inArray;
public:
JsonParserStateMachine(): keyValueStore(9), currentState(state::q0), inNum(false),inArray(false){}
state getState()
{
return this->currentState;
}
void print()
{
keyValueStore.print();
}
std::string getReview()
{
return keyValueStore.getReview();
}
state changeState(char c)
{
switch(currentState)
{
case state::q0:
if(c == ' ') break;
else if(c == '{') this->currentState = state::q1;
else this->currentState = state::h;
break;
case state::q1:
if(c == ' ') break;
else if(c == '\"') this->currentState = state::q2;
else this->currentState = state::h;
break;
case state::q2:
if(c == '\"'){
this->currentState = state::q3;
this->keyValueStore.updateKeyPosition();
break;
}
else{
this->keyValueStore.putKey(c);
break;
}
case state::q3:
if(c == ':') this->currentState = state::q4;
else if(c == ' ') {
}
else {
//std::cout<<"From Q3"<<std::endl;
this->currentState = state::h;
}break;
case state::q4:
if(c == '\"' || c == '[') {
this->currentState = state::q5;
inArray = c == '[' ? true: false;
}else if(c == ' ') break;
else {
//std::cout<<"Got Here"<<std::endl;
inNum = true;
this->currentState = state::q5;
this->keyValueStore.putValue(c);
}
break;
case state::q5:
if(c == '\"' || c == ']'){
this->currentState = state::q6;
this->keyValueStore.updateValuePosition();
inArray = c == ']'? false: true;
break;
}else if(inNum && c == ',' ){
this->currentState = state::q1;
this->keyValueStore.updateValuePosition();
inNum = false;
}
else{
this->keyValueStore.putValue(c);
break;
}
case state::q6:
if(c == ','){
this->currentState = state::q1;
break;
}else if(c == ' '){
break;
}else{
//std::cout<<"From Q6"<<std::endl;
this->currentState = state::h;
}
}
return this->currentState;
}
};
class Review{
std::string reviewText;
int overall;
std::string summary;
public:
void pusReviewText(std::string reviewText)
{
this->reviewText = reviewText;
}
void putOverall(int overall)
{
this->overall = overall;
}
void putSummary(std::string summary)
{
this->summary = summary;
}
std::string getReviewText()
{
return this->reviewText;
}
int getOverall()
{
return this->overall;
}
std::string getSummary()
{
return this->summary;
}
};
int main()
{
std::ifstream file("aggressive_dedup.json", std::ifstream::in);
std::ofstream outFile("processed.json", std::ofstream::out);
std::string str;
int a;
long long count = 0;
while (std::getline(file, str))
{
JsonParserStateMachine jsonParserStateMachine;
for(char &c : str) jsonParserStateMachine.changeState(c);
//std::cout<<jsonParserStateMachine.getReview();
outFile << jsonParserStateMachine.getReview();
if(++count % 1000 == 0) {
std::cout<<count<<" Processed\n";
outFile.flush();
return 0;
}
}
outFile.close();
return 0;
}
问题出在你的KeyValueStore
class:
KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
{
this->keys = new std::string[length];
this->values = new std::string[length];
...
没有任何东西会删除这些数组。在析构函数中删除它们是简单的解决方法:
~KeyValueStore() {
delete[] this->keys;
delete[] this->values;
}
但是!您真的应该考虑改用 std::vector<std::string>
。或者更好的是,围绕 std::unordered_map<std::string, std::string> instead.