读写大文件C++(内存过载)

Reading and Writing Huge Files C++(Memory Overload)

我有一个非常大的文件(55 GB 的 json 数据)。我正在使用 ifstream 读取和另一个 ofstream 写入另一个文件。程序正确运行了一段时间。然后由于大量内存使用而崩溃。

我尝试使用 ignoreclear 来清除输入缓冲区。 并且还尝试使用 flush 清除输出缓冲区。

而且文件很大,所以,我希望它快点。

p.s。我在半睡半醒的时候写了 json 解析器。所以请原谅我糟糕的解析器代码。也许那里存在内存泄漏。任何帮助将不胜感激。

小例子

int main() 
{ 
    std::ifstream file("aggressive_dedup.json", std::ifstream::in);
    std::ofstream outFile("processed.json", std::ofstream::out);
    std::string str;
    int a; 
    long long count = 0;


    while (std::getline(file, str))
    {

        JsonParserStateMachine jsonParserStateMachine;
        for(char &c : str) jsonParserStateMachine.changeState(c);
        //std::cout<<jsonParserStateMachine.getReview();
        //This lines just gives a string to the output which is around may be 1000 characters 
        outFile << jsonParserStateMachine.getReview(); 
        if(++count % 1000 == 0) {
            std::cout<<count<<" Processed\n";
            outFile.flush();
            return 0;
        }
    }
    outFile.close();
    return 0;
}

给愿意看完整代码的人

#include <fstream>
#include <string>
#include <iostream>

enum state {
    q0, q1, q2, q3, q4, q5, q6, h
};

class KeyValueStore{
    std::string *keys;
    std::string *values;
    int currentKeyPosition;
    int currentValuePosition;
    int maxLength;
public:
    KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
    {
        this->keys = new std::string[length];
        this->values = new std::string[length];

        for(int i=0;i<length;i++)
        {
            this->keys[i] = "";
            this->values[i] = "";
        }

    }

    void updateKeyPosition()
    {
        this->currentKeyPosition = this->currentKeyPosition++%9;
    }
    void updateValuePosition()
    {
        this->currentValuePosition = this->currentValuePosition++%9;
    }

    void putKey(char c)
    {
        this->keys[currentKeyPosition] += c;
    }
    void putValue(char c)
    {
        this->values[currentValuePosition] += c;
    }


    std::string getValue(std::string key)
    {
        for(int i=0;i<this->maxLength;i++)
        {
            if(this->keys[i] == key) return this->values[i];
        }
        return "";
    }

    void print()
    {
        std::cout<<"Keys"<<"\t"<<"Values"<<std::endl;
        for(int i=0;i<maxLength;i++)
        {
            std::cout<<this->keys[i] <<'\t'<<this->values[i]<<std::endl;
        }
    }


    std::string getReview()
    {
        return std::string("{\"" +  this->getValue("reviewText") + "\":\"" + this->getValue("overall") + "\"}"); 
    }
};



class JsonParserStateMachine{
    state currentState;
    KeyValueStore keyValueStore;
    bool inNum;
    bool inArray;
public:
    JsonParserStateMachine(): keyValueStore(9), currentState(state::q0), inNum(false),inArray(false){}

    state getState()
    {
        return this->currentState;
    }

    void print()
    {
        keyValueStore.print();
    }


    std::string getReview()
    {
        return keyValueStore.getReview();
    }

    state changeState(char c)
    {
        switch(currentState)
        {
            case state::q0:
                if(c == ' ') break;
                else if(c == '{') this->currentState = state::q1;
                else this->currentState = state::h;
                break;
            case state::q1:
                if(c == ' ') break;
                else if(c == '\"') this->currentState = state::q2;
                else this->currentState = state::h;
                break;
            case state::q2:
                if(c == '\"'){
                    this->currentState = state::q3;
                    this->keyValueStore.updateKeyPosition();
                    break;
                }
                else{
                    this->keyValueStore.putKey(c);
                    break;
                } 
            case state::q3:
                if(c == ':') this->currentState = state::q4;
                else if(c == ' ') {

                }
                else {
                    //std::cout<<"From Q3"<<std::endl;
                    this->currentState = state::h;
                }break;
            case state::q4:
                if(c == '\"' || c == '[') {
                    this->currentState = state::q5;
                    inArray = c == '[' ? true: false; 

                }else if(c == ' ') break;
                else {
                    //std::cout<<"Got Here"<<std::endl;
                    inNum = true;
                    this->currentState = state::q5;
                    this->keyValueStore.putValue(c);
                }
                break;
            case state::q5:
                if(c == '\"' || c == ']'){
                    this->currentState = state::q6;
                    this->keyValueStore.updateValuePosition();
                    inArray = c == ']'? false: true;
                    break;
                }else if(inNum && c == ',' ){
                    this->currentState = state::q1;
                    this->keyValueStore.updateValuePosition();
                    inNum = false;
                }
                else{
                    this->keyValueStore.putValue(c);
                    break;
                } 
            case state::q6:
                if(c == ','){
                    this->currentState = state::q1;
                    break;
                }else if(c == ' '){
                    break;
                }else{
                    //std::cout<<"From Q6"<<std::endl;
                    this->currentState = state::h;
                }
        }

        return this->currentState;
    }
};

class Review{

    std::string reviewText;
    int overall;
    std::string summary;
public:
    void pusReviewText(std::string reviewText)
    {
        this->reviewText = reviewText;
    }

    void putOverall(int overall)
    {
        this->overall = overall;
    }


    void putSummary(std::string summary)
    {
        this->summary = summary;
    }

    std::string getReviewText()
    {
        return this->reviewText;
    }
    int getOverall()
    {
        return this->overall;
    }
    std::string getSummary()
    {
        return this->summary;
    }
};

int main() 
{ 
    std::ifstream file("aggressive_dedup.json", std::ifstream::in);
    std::ofstream outFile("processed.json", std::ofstream::out);
    std::string str;
    int a; 
    long long count = 0;


    while (std::getline(file, str))
    {

        JsonParserStateMachine jsonParserStateMachine;
        for(char &c : str) jsonParserStateMachine.changeState(c);
        //std::cout<<jsonParserStateMachine.getReview();
        outFile << jsonParserStateMachine.getReview();
        if(++count % 1000 == 0) {
            std::cout<<count<<" Processed\n";
            outFile.flush();
            return 0;
        }
    }
    outFile.close();
    return 0;
}

问题出在你的KeyValueStore class:

KeyValueStore(const int length) : maxLength(length),currentKeyPosition(0),currentValuePosition(0)
{
    this->keys = new std::string[length];
    this->values = new std::string[length];
    ...

没有任何东西会删除这些数组。在析构函数中删除它们是简单的解决方法:

~KeyValueStore() {
  delete[] this->keys;
  delete[] this->values;
}

但是!您真的应该考虑改用 std::vector<std::string> 。或者更好的是,围绕 std::unordered_map<std::string, std::string> instead.

重建整个事物