Saving large data in vector c++

1.7k Views Asked by At

I am having a huge data in a file that i need to read and do some probabilities on it, so i need to count the number of occurances of each word in the whole file and do some more calculations on it. the files contains 1 million and half records and each record is about 6 strings. I used a vector to save this data but the program crashes after saving about 8000 records. Is there a way of saving this vector on the computer and not on the memory of the program ?!.. or i heard something called symbol table from searching but i couldn't understand what does it mean or how to use it.

any solution to this problem ?

This is the Mainfile

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>

#include "Tuple.h"
#include "VerbPair.h"
using namespace std;

string filename = "verb-argument-tuples.txt";
vector<Tuple> mytuples;
vector<VerbPair> verbpairs;

vector<Tuple> readTupleFile(string filename)
{
    cout << "Started parsing the file of tuples..." << endl;
    vector<Tuple> mt;
    string temp;
    Tuple t;

    ifstream infile;
    infile.open(filename);
    while(!(infile.eof()))
    {
        getline(infile,temp);
        t.parseTuple(temp);
        mt.push_back(t);
    }

    infile.close();
    cout << "Done with reading tuples file..." << endl;
    return mt;
}

vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
    vector<VerbPair> pairs;
    bool flag = false;
    VerbPair temp;
    for(int i=0;i<mytuples.size();i++)
    {
        flag = false;
        for(int h=0;h<pairs.size();h++)
        {
            if (mytuples[i].verb.compare(pairs[h].verb) == 0)
            {
                pairs[h].freq += mytuples[i].count;
                flag =true;
                break;
            }
        }
        if(! flag)
        {
            temp.verb = mytuples[i].verb;
            temp.freq = mytuples[i].count;
            pairs.push_back(temp);
        }
    }
    return pairs;
}

int numOfLines(string filename)
{
    int numLines = 0;
    string j ="";
    ifstream infile;
    infile.open(filename);

    while(!infile.eof())
    {
        getline(infile,j);
        numLines++;
    }
    infile.close();
    return numLines;
}

void train(string filename)
{
    mytuples = readTupleFile(filename);
    verbpairs = getVerbPairs(mytuples);
}
void store(string filename)
{

}
void load(string filename)
{

}

int main()
{
    cout << "Started Application..." << endl;
    train(filename);
    cout << "Size of verb pairs is " << verbpairs.size() << endl;
}

Tuple.h

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;

class Tuple
{
public:
    int count;
    string verb;
    string frame;
    vector<string> args;
private:
    int i;
    int h;
    string p;

public:
    void parseTuple(string s)
    {
        cout << "parsing.... " << s << endl;
        i=0;
        h=0;
        p="";
        while(s[i] != 32 && s[i]!= 9) //that means temp[i] is a number
        {
            h = h*10 + (s[i] - '0');
            i++;
        }
        this->count = h;
        i++;

        // loops for everything but not the space and tab
        while(s[i] != 32 && s[i]!= 9)
        {
            p +=s[i];
            i++;
        }
        this->verb = p;
        i++;

        p="";
        while(s[i] != 32 && s[i]!= 9)
        {
            p +=s[i];
            i++;
        }
        this->frame = p;
        i++;

        p="";
        while(i < s.length())
        {
            while(s[i] != 32 && s[i]!= 9 && i < s.length())
            {
                p += s[i];
                i++;
            }
            this->args.push_back(p);
            i++;
            p="";
        }
    }
};

and VerbPair.h

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;

class VerbPair
{
public:
    string verb;
    int freq;
};
3

There are 3 best solutions below

1
On

Since there is duplicate data, why are you using vector. Just use a map<string,int>. Each time you encounter a word, increment the corresponding value in the map.

0
On

Can you try with using reserve function with vector. Since you possibly know that you have large data, you should also use reserve function.

Also, use map in this case, since using map, you will be able to count the number of occurences easily.

For the crash, you will have to show us the code.

0
On

You have alot of shadow variables in you code, like the fact that you declare the filename variable globally and then use it locally three lines later. you do the same with the tuple vector and the verbpair vector.

Perhaps some encapsulation would make your debugging task easier.

Another style issue would be a function like:

vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
    vector<VerbPair> pairs;
    bool flag = false;
    VerbPair temp;
    for(int i=0;i<mytuples.size();i++)
    {
        flag = false;
        for(int h=0;h<pairs.size();h++)
        {
            if (mytuples[i].verb.compare(pairs[h].verb) == 0)
            {
                pairs[h].freq += mytuples[i].count;
                flag =true;
                break;
            }
        }
        if(! flag)
        {
            temp.verb = mytuples[i].verb;
            temp.freq = mytuples[i].count;
            pairs.push_back(temp);
        }
    }
    return pairs;
}

The are a few things that make it hard to debug. first one is the shadow thing, second one is that you don't let the compiler help you.

vector<VerbPair> getVerbPairs(const vector<Tuple>& mytuples)
{
  vector<VerbPair> pairs;
  bool flag = false;
  VerbPair temp;
  for(int i=0;i<mytuples.size();i++)
    {
      flag = false;
      for(int h=0;h<pairs.size();h++)
    {
      if (mytuples[i].verb.compare(pairs[h].verb) == 0)
        {
          pairs[h].freq += mytuples[i].count;
          flag =true;
          break;
        }
    }
      if(! flag)
    {
      temp.verb = mytuples[i].verb;
      temp.freq = mytuples[i].count;
      pairs.push_back(temp);
    }
    }
  return pairs;
}

This way the compiler will tell you if you try to mess around with the mytupes vector.