I am having a huge data in a file that i need to read and do some probabilities on it, so i need to count the number of occurances of each word in the whole file and do some more calculations on it. the files contains 1 million and half records and each record is about 6 strings. I used a vector to save this data but the program crashes after saving about 8000 records. Is there a way of saving this vector on the computer and not on the memory of the program ?!.. or i heard something called symbol table from searching but i couldn't understand what does it mean or how to use it.
any solution to this problem ?
This is the Mainfile
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
#include "Tuple.h"
#include "VerbPair.h"
using namespace std;
string filename = "verb-argument-tuples.txt";
vector<Tuple> mytuples;
vector<VerbPair> verbpairs;
vector<Tuple> readTupleFile(string filename)
{
cout << "Started parsing the file of tuples..." << endl;
vector<Tuple> mt;
string temp;
Tuple t;
ifstream infile;
infile.open(filename);
while(!(infile.eof()))
{
getline(infile,temp);
t.parseTuple(temp);
mt.push_back(t);
}
infile.close();
cout << "Done with reading tuples file..." << endl;
return mt;
}
vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
vector<VerbPair> pairs;
bool flag = false;
VerbPair temp;
for(int i=0;i<mytuples.size();i++)
{
flag = false;
for(int h=0;h<pairs.size();h++)
{
if (mytuples[i].verb.compare(pairs[h].verb) == 0)
{
pairs[h].freq += mytuples[i].count;
flag =true;
break;
}
}
if(! flag)
{
temp.verb = mytuples[i].verb;
temp.freq = mytuples[i].count;
pairs.push_back(temp);
}
}
return pairs;
}
int numOfLines(string filename)
{
int numLines = 0;
string j ="";
ifstream infile;
infile.open(filename);
while(!infile.eof())
{
getline(infile,j);
numLines++;
}
infile.close();
return numLines;
}
void train(string filename)
{
mytuples = readTupleFile(filename);
verbpairs = getVerbPairs(mytuples);
}
void store(string filename)
{
}
void load(string filename)
{
}
int main()
{
cout << "Started Application..." << endl;
train(filename);
cout << "Size of verb pairs is " << verbpairs.size() << endl;
}
Tuple.h
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;
class Tuple
{
public:
int count;
string verb;
string frame;
vector<string> args;
private:
int i;
int h;
string p;
public:
void parseTuple(string s)
{
cout << "parsing.... " << s << endl;
i=0;
h=0;
p="";
while(s[i] != 32 && s[i]!= 9) //that means temp[i] is a number
{
h = h*10 + (s[i] - '0');
i++;
}
this->count = h;
i++;
// loops for everything but not the space and tab
while(s[i] != 32 && s[i]!= 9)
{
p +=s[i];
i++;
}
this->verb = p;
i++;
p="";
while(s[i] != 32 && s[i]!= 9)
{
p +=s[i];
i++;
}
this->frame = p;
i++;
p="";
while(i < s.length())
{
while(s[i] != 32 && s[i]!= 9 && i < s.length())
{
p += s[i];
i++;
}
this->args.push_back(p);
i++;
p="";
}
}
};
and VerbPair.h
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;
class VerbPair
{
public:
string verb;
int freq;
};
Since there is duplicate data, why are you using
vector
. Just use amap<string,int>
. Each time you encounter a word, increment the corresponding value in the map.