Memory Error when trying to save pandas DataFrame to disk using to_hdf()

1.9k Views Asked by At

I am trying to save a relatively big DataFrame (memory usage according to the info() method returns 663+ MB) using the to_hdf() method to a HDFstore.

But everytime I run into a "Memory Error".

Hence, I have two questions regarding this:

  1. Is the memory error primarily because I don't have enough RAM (I have 16 GB)?
  2. How can I save it, given my RAM restriction. Is there a way to sort of append it to the HDFstore in chunks.

I know there is a possibility to save it as a 'table' rather than 'fixed' and that allows appending but I have not tried that yet mostly because I was hoping for a simpler alternative.

Thanks a lot in advance :)

P.S. I would like to add that I tried to_pickle() and that worked smoothly so my assumption is, it couldn't have been a physical RAM problem.

Error:

MemoryError                               Traceback (most recent call last)
<ipython-input-10-05bb5886160a> in <module>()
      1 train_data = pd.HDFStore('strat_train_data.h5')
----> 2 strat_train_set.to_hdf(train_data, 'strat_train_set')

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\generic.py in to_hdf(self, path_or_buf, key, **kwargs)
   1280 
   1281         from pandas.io import pytables
-> 1282         return pytables.to_hdf(path_or_buf, key, self, **kwargs)
   1283 
   1284     def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
    268             f(store)
    269     else:
--> 270         f(path_or_buf)
    271 
    272 

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\pytables.py in <lambda>(store)
    260         f = lambda store: store.append(key, value, **kwargs)
    261     else:
--> 262         f = lambda store: store.put(key, value, **kwargs)
    263 
    264     path_or_buf = _stringify_path(path_or_buf)

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\pytables.py in put(self, key, value, format, append, **kwargs)
    869             format = get_option("io.hdf.default_format") or 'fixed'
    870         kwargs = self._validate_format(format, kwargs)
--> 871         self._write_to_group(key, value, append=append, **kwargs)
    872 
    873     def remove(self, key, where=None, start=None, stop=None):

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, **kwargs)
   1311 
   1312         # write the object
-> 1313         s.write(obj=value, append=append, complib=complib, **kwargs)
   1314 
   1315         if s.is_table and index:

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\pytables.py in write(self, obj, **kwargs)
   2890             # I have no idea why, but writing values before items fixed #2299
   2891             blk_items = data.items.take(blk.mgr_locs)
-> 2892             self.write_array('block%d_values' % i, blk.values, items=blk_items)
   2893             self.write_index('block%d_items' % i, blk_items)
   2894 

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\pytables.py in write_array(self, key, value, items)
   2658             vlarr = self._handle.create_vlarray(self.group, key,
   2659                                                 _tables().ObjectAtom())
-> 2660             vlarr.append(value)
   2661         else:
   2662             if empty_array:

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\tables\vlarray.py in append(self, sequence)
    517         atom = self.atom
    518         if not hasattr(atom, 'size'):  # it is a pseudo-atom
--> 519             sequence = atom.toarray(sequence)
    520             statom = atom.base
    521         else:

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\tables\atom.py in toarray(self, object_)
   1051 
   1052     def toarray(self, object_):
-> 1053         buffer_ = self._tobuffer(object_)
   1054         array = numpy.ndarray(buffer=buffer_, dtype=self.base.dtype,
   1055                               shape=len(buffer_))

C:\Users\IQBALSH\AppData\Local\Continuum\Anaconda3\lib\site-packages\tables\atom.py in _tobuffer(self, object_)
   1171 
   1172     def _tobuffer(self, object_):
-> 1173         return pickle.dumps(object_, pickle.HIGHEST_PROTOCOL)
   1174 
   1175     def fromarray(self, array):

MemoryError: 
0

There are 0 best solutions below