How to use Grid Search CV for extremely large datasets (>10GB) using data sequence custom generator class

64 Views Asked by At

I have a dataset of dim (400000,64,63). I am using this dataset in keras deep learning model. My aim is to perform hyperparameter tuning but the problem is my dataset is very large, so I have to use chunks or batches of dataset. I am using custom dataset generator class for that purpose. Now the problem I am encountering is that Grid Search CV fit function is not working with my sequence generator. I am getting an error- TypeError: fit() missing 1 required positional argument: 'y'. I am not able to understand why it is happening.

This is what I am doing:

import numpy as np
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import LSTM, Dense
from keras.optimizers import Adam,Adamax,Ftrl,Adadelta,Nadam
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from keras.utils import Sequence

# Not actual data but dummy data of same size
class S:
    X=np.random.randn(400000,64,63)
    Y=np.random.randint(2,size=(400000,2))

# Define the custom Sequence class
class MySequence(Sequence):
    def __init__(self, S, batch_size):
        self.data = S.X
        self.targets=S.Y
        self.batch_size = batch_size
        self.indices = np.arange(len(self.data))

    def __len__(self):
        return int(np.ceil(len(self.data) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_X = self.data[batch_indices]
        batch_y = self.targets[batch_indices]
        return batch_X, batch_y
    
    def on_epoch_end(self):
        np.random.shuffle(self.indices)




# Define the Keras model
def create_model(activation='relu', dropout_rate=0.2, optimizer='Ftrl',shapes=None):
    model = Sequential()
    model.add(LSTM(units=64, dropout=dropout_rate, recurrent_dropout=0.2,activation=activation, return_sequences=True, input_shape=shapes[0], stateful=False))
    model.add(LSTM(units=32, dropout=dropout_rate, recurrent_dropout=0.2,activation=activation, return_sequences=False, stateful=False))
    model.add(Dense(shapes[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Define the grid search parameters
param_grid = {'batch_size': [32], 'epochs': [2],  'activation': ['relu'],'optimizer': ['Adam','Ftrl','Adamax', 'Nadam']}
input_shape=(S.X.shape[1], S.X[:,:,1:].shape[2])
output_shape=S.Y.shape[1]                                                                 

# Create the Keras object
seq = MySequence(S, batch_size=32)
model=KerasClassifier(build_fn=create_model, verbose=0, shapes=[input_shape, output_shape])
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2,n_jobs=-1, error_score="raise")

# Fit the model using the Sequence object
grid.fit(seq)

I am expecting that I should not get memory error while fitting. The grid.fit(seq) now should perform internal looping on the batches of whole dataset and finally perform tuning.

0

There are 0 best solutions below