I am deploying a seq2seq model for a text2sql generation, i want to be sure that i am on the right path

22 Views Asked by At

This is my model which works on a batch data and I want to deploy it to process a single input which is a natural language question. you can find the classes of the embedding layer, encoder, decoder... in this repository https://github.com/wangpinggl/TREQS/tree/master/LeafNATS/modules


import os
import time
import torch
from torch.autograd import Variable
from seq2sql.model_seq2seq_base import modelSeq2SeqBase
from LeafNATS.data.seq2sql.process_batch_cqa_v1 import process_batch
from LeafNATS.modules.embedding.nats_embedding import natsEmbedding
from LeafNATS.modules.encoder.encoder_rnn import EncoderRNN
from LeafNATS.modules.encoder2decoder.nats_encoder2decoder import natsEncoder2Decoder
from LeafNATS.modules.attention.nats_attention_encoder import AttentionEncoder
from LeafNATS.modules.attention.nats_attention_decoder import AttentionDecoder
from LeafNATS.utils.utils import *

class modelABS(modelSeq2SeqBase):
    def __init__(self, args):
        super().__init__(args=args)
        
    def build_scheduler(self, optimizer):
        '''
        Schedule Learning Rate
        '''
        scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer=optimizer, step_size=self.args.step_size, 
            gamma=self.args.step_decay)
        
        return scheduler
        
    def build_batch(self, batch_id):
        '''
        get batch data
        '''
        output = process_batch(
            batch_id=batch_id, 
            path_=os.path.join('..', 'nats_results'), 
            fkey_=self.args.task, 
            batch_size=self.args.batch_size, 
            vocab2id=self.batch_data['vocab2id'], 
            max_lens=[self.args.src_seq_len, self.args.trg_seq_len])
            
        self.batch_data['ext_id2oov'] = output['ext_id2oov']
        self.batch_data['src_var'] = output['src_var'].to(self.args.device)
        self.batch_data['batch_size'] = self.batch_data['src_var'].size(0)
        self.batch_data['src_seq_len'] = self.batch_data['src_var'].size(1)
        self.batch_data['src_mask_pad'] = output['src_mask_pad'].to(self.args.device)
        
        if self.args.task == 'train' or self.args.task == 'validate':
            self.batch_data['trg_input'] = output['trg_input_var'].to(self.args.device)
            # different from seq2seq models.
            self.batch_data['trg_output'] = output['trg_output_var'].to(self.args.device)
            self.batch_data['trg_seq_len'] = self.batch_data['trg_input'].size(1)
        else:
            self.batch_data['src_mask_unk'] = output['src_mask_unk'].to(self.args.device)
            self.batch_data['src_txt'] = output['src_txt']
            self.batch_data['trg_txt'] = output['trg_txt']
            self.batch_data['trg_seq_len'] = 1
        
    def build_models(self):
        '''
        build all models.
        in this model source and target share embeddings
        '''
        #print("hello")
        #print(self.train_models)
        self.train_models['embedding'] = natsEmbedding(
            vocab_size = self.batch_data['vocab_size'],
            emb_dim = self.args.emb_dim,
            share_emb_weight = True
        ).to(self.args.device)
        
        self.train_models['encoder'] = EncoderRNN(
            self.args.emb_dim, self.args.src_hidden_dim,
            self.args.nLayers, 'lstm',
            device = self.args.device
        ).to(self.args.device)
        
        self.train_models['encoder2decoder'] = natsEncoder2Decoder(
            src_hidden_size = self.args.src_hidden_dim,
            trg_hidden_size = self.args.trg_hidden_dim,
            rnn_network = 'lstm',
            device = self.args.device
        ).to(self.args.device)
        
        self.train_models['decoderRNN'] = torch.nn.LSTMCell(
            self.args.emb_dim+self.args.trg_hidden_dim, 
            self.args.trg_hidden_dim
        ).to(self.args.device)
        
        self.train_models['attnEncoder'] = AttentionEncoder(
            self.args.src_hidden_dim,
            self.args.trg_hidden_dim,
            attn_method='luong_general',
            repetition='temporal'
        ).to(self.args.device)
        
        self.train_models['attnDecoder'] = AttentionDecoder(
            self.args.trg_hidden_dim,
            attn_method='luong_general'
        ).to(self.args.device)
        
        self.train_models['wrapDecoder'] = torch.nn.Linear(
            self.args.src_hidden_dim*2+self.args.trg_hidden_dim*2,
            self.args.trg_hidden_dim, bias=True
        ).to(self.args.device)
        
        self.train_models['genPrb'] = torch.nn.Linear(
            self.args.emb_dim+self.args.src_hidden_dim*2+self.args.trg_hidden_dim, 1
        ).to(self.args.device)
        
        # decoder to vocab
        self.train_models['decoder2proj'] = torch.nn.Linear(
            self.args.trg_hidden_dim, self.args.emb_dim, bias=False
        ).to(self.args.device)
                    
    def build_encoder(self):
        '''
        Encoder Pipeline
        self.pipe_data = {
            'encoder': {},
            'decoderA': {}}
            'decoderB': {'accu_attn': [], 'last_word': word}}
        '''
        src_emb = self.train_models['embedding'].get_embedding(
            self.batch_data['src_var'])
        src_enc, hidden_encoder = self.train_models['encoder'](src_emb)
        trg_hidden0 = self.train_models['encoder2decoder'](hidden_encoder)        
        # set up pipe_data pass to decoder
        self.pipe_data['encoder'] = {}
        self.pipe_data['encoder']['src_emb'] = src_emb
        self.pipe_data['encoder']['src_enc'] = src_enc        
        self.pipe_data['decoderB'] = {}
        self.pipe_data['decoderB']['hidden'] = trg_hidden0        
        self.pipe_data['decoderB']['h_attn'] = Variable(torch.zeros(
            self.batch_data['batch_size'], self.args.trg_hidden_dim
        )).to(self.args.device)
        self.pipe_data['decoderB']['past_attn'] = Variable(torch.ones(
            self.batch_data['batch_size'], self.batch_data['src_seq_len']
        )/float(self.batch_data['src_seq_len'])).to(self.args.device)
        self.pipe_data['decoderB']['past_dech'] = Variable(torch.zeros(
            1, 1)).to(self.args.device)
        self.pipe_data['decoderB']['accu_attn'] = []
        
        self.pipe_data['decoderFF'] = {}
        self.pipe_data['decoderFF']['h_attn'] = []
        self.pipe_data['decoderFF']['attn'] = []
        self.pipe_data['decoderFF']['genPrb'] = []
        # when training get target embedding at the same time.
        if self.args.task == 'train' or self.args.task == 'validate':
            trg_emb = self.train_models['embedding'].get_embedding(
                self.batch_data['trg_input'])
            self.pipe_data['decoderFF']['trg_seq_emb'] = trg_emb        
    def build_decoder_one_step(self, k=0):
        '''
        Decoder one-step
        '''
        # embedding at current decoding step
        if self.args.task == 'train' or self.args.task == 'validate':
            self.pipe_data['decoderA'] = self.pipe_data['decoderB']
            word_emb = self.pipe_data['decoderFF']['trg_seq_emb'][:, k]
        else:
            word_emb = self.train_models['embedding'].get_embedding(
                self.pipe_data['decoderA']['last_word'])
        h_attn = self.pipe_data['decoderA']['h_attn']
        dec_input = torch.cat((word_emb, h_attn), 1)
        hidden = self.pipe_data['decoderA']['hidden']
        past_attn = self.pipe_data['decoderA']['past_attn']
        accu_attn = self.pipe_data['decoderA']['accu_attn']
        past_dech = self.pipe_data['decoderA']['past_dech']
        
        hidden = self.train_models['decoderRNN'](dec_input, hidden)
        ctx_enc, attn, attn_ee = self.train_models['attnEncoder'](
            hidden[0], self.pipe_data['encoder']['src_enc'], 
            past_attn, self.batch_data['src_mask_pad'])
        # temporal attention
        past_attn = past_attn + attn_ee
        # decoder attention
        if k == 0:
            ctx_dec = Variable(torch.zeros(
                self.batch_data['batch_size'], self.args.trg_hidden_dim
            )).to(self.args.device)
        else:
            ctx_dec, _ = self.train_models['attnDecoder'](
                hidden[0], past_dech)
        past_dech = past_dech.transpose(0, 1) # seqL*batch*hidden
        dec_idx = past_dech.size(0)
        if k == 0:
            past_dech = hidden[0].unsqueeze(0) # seqL*batch*hidden
            past_dech = past_dech.transpose(0, 1) # batch*seqL*hidden
        else:
            past_dech = past_dech.contiguous().view(
                -1, self.args.trg_hidden_dim) # seqL*batch**hidden
            past_dech = torch.cat((past_dech, hidden[0]), 0) # (seqL+1)*batch**hidden
            past_dech = past_dech.view(
                dec_idx+1, self.batch_data['batch_size'], self.args.trg_hidden_dim
            ) # (seqL+1)*batch*hidden
            past_dech = past_dech.transpose(0, 1) # batch*(seqL+1)*hidden
        # wrap up.
        h_attn = self.train_models['wrapDecoder'](torch.cat((ctx_enc, ctx_dec, hidden[0]), 1))
        # pointer generator
        pt_input = torch.cat((word_emb, hidden[0], ctx_enc), 1)
        genPrb = torch.sigmoid(self.train_models['genPrb'](pt_input))
        
        # setup piped_data
        self.pipe_data['decoderB'] = {}
        self.pipe_data['decoderB']['h_attn'] = h_attn
        self.pipe_data['decoderB']['past_attn'] = past_attn
        self.pipe_data['decoderB']['hidden'] = hidden
        self.pipe_data['decoderB']['past_dech'] = past_dech
        self.pipe_data['decoderB']['accu_attn'] = [a for a in accu_attn]
        self.pipe_data['decoderB']['accu_attn'].append(attn)
        
        if self.args.task == 'train' or self.args.task == 'validate':
            self.pipe_data['decoderFF']['h_attn'].append(h_attn)
            self.pipe_data['decoderFF']['attn'].append(attn)
            self.pipe_data['decoderFF']['genPrb'].append(genPrb)
            if k == self.batch_data['trg_seq_len']-1:
                self.pipe_data['decoderFF']['h_attn'] = \
                torch.cat(self.pipe_data['decoderFF']['h_attn'], 0).view(
                    self.batch_data['trg_seq_len'], 
                    self.batch_data['batch_size'], 
                    self.args.trg_hidden_dim).transpose(0,1)
                
                self.pipe_data['decoderFF']['attn'] = \
                torch.cat(self.pipe_data['decoderFF']['attn'], 0).view(
                    self.batch_data['trg_seq_len'], 
                    self.batch_data['batch_size'], 
                    self.args.src_seq_len).transpose(0,1)
                
                self.pipe_data['decoderFF']['genPrb'] = \
                torch.cat(self.pipe_data['decoderFF']['genPrb'], 0).view(
                    self.batch_data['trg_seq_len'], 
                    self.batch_data['batch_size']).transpose(0,1)
        else:
            self.pipe_data['decoderFF']['h_attn'] = h_attn
            self.pipe_data['decoderFF']['attn'] = attn.unsqueeze(0)
            self.pipe_data['decoderFF']['genPrb'] = genPrb

    def build_vocab_distribution(self):
        '''
        Data flow from input to output.
        '''
        trg_out = self.pipe_data['decoderFF']['h_attn']
        trg_out = self.train_models['decoder2proj'](trg_out)
        trg_out = self.train_models['embedding'].get_decode2vocab(trg_out)
        trg_out = trg_out.view(
            self.batch_data['batch_size'], self.batch_data['trg_seq_len'], -1)
        prb = torch.softmax(trg_out, dim=2)
        
        vocab_size = self.batch_data['vocab_size']
        batch_size = self.batch_data['batch_size']
        # trg_seq_len = self.batch_data['trg_seq_len']
        src_seq_len = self.batch_data['src_seq_len']
        
        # pointer-generator calculate index matrix
        pt_idx = Variable(torch.FloatTensor(torch.zeros(1, 1, 1))).to(self.args.device)
        pt_idx = pt_idx.repeat(batch_size, src_seq_len, vocab_size)
        pt_idx.scatter_(2, self.batch_data['src_var'].unsqueeze(2), 1.0)
               
        p_gen = self.pipe_data['decoderFF']['genPrb']
        attn_ = self.pipe_data['decoderFF']['attn']
        
        prb_output = p_gen.unsqueeze(2)*prb + \
                     (1.0-p_gen.unsqueeze(2))*torch.bmm(attn_, pt_idx)
                
        return prb_output + 1e-20
    
    def build_pipelines(self):
        '''
        Build pipeline from input to output.
        Output is loss.
        Input is word one-hot encoding.
        '''
        self.build_encoder()
        for k in range(self.args.trg_seq_len):
            self.build_decoder_one_step(k)
        prb = self.build_vocab_distribution()
        
        pad_mask = torch.ones(self.batch_data['vocab_size']).to(self.args.device)
        pad_mask[self.batch_data['vocab2id']['<pad>']] = 0
        self.loss_criterion = torch.nn.NLLLoss(pad_mask).to(self.args.device)
        
        prb = torch.log(prb)
        loss = self.loss_criterion(
            prb.view(-1, self.batch_data['vocab_size']),
            prb.reshape(-1, self.batch_data['vocab_size']),
            self.batch_data['trg_output'].view(-1))
        
        return loss

this is my attempt to deploy the model for single natural question (the app_worker method below)

Some methods are lacking some attributes which i am still working on

'''
@author Tian Shi
Please contact [email protected]
'''
import glob
import json
import os
import pickle
import re
import shutil
import tokenize
import time
from pprint import pprint
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import torch
from torch.autograd import Variable
from LeafNATS.modules.embedding.nats_embedding import natsEmbedding
from LeafNATS.modules.encoder.encoder_rnn import EncoderRNN
from LeafNATS.modules.encoder2decoder.nats_encoder2decoder import natsEncoder2Decoder
from LeafNATS.modules.attention.nats_attention_encoder import AttentionEncoder
from LeafNATS.modules.attention.nats_attention_decoder import AttentionDecoder
from LeafNATS.modules.decoder.nats_decoder_pointer_generator import PointerGeneratorDecoder
#from LeafNATS.modules.attention import AttentionSelf
from LeafNATS.utils.utils import *

from LeafNATS.data.utils import *
from LeafNATS.utils.utils import show_progress

#nltk.download('punkt')
class End2EndBase(object):
    '''
    This engine is for the end2end training for seq2seq models.
    It is a new version of previous one. 
    Our goal is to extend its application to all kinds of language generation tasks.
    '''

    def __init__(self, args=None):
        '''
        Initialize
        '''
        self.args = args
        self.base_models = {}
        self.train_models = {}
        self.batch_data = {}
        self.test_data = {}
        self.global_steps = 0

    def build_vocabulary(self):
        '''
        vocabulary
        '''
        raise NotImplementedError

    def build_models(self):
        '''
        Models:
            self.base_models: models that will be trained
                Format: {'name1': model1, 'name2': model2}
            self.train_models: models that will be trained.
                Format: {'name1': model1, 'name2': model2}
        '''
        raise NotImplementedError

    def init_base_model_params(self):
        '''
        Initialize Base Model Parameters.
        self.base_models.
        '''
        raise NotImplementedError

    def build_pipelines(self):
        '''
        Pipelines and loss here.
        '''
        raise NotImplementedError

    def build_optimizer(self, params):
        '''
        define optimizer
        '''
        raise NotImplementedError

    def print_info_train(self):
        '''
        Print additional information on screen.
        '''
        print('NATS Message: ')

    def build_batch(self, batch_id):
        '''
        process batch data.
        '''
        raise NotImplementedError

    def test_worker(self):
        '''
        Used in decoding.
        Users can define their own decoding process.
        You do not have to worry about path and prepare input.
        '''
        raise NotImplementedError
    
    def app_worker(self):
        '''
        For application.
        '''
        question = input("ask a question: ")
        quest_tok = word_tokenize(question)
        print("TOK",quest_tok)
        # Read the file containing the vocabulary IDs
        file_path = "./nats_results/model/vocab"
        
        # Now we have mapping words to their vocabulary IDs
        vocab2id, id2vocab = construct_vocab(file_path,
                    max_size=200000,
                    mincount=5)
        '''print("vocab2id",vocab2id)
        print("id2vocab",id2vocab)'''
        #ques_id is the list of word indices
        ques_id = []
        for e in quest_tok:
            if e in vocab2id:
                ques_id.append(vocab2id[e])
        print("the list of word indices: ", ques_id)
        
        input_tensor = torch.tensor(ques_id).unsqueeze(0)
        
        #====EMBEDDING==========
        vocab_size = 2353
        emb_dim = 128
        # Initialize the embedding layer
        embedding_layer = natsEmbedding(vocab_size, emb_dim)

        # Access the weights of the embedding layer
        embedding_weights = embedding_layer.embedding.weight

        # Print the shape of the weights tensor
        print("Shape of embedding weights:", embedding_weights.shape)

        #print("Embedding weights:", embedding_weights)
        tensor_embedded = embedding_layer.get_embedding(input_tensor)
        print("tensor_embedded: ", tensor_embedded)
        print(tensor_embedded.size())
        print(input_tensor.size())
        #embedding.get_embedding(input)
        
        #====ENCODER=====================================================================
        
        encoder = EncoderRNN(
        emb_dim=128,
        hidden_size=256,
        nLayers=1,
        rnn_network='lstm',
        bidirectional=True
        )
        hy_encoder, (ht_encoder, ct_encoder) = encoder.forward(tensor_embedded)
        print(hy_encoder, (ht_encoder, ct_encoder))
        
        #====ENCODER2DECODER=====
        encoder2decoder_instance = natsEncoder2Decoder(
        src_hidden_size=256,  # the source hidden size is 256(256*2=512)
        trg_hidden_size=128,  # the target hidden size is 128
        rnn_network='lstm'    # We are using LSTM networks
        )
        (decoder_h0, decoder_c0) = encoder2decoder_instance.forward((ht_encoder, ct_encoder))
        
      #====ATTDECODER==========
        decoder = AttentionDecoder(hidden_size=256, attn_method='luong_concat') # Hidden size of the decoder LSTM is 256
        decoder.forward()     
#====ATTENCODER=================
        src_hidden_size = 256  #source side hidden size
        trg_hidden_size = 256  #target side hidden size
        attn_method = 'luong_general'  #attention method
        repetition = 'temporal'  #repetition handling (as mentioned in the build_models part)
        src_hidden_doubled = True  #whether source hidden size is doubled

        # Instantiate the AttentionEncoder
        attention_encoder = AttentionEncoder(
            src_hidden_size=src_hidden_size,
            trg_hidden_size=trg_hidden_size,
            attn_method=attn_method,
            repetition=repetition,
            src_hidden_doubled=src_hidden_doubled
        )
        attention_encoder.forward((decoder_h0, decoder_c0)[0], hy_encoder, (ht_encoder, ct_encoder)[1][0])
        
                #==========PointerGeneratorDecoder====================
        # Instantiate the PointerGeneratorDecoder
        pointer_generator_decoder = PointerGeneratorDecoder(
            input_size=128,  # Size of the input vector
            src_hidden_size=256,  # Source side hidden size
            trg_hidden_size=128,  # Target side hidden size
            attn_method='luong_general',  # Alignment method
            repetition='temporal',  # Repetition handling method
            pointer_net=True,  # Turn on pointer network
            attn_decoder=True,  # Turn on attention decoder
            rnn_network='lstm',  # Type of RNN network
            device=torch.device("cpu")  # Specify the device (CPU or GPU)
        )
       
        #raise NotImplementedError
    def app2Go(self):
        '''
        For the application.
        Don't overwrite.
        '''
        self.build_vocabulary()
        self.build_models()
        for model_name in self.train_models:
            self.base_models[model_name] = self.train_models[model_name]
        pprint(self.base_models)
        if len(self.base_models) > 0:
            self.init_base_model_params()

        for model_name in self.base_models:
            self.base_models[model_name].eval()
        with torch.no_grad():
            while 1:
                self.app_worker()
0

There are 0 best solutions below