how to pass inference request of type tritonclient.http in a multi model endpoint in aws sagemaker?

354 Views Asked by At

set up - multi model endpoint in aws sagemaker with nvidia triton server. based on the documentation provided here -> https://github.com/aws/amazon-sagemaker-examples/blob/main/inference/nlp/realtime/triton/multi-model/t5_pytorch_python-backend/t5_pytorch_python-backend.ipynb, we construct a request payload, which is of type httpclient class provided by tritonclient.http => httpclient.InferenceServerClient.genreate_request_body(inputs, outputs=outputs...

examples i have seen pass list for both inputs and outputs parameter. any examples of passing just one input instead of list ? also, on the backend , the code that process this request is model.py file (code below) , looks like it only accepts , list of inputs rather than just one input. is there a way to override that .

import tritonclient.http as httpclient
import numpy as np


def get_text_payload_binary(model_name, text):
    inputs = []
    outputs = []
    input_ids, attention_mask = tokenize_text(model_name, text)
    inputs.append(httpclient.InferInput("input_ids", input_ids.shape, "INT32"))
    inputs.append(httpclient.InferInput("attention_mask", attention_mask.shape, "INT32"))

    inputs[0].set_data_from_numpy(input_ids.astype(np.int32), binary_data=True)
    inputs[1].set_data_from_numpy(attention_mask.astype(np.int32), binary_data=True)

    output_name = "output" if model_name == "t5-small" else "logits"
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )
    return request_body, header_length

model.py

import numpy as np
import sys
import os
import json
from pathlib import Path

import torch

import triton_python_backend_utils as pb_utils

class TritonPythonModel:
  
    def initialize(self, args):
         ...


    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model.
        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest
        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """
        responses = []
        for request in requests:
            input_ids = pb_utils.get_input_tensor_by_name(request, "input_ids")
            input_ids = input_ids.as_numpy()
            input_ids = torch.as_tensor(input_ids).long().cuda()
            attention_mask = pb_utils.get_input_tensor_by_name(request, "attention_mask")
            attention_mask = attention_mask.as_numpy()
            attention_mask = torch.as_tensor(attention_mask).long().cuda()
            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
            translation = self.model.generate(**inputs, num_beams=1)
           
            np_translation =  translation.cpu().int().detach().numpy()
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[
                    pb_utils.Tensor(
                        "output",
                        np_translation.astype(self.output_dtype)
                    )
                ]
            )
            responses.append(inference_response)
        return responses
0

There are 0 best solutions below