background

Currently, triton Server uses ordinary HTTP API requests for access.

In a recent Sbert Full model (adding vector output per character in addition to average vector, adding transfer word savings of 2567684), network requests were slow.

Office Network – “network development (2.5m /s), takes 1.2 seconds.

Even in localhost networks, 0.51 seconds is achieved.

Poor engineering performance.

(All average times for the full text are the average results of 100 consecutive executions)

To improve the

Use the Triton Client to access the Triton Server

Triton client installation
pip install nvidia-pyindex
pip install tritonclient[all]
Copy the code
The speed of the Triton client in different modes

Example code: github.com/triton-infe…

Here are a few runnable scripts modified according to the Sbert Full model.

SHM stands for shared memory to pass model inputs and outputs

GRPC stands for transferring the input and output of the model using the GRPC protocol

Cudashm represents inputs and outputs of the CUDA memory transfer model

Http_async stands for sending model inputs and outputs in an asynchronous HTTP manner

Python3./common_sbert_full. Py uses HTTP

When shared memory is used, the time is about 0.032 seconds

When shared memory is not used, the time is about 0.036 seconds

Python3./common_sbert_full. Py -i GRPC -u localhost:8001 The GRPC mode is used

When shared memory is used, the time is about 0.032 seconds

About 0.035 seconds when not using shared memory (even slower, 0.04 seconds with Deflate and Gzip compression)

Cudashm_sbert_full. py is the sbert model that uses CUDashM to access the full volume, and the time is about 0.0315 seconds

Http_async_sbert_full. Py When set to 1 concurrency, equivalent synchronization mode, time is about 0.035 seconds; When set to 50 concurrent (100 concurrent video memory is insufficient), the time is about 0.014 seconds.

conclusion

Shared memory and CUDashM were the fastest (almost the same) at 0.032 seconds

Using HTTP and GRPC was the next fastest (almost the same) at 0.036 seconds

Using HTTP or GRPC, 0.51 seconds is 13 times faster than the original HTTP request.

With shared memory and CUDashm, it’s over 15 times faster.

Over 35 times with HTTP 50 concurrency.

Reference code

common_sbert_full.py

import argparse

import numpy as np

import sys

from builtins import range

\


import tritonclient.grpc as grpcclient

import tritonclient.http as httpclient

import tritonclient.utils as utils

import tritonclient.utils.shared_memory as shm

import time

\


FLAGS = None

\


INPUT_NUM = 3

OUTPUT_NUM = 2

LOOP_NUM = 100

\


def infer_and_validata(use_shared_memory, inputs_data) :

if use_shared_memory:

byte_size = inputs_data[0].size * inputs_data[0].itemsize

[inputs[i].set_shared_memory(f"input{i}_data", byte_size) for i in range(INPUT_NUM)]

[outputs[i].set_shared_memory(f"output{i}_data", outputs_byte_size[i]) for i in range(OUTPUT_NUM)]

else:

[inputs[i].set_data_from_numpy(inputs_data[i]) for i in range(INPUT_NUM)]

[outputs[i].unset_shared_memory() for i in range(OUTPUT_NUM)]

\


results = triton_client.infer(model_name=model_name,

inputs=inputs,

outputs=outputs)

\


# Read results from the shared memory.

for i in range(OUTPUT_NUM):

output = results.get_output(f"OUTPUT__{i}")

if output is not None:

if use_shared_memory:

if protocol == "grpc":

output_data = shm.get_contents_as_numpy(

shm_op_handles[i], utils.triton_to_np_dtype(output.datatype),

output.shape)

else:

output_data = shm.get_contents_as_numpy(

shm_op_handles[i],

utils.triton_to_np_dtype(output['datatype']),

output['shape'])

else:

output_data = results.as_numpy(f'OUTPUT__{i}')

else:

print(f"OUTPUT__{i} is missing in the response.")

sys.exit(1)



# Tests whether the same InferInput and InferRequestedOutput objects can be

# successfully used repeatedly for different inferences using/not-using

# shared memory.

if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument('-v'.'--verbose',

action="store_true",

required=False,

default=False.help='Enable verbose output')

parser.add_argument('-i'.'--protocol'.type=str,

required=False,

default='HTTP'.help='Protocol (HTTP/gRPC) used to communicate with ' +

'the inference service. Default is HTTP.')

parser.add_argument('-u'.'--url'.type=str,

required=False,

default='localhost:8000'.help='Inference server URL. Default is localhost:8000.')

\


FLAGS = parser.parse_args()



protocol = FLAGS.protocol.lower()



try:

if protocol == "grpc":

# Create gRPC client for communicating with the server

triton_client = grpcclient.InferenceServerClient(

url=FLAGS.url, verbose=FLAGS.verbose)

else:

# Create HTTP client for communicating with the server

triton_client = httpclient.InferenceServerClient(

url=FLAGS.url, verbose=FLAGS.verbose)

except Exception as e:

print("client creation failed: " + str(e))

sys.exit(1)



# To make sure no shared memory regions are registered with the

# server.

triton_client.unregister_system_shared_memory()

triton_client.unregister_cuda_shared_memory()



# We use a simple model that takes 2 input tensors of 16 integers

# each and returns 2 output tensors of 16 integers each. One

# output tensor is the element-wise sum of the inputs and one

# output is the element-wise difference.

model_name = "shansou_sbert_full"

model_version = "1"


# Create the data for the two input tensors. Initialize the first

# to unique integers and the second to all ones.


input_byte_size = 256 * 8

outputs_byte_size = [768*4.256*768*4]

# Create Output0 and Output1 in Shared Memory and store shared memory handles

shm_op_handles = [shm.create_shared_memory_region(f"output{i}_data".f"/output{i}_simple",

outputs_byte_size[i]) for i in range(OUTPUT_NUM)]

# Register Output0 and Output1 shared memory with Triton Server

[triton_client.register_system_shared_memory(f"output{i}_data".f"/output{i}_simple",

outputs_byte_size[i]) for i in range(OUTPUT_NUM)]

# Create Input0 and Input1 in Shared Memory and store shared memory handles

shm_ip_handles = [shm.create_shared_memory_region(f"input{i}_data".f"/input{i}_simple",

input_byte_size) for i in range(INPUT_NUM)]

# Put input data values into shared memory

#TODO: this is necessary?

inputs_data = [np.full(shape=(1.256), fill_value=value, dtype=np.int64) for value in (1.1.0)]

[shm.set_shared_memory_region(shm_ip_handles[i], [inputs_data[i]]) for i in range(INPUT_NUM)]

# Register Input0 and Input1 shared memory with Triton Server

[triton_client.register_system_shared_memory(f"input{i}_data".f"/input{i}_simple",

input_byte_size) for i in range(INPUT_NUM)]

# Set the parameters to use data from shared memory

infer_input_f = grpcclient.InferInput if protocol == "grpc" else httpclient.InferInput

inputs = [infer_input_f(f'INPUT__{i}'[1.256]."INT64") for i in range(INPUT_NUM)]

infer_output_f = grpcclient.InferRequestedOutput if protocol == "grpc" else httpclient.InferRequestedOutput

outputs = [infer_output_f(f'OUTPUT__{i}') for i in range(OUTPUT_NUM)]



start = time.perf_counter()

for _ in range(LOOP_NUM):

# Use shared memory

infer_and_validata(True, inputs_data)

end = time.perf_counter()

print("infer: ", end - start)



start = time.perf_counter()

for _ in range(LOOP_NUM):

infer_and_validata(False, inputs_data)

end = time.perf_counter()

print("infer: ", end - start)



triton_client.unregister_system_shared_memory()

for handler in (shm_ip_handles + shm_op_handles):

shm.destroy_shared_memory_region(handler)
Copy the code

http_async_sbert_full.py

from functools import partial

import argparse

import numpy as np

import sys


import tritonclient.http as httpclient

from tritonclient.utils import InferenceServerException

import time



if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument('-v'.'--verbose',

action="store_true",

required=False,

default=False.help='Enable verbose output')

parser.add_argument('-u'.'--url'.type=str,

required=False,

default='localhost:8000'.help='Inference server URL. Default is localhost:8000.')



FLAGS = parser.parse_args()


request_count = 50

try:

# Need to specify large enough concurrency to issue all the

# inference requests to the server in parallel.

triton_client = httpclient.InferenceServerClient(

url=FLAGS.url, verbose=FLAGS.verbose, concurrency=request_count)

except Exception as e:

print("context creation failed: " + str(e))

sys.exit()



model_name = 'shansou_sbert_full'

# prepare

INPUT_NUM = 3

OUTPUT_NUM = 2

inputs = [httpclient.InferInput(f'INPUT__{i}'[1.256]."INT64") for i in range(INPUT_NUM)]

inputs_data = [np.full(shape=(1.256), fill_value=value, dtype=np.int64) for value in (1.1.0)]

[inputs[i].set_data_from_numpy(inputs_data[i], binary_data=True) for i in range(INPUT_NUM)]

outputs = [httpclient.InferRequestedOutput(f'OUTPUT__{i}', binary_data=True) for i in range(OUTPUT_NUM)]

# infer

start = time.perf_counter()

for _ in range(2):

async_requests = [triton_client.async_infer(model_name=model_name,inputs=inputs,outputs=outputs) for i in range(request_count)]

results = [async_request.get_result().get_response() for async_request in async_requests]

end = time.perf_counter()

print("async infer: ", end - start)
Copy the code