logo
Browse Source

Add benchmark with qps test

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
main
Jael Gu 2 years ago
parent
commit
24e4837ac7
  1. 52
      benchmark/README.md
  2. 0
      benchmark/performance.py
  3. 4
      benchmark/performance.sh
  4. 131
      benchmark/qps_test.py

52
benchmark/README.md

@ -1,10 +1,13 @@
# Evaluate with Similarity Search
# Evaluation Method
## Introduction
- Model performance (WIP)
- Pipeline speed
Build a image classification system based on similarity search across embeddings.
## Model Performance
The core ideas in `run.py`:
Build an image classification system based on similarity search across embeddings.
The core ideas in `performance.py`:
1. create a new Milvus collection each time
2. extract embeddings using a pretrained model with model name specified by `--model`
3. specify inference method with `--format` in value of `pytorch` or `onnx`
@ -14,9 +17,44 @@ The core ideas in `run.py`:
2. compare final prediction with ground truth
3. calculate percent of correct predictions over all queries
## Example Usage
### Example Usage
```bash
python evaluate.py --model MODEL_NAME --format pytorch
python evaluate.py --model MODEL_NAME --format onnx
# Option 1:
python performance.py --model MODEL_NAME --format pytorch
python performance.py --model MODEL_NAME --format onnx
# Option 2:
chmod +x performance.sh
./performance.sh
```
## Pipeline Speed
QPS test of the embedding pipeline including steps below:
1. load image from path (pipe.input)
2. decode image into arrays (ops.image_decode)
3. generate image embedding (preprocess, model inference, post-process)
There are 3 methods with different pipeline speeds:
- Towhee pipe (regular method)
- Onnxruntime (model inference using onnx at local)
- TritonServe with onnx enabled (request as client)
### Example usage
Please note that `qps_test.py` uses:
- `localhost:8000`: to connect triton client
- `../towhee/jpeg`: as test image path
```bash
python qps_test.py --model 'resnet50' --pipe --onnx --triton --num 100 --device cuda:0
```
**Args:**
- `--model`: mandatory, string, model name
- `--pipe`: optional, on/off flag to enable qps test for pipe
- `--onnx`: optional, on/off flag to enable qps test for onnx
- `--triton`: optional, on/off flag to enable qps for triton (please make sure that triton client is ready)
- `--num`: optional, integer, defaults to 100, batch size in each loop (10 loops in total)
- `--device`: optional, string, defaults to 'cpu'

0
benchmark/run.py → benchmark/performance.py

4
benchmark/run.sh → benchmark/performance.sh

@ -3,6 +3,6 @@
for name in beit_base_patch16_224 beit_base_patch16_224_in22k beit_base_patch16_384 beit_large_patch16_224 beit_large_patch16_224_in22k beit_large_patch16_384 beit_large_patch16_512 beitv2_base_patch16_224 beitv2_base_patch16_224_in22k beitv2_large_patch16_224 beitv2_large_patch16_224_in22k cait_m36_384 cait_m48_448 cait_s24_224 cait_xs24_384 convnext_large_in22ft1k convnext_small_384_in22ft1k convnext_tiny_in22k convnext_xlarge_in22ft1k convnext_xlarge_in22k deit3_medium_patch16_224 deit3_small_patch16_384 deit_base_distilled_patch16_384 mixer_b16_224 mixer_b16_224_in21k mixer_b16_224_miil mixer_b16_224_miil_in21k mixer_l16_224 mixer_l16_224_in21k mobilevitv2_175_384_in22ft1k mobilevitv2_200_384_in22ft1k repvgg_b2g4 res2net50_26w_8s resmlp_big_24_distilled_224 seresnextaa101d_32x8d vit_base_patch16_224_in21k vit_base_patch16_384 vit_base_patch8_224 vit_base_patch8_224_in21k vit_giant_patch14_224_clip_laion2b vit_large_patch16_224 vit_large_patch16_224_in21k vit_large_patch16_384 vit_large_patch32_384 vit_large_r50_s32_224 vit_large_r50_s32_384 vit_relpos_base_patch16_clsgap_224 vit_relpos_medium_patch16_224 vit_relpos_small_patch16_224 vit_small_patch32_224 vit_small_patch32_224_in21k vit_small_r26_s32_384 xcit_large_24_p8_224 xcit_large_24_p8_224_dist xcit_large_24_p8_384_dist xcit_nano_12_p16_384_dist xcit_nano_12_p8_224 xcit_nano_12_p8_224_dist xcit_nano_12_p8_384_dist xcit_small_24_p8_224 xcit_tiny_12_p8_224 xcit_tiny_12_p8_384_dist xcit_tiny_24_p8_224 xcit_tiny_24_p8_384_dist
do
echo ***${name}***
python run.py --model ${name} --format pytorch
python run.py --model ${name} --format onnx
python performance.py --model ${name} --format pytorch
python performance.py --model ${name} --format onnx
done

131
benchmark/qps_test.py

@ -0,0 +1,131 @@
import towhee
from towhee.dc2 import pipe, ops
from towhee import triton_client
import onnxruntime
import numpy
import torch
from statistics import mean
import time
import argparse
import os
import re
import warnings
import logging
from transformers import logging as t_logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")
t_logging.set_verbosity_error()
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True, type=str)
parser.add_argument('--pipe', action='store_true')
parser.add_argument('--triton', action='store_true')
parser.add_argument('--onnx', action='store_true')
parser.add_argument('--atol', type=float, default=1e-3)
parser.add_argument('--num', type=int, default=100)
parser.add_argument('--device', type=str, default='cpu')
args = parser.parse_args()
model_name = args.model
# model_name = 'resnet50'
# model_name = 'vgg16'
# model_name = 'deit3_base_patch16_224'
# model_name = 'deit_tiny_patch16_224'
# model_name = 'deit_base_distilled_patch16_224'
# model_name = 'convnext_base'
# model_name = 'vit_base_patch16_224'
# model_name = 'tf_efficientnet_b5'
p = (
pipe.input('url')
.map('url', 'img', ops.image_decode.cv2_rgb())
.map('img', 'vec', ops.image_embedding.timm(model_name=model_name, device=args.device))
.output('vec')
)
data = '../towhee.jpeg'
out1 = p(data).get()[0]
print('Pipe: OK')
if args.num and args.pipe:
qps = []
for _ in range(10):
start = time.time()
p.batch([data] * args.num)
# for _ in range(args.num):
# p(data)
end = time.time()
q = args.num / (end - start)
qps.append(q)
print('Pipe qps:', mean(qps))
if args.triton:
client = triton_client.Client(url='localhost:8000')
out2 = client(data)[0][0][0]
print('Triton: OK')
if numpy.allclose(out1, out2, atol=args.atol):
print('Check accuracy: OK')
else:
max_diff = numpy.abs(out1 - out2).max()
min_diff = numpy.abs(out1 - out2).min()
mean_diff = numpy.abs(out1 - out2).mean()
print(f'Check accuracy: atol is larger than {args.atol}.')
print(f'Maximum absolute difference is {max_diff}.')
print(f'Minimum absolute difference is {min_diff}.')
print(f'Mean difference is {mean_diff}.')
if args.num:
qps = []
for _ in range(10):
start = time.time()
client.batch([data] * args.num)
end = time.time()
q = args.num / (end - start)
qps.append(q)
print('Triton qps:', mean(qps))
if args.onnx:
op = ops.image_embedding.timm(model_name=model_name, device='cpu').get_op()
decoder = ops.image_decode.cv2_rgb().get_op()
# if not os.path.exists('test.onnx'):
op.save_model('onnx', 'test.onnx')
sess = onnxruntime.InferenceSession('test.onnx',
providers=['CUDAExecutionProvider'])
inputs = decoder(data)
inputs = op.convert_img(inputs)
inputs = op.tfms(inputs).unsqueeze(0)
out3 = sess.run(None, input_feed={'input_0': inputs.cpu().detach().numpy()})[0]
op.device = 'cuda' if args.device != 'cpu' else 'cpu'
out3 = op.post_proc(torch.from_numpy(out3)).cpu().detach().numpy()
print('Onnx: OK')
if numpy.allclose(out1, out3, atol=args.atol):
print('Check accuracy: OK')
else:
max_diff = numpy.abs(out1 - out3).max()
min_diff = numpy.abs(out1 - out3).min()
mean_diff = numpy.abs(out1 - out3).mean()
print(f'Check accuracy: atol is larger than {args.atol}.')
print(f'Maximum absolute difference is {max_diff}.')
print(f'Minimum absolute difference is {min_diff}.')
print(f'Mean difference is {mean_diff}.')
if args.num:
qps = []
for _ in range(10):
start = time.time()
for _ in range(args.num):
inputs = decoder(data)
inputs = op.convert_img(inputs)
inputs = op.tfms(inputs).unsqueeze(0)
outs = sess.run(None, input_feed={'input_0': inputs.cpu().detach().numpy()})[0]
outs = op.post_proc(torch.from_numpy(outs))
end = time.time()
q = args.num / (end - start)
qps.append(q)
print('Onnx qps:', mean(qps))
Loading…
Cancel
Save