switching to high quality piper tts and added label translations
This commit is contained in:
@@ -0,0 +1,434 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import numpy
|
||||
import psutil
|
||||
from onnx import TensorProto
|
||||
|
||||
"""
|
||||
This profiler tool could run a transformer model and print out the kernel time spent on each Node of the model.
|
||||
Example of profiling of longformer model:
|
||||
python profiler.py --model longformer-base-4096_fp32.onnx --batch_size 1 --sequence_length 4096 --global_length 8 --samples 1000 --thread_num 8 --dummy_inputs longformer --use_gpu
|
||||
Example of importing profile result file from onnxruntime_perf_test:
|
||||
python profiler.py --input profile_2021-10-25_12-02-41.json
|
||||
"""
|
||||
|
||||
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Set the input file for reading the profile results",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--model",
|
||||
required=False,
|
||||
type=str,
|
||||
help="onnx model path to run profiling. Required when --input is not specified.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-b",
|
||||
"--batch_size",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="batch size of input",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--sequence_length",
|
||||
required=False,
|
||||
type=int,
|
||||
default=32,
|
||||
help="sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--past_sequence_length",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="past sequence length for gpt2",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--global_length",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of global tokens for longformer",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1000,
|
||||
help="number of samples to test. Set it large enough to reduce the variance of performance result.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
required=False,
|
||||
type=float,
|
||||
default=0.01,
|
||||
help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--thread_num",
|
||||
required=False,
|
||||
type=int,
|
||||
default=-1,
|
||||
help="number of threads to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input IDs, for bert",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--segment_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment IDs, for bert",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_mask_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask, for bert",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dummy_inputs",
|
||||
required=False,
|
||||
default="default",
|
||||
choices=["bert", "gpt2", "longformer", "default"],
|
||||
help="Type of model inputs. The default will create dummy inputs with ones.",
|
||||
)
|
||||
|
||||
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default="cuda",
|
||||
help="Execution provider to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--basic_optimization",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime",
|
||||
)
|
||||
parser.set_defaults(basic_optimization=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--kernel_time_only",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Only include the kernel time and no fence time",
|
||||
)
|
||||
parser.set_defaults(kernel_time_only=False)
|
||||
|
||||
parser.add_argument("-v", "--verbose", required=False, action="store_true")
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_num, all_inputs):
|
||||
from benchmark_helper import create_onnxruntime_session # noqa: PLC0415
|
||||
|
||||
session = create_onnxruntime_session(
|
||||
onnx_model_path,
|
||||
use_gpu,
|
||||
provider,
|
||||
enable_all_optimization=not basic_optimization,
|
||||
num_threads=thread_num,
|
||||
enable_profiling=True,
|
||||
)
|
||||
|
||||
for inputs in all_inputs:
|
||||
_ = session.run(None, inputs)
|
||||
|
||||
profile_file = session.end_profiling()
|
||||
return profile_file
|
||||
|
||||
|
||||
def get_dim_from_type_proto(dim):
|
||||
return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None # noqa: E721
|
||||
|
||||
|
||||
def get_shape_from_type_proto(type_proto):
|
||||
return [get_dim_from_type_proto(d) for d in type_proto.tensor_type.shape.dim]
|
||||
|
||||
|
||||
def create_dummy_inputs(onnx_model, batch_size, sequence_length, samples):
|
||||
"""Create dummy inputs for ONNX model.
|
||||
|
||||
Args:
|
||||
onnx_model (OnnxModel): ONNX model
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
samples (int): number of samples
|
||||
|
||||
Returns:
|
||||
List[Dict]: list of inputs
|
||||
"""
|
||||
dummy_inputs = {}
|
||||
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
|
||||
shape = get_shape_from_type_proto(graph_input.type)
|
||||
symbol_dims = []
|
||||
for i, dim in enumerate(shape):
|
||||
if isinstance(dim, str):
|
||||
symbol_dims.append(i)
|
||||
|
||||
# allowed symbolic dimensions: batch_size and sequence_length
|
||||
if len(symbol_dims) > 2:
|
||||
return None
|
||||
if len(symbol_dims) > 0:
|
||||
shape[symbol_dims[0]] = batch_size
|
||||
if len(symbol_dims) > 1:
|
||||
shape[symbol_dims[1]] = sequence_length
|
||||
|
||||
elem_type = graph_input.type.tensor_type.elem_type
|
||||
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
||||
data_type = (
|
||||
numpy.float32
|
||||
if elem_type == TensorProto.FLOAT
|
||||
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
|
||||
)
|
||||
data = numpy.ones(shape, dtype=data_type)
|
||||
dummy_inputs[graph_input.name] = data
|
||||
|
||||
all_inputs = [dummy_inputs for _ in range(samples)]
|
||||
return all_inputs
|
||||
|
||||
|
||||
def create_bert_inputs(
|
||||
onnx_model,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
samples,
|
||||
input_ids_name=None,
|
||||
segment_ids_name=None,
|
||||
input_mask_name=None,
|
||||
):
|
||||
"""Create dummy inputs for BERT model.
|
||||
|
||||
Args:
|
||||
onnx_model (OnnxModel): ONNX model
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
samples (int): number of samples
|
||||
input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
|
||||
segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
|
||||
input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Dict]: list of inputs
|
||||
"""
|
||||
from bert_test_data import find_bert_inputs, generate_test_data # noqa: PLC0415
|
||||
|
||||
input_ids, segment_ids, input_mask = find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
|
||||
all_inputs = generate_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases=samples,
|
||||
seed=123,
|
||||
verbose=False,
|
||||
input_ids=input_ids,
|
||||
segment_ids=segment_ids,
|
||||
input_mask=input_mask,
|
||||
random_mask_length=False,
|
||||
)
|
||||
|
||||
return all_inputs
|
||||
|
||||
|
||||
def create_gpt2_inputs(onnx_model, batch_size, sequence_length, past_sequence_length, samples):
|
||||
"""Create dummy inputs for GPT-2 model.
|
||||
|
||||
Args:
|
||||
onnx_model (OnnxModel): ONNX model
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
past_sequence_length (int): past sequence length
|
||||
samples (int): number of samples
|
||||
|
||||
Raises:
|
||||
RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.
|
||||
|
||||
Returns:
|
||||
List[Dict]: list of inputs
|
||||
"""
|
||||
# The symbolic names shall be same as those used in Gpt2Helper.export_onnx(...) function.
|
||||
symbols = {
|
||||
"batch_size": batch_size,
|
||||
"seq_len": sequence_length,
|
||||
"past_seq_len": past_sequence_length,
|
||||
"total_seq_len": sequence_length + past_sequence_length,
|
||||
}
|
||||
|
||||
dummy_inputs = {}
|
||||
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
|
||||
shape = get_shape_from_type_proto(graph_input.type)
|
||||
for i, dim in enumerate(shape):
|
||||
if isinstance(dim, str):
|
||||
if dim not in symbols:
|
||||
raise RuntimeError(f"symbol is not supported: {dim}")
|
||||
else:
|
||||
shape[i] = symbols[dim]
|
||||
|
||||
elem_type = graph_input.type.tensor_type.elem_type
|
||||
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
||||
data_type = (
|
||||
numpy.float32
|
||||
if elem_type == TensorProto.FLOAT
|
||||
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
|
||||
)
|
||||
data = numpy.ones(shape, dtype=data_type)
|
||||
dummy_inputs[graph_input.name] = data
|
||||
|
||||
all_inputs = [dummy_inputs for _ in range(samples)]
|
||||
return all_inputs
|
||||
|
||||
|
||||
def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_length, samples):
|
||||
"""Create dummy inputs for Longformer model.
|
||||
|
||||
Args:
|
||||
onnx_model (OnnxModel): ONNX model
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
global_length (int): number of global tokens
|
||||
samples (int): number of samples
|
||||
|
||||
Raises:
|
||||
RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.
|
||||
|
||||
Returns:
|
||||
List[Dict]: list of inputs
|
||||
"""
|
||||
symbols = {"batch_size": batch_size, "sequence_length": sequence_length}
|
||||
|
||||
dummy_inputs = {}
|
||||
for graph_input in onnx_model.get_graph_inputs_excluding_initializers():
|
||||
shape = get_shape_from_type_proto(graph_input.type)
|
||||
for i, dim in enumerate(shape):
|
||||
if isinstance(dim, str):
|
||||
if dim not in symbols:
|
||||
raise RuntimeError(f"symbol is not supported: {dim}")
|
||||
else:
|
||||
shape[i] = symbols[dim]
|
||||
|
||||
elem_type = graph_input.type.tensor_type.elem_type
|
||||
assert elem_type in [TensorProto.FLOAT, TensorProto.INT32, TensorProto.INT64]
|
||||
data_type = (
|
||||
numpy.float32
|
||||
if elem_type == TensorProto.FLOAT
|
||||
else (numpy.int64 if elem_type == TensorProto.INT64 else numpy.int32)
|
||||
)
|
||||
|
||||
if "global" in graph_input.name:
|
||||
data = numpy.zeros(shape, dtype=data_type)
|
||||
data[:, :global_length] = 1
|
||||
else:
|
||||
data = numpy.ones(shape, dtype=data_type)
|
||||
dummy_inputs[graph_input.name] = data
|
||||
|
||||
all_inputs = [dummy_inputs for _ in range(samples)]
|
||||
return all_inputs
|
||||
|
||||
|
||||
def run(args):
|
||||
num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False)
|
||||
|
||||
# Set OMP environment variable before importing onnxruntime. Needed for cpu only, and no impact for onnxruntime-gpu package.
|
||||
if "OMP_NUM_THREADS" not in os.environ:
|
||||
os.environ["OMP_NUM_THREADS"] = str(num_threads)
|
||||
|
||||
from onnx import load # noqa: PLC0415
|
||||
from onnx_model import OnnxModel # noqa: PLC0415
|
||||
|
||||
onnx_model = OnnxModel(load(args.model))
|
||||
|
||||
all_inputs = None
|
||||
if args.dummy_inputs == "bert":
|
||||
all_inputs = create_bert_inputs(
|
||||
onnx_model,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.samples,
|
||||
args.input_ids_name,
|
||||
args.segment_ids_name,
|
||||
args.input_mask_name,
|
||||
)
|
||||
elif args.dummy_inputs == "gpt2":
|
||||
all_inputs = create_gpt2_inputs(
|
||||
onnx_model,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.past_sequence_length,
|
||||
args.samples,
|
||||
)
|
||||
elif args.dummy_inputs == "longformer":
|
||||
all_inputs = create_longformer_inputs(
|
||||
onnx_model,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.global_length,
|
||||
args.samples,
|
||||
)
|
||||
else: # default
|
||||
all_inputs = create_dummy_inputs(onnx_model, args.batch_size, args.sequence_length, args.samples)
|
||||
|
||||
profile_file = run_profile(
|
||||
args.model,
|
||||
args.use_gpu,
|
||||
args.provider,
|
||||
args.basic_optimization,
|
||||
args.thread_num,
|
||||
all_inputs,
|
||||
)
|
||||
|
||||
return profile_file
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
arguments = parse_arguments()
|
||||
print("Arguments", arguments)
|
||||
|
||||
from benchmark_helper import setup_logger
|
||||
|
||||
setup_logger(arguments.verbose)
|
||||
|
||||
if not arguments.input:
|
||||
assert arguments.model, "requires either --model to run profiling or --input to read profiling results"
|
||||
profile_file = run(arguments)
|
||||
else:
|
||||
profile_file = arguments.input
|
||||
from profile_result_processor import process_results
|
||||
|
||||
results = process_results(profile_file, arguments)
|
||||
|
||||
for line in results:
|
||||
print(line)
|
||||
Reference in New Issue
Block a user