switching to high quality piper tts and added label translations
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
# from .base_operator import QuantOperatorBase
|
||||
# from .matmul import MatMulInteger
|
||||
+119
@@ -0,0 +1,119 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QLinearActivation(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def QuantizeClipRelu(self): # noqa: N802
|
||||
node = self.node
|
||||
assert node.op_type == "Relu" or node.op_type == "Clip"
|
||||
|
||||
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
|
||||
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
|
||||
# If input to this node is not quantized then keep this node
|
||||
# If activation is symmetric, not quantize the op and simply return
|
||||
if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
|
||||
return super().quantize()
|
||||
|
||||
quantized_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_value
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
if node.op_type == "Relu" or node.op_type == "Clip":
|
||||
self.QuantizeClipRelu()
|
||||
return
|
||||
|
||||
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
|
||||
sigmoid_nnapi_mode = (
|
||||
node.op_type == "Sigmoid"
|
||||
and nnapi_sigmoid_option in self.quantizer.extra_options
|
||||
and self.quantizer.extra_options[nnapi_sigmoid_option]
|
||||
)
|
||||
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
|
||||
use_zeropoint = 0 if sigmoid_nnapi_mode else None
|
||||
|
||||
# No assert on op_type as it is controlled by registry
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_activation_name = ""
|
||||
if node.name:
|
||||
qlinear_activation_name = node.name + "_quant"
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_activation_inputs = [
|
||||
quantized_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
|
||||
qlinear_activation_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_activation_inputs,
|
||||
[qlinear_activation_output],
|
||||
qlinear_activation_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_activation_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
nodes.append(qlinear_activation_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQRemovableActivation(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
# If input to this node is not quantized then keep this node
|
||||
if not self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
return
|
||||
|
||||
if (
|
||||
not self.quantizer.is_activation_symmetric
|
||||
and not self.quantizer.qdq_keep_removable_activations
|
||||
and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
|
||||
):
|
||||
self.quantizer.remove_node(self.node)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
@@ -0,0 +1,18 @@
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
# Use the quantized tensor as input without DQ.
|
||||
class QArgMax(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
+73
@@ -0,0 +1,73 @@
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Attention
|
||||
"""
|
||||
|
||||
|
||||
class AttentionQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
return self.quantizer.should_quantize_node(self.node)
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
parameter node: Attention node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert node.op_type == "Attention"
|
||||
|
||||
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
|
||||
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
|
||||
# is implemented
|
||||
for attr in node.attribute:
|
||||
if attr.name == "qkv_hidden_sizes":
|
||||
return super().quantize()
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qattention_name = "" if not node.name else node.name + "_quant"
|
||||
|
||||
inputs = []
|
||||
inputs.extend(quantized_input_names)
|
||||
inputs.extend([node.input[2]])
|
||||
inputs.extend(scale_names)
|
||||
inputs.extend([node.input[3] if len(node.input) > 3 else ""])
|
||||
inputs.extend(zero_point_names)
|
||||
inputs.extend([node.input[4] if len(node.input) > 4 else ""])
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
|
||||
nodes.append(qattention_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
class QuantOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
return False
|
||||
|
||||
return self.quantizer.is_float_tensor(self.node.input[0])
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
Given a node which does not support quantization, this method checks whether the input to
|
||||
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
|
||||
parameter node: Current node
|
||||
parameter new_nodes_list: List of new nodes created before processing current node
|
||||
return: List of new nodes created
|
||||
"""
|
||||
for _, node_input in enumerate(self.node.input):
|
||||
dequantize_node = self.quantizer._dequantize_value(node_input)
|
||||
if dequantize_node is not None:
|
||||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
|
||||
# Append the original node
|
||||
self.quantizer.new_nodes.append(self.node)
|
||||
+72
@@ -0,0 +1,72 @@
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearBinaryOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0, 1])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_binary_math_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_binary_math_inputs = []
|
||||
# Input 0
|
||||
qlinear_binary_math_inputs.append(quantized_input_names[0])
|
||||
qlinear_binary_math_inputs.append(scale_names[0])
|
||||
qlinear_binary_math_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_binary_math_inputs.append(quantized_input_names[1])
|
||||
qlinear_binary_math_inputs.append(scale_names[1])
|
||||
qlinear_binary_math_inputs.append(zero_point_names[1])
|
||||
|
||||
# Output
|
||||
qlinear_binary_math_inputs.append(output_scale_name)
|
||||
qlinear_binary_math_inputs.append(output_zp_name)
|
||||
|
||||
qlinear_binary_math_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_binary_math_inputs,
|
||||
[qlinear_binary_math_output],
|
||||
qlinear_binary_math_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qlinear_binary_math_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_binary_math_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,62 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import ( # noqa: F401
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
ms_domain,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase # noqa: F401
|
||||
|
||||
|
||||
class QLinearConcat(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qnode_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qlconcat_inputs = [output_scale_name, output_zp_name]
|
||||
for i in range(len(q_input_names)):
|
||||
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
qlconcat_node = onnx.helper.make_node(
|
||||
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlconcat_node]
|
||||
@@ -0,0 +1,260 @@
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name,
|
||||
get_mul_node,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class ConvInteger(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def add_bias(self, nodes, scaled_output):
|
||||
"""
|
||||
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
|
||||
parameter nodes: new nodes would be appended into nodes
|
||||
parameter node: current node (Conv)
|
||||
parameter scaled_output: output of quant conv without bias
|
||||
parameter output: output of Conv
|
||||
parameter bias_name: bias of Conv
|
||||
return: the name of output
|
||||
"""
|
||||
node = self.node
|
||||
model = self.quantizer.model
|
||||
# Add tensors for the shape to be reshaped to
|
||||
weight = find_by_name(node.input[1], model.initializer())
|
||||
if weight is None:
|
||||
raise ValueError(f"Expected {node.input[1]} to be an initializer")
|
||||
|
||||
# Add reshape for correct broadcase
|
||||
output = node.output[0]
|
||||
reshape_input_data = node.input[2] # bias of Conv
|
||||
reshape_input_shape = output + "_bias_reshape_shape"
|
||||
reshape_output = output + "_bias_reshape_output"
|
||||
|
||||
shape = np.ones((len(weight.dims)), dtype=np.int64)
|
||||
shape[1] = -1
|
||||
init_shape = onnx.helper.make_tensor(
|
||||
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
|
||||
)
|
||||
model.add_initializer(init_shape)
|
||||
|
||||
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
|
||||
nodes.append(reshape_node)
|
||||
|
||||
# Add an Add operation for bias
|
||||
add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
|
||||
nodes.append(add_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
conv_integer_output = node.output[0] + "_output_quantized"
|
||||
conv_integer_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
conv_integer_node = onnx.helper.make_node(
|
||||
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
|
||||
)
|
||||
nodes.append(conv_integer_node)
|
||||
|
||||
# Add cast operation to cast convInteger output to float.
|
||||
onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
|
||||
cast_op_output = conv_integer_output + "_cast_output"
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[conv_integer_output],
|
||||
[cast_op_output],
|
||||
conv_integer_output + "_cast",
|
||||
to=onnx_type, # TODO: FLOAT ot FLOAT16
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert len(scale_names) == 2
|
||||
if conv_integer_name:
|
||||
scales_mul_op = conv_integer_name + "_scales_mul"
|
||||
else:
|
||||
scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
|
||||
nodes.append(scales_mul_node)
|
||||
|
||||
scales_mul_op_output = scales_mul_node.output[0]
|
||||
|
||||
has_bias = len(node.input) == 3
|
||||
scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
|
||||
|
||||
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
|
||||
# and make the output of this node the same as output of original conv node.
|
||||
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
scaled_output_name,
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
|
||||
if has_bias:
|
||||
self.add_bias(nodes, scaled_output_name)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QLinearConv(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv"
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = ""
|
||||
bias_present = False
|
||||
if len(node.input) == 3:
|
||||
if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
|
||||
raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
|
||||
bias_present = True
|
||||
|
||||
qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_conv_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
qlinear_conv_inputs = []
|
||||
# Input 0
|
||||
qlinear_conv_inputs.append(quantized_input_names[0])
|
||||
qlinear_conv_inputs.append(scale_names[0])
|
||||
qlinear_conv_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_conv_inputs.append(quantized_input_names[1])
|
||||
qlinear_conv_inputs.append(scale_names[1])
|
||||
qlinear_conv_inputs.append(zero_point_names[1])
|
||||
|
||||
# Output
|
||||
qlinear_conv_inputs.append(output_scale_name)
|
||||
qlinear_conv_inputs.append(output_zp_name)
|
||||
|
||||
if bias_present:
|
||||
qlinear_conv_inputs.append(quantized_bias_name)
|
||||
|
||||
qlinear_conv_node = onnx.helper.make_node(
|
||||
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
|
||||
)
|
||||
nodes.append(qlinear_conv_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_conv_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQConv(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
|
||||
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
|
||||
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=0 if node.op_type == "Conv" else 1
|
||||
)
|
||||
if is_weight_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
|
||||
if len(node.input) == 3:
|
||||
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
|
||||
+78
@@ -0,0 +1,78 @@
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
# For operators that support 8bits operations directly, and output could
|
||||
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
|
||||
class Direct8BitOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
# Keep backward compatibility
|
||||
# Quantize when input[0] is quantized already. Otherwise keep it.
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
||||
else:
|
||||
# Force quantize those ops if possible, use exclude node list if this is not you want
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_names[0]
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
nodes.append(node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQDirect8BitOp(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
if self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.quantize_activation_tensor(self.node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
+121
@@ -0,0 +1,121 @@
|
||||
import logging
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
|
||||
|
||||
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
|
||||
weight inputs associated with the node to uint8.
|
||||
"""
|
||||
|
||||
|
||||
class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
return self.quantizer.should_quantize_node(self.node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "EmbedLayerNormalization"
|
||||
|
||||
if len(node.output) > 2:
|
||||
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
|
||||
return super().quantize()
|
||||
|
||||
"""
|
||||
Pre-quantization EmbedLayerNorm inputs:
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
[2] word_embedding (float32)
|
||||
[3] position_embedding (float32)
|
||||
[4] segment_embedding (float32)
|
||||
[5] gamma (float32)
|
||||
[6] beta (float32)
|
||||
[7] mask (int32) (optional)
|
||||
"""
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
|
||||
|
||||
"""
|
||||
Quantized Input Tensor List
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
[2] word_embedding (uint8)
|
||||
[3] position_embedding (uint8)
|
||||
[4] segment_embedding (uint8)
|
||||
[5] gamma (uint8)
|
||||
[6] beta (uint8)
|
||||
[7] mask (int32) (optional)
|
||||
[8] word_embedding_scale (float)
|
||||
[9] position_embedding_scale (float)
|
||||
[10] segment_embedding_scale (float)
|
||||
[11] gamma_scale (float)
|
||||
[12] beta_scale (float)
|
||||
[13] word_embedding_zero_point (uint8)
|
||||
[14] position_embedding_zero_point (uint8)
|
||||
[15] segment_embedding_zero_point (uint8)
|
||||
[16] gamma_zero_point (uint8)
|
||||
[17] beta_zero_point (uint8)
|
||||
"""
|
||||
inputs = []
|
||||
# 'input_ids'
|
||||
inputs.extend([node.input[0]])
|
||||
# 'segment_ids'
|
||||
inputs.extend([node.input[1]])
|
||||
# 'word_embedding_quant'
|
||||
inputs.extend([quantized_input_names[0]])
|
||||
# 'position_embedding_quant'
|
||||
inputs.extend([quantized_input_names[1]])
|
||||
# 'segment_embedding_quant'
|
||||
inputs.extend([quantized_input_names[2]])
|
||||
# 'gamma_quant'
|
||||
inputs.extend([quantized_input_names[3]])
|
||||
# 'beta_quant'
|
||||
inputs.extend([quantized_input_names[4]])
|
||||
# 'mask' (optional)
|
||||
inputs.extend([node.input[7] if len(node.input) > 7 else ""])
|
||||
|
||||
# Add all scales:
|
||||
inputs.extend([scale_names[0]])
|
||||
inputs.extend([scale_names[1]])
|
||||
inputs.extend([scale_names[2]])
|
||||
inputs.extend([scale_names[3]])
|
||||
inputs.extend([scale_names[4]])
|
||||
|
||||
# Add all zero points:
|
||||
inputs.extend([zero_point_names[0]])
|
||||
inputs.extend([zero_point_names[1]])
|
||||
inputs.extend([zero_point_names[2]])
|
||||
inputs.extend([zero_point_names[3]])
|
||||
inputs.extend([zero_point_names[4]])
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qembed_layer_norm_node = onnx.helper.make_node(
|
||||
"QEmbedLayerNormalization",
|
||||
inputs,
|
||||
node.output,
|
||||
qembed_layer_norm_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qembed_layer_norm_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,64 @@
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Gather
|
||||
"""
|
||||
|
||||
|
||||
class GatherQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
return False
|
||||
|
||||
return self.quantizer.is_valid_quantize_weight(self.node.input[0])
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gather"
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
gather_new_output,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
node.output[0] = gather_new_output
|
||||
node.input[0] = quantized_input_names[0]
|
||||
nodes.append(node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQGather(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gather" or node.op_type == "GatherElements"
|
||||
|
||||
if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
|
||||
elif self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
|
||||
+62
@@ -0,0 +1,62 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QGlobalAveragePool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "GlobalAveragePool"
|
||||
|
||||
# If input to this node is not quantized then keep this node.
|
||||
if node.input[0] not in self.quantizer.quantized_value_map:
|
||||
return super().quantize()
|
||||
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
(
|
||||
data_found,
|
||||
output_scale_name_from_parameter,
|
||||
output_zp_name_from_parameter,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
# Just use input scale and zp if parameters for output is not specified.
|
||||
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
|
||||
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
kwargs["channels_last"] = 0
|
||||
qnode_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_value.q_name,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[quantized_output_value.q_name],
|
||||
qnode_name,
|
||||
**kwargs,
|
||||
)
|
||||
self.quantizer.new_nodes += [qnode]
|
||||
@@ -0,0 +1,172 @@
|
||||
import logging
|
||||
|
||||
import numpy as np # noqa: F401
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name, # noqa: F401
|
||||
get_mul_node, # noqa: F401
|
||||
ms_domain,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase # noqa: F401
|
||||
from .matmul import QOpMatMul
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
def is_B_transposed(gemm_node): # noqa: N802
|
||||
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"] # noqa: N806
|
||||
if transB_attribute:
|
||||
return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if beta_attribute:
|
||||
return onnx.helper.get_attribute_value(beta_attribute[0])
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
def set_default_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if beta_attribute:
|
||||
beta_attribute[0].f = 1.0
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
class QLinearGemm(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
self.quantizer.weight_qType,
|
||||
0 if is_B_transposed(node) else 1,
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = ""
|
||||
if len(node.input) == 3:
|
||||
if not self.quantizer.is_input_a_initializer(node.input[2]):
|
||||
return super().quantize()
|
||||
|
||||
# Note: if the quantized type is float 8, the bias is converted into float 16.
|
||||
# cublasLtMatMul only supports (b)float16 or float32 bias.
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(
|
||||
node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
|
||||
qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qgemm_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
if attribute.name != "beta":
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
# generate input
|
||||
qgemm_inputs = []
|
||||
for i in range(2):
|
||||
qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
|
||||
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
|
||||
|
||||
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
|
||||
nodes.append(qgemm_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qgemm_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
node_type=node.op_type,
|
||||
node_qtype=self.quantizer.weight_qType,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQGemm(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
|
||||
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=0 if is_B_transposed(node) else 1
|
||||
)
|
||||
if is_weight_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
|
||||
if len(node.input) == 3:
|
||||
if self.quantizer.is_input_a_initializer(node.input[2]):
|
||||
self.quantizer.quantize_bias_tensor(
|
||||
node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
set_default_beta(self.node)
|
||||
else:
|
||||
logging.warning(
|
||||
f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
|
||||
)
|
||||
@@ -0,0 +1,121 @@
|
||||
import numpy
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain # noqa: F401
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize LSTM
|
||||
"""
|
||||
|
||||
|
||||
class LSTMQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
parameter node: LSTM node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert node.op_type == "LSTM"
|
||||
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
|
||||
node.input[2]
|
||||
):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
model = self.quantizer.model
|
||||
W = model.get_initializer(node.input[1]) # noqa: N806
|
||||
R = model.get_initializer(node.input[2]) # noqa: N806
|
||||
|
||||
if len(W.dims) != 3 or len(R.dims) != 3:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
[W_num_dir, W_4_hidden_size, W_input_size] = W.dims # noqa: N806
|
||||
[R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims # noqa: N806
|
||||
|
||||
if self.quantizer.is_per_channel():
|
||||
del W.dims[0]
|
||||
del R.dims[0]
|
||||
W.dims[0] = W_num_dir * W_4_hidden_size
|
||||
R.dims[0] = R_num_dir * R_4_hidden_size
|
||||
|
||||
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[2],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
|
||||
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0]) # noqa: N806
|
||||
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0]) # noqa: N806
|
||||
|
||||
W_quant_array = onnx.numpy_helper.to_array(W_quant_weight) # noqa: N806
|
||||
R_quant_array = onnx.numpy_helper.to_array(R_quant_weight) # noqa: N806
|
||||
|
||||
W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size)) # noqa: N806
|
||||
R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size)) # noqa: N806
|
||||
|
||||
W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1)) # noqa: N806
|
||||
R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1)) # noqa: N806
|
||||
|
||||
W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0]) # noqa: N806
|
||||
R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0]) # noqa: N806
|
||||
|
||||
model.remove_initializers([W_quant_weight, R_quant_weight])
|
||||
model.add_initializer(W_quant_tranposed)
|
||||
model.add_initializer(R_quant_tranposed)
|
||||
|
||||
W_quant_zp = model.get_initializer(quant_input_weight_tuple[1]) # noqa: N806
|
||||
R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1]) # noqa: N806
|
||||
W_quant_scale = model.get_initializer(quant_input_weight_tuple[2]) # noqa: N806
|
||||
R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2]) # noqa: N806
|
||||
|
||||
if self.quantizer.is_per_channel():
|
||||
W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
|
||||
R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
|
||||
W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
|
||||
R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
|
||||
|
||||
inputs = []
|
||||
input_len = len(node.input)
|
||||
inputs.extend([node.input[0]])
|
||||
inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
|
||||
inputs.extend([node.input[3] if input_len > 3 else ""])
|
||||
inputs.extend([node.input[4] if input_len > 4 else ""])
|
||||
inputs.extend([node.input[5] if input_len > 5 else ""])
|
||||
inputs.extend([node.input[6] if input_len > 6 else ""])
|
||||
inputs.extend([node.input[7] if input_len > 7 else ""])
|
||||
inputs.extend(
|
||||
[
|
||||
quant_input_weight_tuple[2],
|
||||
quant_input_weight_tuple[1],
|
||||
quant_recurrent_weight_tuple[2],
|
||||
quant_recurrent_weight_tuple[1],
|
||||
]
|
||||
)
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
if attribute.name == "layout":
|
||||
continue
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
quant_lstm_name = "" if not node.name else node.name + "_quant"
|
||||
quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
|
||||
self.quantizer.new_nodes.append(quant_lstm_node)
|
||||
|
||||
dequantize_node = self.quantizer._dequantize_value(node.input[0])
|
||||
if dequantize_node is not None:
|
||||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
@@ -0,0 +1,231 @@
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QOpMatMul(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
logging.debug(f"Ignore MatMul {self.node.name}]")
|
||||
return False
|
||||
|
||||
if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
|
||||
not self.quantizer.is_float_tensor(self.node.input[0])
|
||||
):
|
||||
logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
|
||||
return False
|
||||
|
||||
# do not quantize non-constant B matrices for matmul
|
||||
if self.quantizer.q_matmul_const_b_only:
|
||||
if not self.quantizer.find_initializer_in_path(self.node.input[1]):
|
||||
logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.IntegerOps.
|
||||
"""
|
||||
|
||||
|
||||
class MatMulInteger(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
matmul_integer_output = node.output[0] + "_output_quantized"
|
||||
matmul_integer_name = node.name + "_quant" if node.name else ""
|
||||
matmul_integer_node = onnx.helper.make_node(
|
||||
"MatMulInteger",
|
||||
quantized_input_names + zero_point_names,
|
||||
[matmul_integer_output],
|
||||
matmul_integer_name,
|
||||
)
|
||||
nodes.append(matmul_integer_node)
|
||||
|
||||
# Add cast operation to cast matmulInteger output to float.
|
||||
cast_op_output = matmul_integer_output + "_cast_output"
|
||||
otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[matmul_integer_output],
|
||||
[cast_op_output],
|
||||
matmul_integer_output + "_cast",
|
||||
to=otype,
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert len(scale_names) == 2
|
||||
scales_mul_op = (
|
||||
matmul_integer_name + "_scales_mul"
|
||||
if matmul_integer_name
|
||||
else scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
)
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
|
||||
nodes.append(scales_mul_node)
|
||||
|
||||
scales_mul_op_output = scales_mul_node.output[0]
|
||||
|
||||
# Add mul operation to multiply mul_scales_op result with output of MatMulInteger
|
||||
# and make the output of this node the same as output of original matmul node.
|
||||
output_scale_mul_op = ""
|
||||
if matmul_integer_name:
|
||||
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
node.output[0],
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.QLinearOps
|
||||
"""
|
||||
|
||||
|
||||
class QLinearMatMul(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
|
||||
nodes.extend(nodes_weight)
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_matmul_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qlinear_matmul_inputs = []
|
||||
# Input 0
|
||||
qlinear_matmul_inputs.append(quantized_input_names[0])
|
||||
qlinear_matmul_inputs.append(scale_names[0])
|
||||
qlinear_matmul_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_matmul_inputs.append(quantized_input_names[1])
|
||||
qlinear_matmul_inputs.append(scale_names[1])
|
||||
qlinear_matmul_inputs.append(zero_point_names[1])
|
||||
# Output quantization parameter
|
||||
qlinear_matmul_inputs.append(output_scale_name)
|
||||
qlinear_matmul_inputs.append(output_zp_name)
|
||||
|
||||
domain = (
|
||||
"com.microsoft"
|
||||
if self.quantizer.weight_qType
|
||||
in {
|
||||
onnx_proto.TensorProto.FLOAT8E4M3FN,
|
||||
onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
|
||||
onnx_proto.TensorProto.FLOAT8E5M2,
|
||||
onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
|
||||
}
|
||||
else ""
|
||||
)
|
||||
qlinear_matmul_node = onnx.helper.make_node(
|
||||
"QLinearMatMul",
|
||||
qlinear_matmul_inputs,
|
||||
[qlinear_matmul_output],
|
||||
qlinear_matmul_name,
|
||||
domain=domain,
|
||||
)
|
||||
nodes.append(qlinear_matmul_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_matmul_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQMatMul(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
nodes_to_iterate = node.input
|
||||
else:
|
||||
nodes_to_iterate = itertools.chain(node.input, node.output)
|
||||
|
||||
for tensor_name in nodes_to_iterate:
|
||||
if find_by_name(tensor_name, self.quantizer.model.initializer()):
|
||||
is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
|
||||
tensor_name, default_axis=1, op_type=node.op_type
|
||||
)
|
||||
if is_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(tensor_name)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(tensor_name)
|
||||
@@ -0,0 +1,34 @@
|
||||
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
|
||||
|
||||
class QMaxPool(Direct8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, go to normal quantize.
|
||||
if self.quantizer.opset_version < 12:
|
||||
super(Direct8BitOp, self).quantize()
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
|
||||
|
||||
class QDQMaxPool(QDQDirect8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, just no change
|
||||
if self.quantizer.opset_version < 12:
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
@@ -0,0 +1,40 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QDQNormalization(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
|
||||
|
||||
# Input
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
|
||||
# Scale
|
||||
scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
|
||||
scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=1, op_type=node.op_type
|
||||
)
|
||||
|
||||
if scale_is_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
|
||||
elif scale_is_initializer:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(node.input[1])
|
||||
|
||||
# Bias
|
||||
if len(node.input) > 2 and node.input[2]:
|
||||
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
|
||||
|
||||
# Output
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output_name in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output_name)
|
||||
@@ -0,0 +1,172 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
quantize_nparray,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QPad(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Pad"
|
||||
|
||||
# Only after version 11, it has the optional constant_value
|
||||
# If input[0] is not quantized, do not quanitize this node
|
||||
if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
|
||||
super().quantize()
|
||||
return
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kv = attribute_to_kwarg(attribute)
|
||||
kwargs.update(kv)
|
||||
|
||||
if "mode" not in kwargs or kwargs["mode"] == b"constant":
|
||||
if len(node.input) > 2 and node.input[2] != "": # There is 3rd input 'constant_value'
|
||||
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
|
||||
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
|
||||
if zp_tensor is None or scale_tensor is None:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
|
||||
if padding_constant_initializer is not None:
|
||||
zp_array = onnx.numpy_helper.to_array(zp_tensor)
|
||||
zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
|
||||
scale_array = onnx.numpy_helper.to_array(scale_tensor)
|
||||
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
|
||||
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
|
||||
quantized_padding_constant_array = quantize_nparray(
|
||||
self.quantizer.activation_qType,
|
||||
padding_constant_array,
|
||||
scale_value,
|
||||
zp_value,
|
||||
)
|
||||
quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
|
||||
quantized_padding_constant_array,
|
||||
quantized_padding_constant_name,
|
||||
)
|
||||
# Suppose this padding constant initializer only used by the node
|
||||
self.quantizer.model.remove_initializer(padding_constant_initializer)
|
||||
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
|
||||
node.input[2] = quantized_padding_constant_name
|
||||
else:
|
||||
# TODO: check quantize_inputs after sub graph is supported
|
||||
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
|
||||
node,
|
||||
2,
|
||||
self.quantizer.activation_qType,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
initial_type=scale_tensor.data_type,
|
||||
)
|
||||
self.quantizer.new_nodes.extend(pad_value_qnodes)
|
||||
node.input[2] = pad_value_qnodes[0].output[0]
|
||||
else:
|
||||
# In quantized format, the `zero` before quantization is mapped
|
||||
# to quantized_input_value.zp_name. Thus, padding 0 to
|
||||
# original tensor should become padding zero point to quantized
|
||||
# tensor.
|
||||
if len(node.input) == 2:
|
||||
# Feed quantization's zero point to padding node.
|
||||
node.input.append(quantized_input_value.zp_name)
|
||||
else:
|
||||
# Assign quantization's zero point to padding node.
|
||||
assert node.input[2] == ""
|
||||
node.input[2] = quantized_input_value.zp_name
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
||||
|
||||
class QDQPad(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
|
||||
"""
|
||||
Returns the Pad's constant padding value. Returns `None` if the padding value is
|
||||
not constant (i.e., comes from a dynamic input).
|
||||
"""
|
||||
const_val = None
|
||||
onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
|
||||
if onnx_tensor_type is None:
|
||||
return None
|
||||
|
||||
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
|
||||
if self.quantizer.opset_version < 11:
|
||||
const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
|
||||
elif len(self.node.input) >= 3 and self.node.input[2]:
|
||||
const_val = self.quantizer.model.get_constant_value(self.node.input[2])
|
||||
else:
|
||||
const_val = np.array(0, dtype=np_dtype)
|
||||
|
||||
return const_val
|
||||
|
||||
def _should_quantize_output_same_as_input(self) -> bool:
|
||||
"""
|
||||
Returns true if Pad's output should use the same quantization parameters as input[0]
|
||||
"""
|
||||
attrs_dict = {}
|
||||
for attribute in self.node.attribute:
|
||||
kv = attribute_to_kwarg(attribute)
|
||||
attrs_dict.update(kv)
|
||||
|
||||
pad_mode = attrs_dict.get("mode", b"constant")
|
||||
if pad_mode in (b"reflect", b"edge", b"wrap"):
|
||||
# These modes pad the output with a value that already exists in the input.
|
||||
# So, we can quantize the output the same as the input.
|
||||
return True
|
||||
|
||||
# For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
|
||||
# because our quantization floating-point range always includes 0.
|
||||
if pad_mode == b"constant":
|
||||
pad_val = self._get_pad_const_val(attrs_dict)
|
||||
if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
|
||||
return float(pad_val.item()) == 0
|
||||
|
||||
return False
|
||||
|
||||
def quantize(self):
|
||||
assert self.node.op_type == "Pad"
|
||||
|
||||
for input_name in self.node.input:
|
||||
if input_name:
|
||||
self.quantizer.quantize_activation_tensor(input_name)
|
||||
|
||||
if not self.disable_qdq_for_node_output:
|
||||
if self._should_quantize_output_same_as_input():
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(self.node.output[0])
|
||||
@@ -0,0 +1,67 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearPool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear pool node for given type (AveragePool, etc)
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qlinear_node_name = node.name + "_quant" if node.name else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
self.quantizer.new_nodes += nodes
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
import itertools
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray # noqa: F401
|
||||
from .base_operator import QuantOperatorBase # noqa: F401
|
||||
|
||||
|
||||
class QDQOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
tensors_to_quantize = node.input
|
||||
else:
|
||||
tensors_to_quantize = itertools.chain(node.input, node.output)
|
||||
|
||||
for tensor_name in tensors_to_quantize:
|
||||
self.quantizer.quantize_activation_tensor(tensor_name)
|
||||
@@ -0,0 +1,34 @@
|
||||
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
|
||||
|
||||
class QResize(Direct8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, go to normal quantize.
|
||||
if self.quantizer.opset_version < 11:
|
||||
super(Direct8BitOp, self).quantize()
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
|
||||
|
||||
class QDQResize(QDQDirect8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, just keep this node
|
||||
if self.quantizer.opset_version < 11:
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
@@ -0,0 +1,74 @@
|
||||
import onnx
|
||||
import onnx.helper
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearSoftmax(QuantOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
# set limitations for softmax output scale and zp, because the output of softmax is always 0-1
|
||||
if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
|
||||
out_scale = 1 / 256.0
|
||||
out_zero_point = 0
|
||||
else:
|
||||
out_scale = 1 / 256.0
|
||||
out_zero_point = -128
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear softmax node for given type
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
# make qlinearsoft has the real opset_version, its default SinceVersion would be 1
|
||||
kwargs["opset"] = self.quantizer.opset_version
|
||||
qlinear_node_name = node.name + "_quant" if node.name else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
self.quantizer.new_nodes += nodes
|
||||
return None
|
||||
@@ -0,0 +1,63 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QSplit(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_node_name = ""
|
||||
if node.name:
|
||||
quantized_node_name = node.name + "_quant"
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
|
||||
# Output just derive the scale/zero from input
|
||||
quantized_output_names = []
|
||||
for output_name in node.output:
|
||||
quantized_output_name = output_name + "quantized"
|
||||
quantized_output_names.append(quantized_output_name)
|
||||
q_output = QuantizedValue(
|
||||
output_name,
|
||||
quantized_output_name,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[output_name] = q_output
|
||||
|
||||
if len(node.input) > 1:
|
||||
quantized_input_names.extend(node.input[1:])
|
||||
quantized_node = onnx.helper.make_node(
|
||||
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
|
||||
)
|
||||
|
||||
nodes.append(quantized_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQSplit(QDQOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Split"
|
||||
|
||||
if not self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
|
||||
@@ -0,0 +1,87 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QLinearWhere(QuantOperatorBase):
|
||||
def should_quantize(self):
|
||||
return True
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Where"
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [1, 2])
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_output_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlwhere_inputs = [
|
||||
node.input[0],
|
||||
q_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
q_input_names[1],
|
||||
scale_names[1],
|
||||
zero_point_names[1],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
qlwhere_node = onnx.helper.make_node(
|
||||
"QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlwhere_node]
|
||||
|
||||
|
||||
class QDQWhere(QDQOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Where"
|
||||
if self.quantizer.force_quantize_no_input_check:
|
||||
if not self.quantizer.is_tensor_quantized(node.input[1]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[1])
|
||||
if not self.quantizer.is_tensor_quantized(node.input[2]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[2])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output)
|
||||
elif (
|
||||
self.quantizer.is_tensor_quantized(node.input[1])
|
||||
and self.quantizer.is_tensor_quantized(node.input[2])
|
||||
and not self.disable_qdq_for_node_output
|
||||
):
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output)
|
||||
Reference in New Issue
Block a user