switching to high quality piper tts and added label translations

2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,2 @@
+# from .base_operator import QuantOperatorBase
+# from .matmul import MatMulInteger
@@ -0,0 +1,119 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearActivation(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def QuantizeClipRelu(self):  # noqa: N802
+        node = self.node
+        assert node.op_type == "Relu" or node.op_type == "Clip"
+
+        # When mode is QLinearOps, the output quantization params are calculated based on outputs from
+        # activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
+        # If input to this node is not quantized then keep this node
+        # If activation is symmetric, not quantize the op and simply return
+        if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
+            return super().quantize()
+
+        quantized_value = self.quantizer.quantized_value_map[node.input[0]]
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_value
+
+    def quantize(self):
+        node = self.node
+        if node.op_type == "Relu" or node.op_type == "Clip":
+            self.QuantizeClipRelu()
+            return
+
+        nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
+        sigmoid_nnapi_mode = (
+            node.op_type == "Sigmoid"
+            and nnapi_sigmoid_option in self.quantizer.extra_options
+            and self.quantizer.extra_options[nnapi_sigmoid_option]
+        )
+        use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
+        use_zeropoint = 0 if sigmoid_nnapi_mode else None
+
+        # No assert on op_type as it is controlled by registry
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_activation_name = ""
+        if node.name:
+            qlinear_activation_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_activation_inputs = [
+            quantized_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            output_scale_name,
+            output_zp_name,
+        ]
+
+        qlinear_activation_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_activation_inputs,
+            [qlinear_activation_output],
+            qlinear_activation_name,
+            **kwargs,
+        )
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_activation_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        nodes.append(qlinear_activation_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQRemovableActivation(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # If input to this node is not quantized then keep this node
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            return
+
+        if (
+            not self.quantizer.is_activation_symmetric
+            and not self.quantizer.qdq_keep_removable_activations
+            and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
+        ):
+            self.quantizer.remove_node(self.node)
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
@@ -0,0 +1,18 @@
+from .base_operator import QuantOperatorBase
+
+
+# Use the quantized tensor as input without DQ.
+class QArgMax(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+        if quantized_input_value is None:
+            self.quantizer.new_nodes += [node]
+            return
+
+        node.input[0] = quantized_input_value.q_name
+        self.quantizer.new_nodes += [node]
@@ -0,0 +1,73 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize Attention
+"""
+
+
+class AttentionQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        """
+        parameter node: Attention node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "Attention"
+
+        # TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
+        # attribute. This needs to be removed once the QAttention for varied q,k,v sizes
+        # is implemented
+        for attr in node.attribute:
+            if attr.name == "qkv_hidden_sizes":
+                return super().quantize()
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qattention_name = "" if not node.name else node.name + "_quant"
+
+        inputs = []
+        inputs.extend(quantized_input_names)
+        inputs.extend([node.input[2]])
+        inputs.extend(scale_names)
+        inputs.extend([node.input[3] if len(node.input) > 3 else ""])
+        inputs.extend(zero_point_names)
+        inputs.extend([node.input[4] if len(node.input) > 4 else ""])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
+        nodes.append(qattention_node)
+
+        self.quantizer.new_nodes += nodes
@@ -0,0 +1,26 @@
+class QuantOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_float_tensor(self.node.input[0])
+
+    def quantize(self):
+        """
+        Given a node which does not support quantization, this method checks whether the input to
+        this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
+            parameter node: Current node
+            parameter new_nodes_list: List of new nodes created before processing current node
+            return: List of new nodes created
+        """
+        for _, node_input in enumerate(self.node.input):
+            dequantize_node = self.quantizer._dequantize_value(node_input)
+            if dequantize_node is not None:
+                self.quantizer.new_nodes.append(dequantize_node)
+
+        # Append the original node
+        self.quantizer.new_nodes.append(self.node)
@@ -0,0 +1,72 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearBinaryOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0, 1])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_binary_math_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_binary_math_inputs = []
+        # Input 0
+        qlinear_binary_math_inputs.append(quantized_input_names[0])
+        qlinear_binary_math_inputs.append(scale_names[0])
+        qlinear_binary_math_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_binary_math_inputs.append(quantized_input_names[1])
+        qlinear_binary_math_inputs.append(scale_names[1])
+        qlinear_binary_math_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_binary_math_inputs.append(output_scale_name)
+        qlinear_binary_math_inputs.append(output_zp_name)
+
+        qlinear_binary_math_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_binary_math_inputs,
+            [qlinear_binary_math_output],
+            qlinear_binary_math_name,
+            **kwargs,
+        )
+        nodes.append(qlinear_binary_math_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_binary_math_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
@@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import (  # noqa: F401
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase  # noqa: F401
+
+
+class QLinearConcat(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            quantized_input_value.value_type,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qlconcat_inputs = [output_scale_name, output_zp_name]
+        for i in range(len(q_input_names)):
+            qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
+        qlconcat_node = onnx.helper.make_node(
+            "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlconcat_node]
@@ -0,0 +1,260 @@
+import numpy as np
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,
+    get_mul_node,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class ConvInteger(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def add_bias(self, nodes, scaled_output):
+        """
+        Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
+            parameter nodes: new nodes would be appended into nodes
+            parameter node: current node (Conv)
+            parameter scaled_output: output of quant conv without bias
+            parameter output: output of Conv
+            parameter bias_name: bias of Conv
+            return: the name of output
+        """
+        node = self.node
+        model = self.quantizer.model
+        # Add tensors for the shape to be reshaped to
+        weight = find_by_name(node.input[1], model.initializer())
+        if weight is None:
+            raise ValueError(f"Expected {node.input[1]} to be an initializer")
+
+        # Add reshape for correct broadcase
+        output = node.output[0]
+        reshape_input_data = node.input[2]  # bias of Conv
+        reshape_input_shape = output + "_bias_reshape_shape"
+        reshape_output = output + "_bias_reshape_output"
+
+        shape = np.ones((len(weight.dims)), dtype=np.int64)
+        shape[1] = -1
+        init_shape = onnx.helper.make_tensor(
+            reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
+        )
+        model.add_initializer(init_shape)
+
+        reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
+        nodes.append(reshape_node)
+
+        # Add an Add operation for bias
+        add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
+        nodes.append(add_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        conv_integer_output = node.output[0] + "_output_quantized"
+        conv_integer_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        conv_integer_node = onnx.helper.make_node(
+            "ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
+        )
+        nodes.append(conv_integer_node)
+
+        # Add cast operation to cast convInteger output to float.
+        onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_op_output = conv_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [conv_integer_output],
+            [cast_op_output],
+            conv_integer_output + "_cast",
+            to=onnx_type,  # TODO: FLOAT ot FLOAT16
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        if conv_integer_name:
+            scales_mul_op = conv_integer_name + "_scales_mul"
+        else:
+            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        has_bias = len(node.input) == 3
+        scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
+
+        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
+        # and make the output of this node the same as output of original conv node.
+        output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                scaled_output_name,
+                output_scale_mul_op,
+            )
+        )
+
+        if has_bias:
+            self.add_bias(nodes, scaled_output_name)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QLinearConv(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                onnx_proto.TensorProto.INT8,
+                0,  # self.quantizer.weight_qType?
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        bias_present = False
+        if len(node.input) == 3:
+            if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
+                raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
+            quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
+            bias_present = True
+
+        qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_conv_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        qlinear_conv_inputs = []
+        # Input 0
+        qlinear_conv_inputs.append(quantized_input_names[0])
+        qlinear_conv_inputs.append(scale_names[0])
+        qlinear_conv_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_conv_inputs.append(quantized_input_names[1])
+        qlinear_conv_inputs.append(scale_names[1])
+        qlinear_conv_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_conv_inputs.append(output_scale_name)
+        qlinear_conv_inputs.append(output_zp_name)
+
+        if bias_present:
+            qlinear_conv_inputs.append(quantized_bias_name)
+
+        qlinear_conv_node = onnx.helper.make_node(
+            "QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
+        )
+        nodes.append(qlinear_conv_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_conv_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQConv(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if node.op_type == "Conv" else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
@@ -0,0 +1,78 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+# For operators that support 8bits operations directly, and output could
+# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
+class Direct8BitOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        if not self.quantizer.force_quantize_no_input_check:
+            # Keep backward compatibility
+            # Quantize when input[0] is quantized already. Otherwise keep it.
+            quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+            if quantized_input_value is None:
+                self.quantizer.new_nodes += [node]
+                return
+
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                quantized_input_value.value_type,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_value.q_name
+            node.output[0] = quantized_output_value.q_name
+            self.quantizer.new_nodes += [node]
+
+        else:
+            # Force quantize those ops if possible, use exclude node list if this is not you want
+            if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+                super().quantize()
+                return
+
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            if quantized_input_names is None:
+                return super().quantize()
+
+            # Create an entry for output quantized value
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_names[0]
+            node.output[0] = quantized_output_value.q_name
+            nodes.append(node)
+
+            self.quantizer.new_nodes += nodes
+
+
+class QDQDirect8BitOp(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        if self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(self.node.input[0])
+            if not self.disable_qdq_for_node_output:
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+        elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
@@ -0,0 +1,121 @@
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
+
+This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
+weight inputs associated with the node to uint8.
+"""
+
+
+class EmbedLayerNormalizationQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "EmbedLayerNormalization"
+
+        if len(node.output) > 2:
+            logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
+            return super().quantize()
+
+        """
+        Pre-quantization EmbedLayerNorm inputs:
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (float32)
+        [3] position_embedding (float32)
+        [4] segment_embedding (float32)
+        [5] gamma (float32)
+        [6] beta (float32)
+        [7] mask (int32) (optional)
+        """
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
+
+        """
+        Quantized Input Tensor List
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (uint8)
+        [3] position_embedding (uint8)
+        [4] segment_embedding (uint8)
+        [5] gamma (uint8)
+        [6] beta (uint8)
+        [7] mask (int32) (optional)
+        [8] word_embedding_scale (float)
+        [9] position_embedding_scale (float)
+        [10] segment_embedding_scale (float)
+        [11] gamma_scale (float)
+        [12] beta_scale (float)
+        [13] word_embedding_zero_point (uint8)
+        [14] position_embedding_zero_point (uint8)
+        [15] segment_embedding_zero_point (uint8)
+        [16] gamma_zero_point (uint8)
+        [17] beta_zero_point (uint8)
+        """
+        inputs = []
+        # 'input_ids'
+        inputs.extend([node.input[0]])
+        # 'segment_ids'
+        inputs.extend([node.input[1]])
+        # 'word_embedding_quant'
+        inputs.extend([quantized_input_names[0]])
+        # 'position_embedding_quant'
+        inputs.extend([quantized_input_names[1]])
+        # 'segment_embedding_quant'
+        inputs.extend([quantized_input_names[2]])
+        # 'gamma_quant'
+        inputs.extend([quantized_input_names[3]])
+        # 'beta_quant'
+        inputs.extend([quantized_input_names[4]])
+        # 'mask' (optional)
+        inputs.extend([node.input[7] if len(node.input) > 7 else ""])
+
+        # Add all scales:
+        inputs.extend([scale_names[0]])
+        inputs.extend([scale_names[1]])
+        inputs.extend([scale_names[2]])
+        inputs.extend([scale_names[3]])
+        inputs.extend([scale_names[4]])
+
+        # Add all zero points:
+        inputs.extend([zero_point_names[0]])
+        inputs.extend([zero_point_names[1]])
+        inputs.extend([zero_point_names[2]])
+        inputs.extend([zero_point_names[3]])
+        inputs.extend([zero_point_names[4]])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qembed_layer_norm_node = onnx.helper.make_node(
+            "QEmbedLayerNormalization",
+            inputs,
+            node.output,
+            qembed_layer_norm_name,
+            **kwargs,
+        )
+        nodes.append(qembed_layer_norm_node)
+
+        self.quantizer.new_nodes += nodes
@@ -0,0 +1,64 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+"""
+    Quantize Gather
+"""
+
+
+class GatherQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_valid_quantize_weight(self.node.input[0])
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather"
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            gather_new_output,
+            scale_names[0],
+            zero_point_names[0],
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        node.output[0] = gather_new_output
+        node.input[0] = quantized_input_names[0]
+        nodes.append(node)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGather(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather" or node.op_type == "GatherElements"
+
+        if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
+        elif self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
@@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QGlobalAveragePool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "GlobalAveragePool"
+
+        # If input to this node is not quantized then keep this node.
+        if node.input[0] not in self.quantizer.quantized_value_map:
+            return super().quantize()
+
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        # Create an entry for output quantized value.
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        (
+            data_found,
+            output_scale_name_from_parameter,
+            output_zp_name_from_parameter,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        # Just use input scale and zp if parameters for output is not specified.
+        output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
+        output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        kwargs["channels_last"] = 0
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_value.q_name,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                output_scale_name,
+                output_zp_name,
+            ],
+            [quantized_output_value.q_name],
+            qnode_name,
+            **kwargs,
+        )
+        self.quantizer.new_nodes += [qnode]
@@ -0,0 +1,172 @@
+import logging
+
+import numpy as np  # noqa: F401
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,  # noqa: F401
+    get_mul_node,  # noqa: F401
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase  # noqa: F401
+from .matmul import QOpMatMul
+from .qdq_base_operator import QDQOperatorBase
+
+
+def is_B_transposed(gemm_node):  # noqa: N802
+    transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]  # noqa: N806
+    if transB_attribute:
+        return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
+
+    return False
+
+
+def get_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if beta_attribute:
+        return onnx.helper.get_attribute_value(beta_attribute[0])
+
+    return 1.0
+
+
+def set_default_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if beta_attribute:
+        beta_attribute[0].f = 1.0
+
+    return 1.0
+
+
+class QLinearGemm(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                self.quantizer.weight_qType,
+                0 if is_B_transposed(node) else 1,
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            #  Get Quantized from both activation(input[0]) and weight(input[1])
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        if len(node.input) == 3:
+            if not self.quantizer.is_input_a_initializer(node.input[2]):
+                return super().quantize()
+
+            # Note: if the quantized type is float 8, the bias is converted into float 16.
+            # cublasLtMatMul only supports (b)float16 or float32 bias.
+            quantized_bias_name = self.quantizer.quantize_bias_static(
+                node.input[2], node.input[0], node.input[1], get_beta(self.node)
+            )
+
+        qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qgemm_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name != "beta":
+                kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        # generate input
+        qgemm_inputs = []
+        for i in range(2):
+            qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
+
+        qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
+
+        qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
+        nodes.append(qgemm_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qgemm_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+            node_type=node.op_type,
+            node_qtype=self.quantizer.weight_qType,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGemm(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if is_B_transposed(node) else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            if self.quantizer.is_input_a_initializer(node.input[2]):
+                self.quantizer.quantize_bias_tensor(
+                    node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
+                )
+                set_default_beta(self.node)
+            else:
+                logging.warning(
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
+                )
@@ -0,0 +1,121 @@
+import numpy
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain  # noqa: F401
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize LSTM
+"""
+
+
+class LSTMQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """
+        parameter node: LSTM node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "LSTM"
+
+        if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
+            node.input[2]
+        ):
+            super().quantize()
+            return
+
+        model = self.quantizer.model
+        W = model.get_initializer(node.input[1])  # noqa: N806
+        R = model.get_initializer(node.input[2])  # noqa: N806
+
+        if len(W.dims) != 3 or len(R.dims) != 3:
+            super().quantize()
+            return
+
+        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims  # noqa: N806
+        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            del W.dims[0]
+            del R.dims[0]
+            W.dims[0] = W_num_dir * W_4_hidden_size
+            R.dims[0] = R_num_dir * R_4_hidden_size
+
+        quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[1],
+            onnx_proto.TensorProto.INT8,
+            0,  # self.quantizer.weight_qType?
+        )
+        quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[2],
+            onnx_proto.TensorProto.INT8,
+            0,  # self.quantizer.weight_qType?
+        )
+
+        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)  # noqa: N806
+        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)  # noqa: N806
+
+        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))  # noqa: N806
+        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))  # noqa: N806
+
+        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))  # noqa: N806
+        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))  # noqa: N806
+
+        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        model.remove_initializers([W_quant_weight, R_quant_weight])
+        model.add_initializer(W_quant_tranposed)
+        model.add_initializer(R_quant_tranposed)
+
+        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])  # noqa: N806
+        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])  # noqa: N806
+        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])  # noqa: N806
+        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
+            W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
+
+        inputs = []
+        input_len = len(node.input)
+        inputs.extend([node.input[0]])
+        inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
+        inputs.extend([node.input[3] if input_len > 3 else ""])
+        inputs.extend([node.input[4] if input_len > 4 else ""])
+        inputs.extend([node.input[5] if input_len > 5 else ""])
+        inputs.extend([node.input[6] if input_len > 6 else ""])
+        inputs.extend([node.input[7] if input_len > 7 else ""])
+        inputs.extend(
+            [
+                quant_input_weight_tuple[2],
+                quant_input_weight_tuple[1],
+                quant_recurrent_weight_tuple[2],
+                quant_recurrent_weight_tuple[1],
+            ]
+        )
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "layout":
+                continue
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        quant_lstm_name = "" if not node.name else node.name + "_quant"
+        quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
+        self.quantizer.new_nodes.append(quant_lstm_node)
+
+        dequantize_node = self.quantizer._dequantize_value(node.input[0])
+        if dequantize_node is not None:
+            self.quantizer.new_nodes.append(dequantize_node)
@@ -0,0 +1,231 @@
+import itertools
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QOpMatMul(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            logging.debug(f"Ignore MatMul {self.node.name}]")
+            return False
+
+        if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
+            not self.quantizer.is_float_tensor(self.node.input[0])
+        ):
+            logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
+            return False
+
+        # do not quantize non-constant B matrices for matmul
+        if self.quantizer.q_matmul_const_b_only:
+            if not self.quantizer.find_initializer_in_path(self.node.input[1]):
+                logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
+                return False
+        return True
+
+
+"""
+    Used when quantize mode is QuantizationMode.IntegerOps.
+"""
+
+
+class MatMulInteger(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        matmul_integer_output = node.output[0] + "_output_quantized"
+        matmul_integer_name = node.name + "_quant" if node.name else ""
+        matmul_integer_node = onnx.helper.make_node(
+            "MatMulInteger",
+            quantized_input_names + zero_point_names,
+            [matmul_integer_output],
+            matmul_integer_name,
+        )
+        nodes.append(matmul_integer_node)
+
+        # Add cast operation to cast matmulInteger output to float.
+        cast_op_output = matmul_integer_output + "_cast_output"
+        otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [matmul_integer_output],
+            [cast_op_output],
+            matmul_integer_output + "_cast",
+            to=otype,
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        scales_mul_op = (
+            matmul_integer_name + "_scales_mul"
+            if matmul_integer_name
+            else scale_names[0] + "_" + scale_names[1] + "_mul"
+        )
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
+        # and make the output of this node the same as output of original matmul node.
+        output_scale_mul_op = ""
+        if matmul_integer_name:
+            output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                node.output[0],
+                output_scale_mul_op,
+            )
+        )
+        self.quantizer.new_nodes += nodes
+
+
+"""
+    Used when quantize mode is QuantizationMode.QLinearOps
+"""
+
+
+class QLinearMatMul(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+
+        nodes.extend(nodes_weight)
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_matmul_name = node.name + "_quant" if node.name else ""
+
+        qlinear_matmul_inputs = []
+        # Input 0
+        qlinear_matmul_inputs.append(quantized_input_names[0])
+        qlinear_matmul_inputs.append(scale_names[0])
+        qlinear_matmul_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_matmul_inputs.append(quantized_input_names[1])
+        qlinear_matmul_inputs.append(scale_names[1])
+        qlinear_matmul_inputs.append(zero_point_names[1])
+        # Output quantization parameter
+        qlinear_matmul_inputs.append(output_scale_name)
+        qlinear_matmul_inputs.append(output_zp_name)
+
+        domain = (
+            "com.microsoft"
+            if self.quantizer.weight_qType
+            in {
+                onnx_proto.TensorProto.FLOAT8E4M3FN,
+                onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+                onnx_proto.TensorProto.FLOAT8E5M2,
+                onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+            }
+            else ""
+        )
+        qlinear_matmul_node = onnx.helper.make_node(
+            "QLinearMatMul",
+            qlinear_matmul_inputs,
+            [qlinear_matmul_output],
+            qlinear_matmul_name,
+            domain=domain,
+        )
+        nodes.append(qlinear_matmul_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_matmul_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQMatMul(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+
+        if self.disable_qdq_for_node_output:
+            nodes_to_iterate = node.input
+        else:
+            nodes_to_iterate = itertools.chain(node.input, node.output)
+
+        for tensor_name in nodes_to_iterate:
+            if find_by_name(tensor_name, self.quantizer.model.initializer()):
+                is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
+                    tensor_name, default_axis=1, op_type=node.op_type
+                )
+                if is_per_channel:
+                    self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
+                else:
+                    self.quantizer.quantize_weight_tensor(tensor_name)
+            else:
+                self.quantizer.quantize_activation_tensor(tensor_name)
@@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QMaxPool(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, go to normal quantize.
+        if self.quantizer.opset_version < 12:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQMaxPool(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, just no change
+        if self.quantizer.opset_version < 12:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
@@ -0,0 +1,40 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QDQNormalization(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
+
+        # Input
+        self.quantizer.quantize_activation_tensor(node.input[0])
+
+        # Scale
+        scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
+        scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=1, op_type=node.op_type
+        )
+
+        if scale_is_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
+        elif scale_is_initializer:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[1])
+
+        # Bias
+        if len(node.input) > 2 and node.input[2]:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
+
+        # Output
+        if not self.disable_qdq_for_node_output:
+            for output_name in node.output:
+                self.quantizer.quantize_activation_tensor(output_name)
@@ -0,0 +1,172 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    quantize_nparray,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QPad(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Pad"
+
+        # Only after version 11, it has the optional constant_value
+        # If input[0] is not quantized, do not quanitize this node
+        if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
+            super().quantize()
+            return
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if "mode" not in kwargs or kwargs["mode"] == b"constant":
+            if len(node.input) > 2 and node.input[2] != "":  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
+                scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
+                if zp_tensor is None or scale_tensor is None:
+                    super().quantize()
+                    return
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quantize_nparray(
+                        self.quantizer.activation_qType,
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
+                    )
+                    quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array,
+                        quantized_padding_constant_name,
+                    )
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    # TODO: check quantize_inputs after sub graph is supported
+                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
+                        node,
+                        2,
+                        self.quantizer.activation_qType,
+                        quantized_input_value.scale_name,
+                        quantized_input_value.zp_name,
+                        initial_type=scale_tensor.data_type,
+                    )
+                    self.quantizer.new_nodes.extend(pad_value_qnodes)
+                    node.input[2] = pad_value_qnodes[0].output[0]
+            else:
+                # In quantized format, the `zero` before quantization is mapped
+                # to quantized_input_value.zp_name. Thus, padding 0 to
+                # original tensor should become padding zero point to quantized
+                # tensor.
+                if len(node.input) == 2:
+                    # Feed quantization's zero point to padding node.
+                    node.input.append(quantized_input_value.zp_name)
+                else:
+                    # Assign quantization's zero point to padding node.
+                    assert node.input[2] == ""
+                    node.input[2] = quantized_input_value.zp_name
+
+        # Create an entry for output quantized value
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            quantized_input_value.scale_name,
+            quantized_input_value.zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        node.input[0] = quantized_input_value.q_name
+        node.output[0] = quantized_output_value.q_name
+        self.quantizer.new_nodes += [node]
+
+
+class QDQPad(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
+        """
+        Returns the Pad's constant padding value. Returns `None` if the padding value is
+        not constant (i.e., comes from a dynamic input).
+        """
+        const_val = None
+        onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
+        if onnx_tensor_type is None:
+            return None
+
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
+        if self.quantizer.opset_version < 11:
+            const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
+        elif len(self.node.input) >= 3 and self.node.input[2]:
+            const_val = self.quantizer.model.get_constant_value(self.node.input[2])
+        else:
+            const_val = np.array(0, dtype=np_dtype)
+
+        return const_val
+
+    def _should_quantize_output_same_as_input(self) -> bool:
+        """
+        Returns true if Pad's output should use the same quantization parameters as input[0]
+        """
+        attrs_dict = {}
+        for attribute in self.node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            attrs_dict.update(kv)
+
+        pad_mode = attrs_dict.get("mode", b"constant")
+        if pad_mode in (b"reflect", b"edge", b"wrap"):
+            # These modes pad the output with a value that already exists in the input.
+            # So, we can quantize the output the same as the input.
+            return True
+
+        # For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
+        # because our quantization floating-point range always includes 0.
+        if pad_mode == b"constant":
+            pad_val = self._get_pad_const_val(attrs_dict)
+            if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
+                return float(pad_val.item()) == 0
+
+        return False
+
+    def quantize(self):
+        assert self.node.op_type == "Pad"
+
+        for input_name in self.node.input:
+            if input_name:
+                self.quantizer.quantize_activation_tensor(input_name)
+
+        if not self.disable_qdq_for_node_output:
+            if self._should_quantize_output_same_as_input():
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+            else:
+                self.quantizer.quantize_activation_tensor(self.node.output[0])
@@ -0,0 +1,67 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearPool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear pool node for given type (AveragePool, etc)
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
@@ -0,0 +1,22 @@
+import itertools
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray  # noqa: F401
+from .base_operator import QuantOperatorBase  # noqa: F401
+
+
+class QDQOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+        self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
+
+    def quantize(self):
+        node = self.node
+
+        if self.disable_qdq_for_node_output:
+            tensors_to_quantize = node.input
+        else:
+            tensors_to_quantize = itertools.chain(node.input, node.output)
+
+        for tensor_name in tensors_to_quantize:
+            self.quantizer.quantize_activation_tensor(tensor_name)
@@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QResize(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, go to normal quantize.
+        if self.quantizer.opset_version < 11:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQResize(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, just keep this node
+        if self.quantizer.opset_version < 11:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
@@ -0,0 +1,74 @@
+import onnx
+import onnx.helper
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearSoftmax(QuantOperatorBase):
+    def quantize(self):
+        node = self.node
+        # set limitations for softmax output scale and zp, because the output of softmax is always 0-1
+        if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
+            out_scale = 1 / 256.0
+            out_zero_point = 0
+        else:
+            out_scale = 1 / 256.0
+            out_zero_point = -128
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear softmax node for given type
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        # make qlinearsoft has the real opset_version, its default SinceVersion would be 1
+        kwargs["opset"] = self.quantizer.opset_version
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
+        return None
@@ -0,0 +1,63 @@
+import onnx
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QSplit(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        quantized_node_name = ""
+        if node.name:
+            quantized_node_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        # Output just derive the scale/zero from input
+        quantized_output_names = []
+        for output_name in node.output:
+            quantized_output_name = output_name + "quantized"
+            quantized_output_names.append(quantized_output_name)
+            q_output = QuantizedValue(
+                output_name,
+                quantized_output_name,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[output_name] = q_output
+
+        if len(node.input) > 1:
+            quantized_input_names.extend(node.input[1:])
+        quantized_node = onnx.helper.make_node(
+            node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
+        )
+
+        nodes.append(quantized_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQSplit(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Split"
+
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            for output in node.output:
+                self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
@@ -0,0 +1,87 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearWhere(QuantOperatorBase):
+    def should_quantize(self):
+        return True
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if not self.quantizer.force_quantize_no_input_check:
+            self.quantizer.new_nodes += [node]
+            return
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [1, 2])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+        qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_output_name = node.name + "_quant" if node.name else ""
+
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlwhere_inputs = [
+            node.input[0],
+            q_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            q_input_names[1],
+            scale_names[1],
+            zero_point_names[1],
+            output_scale_name,
+            output_zp_name,
+        ]
+        qlwhere_node = onnx.helper.make_node(
+            "QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlwhere_node]
+
+
+class QDQWhere(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if self.quantizer.force_quantize_no_input_check:
+            if not self.quantizer.is_tensor_quantized(node.input[1]):
+                self.quantizer.quantize_activation_tensor(node.input[1])
+            if not self.quantizer.is_tensor_quantized(node.input[2]):
+                self.quantizer.quantize_activation_tensor(node.input[2])
+            if not self.disable_qdq_for_node_output:
+                for output in node.output:
+                    self.quantizer.quantize_activation_tensor(output)
+        elif (
+            self.quantizer.is_tensor_quantized(node.input[1])
+            and self.quantizer.is_tensor_quantized(node.input[2])
+            and not self.disable_qdq_for_node_output
+        ):
+            for output in node.output:
+                self.quantizer.quantize_activation_tensor(output)