tensorflow int8点/卷积与XLA+CUDA的状态

zaqlnxep  于 6个月前  发布在  其他
关注(0)|答案(8)|浏览(44)

使用TensorFlow API访问通过XLA加速的tensorcore-accelerated int8点乘/卷积功能可行吗?

#30771 表示支持int8卷积,#49140 也建议存在这种功能。然而,TensorFlow卷积操作不允许使用int8参数。矩阵乘法运算符允许通过output_type参数将int8 x int8 -> int32,我已经成功地使用这个选项使用XLA进行了编译。但是,当我这样做时,我没有得到与预期的int8 tensorcores相匹配的吞吐量。我期望的是接近2倍于fp16的大矩阵乘法([8192, 8192]输入)的吞吐量。诚然,我的基准测试并不非常严格,但我想知道我的关于约2倍吞吐量的期望是否合理,以及XLA是否具有促进这一功能的函数。

以下是一个最小化的XLA编译的int8矩阵乘法实现:

@tf.function(jit_compile=True)
def int8_matmul(x, w):
    return tf.matmul(x, w, output_type=tf.int32)

x = tf.cast(tf.random.uniform([2048, 2048], minval=-127, maxval=127, dtype=tf.int32), tf.int8)
w = tf.cast(tf.random.uniform([2048, 2048], minval=-127, maxval=127, dtype=tf.int32), tf.int8)
y = int8_matmul(x, w)

print(int8_matmul.experimental_get_compiler_ir(x, w)(stage="optimized_hlo"))
# HloModule a_inference_int8_matmul_17__.10, alias_passthrough_params=true, entry_computation_layout={(s8[2048,2048]{1,0},s8[2048,2048]{1,0})->s32[2048,2048]{1,0}}

# ENTRY %a_inference_int8_matmul_17__.10 (arg0.1: s8[2048,2048], arg1.2: s8[2048,2048]) -> s32[2048,2048] {
#   %arg0.1 = s8[2048,2048]{1,0} parameter(0), parameter_replication={false}, metadata={op_name="XLA_Args"}
#   %arg1.2 = s8[2048,2048]{1,0} parameter(1), parameter_replication={false}, metadata={op_name="XLA_Args"}
#   %copy = s8[2048,2048]{0,1} copy(s8[2048,2048]{1,0} %arg1.2), metadata={op_name="XLA_Args"}
#   ROOT %cublas-gemm.1 = s32[2048,2048]{1,0} custom-call(s8[2048,2048]{1,0} %arg0.1, s8[2048,2048]{0,1} %copy), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMulV3" op_name="MatMul" source_file="int8_xla.py" source_line=22}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
# }

这是我尝试实现的一个“规范”量化线性层。我认为这是一个相当标准的量化层的顺序操作,我相信TensorRT可以融合整个操作。

@tf.function(jit_compile=True)
def int8_linear_layer(x, w, b, s):
    # read in x and w as pre-quantized int8 tensors
    y = tf.matmul(x, w, output_type=tf.int32)

    # add bias and apply activation in fp32
    y = tf.cast(y, tf.float32)
    y = y + b
    y = tf.nn.relu(y)

    # quantize and store output as int8
    y = tf.round(y / s)
    y = tf.clip_by_value(y, -128, 127)
    y = tf.cast(y, tf.int8)
    return y

x = tf.cast(tf.random.uniform([2048, 2048], minval=-127, maxval=127, dtype=tf.int32), tf.int8)
w = tf.cast(tf.random.uniform([2048, 2048], minval=-127, maxval=127, dtype=tf.int32), tf.int8)
b = tf.random.normal([2048], dtype=tf.float32)  # bias
s = tf.random.normal([], dtype=tf.float32)  # per-tensor quantization scale for output activation
y = int8_linear_layer(x, w, b, s)

print(int8_linear_layer.experimental_get_compiler_ir(x, w, b, s)(stage="optimized_hlo"))
# HloModule a_inference_int8_linear_layer_65__.32, alias_passthrough_params=true, entry_computation_layout={(s8[2048,2048]{1,0},s8[2048,2048]{1,0},f32[2048]{0},f32[])->s8[2048,2048]{1,0}}

# %fused_computation (param_0.2: f32[], param_1.4: f32[2048], param_2.7: s32[2048,2048]) -> s8[2048,2048] {
#   %constant_2 = f32[] constant(-128), metadata={op_type="Maximum" op_name="clip_by_value" source_file="int8_xla.py" source_line=52}
#   %broadcast.4 = f32[2048,2048]{1,0} broadcast(f32[] %constant_2), dimensions={}, metadata={op_type="Maximum" op_name="clip_by_value" source_file="int8_xla.py" source_line=52}
#   %param_2.7 = s32[2048,2048]{1,0} parameter(2)
#   %convert.1 = f32[2048,2048]{1,0} convert(s32[2048,2048]{1,0} %param_2.7), metadata={op_type="Cast" op_name="Cast" source_file="int8_xla.py" source_line=46}
#   %param_1.4 = f32[2048]{0} parameter(1)
#   %broadcast.3 = f32[2048,2048]{1,0} broadcast(f32[2048]{0} %param_1.4), dimensions={1}, metadata={op_type="AddV2" op_name="add" source_file="int8_xla.py" source_line=47}
#   %add.0 = f32[2048,2048]{1,0} add(f32[2048,2048]{1,0} %convert.1, f32[2048,2048]{1,0} %broadcast.3), metadata={op_type="AddV2" op_name="add" source_file="int8_xla.py" source_line=47}
#   %constant_1 = f32[] constant(0), metadata={op_type="Relu" op_name="Relu" source_file="int8_xla.py" source_line=48}
#   %broadcast.2 = f32[2048,2048]{1,0} broadcast(f32[] %constant_1), dimensions={}, metadata={op_type="Relu" op_name="Relu"}
#   %maximum.0 = f32[2048,2048]{1,0} maximum(f32[2048,2048]{1,0} %add.0, f32[2048,2048]{1,0} %broadcast.2), metadata={op_type="Relu" op_name="Relu"}
#   %param_0.2 = f32[] parameter(0)
#   %broadcast.1 = f32[2048,2048]{1,0} broadcast(f32[] %param_0.2), dimensions={}, metadata={op_type="RealDiv" op_name="truediv" source_file="int8_xla.py" source_line=51}
#   %divide.0 = f32[2048,2048]{1,0} divide(f32[2048,2048]{1,0} %maximum.0, f32[2048,2048]{1,0} %broadcast.1), metadata={op_type="RealDiv" op_name="truediv" source_file="int8_xla.py" source_line=51}
#   %round-nearest-even.0 = f32[2048,2048]{1,0} round-nearest-even(f32[2048,2048]{1,0} %divide.0), metadata={op_type="Round" op_name="Round" source_file="int8_xla.py" source_line=51}
#   %constant_0 = f32[] constant(127), metadata={op_type="Minimum" op_name="clip_by_value/Minimum" source_file="int8_xla.py" source_line=52}
#   %broadcast.0 = f32[2048,2048]{1,0} broadcast(f32[] %constant_0), dimensions={}, metadata={op_type="Minimum" op_name="clip_by_value/Minimum" source_file="int8_xla.py" source_line=52}
#   %clamp.1 = f32[2048,2048]{1,0} clamp(f32[2048,2048]{1,0} %broadcast.4, f32[2048,2048]{1,0} %round-nearest-even.0, f32[2048,2048]{1,0} %broadcast.0), metadata={op_type="Maximum" op_name="clip_by_value" source_file="int8_xla.py" source_line=52}
#   ROOT %convert.0 = s8[2048,2048]{1,0} convert(f32[2048,2048]{1,0} %clamp.1), metadata={op_type="Cast" op_name="Cast_1" source_file="int8_xla.py" source_line=53}
# }

# ENTRY %a_inference_int8_linear_layer_65__.32 (arg0.1: s8[2048,2048], arg1.2: s8[2048,2048], arg2.3: f32[2048], arg3.4: f32[]) -> s8[2048,2048] {
#   %arg3.4 = f32[] parameter(3), parameter_replication={false}, metadata={op_name="XLA_Args"}
#   %arg2.3 = f32[2048]{0} parameter(2), parameter_replication={false}, metadata={op_name="XLA_Args"}
#   %arg0.1 = s8[2048,2048]{1,0} parameter(0), parameter_replication={false}, metadata={op_name="XLA_Args"}
#   %arg1.2 = s8[2048,2048]{1,0} parameter(1), parameter_replication={false}, metadata={op_name="XLA_Args"}
#   %copy = s8[2048,2048]{0,1} copy(s8[2048,2048]{1,0} %arg1.2), metadata={op_name="XLA_Args"}
#   %cublas-gemm.1 = s32[2048,2048]{1,0} custom-call(s8[2048,2048]{1,0} %arg0.1, s8[2048,2048]{0,1} %copy), custom_call_target="__cublas$gemm", metadata={op_type="BatchMatMulV3" op_name="MatMul" source_file="int8_xla.py" source_line=43}, backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
#   ROOT %fusion = s8[2048,2048]{1,0} fusion(f32[] %arg3.4, f32[2048]{0} %arg2.3, s32[2048,2048]{1,0} %cublas-gemm.1), kind=kLoop, calls=%fused_computation, metadata={op_type="Cast" op_name="Cast_1" source_file="int8_xla.py" source_line=53}
# }

系统信息:Ubuntu 20.04.5 LTS,TF 2.11.0 通过pip安装,A100 GPU,CUDA版本12.0

enxuqcxy

enxuqcxy1#

你好,

我想为这个组织做出贡献,但是我不知道如何做。你能建议我学习什么吗?我已经学习了深度学习,并且尝试阅读了很多问题。尽管我还没有能够理解如何解决这些问题。所以请告诉我我应该首先学习什么。

9wbgstp7

9wbgstp72#

你好,SuryanarayanaY,你能调查一下这个问题吗?谢谢。

bfrts1fy

bfrts1fy3#

你好,@jmc128 。

问题 49140 在我们的内部工单中仍然打开。我会检查并告诉你是否有任何更新。

我想知道你是如何使用 XLA 实现点积的。它是否用 @tf.function(jit_compile=True) 标注了?

还请按照标准格式提交您的问题,附上 here

谢谢!

ibps3vxo

ibps3vxo4#

你好,SuryanarayanaY。我已经更新了帖子,附上了代码示例和系统信息。

dhxwm5r4

dhxwm5r45#

@jmc128 ,

感谢您提供的实现代码。我在colab上进行了相同的测试,似乎tf.matmul(点)是成功的,但tf.nn.Conv2D仍然不受支持,如gist所述。

您能否提交一个最小的代码片段,用于性能比较,就像您在tf.matmul上观察到的那样?

yx2lnoni

yx2lnoni6#

我附上了一个脚本,用于使用int8和fp16精度对线性层进行吞吐量基准测试。在A100上,它产生了以下结果:

TOPS/TFLOPS
==========================
dim    |     int8     fp16
--------------------------
256    |     0.29     0.34
512    |     2.33     2.79
1024   |    16.51    19.47
2048   |    96.75   102.14
4096   |   261.43   205.79
8192   |   354.98   263.79
16384  |   367.82   272.62
==========================

关于性能的问题是,int8的吞吐量应该接近2倍于fp16的吞吐量。在我使用TensorRT进行的实验中,对于dims >= 2048的情况,int8的吞吐量是fp16的2倍,而对于dims在256到2048之间的情况,吞吐量仍然至少是1.7倍。我希望有人熟悉XLA内部的人能对此性能水平能否通过XLA实现发表评论。是否有一种特别的方法可以在TensorFlow中表示这种计算,从而使XLA输出更好的优化内核,还是说目前完全不可能使用XLA实现这一点?

import time
import tensorflow as tf

def int8_linear_layer(x, params):
    # read in x and w as pre-quantized int8 tensors
    y = tf.matmul(x, params["w"], output_type=tf.int32)

    # add bias and apply activation in fp32
    y = tf.cast(y, tf.float32)
    y = y + params["b"]
    y = tf.nn.relu(y)

    # quantize and store output as int8
    y = tf.round(y / params["s"])
    y = tf.clip_by_value(y, -128, 127)
    y = tf.cast(y, tf.int8)
    return y

def fp_linear_layer(x, params):
    return tf.nn.relu(tf.matmul(x, params["w"]) + params["b"])

def linear_layer(x, params):
    if params["w"].dtype == tf.int8:
        return int8_linear_layer(x, params)
    else:
        return fp_linear_layer(x, params)

@tf.function(jit_compile=True)
def network(x, network_params):
    for layer_params in network_params:
        x = linear_layer(x, layer_params)
    return x[0, 0]

def benchmark(dim, dtype, n_layers=32, iters=200):
    """
Benchmarks the throughput of a linear layer (matmul + bias + act (+ quantization))

To drown out data transfer costs and other overheads, each run performs a
forward pass of a network consisting of `n_layers` of linear layers in series

The full network is run for `iters` iterations, and the last 10% of iterations are
used to compute the throughput in TFLOPS/TOPS
"""

    # create input tensor for network
    if dtype == tf.int8:
        x = tf.random.uniform([dim, dim], minval=-127, maxval=127, dtype=tf.int32)
        x = tf.cast(x, tf.int8)
    else:
        x = tf.random.uniform([dim, dim], dtype=dtype)

    # create weight + bias (+ quant scale) tensors for each layer
    network_params = []
    for i in range(n_layers):
        if dtype == tf.int8:
            w = tf.random.uniform([dim, dim], minval=-127, maxval=127, dtype=tf.int32)
            w = tf.cast(w, tf.int8)
            b = tf.random.normal([dim], dtype=tf.float32)
            s = tf.random.normal([], dtype=tf.float32)
            layer_params = {"w": w, "b": b, "s": s}
        else:
            w = tf.random.uniform([dim, dim], dtype=dtype)
            b = tf.random.uniform([dim], dtype=dtype)
            layer_params = {"w": w, "b": b}
        network_params.append(layer_params)

    times = []
    for i in range(iters):
        t0 = time.time()
        y = network(x, network_params).numpy()
        elapsed_time = time.time() - t0  # in ms
        times.append(elapsed_time)

    times = times[-(iters // 10) :]  # discard warmup iters
    avg_time = sum(times) / len(times)
    tflops = (n_layers * 2 * dim ** 3) / avg_time * 1e-12
    return tflops

if __name__ == "__main__":
    benchmark(256, tf.float16)  # not used, just triggers tensorflow/xla initial log dump

    print("\nTOPS/TFLOPS")
    cols = f"{'dim':<6} | {'int8':>8} {'fp16':>8}"
    print("=" * len(cols) + "\n" + cols + "\n" + "-" * len(cols))
    for dim in [256, 512, 1024, 2048, 4096, 8192]:
        int8_time = benchmark(dim, tf.int8)
        fp16_time = benchmark(dim, tf.float16)
        print(f"{dim:<6} | {int8_time:>8.2f} {fp16_time:>8.2f}")
    print("=" * len(cols))
2w3rbyxf

2w3rbyxf7#

我已经复制了报告中的行为,并在此附上了 gist
@sachinprasadhs 你能看一下这个问题吗?谢谢!

vq8itlhq

vq8itlhq8#

@reed Wanderman-Milne ***@***.***> could you take a look?…
On Thu, Feb 16, 2023 at 1:01 AM Sachin Prasad ***@***.***> wrote: Assigned #59530 <#59530> to @cheshire < https://github.com/cheshire >. — Reply to this email directly, view it on GitHub <#59530 (comment)>, or unsubscribe < https://github.com/notifications/unsubscribe-auth/AACVGH4Y6JCZ6EMSU3XNYZTWXVU4TANCNFSM6AAAAAAUPOBSFM > . You are receiving this because you were assigned.Message ID: ***@***.***>

相关问题