o
    #j)                     @   s   d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZ dd Zdd
dZdddZ		dddZ					dddZd	S )    )_C_opsversion)check_dtype)convert_np_dtype_to_dtype_)get_device_capability)LayerHelperin_dynamic_modein_dynamic_or_pir_modec                  C   s>   t  } | d ur| dkrt \}}t|d | }|S td)NFalse
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDA)r   cudar   int
ValueError)Zcuda_versionmajorminorarch r   a/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddle/nn/quant/quantized_linear.py_get_arch_info   s   
r   weight_only_int8Nc                 C   s   |du rt  }|dks|dks|dks|dksJ d| dt r)t| ||S d}t|fi t }|d	}|d
}|j|d| i||d||dd ||fS )a  
    Quantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.

    Returns:
        out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
        scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> print(out.shape)
            [32, 64]
            >>> print(scale.shape)
            [32]
    NF   P   V   K   >Currently weight_quantize only support SM70/75/80/86. but got  weight_quantizeint8floatx)outscale)algor   typeinputsZoutputsattrs)r   r   r   r   r   locals"create_variable_for_type_inference	append_op)r   r"   r   r$   helperr    r!   r   r   r   r   '   s$   "


r   float16c                 C   sv   t |dddgd t|}t rt| |||S d}t|fi t }||}|j|| |dd|i||dd |S )	a  
    Dequantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be dequantized, the data type is int8.
        scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
        out_dtype (str|np.dtype): The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.

    Returns:
        out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize, weight_dequantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> x_dequant = weight_dequantize(out, scale)
    	out_dtyper+   Zbfloat16weight_dequantize)r   r!   r    )r"   r,   r#   )	r   r   r   r   r-   r   r'   r(   r)   )r   r!   r"   r,   r$   r*   r    r   r   r   r-   [   s    
r-   r   c                 C   s   |du rt  }|dks|dks|dks|dksJ d| dt r.t| |||||}|S t|dd	d
gd d}t|fi t }| j}	| g|g|gd}
|durW|g|
d< ||d}||	}|j	||
d|i|d |S )a  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, The bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        weight_dtype(str): The dtype of  weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_only_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...    out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
            ...    print(out.shape)
            [1, 2, 32]
    Nr   r   r   r   r   r   weight_dtyper   Zint4weight_only_linearr   weightweight_scalebias)r.   r   r    r#   )
r   r   r   r/   r   r   r'   dtyper(   r)   )r   r1   r3   r2   r.   r   r    r$   r*   r4   r%   r&   r   r   r   r/      s>   ""



r/         @c                 C   s   t  rt| ||||}|S d}t|fi t }| j}| g|g|gd}	|r,|g|	d< d|i}
||}|j||	d|i|
d |S )a  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, the bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype.

    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import llm_int8_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...    out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
            ...    print(out.shape)
            [1, 2, 32]
    llm_int8_linearr0   r3   	thresholdr    r#   )r	   r   r6   r   r'   r4   r(   r)   )r   r1   r3   r2   r7   r    r$   r*   r4   r%   r&   r   r   r   r6      s*   &

r6   )r   N)r   r+   )NNr   N)NNr5   )Zpaddler   r   Zpaddle.base.data_feederr   Zpaddle.base.frameworkr   Zpaddle.device.cudar   Zpaddle.frameworkr   r   r	   r   r   r-   r/   r6   r   r   r   r   <module>   s   

4/
M