o
    0j0                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
  m  mZ d dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlm Z m!Z! d
dl"m#Z#m$Z$ zd dl%m&Z& W n e'y   dZ&Y nw zd dl(m)Z)m*Z*m+Z+ W n   Y zd dl,m-Z- W n   dZ-Y ej.Z.ej/Z/ej0Z0ej1Z1ej2Z2G dd de Z3dAddZ4	dBdedefddZ5				dCddZ6dd Z7d d! Z8d"d# Z9G d$d% d%ej:Z;G d&d' d'ej:Z<d(d) Z=d*d+ Z>G d,d- d-ej:Z?d.ejd/e@d0ejfd1d2ZAG d3d4 d4ej:ZBG d5d6 d6ej:ZCG d7d8 d8e!ZDG d9d: d:eDZEG d;d< d<ej:ZFG d=d> d>ej:ZGG d?d@ d@eDZHdS )D    N)partial)ListOptionalTupleUnion)Tensor)fleet)sequence_parallel_utils   )logging)get_device_type   )
fusion_ops)ACT2FN)PretrainedConfigPretrainedModel)BaseModelOutputWithPastCausalLMOutputWithPast)fused_rotary_position_embedding)GatherOp	ScatterOp#mark_as_sequence_parallel_parameter)flash_attentionc                       s\   e Zd ZdZdZdgZ												
														d fdd	Z  ZS )Qwen2Configa  
    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of
    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Qwen2Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 22016):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 32):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        use_sliding_window (`bool`, *optional*, defaults to `False`):
            Whether to use sliding window attention.
        sliding_window (`int`, *optional*, defaults to 4096):
            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
        max_window_layers (`int`, *optional*, defaults to 28):
            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    qwen2past_key_valuesQ     V      silu   {Gz?ư>TF     @r   [P                  ?Nc                    s   || _ || _|	| _|| _|| _|| _|| _|| _|| _|| _	|d u r$|}|| _
|| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t jd||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddings
seq_lengthhidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsuse_sliding_windowsliding_windowmax_window_layersnum_key_value_heads
hidden_actinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_dropoutrope_scaling_factorrope_scaling_typer)   r*   r+   
dpo_configsuper__init__)selfr.   r1   r2   r3   r4   r8   r9   r/   r0   r:   r;   r<   r,   r=   r)   r*   r+   r5   r6   r7   r>   r?   r@   rA   kwargs	__class__r-   p/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/doc_vlm/modeling/qwen2.pyrC   z   sD   
zQwen2Config.__init__)r   r   r   r   r   r   r    r!   r!   r"   r#   TFr$   r   r%   r%   Fr   r&   r'   r(   NN)__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferencerC   __classcell__r-   r-   rF   rH   r   @   s:    6r   c                 C   sN   |d ur|S | j }d|d< tj|t| jj| jd}tj|dd}d|_|S )N   dtype)ZdiagonalT)shapepaddlefullfinforP   mintriustop_gradient)xmaskrQ   r-   r-   rH   get_triangle_upper_mask   s   rZ   TrX   yc                 C   s   d}d}zt  }| }| }W n   d}Y t r!|j}n|dk}|rK|dkrK|rKtjjj	| |d}	tj
|	||d}
|rB|
S tjjj|
|dS tj
| ||d}
|
S )NTrN   F)group)transpose_y)r   Zget_hybrid_communicate_groupZget_model_parallel_groupZget_model_parallel_world_sizerR   in_dynamic_modeis_distributeddistributedZ
collectiveZ_c_identitymatmulZ	_c_concat)rX   r[   r]   tensor_parallel_outputZis_fleet_inittensor_parallel_degreeZhcgZmodel_parallel_groupZy_is_distributedZinput_parallellogitsr-   r-   rH   parallel_matmul   s.   re   Fc
                 C   s  | j \}
}}}|j \}}}}t| g d} t|g d}t|g d}t r3| jtjkr3d}nd}t| t||  |g d}|j |
|||gkr_t	d|
|||f d|j  |d u rgt
|}||
d||g}|j |
d||gkrt	d|
d||f d|j  || }t stj|| dd	d
| j}n'tjd tj|d	| dd	d
| j}W d    n1 sw   Y  tj||j|d}t||}|g d}|r||
| || g}n
||
||| g}|r||fS |S )N)r      rN   r   r   rN   )r   rN   r   rf   z%Attention weights should be of shape z	, but is z"Attention mask should be of shape float32)axisrP   F)ptraining)rQ   rR   Z	transposer^   rP   float16ra   mathsqrt
ValueErrorrZ   reshapeFZsoftmaxastypeamp	auto_castZdropoutr>   )query_statesconfig
key_statesvalue_statesattention_maskoutput_attentionsattn_mask_startend_row_indicesrk   sequence_parallelZskip_recomputeZbszZq_len	num_headshead_dim_
kv_seq_lenZpre_divided_factorattn_weightsattn_outputr-   r-   rH   scaled_dot_product_attention   sd   
r   c                 C   s   t | | k  S )zO
    Upper triangular of attention_mask equals to attention_mask is casual
    )rR   rV   allitem)ry   r-   r-   rH   is_casual_mask;  s   r   c                 C   sr   | \}}t t j||fdd}|dkr$t jt j||gdd|gdd}|ddddddf |d||| gS )z2
    Make causal mask used for self-attention
    boolrO   r   rg   ri   NrN   )rR   Ztrilonesconcatexpand)Zinput_ids_shapepast_key_values_length
batch_sizeZtarget_lengthrY   r-   r-   rH   _make_causal_maskB  s   r   c                 C   s`   | j d | j d }}|dur|n|}| ddddddf d} d| _| |d||g}|S )zn
    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
    r   rg   Nr   TrN   )rQ   rr   rW   r   )rY   rP   
tgt_lengthr   Z
src_lengthZexpanded_maskr-   r-   rH   _expand_2d_maskW  s   r   c                       s*   e Zd Zdef fddZdd Z  ZS )Qwen2RMSNormrv   c                    sZ   t    |j| _tj| jgt tjdd| _	|j
| _|| _|jr+t| j	 dS dS )z;
        Qwen2RMSNorm is equivalent to T5LayerNorm
        r(   )rQ   rP   Zdefault_initializerN)rB   rC   r1   rR   create_parameterget_default_dtypenninitializerConstantweightr;   variance_epsilonrv   r|   r   rD   rv   rF   r-   rH   rC   f  s   

zQwen2RMSNorm.__init__c                 C   s   | j jrt|| j| jdS t r@tj	d |
ddjddd}t|| j | }W d    n1 s:w   Y  n|
ddjddd}t|| j | }| jjtjtjfv rit|| jj}|| j S )NFrh   rf   rg   T)Zkeepdim)rv   Zuse_fused_rms_normr   Zfusion_rms_normr   r   rR   r^   rs   rt   rr   powmeanZrsqrtrP   rl   Zbfloat16cast)rD   hidden_statesZvariancer-   r-   rH   forwardw  s"   
zQwen2RMSNorm.forwardrI   rJ   rK   r   rC   r   rM   r-   r-   rF   rH   r   e  s    r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )Qwen2RotaryEmbedding   '  c                    sX   t    || _|| _|| _d| jtjtd| jddd| j   | _| j	|d d S )Nr(   r   rf   rh   rO   seq_len)
rB   rC   dimr/   baserR   r   arangeinv_freq_set_cos_sin_cache)rD   r   r/   r   rF   r-   rH   rC     s   
zQwen2RotaryEmbedding.__init__c                 C   sv   || _ tj|dd}td|| j}tj||gdd}| d d d d d d f | _| d d d d d d f | _	d S )Nrh   rO   zi,j->ijrg   r   )
max_seq_len_cachedrR   r   Zeinsumr   r   cos
cos_cachedsin
sin_cached)rD   r   tZfreqsZembr-   r-   rH   r     s   "z'Qwen2RotaryEmbedding._set_cos_sin_cacheNc                 C   s   || j kr
| | | jd d d |d d d d f }| jd d d |d d d d f }|j|jkr8||jn||j|jkrF||jfS |fS N)r   r   r   r   rP   r   )rD   rX   r   r   r   r-   r-   rH   r     s   

""zQwen2RotaryEmbedding.forward)r   r   r   )rI   rJ   rK   rC   r   r   rM   r-   r-   rF   rH   r     s    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |gddS )z*Rotates half the hidden dims of the input..Nrg   rf   r   )rQ   rR   r   )rX   x1Zx2r-   r-   rH   rotate_half  s   r   c                 C   s   |d u r+|d d d | j d d d d d f }|d d d | j d d d d d f }n|jddgd}|jddgd}|| d}|| d}| | t| |  }|| t||  }||fS )NrN   r   rf   r   )rQ   Zsqueeze	unsqueezer   )qkr   r   position_idsZq_embedZk_embedr-   r-   rH   apply_rotary_pos_emb  s   &(r   c                       s,   e Zd Zddef fddZdd Z  ZS )	Qwen2MLPFNrv   c                    sH  t    |d u ri }|| _|j| _|j| _|j| _|j| _|jr%t}t	}|jdkr]| jr;|| j| jd ddd| _
n|| j| jddd| _|| j| jddd| _|| j| jddd| _n.| jrmt| j| jd dd| _
nt| j| jdd| _t| j| jdd| _t| j| jdd| _|jdkrtj| _d| _d S t|j | _d| _d S )	NrN   rf   F)gather_outputhas_biasT)input_is_parallelr   Z	bias_attrr    )rB   rC   skip_recompute_opsr1   r2   fuse_attention_ffnrc   r|   ColumnSequenceParallelLinearRowSequenceParallelLineargate_up_fused_proj	gate_projup_proj	down_projLinearr9   r   Zswigluact_fnfuse_swiglur   )rD   rv   Z	is_sharedr   ColumnParallelLinearRowParallelLinearrF   r-   rH   rC     sn   










zQwen2MLP.__init__c                 C   sr   | j r| |}| jrd }n|jddd\}}n| || |}}| jr-| ||}n| || }| |S )Nrf   rg   r   )r   r   r   chunkr   r   r   r   )rD   rX   r[   r-   r-   rH   r     s   

zQwen2MLP.forwardFNr   r-   r-   rF   rH   r     s    Ar   r   n_repreturnc                 C   sJ   | j \}}}}|dkr| S | dddd|dg} | |||| |gS )z
    This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    rN   )rQ   r   Ztilerp   )r   r   batchslenr8   r~   r-   r-   rH   	repeat_kv  s
   r   c                       s   e Zd ZdZ		ddedef fddZ						dd	eee	j
  d
eee	j
  dee	j
 dededee	j
 dee	j
ee	j
 eee	j
  f fddZ  ZS )Qwen2Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    TNrv   layerwise_recomputec                    s  t    |d u ri }|| _|| _|j| _|j| _| j|j | _|j| _|j|j s,J |j|j | _	|j|jk| _
|j| _|j| _d| _|j| _|j| _|j| _|j| _d| _|| _|j| _|jdkr| j|j dksxJ d| j d|j | j|j | _| j|j dksJ d| j d|j | j|j | _|j| _| jrt dvstd u rtd	 d| _|jrt}t}|jdkr| jr|| j| jd
| jj | j  ddd| _n)|| j| jddd| _|| j| jj| j ddd| _|| j| jj| j ddd| _ || j| jddd| _!nF| jr"t"| j| jd
| jj | j  | _n&t"| j| jdd| _t"| j| jj| j dd| _t"| j| jj| j dd| _ t"| j| jdd| _!t#| j| j| jd| _$t%| _&d S )NTFrN   r   znum_heads: z, tensor_parallel_degree: znum_key_value_heads: )ZgpuxpuzEnable fuse rope in the config, but fuse rope is not available. Will disable fuse rope. Try using latest gpu version of Paddle.rf   )r   r   )r   r   r   )r/   r   )'rB   rC   rv   r   r1   r4   r}   r~   r8   num_key_value_groups
gqa_or_mqar/   r=   Z	is_causalr>   r0   r|   fuse_attention_qkvenable_recomputer   recompute_granularityrc   use_fused_roper   r   r   warningr   r   qkv_projq_projk_projv_projo_projr   r   
rotary_embr   	attn_func)rD   rv   r   r   r   r   rF   r-   rH   rC   1  s   







zQwen2Attention.__init__Fr   past_key_valuery   rz   r<   r{   r   c                 K   s  | j rQ| |}	| jrd| j| j| jd | j g}
ndd| j| jd | j g}
t|	|
}	tj	|	| j| j | j| jgdd\}}}| j
rPt|dd| j| jg}nG| |}| |}| |}| jrvd| j| j| jg}d| j| j| jg}ndd| j| jg}dd| j| jg}|j|d}|j|d}|j|d}|jd }|dur||d jd 7 }| jr|du sJ d| j||d	\}}t||d|||d
d\}}}n| j||d	\}}t|||||\}}|durtj|d |gdd}tj|d |gdd}|r||fnd}ttjdd }| jjr|dkr'|dkr't|| j}t|| j}| j|| j|||||| j| jd	}|r@|\}}n|}| |}|sLd}|f}|rW||f7 }|r_||f7 }t|tu rqt |dkrq|d }|S )z#Input shape: Batch x Time x Channelrg   rf   r   )Znum_or_sectionsri   )rQ   Nz(fuse rotary not support cache kv for nowr   F)vr   r   r   Zuse_neox_rotary_stylerN   r   r   r'   g@)r{   rk   r|   )!r   r   r|   r0   r8   r   r~   rR   reshape_splitr   r}   r   r   r   rp   rQ   r   r   r   r   r   float__version__rv   use_flash_attentionr   r   rk   r   typetuplelen)rD   r   r   r   ry   rz   r<   r{   rE   Z	mix_layerZtarget_shaperu   rw   rx   Ztarget_query_shapeZtarget_key_value_shaper   r   r   r   Zpaddle_versionoutputsr   r   r-   r-   rH   r     s   

	










zQwen2Attention.forward)TN)NNNFFN)rI   rJ   rK   rL   r   r   rC   r   r   rR   r   r   rM   r-   r-   rF   rH   r   +  s<    z
r   c                       s   e Zd Z		ddedef fddZ						ddejdeej d	eej d
ee dee	ej  dee deej de	ejee	ejejf  f fddZ
  ZS )Qwen2DecoderLayerFNrv   r   c                    st   t    |d u ri }|| _|| _|j| _t|||d| _t||d| _t	|| _
t	|| _d| _|| _|j| _d S )N)r   F)rB   rC   rv   r   r1   r   	self_attnr   mlpr   input_layernormpost_attention_layernormr   r   r   )rD   rv   r   r   rF   r-   rH   rC   7  s   


zQwen2DecoderLayer.__init__r   r   ry   rz   r   r<   r{   r   c              	   K   s   |}	|  |}| j|||||||d}
t|
tu r|
d }n|
}|r&|
d }|r0|
|r-dnd }|	| }|}	| |}| |}|	| }|f}
|rN|
|f7 }
|rU|
|f7 }
t|
tu ret|
dkre|
d }
|
S )a4  
        Args:
            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
        r{   r   rN   rf   )r   r   r   r   r   r   r   )rD   r   r   ry   rz   r   r<   r{   rE   Zresidualr   Zself_attn_weightsZpresent_key_valuer-   r-   rH   r   Q  s>   






zQwen2DecoderLayer.forwardr   )NNFNFN)rI   rJ   rK   r   r   rC   rR   r   r   r   r   rM   r-   r-   rF   rH   r   6  s>    
r   c                   @   s.   e Zd ZeZdZdgZeddefddZdS )	Qwen2PretrainedModelr   zself_attn.rotary_emb.inv_freqFrv   c              	      sZ  ddl m} ||d}ddg}d}|j}t|d|}t|dd	}	t|d
d	}
i }|ri|	rNt|jD ] |D ]}t fdd|D }t|d||d||< q5q1|
rgt|jD ] t fdd|D }|||< qU|S |	st|jD ] |D ]}t fdd|D }t|dd||d||< qtqp|
st|jD ] t fdd|D }t|dd||< q|S )Nr   )split_or_fuse_func)is_fuse)z layers.0.self_attn.q_proj.weightz layers.0.self_attn.k_proj.weightz layers.0.self_attn.v_proj.weightz"layers.0.self_attn.qkv_proj.weight)zlayers.0.self_attn.q_proj.biaszlayers.0.self_attn.k_proj.biaszlayers.0.self_attn.v_proj.biasz layers.0.self_attn.qkv_proj.bias)zlayers.0.mlp.gate_proj.weightzlayers.0.mlp.up_proj.weightz&layers.0.mlp.gate_up_fused_proj.weightr8   r   Fr   c                        g | ]}| d d  dqS z	layers.0.zlayers..replace.0keyir-   rH   
<listcomp>      zJQwen2PretrainedModel._get_fuse_or_split_param_mappings.<locals>.<listcomp>T)is_qkvr}   r8   c                    r   r   r   r   r   r-   rH   r    r  c                    r   r   r   r   r   r-   rH   r    r  )
split_numsr  r}   r8   c                    r   r   r   r   r   r-   rH   r    r  rf   )r  )Z$common.transformers.conversion_utilsr   r4   getattrranger3   r   r   )clsrv   r   r   fnZfuse_qkv_keysZfuse_gate_up_keysr}   r8   r   r   Zfinal_actionsZ	fuse_keyskeysr-   r   rH   !_get_fuse_or_split_param_mappings  sz   





z6Qwen2PretrainedModel._get_fuse_or_split_param_mappingsNF)	rI   rJ   rK   r   Zconfig_classZbase_model_prefixZ"_keys_to_ignore_on_load_unexpectedclassmethodr  r-   r-   r-   rH   r     s    r   c                       s   e Zd ZdZdef fddZdd Zdd Zed	d
 Z											dde
jdee
j dee
j dee
j dee deee
j  dee dee dee deeef fddZ  ZS )
Qwen2Modelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]

    Args:
        config: Qwen2Config
    rv   c                    s   t     j_ j_ j_ j_ j_ jd ur" jng _d_	 j
dkrH j j
 dkrHtjjjtjtj dd_n	tjj_t fddt jD _t _d S )NFrN   r   )r   )Zweight_attrc                    s   g | ]}t  |jvd qS ))rv   r   )r   no_recompute_layers)r   Z	layer_idxrv   rD   r-   rH   r    s    z'Qwen2Model.__init__.<locals>.<listcomp>)rB   rC   r)   Zpadding_idxr.   r1   r|   r   r  r   rc   mpuZVocabParallelEmbeddingrR   Z	ParamAttrr   r   ZXavierNormalembed_tokensZ	EmbeddingZ	LayerListr  r3   layersr   normr   rF   r  rH   rC     s4   

	zQwen2Model.__init__c                 C      | j S r   r  rD   r-   r-   rH   get_input_embeddings)     zQwen2Model.get_input_embeddingsc                 C   
   || _ d S r   r  rD   valuer-   r-   rH   set_input_embeddings,     
zQwen2Model.set_input_embeddingsc                 C   s   | d ur8t | jdkr%t| ||d d}|d dkr$t||d}||@ }nt | jdkr5| dd}n	| }nt||d}t dkrZtjd	d
d}tjdd
d}t	|||}|S t	|
dd	t|j|}|S )Nrf   rg   )r   rN   )r   r   r   r   r'   rh   rO   g   )r   rQ   r   r   r   rr   r   rR   Z	to_tensorwherer   rT   rU   )ry   Zinput_shaper   rP   Zexpanded_attn_maskZcombined_attention_maskrX   r[   r-   r-   rH   _prepare_decoder_attention_mask/  s<   

z*Qwen2Model._prepare_decoder_attention_maskN	input_idsr   ry   inputs_embedsr<   r   rz   output_hidden_statesreturn_dictr   c                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d ur4|d ur4td|d ur>|j\}}n|d urI|j\}}}ntd|d u r[td gt| j	 }t
|}|}d}|d d urv|d d jd }||7 }|d u r| |}| jr|j\}}}t||| |g}t|}|d u rtj||ftjdn|}| |||f||j}| j jrt|rd n|}|d u rtj|dd||f}|}|rdnd }|rdnd }|rdnd }t| j	D ]r\}}|r||f7 }|d ur|| nd }|j }| jr!|| jvr!|r!| jdkr!| j||||||||
d	}n||||||||
d	}d  }||< t|tu r>|d }n|}|rJ||d f7 }|rY|||rTd
nd f7 }q|  |}|rg||f7 }|rl|nd }|	s~tdd ||||fD S t!||||dS )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timezEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   rN   rO   int64r-   rS   r   rf   c                 s   s    | ]	}|d ur|V  qd S r   r-   )r   r   r-   r-   rH   	<genexpr>  s    z%Qwen2Model.forward.<locals>.<genexpr>)Zlast_hidden_stater   r   
attentions)"rv   rz   r#  r<   use_return_dictro   rQ   r   r   r  listr  r|   rR   r   r   applyr   r   r   rP   r   r   r   r   	enumeraterW   r   r  r   Zrecompute_training_fullr   r  r   )rD   r!  r   ry   r"  r<   r   rz   r#  r$  r{   r   r0   r   Zseq_length_with_pastZcache_lengthbsr   r1   r   Zall_hidden_statesZall_self_attnsZnext_decoder_cacheidxZdecoder_layerr   Zhas_gradientZlayer_outputsZ
next_cacher-   r-   rH   r   V  s   






zQwen2Model.forward)
NNNNNNNNNN)rI   rJ   rK   rL   r   rC   r  r  staticmethodr   rR   r   r   r   r   r   r   r   r   rM   r-   r-   rF   rH   r    sL    (
(	

r  c                       s.   e Zd ZdZdef fddZdd Z  ZS )Qwen2PretrainingCriterionzB
    Criterion for Mixtral.
    It calculates the final loss.
    rv   c                    sf   t t|   t|dd| _|| _|jdko|j| _| jr't	j
| jd| _d S tjjd| jd| _d S )Nignore_indexirN   )r0  noneZ	reductionr0  )rB   r/  rC   r  r0  rv   rc   rb   enable_parallel_cross_entropyr  ZParallelCrossEntropy	loss_funcrR   r   CrossEntropyLossr   rF   r-   rH   rC     s   z"Qwen2PretrainingCriterion.__init__c                 C   s   | j r&|jd | jjkr&td|jd  d| jj  tjjd| j	d| _
tjdF | 
|d|d}t|d	kt|t|}t|}|d	krYt|| }nt|| | }W d    |S W d    |S 1 suw   Y  |S )
Nrg   zBenable_parallel_cross_entropy, the vocab_size should be splitted: z, r1  r2  Frh   rf   r   )r3  rQ   rv   r.   r   r   rR   r   r5  r0  r4  rs   rt   rr   r   r  Z	ones_likeZ
zeros_likesum)rD   Zprediction_scoresZmasked_lm_labelsZmasked_lm_lossZbinary_sequencecountlossr-   r-   rH   r     s8   


z!Qwen2PretrainingCriterion.forward)rI   rJ   rK   rL   r   rC   r   rM   r-   r-   rF   rH   r/    s    r/  c                       s.   e Zd Zddef fddZd	ddZ  ZS )
Qwen2LMHeadNFrv   c                    s   t t|   || _|jdkr|j|j dkr|j|j }n|j}|| _|r<|d ur.|| _n.| j||j	gt
 d| _n ||jkrO| j|j	|gt
 d| _n| j|j	|gt
 d| _||jkrcdnd| j_| jjrv| jrpdnd| j_d S d S )NrN   r   rQ   rP   TF)rB   r9  rC   rv   rc   r.   r]   r   r   r1   rR   r   r_   Z
split_axis)rD   rv   embedding_weightsr]   r.   rF   r-   rH   rC   (  s8   



zQwen2LMHead.__init__c                 C   sX   | j jrt|}| j j}t|d|| j jg}|d u r | j j}t	|| j
| j|d}|S )Nrg   )r]   rb   )rv   r|   r   r*  r0   rR   r   r1   rb   re   r   r]   )rD   r   rb   r0   rd   r-   r-   rH   r   P  s   
zQwen2LMHead.forward)NFr   r   r-   r-   rF   rH   r9  '  s    (r9  c                       s  e Zd ZdZdgZdef fddZdd Zdd	 Zd
d Z	dd Z
dd Zdd Z				d(ddZdefddZe	d)ddZ											d*dejdeej deej deej deej d ee d!eeej  d"ee d#ee d$ee d%eeef fd&d'Z  ZS )+Qwen2ForCausalLMTzlm_head.weightrv   c                    s\   t  | t|| _|jrt|| jjjdd| _| 	  nt|| _t
|| _|j| _d S )NT)r;  r]   )rB   rC   r  r   r,   r9  r  r   lm_headZtie_weightsr/  	criterionr.   r   rF   r-   rH   rC   h  s   



zQwen2ForCausalLM.__init__c                 C   s   | j jS r   r   r  r  r-   r-   rH   r  w  s   z%Qwen2ForCausalLM.get_input_embeddingsc                 C   s   || j _d S r   r?  r  r-   r-   rH   r  z  s   z%Qwen2ForCausalLM.set_input_embeddingsc                 C   r  r   r=  r  r-   r-   rH   get_output_embeddings}  r  z&Qwen2ForCausalLM.get_output_embeddingsc                 C   r  r   r@  )rD   Znew_embeddingsr-   r-   rH   set_output_embeddings  r  z&Qwen2ForCausalLM.set_output_embeddingsc                 C   r  r   r   )rD   decoderr-   r-   rH   set_decoder  r  zQwen2ForCausalLM.set_decoderc                 C   r  r   rC  r  r-   r-   rH   get_decoder  r  zQwen2ForCausalLM.get_decoderFNc                 K   s   |j \}}|dt|||f}	|r,|d d df jdd}|	d d df d}	|d ur9|d u r9d|i}
nd|i}
|
|	|||d |
S )Nr   rg   r   r"  r!  )r   r   r<   ry   )rQ   getrR   r   r   r   update)rD   r!  r<   r   ry   r"  rE   r   r0   r   Zmodel_inputsr-   r-   rH   prepare_inputs_for_generation  s$   
	
z.Qwen2ForCausalLM.prepare_inputs_for_generationrP   c                 C   s<   t jjd d gddt jjd d gddt jjd d gdddS )Nr%  r:  )r!  ry   r   )rR   ZstaticZ	InputSpec)rD   rP   r-   r-   rH   _get_model_inputs_spec  s   z'Qwen2ForCausalLM._get_model_inputs_specc                 C   sD  t | trt| dkrt | d tjs| d |d< t | tr'd| v r'| j|d< d|v rH|d d urH|d }tj||ddd f d gdd|d< |sd|v r|d }t|jdkrqtj|tj	|jd	 dg|j
d
gdd|d< |S t|jdkrtj|tj	g |jd d d|j
d
gddd d d d dd d d f |d< |S )NrN   r   r   .rg   r   ry   rf   r   rO      r   )
isinstancer   r   rR   r   r   r   r   rQ   r   rP   )r   Zmodel_kwargsZis_encoder_decoderr   ry   r-   r-   rH   "update_model_kwargs_for_generation  sJ   



z3Qwen2ForCausalLM.update_model_kwargs_for_generationr!  r   ry   r"  labelsr<   r   rz   r#  r$  r   c                 C   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
|dur-|dur-td d}| j||||||||	|
|d
}|d }| j joI| j jdk}| j	||d}d}|
si|f|dd  }|durg|f| S |S t
|||j|j|jdS )a  
        Args:
            labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM

        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```NzzYou have provided both attn_mask_startend_row_indices and attention_mask. The attn_mask_startend_row_indices will be used.)
r!  r   ry   r"  r<   r   rz   r#  r$  r{   r   rN   )rb   )r8  rd   r   r   r'  )rv   rz   r#  r(  r   r   r   rb   rc   r=  r   r   r   r'  )rD   r!  r   ry   r"  rN  r<   r   rz   r#  r$  r{   r   r   rb   rd   r8  outputr-   r-   rH   r     sZ   *
zQwen2ForCausalLM.forward)FNNNr  )NNNNNNNNNNN)rI   rJ   rK   Zenable_to_static_methodZ_tied_weights_keysr   rC   r  r  rA  rB  rE  rF  rI  strrJ  r.  rM  rR   r   r   r   r   r   r   r   r   rM   r-   r-   rF   rH   r<  d  sj    
!	1	

r<  r   )TT)NTFF)Irm   	functoolsr   typingr   r   r   r   rR   Z&paddle.distributed.fleet.meta_parallelr`   r   Zmeta_parallelr  Z	paddle.nnr   Zpaddle.nn.functionalZ
functionalrq   r   Zpaddle.distributedZpaddle.distributed.fleet.utilsr	   utilsr   Z	utils.envr   Zcommon.transformersr   Zcommon.transformers.activationsr   Z common.transformers.transformersr   r   Z.common.transformers.transformers.model_outputsr   r   Zpaddle.incubate.nn.functionalr   ImportErrorZ6paddle.distributed.fleet.utils.sequence_parallel_utilsr   r   r   Z$paddle.nn.functional.flash_attentionr   r   r   r   r   r   r   rZ   re   r   r   r   r   ZLayerr   r   r   r   r   intr   r   r   r   r  r/  r9  r<  r-   r-   r-   rH   <module>   s   

+
K)&T  f] x7=