o
    *j                     @   s:  d Z ddlZddlZddlZddlm  mZ ddlmZ	 ddl
mZ G dd dejjZG dd dejjZG d	d
 d
ejjZejjdd Zdd ZG dd dejjZG dd dejjZG dd dejjZdd Zdd ZG dd dejjZG dd dejjZG dd dejjZG dd  d ejjZdS )!zTransformer.    N)FusedLayerNorm)mpuc                       s&   e Zd Z fddZdddZ  ZS )PositionalEmbeddingc                    s>   t t|   || _ddtd|d|   }| d| d S )N   i'                 @inv_freq)superr   __init__hidden_sizetorcharangeZregister_buffer)selfr   r   	__class__ m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/model/transformer.pyr
      s   zPositionalEmbedding.__init__Nc                 C   sf   t || j}t j| | gdd}|d ur(|d d d d d f |ddS |d d d d d f S )Ndim)r   Zgerr   catsincosexpand)r   Zpos_seqZbszZsinusoid_inpZpos_embr   r   r   forward$   s
    zPositionalEmbedding.forwardN)__name__
__module____qualname__r
   r   __classcell__r   r   r   r   r      s    	r   c                       s4   e Zd ZdZ	d	 fdd	Zdd Zdd Z  ZS )
ParallelCrossAttentionz.Parallel cross-attention layer for TransformerNc                    s   t t|   |d u r|}t }t||| _t||| _t||| _tj	||d|d| _
tj	|d| dd|d| _tj|| _tj||d|d| _tj|| _tj rftjjt_tjjt_d S d S )NFgather_outputinit_method   Zstrider"   r#   Tinput_is_parallelr#   )r	   r    r
   r   get_model_parallel_world_sizedividehidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionColumnParallelLinearquery	key_valuer   nnDropoutattention_dropoutRowParallelLineardenseoutput_dropout	deepspeedcheckpointingis_configuredget_cuda_rng_tracker
checkpoint)r   r   num_attention_headsattention_dropout_proboutput_dropout_probr#   output_layer_init_method
world_sizer   r   r   r
   1   sH   	

zParallelCrossAttention.__init__c                 C   6   |  dd | j| jf }|j| }|ddddS z_Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
        size [b, np, s, hn].
        Nr   r   r$   r      sizer,   r+   viewpermuter   tensorZnew_tensor_shaper   r   r   _transpose_for_scores`      
z,ParallelCrossAttention._transpose_for_scoresc                 C   s(  |  |}| |}t|d\}}| |}| |}	| |}
t||	dd}|t	| j
 }|d urCt||dd|   }tjjdd|}t   | |}W d    n1 sbw   Y  t||
}|dddd	 }| d d | jf }|j| }| |}| |}|S )
Nr$   r   g     @      ?r   r   r   rB   )r.   r/   r   split_tensor_along_last_dimrI   r   matmul	transposemathsqrtr+   mulr0   Softmaxr9   forkr2   rF   
contiguousrD   r*   rE   r4   r5   )r   hidden_statesencoder_states
cross_maskmixed_query_layermixed_x_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputr   r   r   r   j   s>   










zParallelCrossAttention.forwardr   )r   r   r   __doc__r
   rI   r   r   r   r   r   r   r    .   s    /
r    c                       sR   e Zd ZdZ				d fdd	Zdd Zedd	d
Z				dddZ  Z	S )ParallelSelfAttentiona  Parallel self-attention layer for GPT2.

    Self-attention layer takes input with size [b, s, h] where b is
    the batch size, s is the sequence length, and h is the hidden size
    and creates output of the same size.
    Arguments:
        hidden_size: total hidden size of the layer (h).
        num_attention_heads: number of attention heads (n). Note that we
                             require n to be divisible by number of GPUs
                             used to parallelize the model. Also, we
                             require hidden size to be divisible by n.
        attention_dropout_prob: dropout probability for the attention scores.
        init_method: weight initialization.
        output_layer_init_method: output layer initialization. If None, use
                                  `init_method`.
    We use the following notation:
        h: hidden_size
        n: num_attention_heads
        p: number of partitions
        np: n/p
        hp: h/p
        hn: h/n
        b: batch size
        s: sequence length
    NFrL   c
                    s   t t|   || _|d u r|}t }
t||
| _t||| _t||
| _	|| _
|	| _tj|d| dd|d| _|rHtj||d|d| _tj|| _tj||d|d| _tj|| _tj rqtjjt_tjjt_d S d S )NrB   Fr%   r!   Tr&   )r	   rf   r
   	performerr   r(   r)   r*   r+   r,   relative_encodingattention_scaler-   query_key_valuerelativer   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   )r   r   r;   r<   r=   r#   r>   rh   rg   ri   r?   r   r   r   r
      sP   


zParallelSelfAttention.__init__c                 C   r@   rA   rC   rG   r   r   r   rI      rJ   z+ParallelSelfAttention._transpose_for_scoresc                 C   s   t jg |  d d | ddR | j| jd}t j|| gdd}|jg |  d d | dd | dR  }|d d d d dd f | } |rvt | d| df}| t 	|| d| d d d d d d d f  } | S )NrK   r   devicedtyper   r   r   )
r   ZzerosrD   rm   rn   r   rE   Zview_asonestril)xZ	zero_triuZzero_padZx_paddedro   r   r   r   
_rel_shift   s   &4 4z ParallelSelfAttention._rel_shiftc                 C   s\  | d}|d u r| |}t|d\}	}
}n!t||fd}| |}t|d\}	}
}|	d d | d f }	| |	}| |
}| |}| jr| |}| |}||	d }t
||dd}||	d }t
||dd}| |}|| }|t| j }n/| jdkrt
|t| j |ddt| j| j  }nt
||ddt| j }t||}| jdkr|jdddd }||8 }|| j9 }|d	d|   }tjjdd
|}t   | |}W d    n1 sw   Y  t
||}|dddd }|  d d | jf }|j| }| |}| |}|S )Nr   rB   r   rK   rL   T)r   Zkeepdimr   g     r   r$   )rD   rj   r   rM   r   r   rI   rh   rk   	unsqueezerN   rO   rr   rP   rQ   r+   ri   rR   maxr0   rS   r9   rT   r2   rF   rU   r*   rE   r4   r5   )r   rV   	ltor_maskposition_embeddingsr_w_biasr_r_biasmemquery_lengthrZ   rY   r[   r\   r   r]   r^   r_   Zrelative_layerZ	rw_head_qZac_scoreZ	rr_head_qZbd_scorer`   Zmax_attention_scoresra   rb   rc   rd   r   r   r   r     s   

















zParallelSelfAttention.forwardNFFrL   FNNNN)
r   r   r   re   r
   rI   staticmethodrr   r   r   r   r   r   r   rf      s     6
rf   c                 C   s*   d|  dt d|  dd|  |      S )zOpenAI's gelu implementation.g      ?rL   gQ63E?gHm?)r   tanhrq   r   r   r   	gelu_implf  s
   r   c                 C   s   t | S r   )r   r   r   r   r   gelun  s   r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )ParallelMLPa  MLP for GPT2.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform gelu transformation, and project the
    state back into h hidden dimension. At the end, dropout is also
    applied.

    Arguments:
        hidden_size: The hidden size of the self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        init_method: initialization method used for the weights. Note
                     that all biases are initialized to zero and
                     layernorm weight are initialized to one.
        output_layer_init_method: output layer initialization. If None,
                                  use `init_method`.
    Nc                    s\   t t|   |d u r|}tj|d| d|d| _tjd| |d|d| _tj	
|| _d S )N   Fr!   Tr&   )r	   r   r
   r   r-   dense_h_to_4hr3   dense_4h_to_hr   r0   r1   dropout)r   r   r=   r#   r>   r   r   r   r
     s    zParallelMLP.__init__c                 C   s*   |  |}t|}| |}| |}|S r   )r   r   r   r   )r   rV   Zintermediate_parallelrd   r   r   r   r     s
   


zParallelMLP.forwardr   r   r   r   re   r
   r   r   r   r   r   r   r   r  s
    r   c                       s0   e Zd ZdZ	d fdd	Z	dddZ  ZS )ParallelDecoderLayer  A single layer transformer for GPT2.

    We use the following notation:
        h: hidden size
        n: number of attention heads
        b: batch size
        s: sequence length
    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.

    Arguments:
        hidden_size: The hidden size of the self attention.
        num_attention_heads: number of attention head in the self
                             attention.
        attention_dropout_prob: dropout probability of the attention
                                score in self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        layernorm_epsilon: epsilon used in layernorm to avoid
                           division by zero.
        init_method: initialization method used for the weights. Note
                     that all biases are initialized to zero and
                     layernorm weight are initialized to one.
        output_layer_init_method: output layers (attention output and
                                  mlp output) initialization. If None,
                                  use `init_method`.
    Nc                    s   t t|   |d u r|}t||d| _t||||||d| _t||d| _t||||||d| _	t||d| _
t||||d| _d S )Nepsr>   )r	   r   r
   	LayerNorminput_layernormrf   self_attentionpost_self_layernormr    cross_attentionpost_attention_layernormr   mlp)r   r   r;   r<   r=   layernorm_epsilonr#   r>   r   r   r   r
     s@   		zParallelDecoderLayer.__init__c                 C   s^   |  |}| ||}|| }| |}| |||}	||	 }
| |
}| |}|
| }|S r   )r   r   r   r   r   r   )r   rV   rW   ru   rX   layernorm_outputZself_attention_outputZself_layernorm_inputZself_layernorm_outputattention_outputlayernorm_input
mlp_outputrd   r   r   r   r     s   
	


zParallelDecoderLayer.forwardr   r   r   r   r   r   r     s    #4r   c                       s<   e Zd ZdZ				d	 fdd	Z				d
ddZ  ZS )ParallelTransformerLayerr   NFrL   c                    sh   t t|   |d u r|}t||d| _t||||||||	|
d	| _t||d| _t||||d| _	d S )Nr   r>   rh   rg   ri   r   )
r	   r   r
   r   r   rf   	attentionr   r   r   )r   r   r;   r<   r=   r   r#   r>   rh   rg   ri   r   r   r   r
   0  s0   z!ParallelTransformerLayer.__init__c                 C   s\   |  |}|d ur|  |nd }| ||||||}|| }	| |	}| |}
|	|
 }|S r   )r   r   r   r   )r   rV   ru   rv   rw   rx   ry   r   r   r   r   rd   r   r   r   r   Z  s   


z ParallelTransformerLayer.forwardr{   r}   r   r   r   r   r   r     s    #-r   c                    s    fdd}|S )z!Init method based on N(0, sigma).c                       t jjj| d dS Nr   meanstdr   r0   initnormal_rH   sigmar   r   init_z     z#unscaled_init_method.<locals>.init_r   )r   r   r   r   r   unscaled_init_methodw  s   r   c                    s"   | t d|    fdd}|S )z3Init method based on N(0, sigma/sqrt(2*num_layers).r   c                    r   r   r   r   r   r   r   r     r   z!scaled_init_method.<locals>.init_)rP   rQ   )r   
num_layersr   r   r   r   scaled_init_method  s   r   c                       sP   e Zd ZdZ									d fdd		Z	
	
		dddZdddZ  ZS )GPT2ParallelTransformera  GPT-2 transformer.

    This module takes input from embedding layer and it's output can
    be used directly by a logit layer. It consists of L (num-layers)
    blocks of:
        layer norm
        self attention
        residual connection
        layer norm
        mlp
        residual connection
    followed by a final layer norm.

    Arguments:
        num_layers: Number of transformer layers.
        hidden_size: The hidden size of the self attention.
        num_attention_heads: number of attention head in the self
                             attention.
        attention_dropout_prob: dropout probability of the attention
                                score in self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        checkpoint_activations: if True, checkpoint activations.
        checkpoint_num_layers: number of layers to checkpoint. This
                               is basically the chunk size in checkpoitning.
        layernorm_epsilon: epsilon used in layernorm to avoid
                           division by zero.
        init_method_std: standard deviation of the init method which has
                         the form N(0, std).
        use_scaled_init_for_output_weights: If True use 1/sqrt(2*num_layers)
                                            scaling for the output weights (
                                            output of self attention and mlp).
    r   h㈵>{Gz?TFrL   c                    s   t t|   | _|	| _|
| _|| _	| _| _	r
rJ d |r(t	|t
j|| _
| _|| _
rt| _t }t| _t|| _t
jt
| j| j| _d| j_t
jt
| j| j| _d| j_t
  | j  | j  W d    n1 sw   Y  n7|rt
j|d | _t
j|d | _t
jj j!| jj"dd nt
j|| _t
jj j!| jj"dd  	
fddt
j#fddt$|D | _%t&d	| _'t(j)* rt(j)j+t_+t(j)j,t_,d S d S )
NTr   r   r   c                      s>   
rt  tdS t t	d
S )Nr   r   )r   r   r   r   )r<   ri   r   init_method_stdr   r;   r=   r>   rg   rh   use_decoder_layerr   r   	get_layer  s,   	z3GPT2ParallelTransformer.__init__.<locals>.get_layerc                    s   g | ]}  qS r   r   ).0_)r   r   r   
<listcomp>  s    z4GPT2ParallelTransformer.__init__.<locals>.<listcomp>r   )-r	   r   r
   r   checkpoint_activationscheckpoint_num_layersmax_memory_lengthrg   r   r   r   r0   r1   embedding_dropoutrh   block_position_encodingr   rv   r   r(   r)   r+   r,   	ParameterZTensorrw   Zmodel_parallelrx   Zno_gradZzero_Z	Embeddingblock_position_embeddingsr   r   weightZ
ModuleListrangelayersr   final_layernormr6   r7   r8   r9   r:   )r   r   r   r;   Zmax_sequence_lengthr   Zembedding_dropout_probr<   r=   r   r   r   r   Z"use_scaled_init_for_output_weightsrh   r   rg   r   ri   r?   r   )r<   ri   r   r   r   r   r;   r=   r>   rg   rh   r   r   r
     s   



 

z GPT2ParallelTransformer.__init__Nc                    s    d d \ }|r|d  dnd}	||	 }
t|dkp(t| k}jr:s2J d|	dks:J d|rXrB| n|}d fdd	}jsW||||	d}n|d d d d d d | |	 d f }jrtj|
d d	d
jjd}	|}
|}n)jr|d d df |d d df }}	|}| jr|}| 
fddjdksrȈgng fdd}jr"d}tj}j}||k r!js|gn||g}jr||jjg7 }|r|||||  7 }tj|||| g|R  ||7 }||k snEtjD ]?\}}js3|gn||g}jrE||jjg7 }|rL|| nd }||d|ijdks^re q'}jdksur}j|d|fS )Nr$   r   r   zFattention_mask should be a scalar to indicate the separation position.zDo not support transformer-xl.c                    s    d| | f}t|}rd|dd d d |f< n)| dd}tj| |j|jddd}||ddk }||	d
|d}|dkr^| dd}tj  | |f|fdd}|	d}|S )Nr   r   r   rl   r$   r   )Znew_onesr   rp   r   r   rm   rn   rE   Zmasked_fillrs   Z	expand_asr   )Z
seq_lengthsepmemory_lengthmZidsmask)
batch_sizerV   	is_scalarr   r   build_mask_matrix2  s2   

z:GPT2ParallelTransformer.forward.<locals>.build_mask_matrix)r   r   g      rl   c                    s    r|   S | S r   )detach)Z_hidden_states)detach_memoryr   r   check_detache  s   z5GPT2ParallelTransformer.forward.<locals>.check_detachc                    s    fdd}|S )Nc                     s   j  }| d | dd  }} jr#| d d | dd  } }n| d d | dd  } }t|D ]%\}}|r>|| nd }||g| R d|i}jdksRrY | q4|S )Nr   r   r   ry   )r   rh   	enumerater   append)ZinputsZlayers_Zx_Zmems_ilayerZmem_i_)r   end
mem_layersreturn_memoryr   startr   r   custom_forwardq  s   zGGPT2ParallelTransformer.forward.<locals>.custom.<locals>.custom_forwardr   )r   r   r   )r   r   r   r   )r   r   r   customo  s   z/GPT2ParallelTransformer.forward.<locals>.customry   )r   )r   )rD   r   Znumelrg   itemrh   r   rm   rn   rv   r   r   r   r   r   lenr   r   r   rw   rx   r   r:   r   r   r   update_mems)r   rV   Zposition_idsattention_maskZmemory_statesrW   r   r   rz   r   Z
key_lengthZis_sepr   r   Zposition_sequencerv   Zblock_position_idsr   r   lr   chunk_lengthargsr   r   Zmem_ird   r   )r   r   r   rV   r   r   r   r   r   r     s   






zGPT2ParallelTransformer.forwardc           	      C   s   |r	|d  dnd}|d  d}|| }|st| j|}g }tt|D ]2}||kr=||| d d | d f  q&|tj|| d d | | d f || fdd q&|S )Nr   r   r   )rD   minr   r   r   r   r   r   )	r   ZhiddensZmemsr   r   rz   Znew_memory_lengthZnew_memsr   r   r   r   r     s"   " z#GPT2ParallelTransformer.update_mems)	r   r   r   TFFFFrL   )NNFTr|   )r   r   r   re   r
   r   r   r   r   r   r   r   r     s&    -t
 r   c                       s8   e Zd ZdZdejf fdd	Zdd Zdd Z  Z	S )	BertParallelSelfAttentiona  Parallel self-attention layer for BERT.

    Self-attention layer takes input with size [b, s, h] where b is
    the batch size, s is the sequence length, and h is the hidden size
    and creates output of the same size.
    Arguments:
        hidden_size: total hidden size of the layer (h).
        num_attention_heads: number of attention heads (n). Note that we
                             require n to be divisible by number of GPUs
                             used to parallelize the model. Also, we
                             require hidden size be divisible by n.
        dropout_prob: dropout probability for the attention scores.
        output_parallel: If true, no all-gather is done on the output and
                         the output values will be per partition.
    We use the following notation:
        h: hidden_size
        n: num_attention_heads
        p: number of partitions
        np: n/p
        hp: h/p
        hn: h/n
        b: batch size
        s: sequence length
    Fc                    s   t t|   || _|| _|| _|| _t }t	||| _
t	||| _t	||| _tj|d| dd|d| _tj|| _tj rQtjjt_tjjt_d S d S )NrB   Fr%   )r	   r   r
   r   r;   dropout_proboutput_parallelr   r(   r)   r*   r+   r,   r-   rj   r   r0   r1   r   r6   r7   r8   r9   r:   )r   r   r;   r   r   r#   r?   r   r   r   r
     s2   	

z"BertParallelSelfAttention.__init__c                 C   r@   rA   rC   rG   r   r   r   rI     rJ   z/BertParallelSelfAttention._transpose_for_scoresc                 C   s  |  |}t|d\}}}| |}| |}| |}	tt| j}
t||
 |	dd|
 }||7 }tj
jdd|}t   | |}W d    n1 sWw   Y  t||	}|dddd }| d d | jf }|j| }| jr|}|S t|}|S )NrB   r   rK   r   r   r$   r   )rj   r   rM   rI   rP   rQ   r+   r   rN   rO   r0   rS   r9   rT   r   rF   rU   rD   r*   rE   r   Z!gather_from_model_parallel_region)r   rV   r   rZ   rY   r[   r\   r]   r^   r_   Znorm_factorr`   ra   rb   rc   rd   r   r   r   r     s<   





z!BertParallelSelfAttention.forward)
r   r   r   re   r   xavier_normal_r
   rI   r   r   r   r   r   r   r     s    #
r   c                       s2   e Zd ZdZddejf fdd	Zdd Z  ZS )BertParallelTransformerOutputz[The output layer used after self attention and intermediate
    parts of transformer layer.g-q=Fc                    sB   t t|   tj||||d| _tj|| _	t
||d| _d S )Nr&   r   )r	   r   r
   r   r3   r4   r   r0   r1   r   r   	layernorm)r   Z
input_sizeZoutput_sizer   r   r'   r#   r   r   r   r
   7  s   z&BertParallelTransformerOutput.__init__c                 C   s*   |  |}| |}|| }| |}|S r   )r4   r   r   )r   rV   Zinput_tensorr   r   r   r   r   H  s
   


z%BertParallelTransformerOutput.forward	r   r   r   re   r   r   r
   r   r   r   r   r   r   r   3  s    r   c                       s.   e Zd ZdZejf fdd	Zdd Z  ZS )BertParallelTransformerLayera_  A single layer transformer for Bert.

    We use the following notation:
        h: hidden size
        n: number of attention heads
        b: batch size
        s: sequence length
    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.

    Arguments:
        hidden_size: The hidden size of the self attention.
        intermediate_size: size of the intermediate state after
                           self attention. In both BERT and GPT
                           this is set to be 4 times the hidden
                           size.
        num_attention_heads: number of attention head in the self
                             attention.
        attention_dropout_prob: dropout probability of the attention
                                score in self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        intermediate_activation_fn: activation function for output
                                    of intermediate.
        layernorm_epsilon: epsilon used in layernorm to avoid
                           division by zero.
        init_method: initialization method used for the weights. Note
                     that all biases are initialized to zero and
                     layernorm weight are initialized to one.
    c	           	         sl   t t|   t|||d|d| _t||||d|d| _tj||d|d| _	|| _
t||||d|d| _d S )NT)r   r#   )r   r'   r#   Fr!   )r	   r   r
   r   r   r   self_outputr   r-   intermediateintermediate_activation_fnrd   )	r   r   Zintermediate_sizer;   r<   r=   r   r   r#   r   r   r   r
   p  s>   	z%BertParallelTransformerLayer.__init__c                 C   s<   |  ||}| ||}| |}| |}| ||}|S r   )r   r   r   r   rd   )r   rV   r   Zattention_output_parallelZattention_self_outputZintermediate_output_parallelZlayer_outputr   r   r   r     s   
z$BertParallelTransformerLayer.forwardr   r   r   r   r   r   P  s
    '*r   )re   rP   r6   r   Ztorch.nn.initr0   r   Z#apex.normalization.fused_layer_normr   r   Zmegatron_utilr   Moduler   r    rf   Zjitscriptr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s2   l M
5ld	
  5v