o
    *jJ3                     @   s   d dl Z d dlZd dlmZmZ d dlZd dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ ddlmZ G d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdS )    N)OptionalUnion)nn)
functional)PreTrainedModel)	ModelFile   )GPTMoEConfigc                       s>   e Zd ZdZ fddZdd Z	dddZdd	d
Z  ZS )GPTMoESelfAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    c                    s~   t    |j| _|j| _| j| j | _t| jd| j | _tjdd| _	t
|j| _t| j| j| _t
|j| _d S )N   dim)super__init__hidden_sizenum_attention_headshidden_size_per_attention_headr   Linearquery_key_valueZSoftmaxsoftmaxDropoutZattention_probs_dropout_probattention_dropoutdensehidden_dropout_proboutput_dropoutselfconfig	__class__ g/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/backbone.pyr   %   s   

zGPTMoESelfAttention.__init__c                 C   s6   |  dd | j| jf }|j| }|ddddS )z_Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
        size [b, np, s, hn].
        Nr   r      r   r   )sizer   r   viewpermute)r   tensorZnew_tensor_shaper!   r!   r"   _transpose_for_scores8   s
   
z)GPTMoESelfAttention._transpose_for_scoresFc                 C   sF   |  d }| | | }tj|||d}|r!tdd |D S |S )Nr   r   c                 s   s    | ]}|  V  qd S N)
contiguous).0chunkr!   r!   r"   	<genexpr>L   s    zCGPTMoESelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>)r   r$   torchsplittuple)r   r'   Znum_partitionsZcontiguous_split_chunksZlast_dimZlast_dim_sizeZtensor_listr!   r!   r"   _split_tensor_along_last_dimA   s   z0GPTMoESelfAttention._split_tensor_along_last_dimc                 C   sP  | d}t|dd||g}| |}| |d\}}}| |}	| |}
| |}| }t|	|
dd}|t	
| j }|r`|
 d}ttjd||f|jddd|||}dd|  }t||| |}| |}| |}t||}|d	ddd }|  d d | jf }|j| }| |}| |}|S )
Nr   r   r   r#   )deviceg     @      ?r   )r$   r.   Zreshaper   r1   r(   typematmulZ	transposemathsqrtr   trilonesr3   r%   mulr   r   r&   r*   r   r   r   )r   hidden_states	ltor_maskZis_inferZtgt_lenZmixed_x_layerZmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ	key_layerZvalue_layerZprevious_typeZattention_scoresZsrc_lenZconverted_maskZattention_probsZcontext_layerZnew_context_layer_shapeoutputr!   r!   r"   forwardP   sX   













zGPTMoESelfAttention.forward)F)	__name__
__module____qualname____doc__r   r(   r1   r?   __classcell__r!   r!   r   r"   r
      s    
r
   c                       (   e Zd ZdZ fddZdd Z  ZS )	GPTMoEMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    c                    sN   t    |j}t|d| | _tj| _td| || _	t
|j| _d S )N   )r   r   r   r   r   dense_h_to_4hFZgeluactivation_funcdense_4h_to_hr   r   dropout)r   r   r   r   r!   r"   r      s   
zGPTMoEMLP.__init__c                 C   s,   |  |}| |}| |}| |}|S r)   )rH   rJ   rK   rL   )r   r<   Zintermediate_parallelr>   r!   r!   r"   r?      s
   



zGPTMoEMLP.forwardr@   rA   rB   rC   r   r?   rD   r!   r!   r   r"   rF      s    rF   c                       rE   )GPTMoETransformerLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    c                    sJ   t    tj|j|jd| _t|| _tj|j|jd| _	t
|| _d S )Neps)r   r   r   	LayerNormr   layernorm_epsiloninput_layernormr
   	attentionpost_attention_layernormrF   mlpr   r   r!   r"   r      s   

zGPTMoETransformerLayer.__init__c                 C   s>   |  |}| ||}|| }| |}| |}|| }|S r)   )rS   rT   rU   rV   )r   r<   r=   Zlayernorm_outputZattention_outputZlayernorm_inputZ
mlp_outputr>   r!   r!   r"   r?      s   


zGPTMoETransformerLayer.forwardrM   r!   r!   r   r"   rN      s    rN   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )GPTMoETransformerzTransformer class.c                    sR   t    d | _ j| _tj fddt| jD | _	tj
 j jd| _d S )Nc                    s   g | ]}t  qS r!   )rN   )r+   _r   r!   r"   
<listcomp>   s    z.GPTMoETransformer.__init__.<locals>.<listcomp>rO   )r   r   Zinput_tensorZnum_hidden_layers
num_layersr.   r   Z
ModuleListrangelayersrQ   r   rR   final_layernormr   r   rY   r"   r      s   
zGPTMoETransformer.__init__c                 C   s
   | j | S r)   )r]   )r   Zlayer_numberr!   r!   r"   
_get_layer   s   
zGPTMoETransformer._get_layerc                 C   s2   t | jD ]}| |}|||}q| |}|S r)   )r\   r[   r_   r^   )r   r<   attention_maskindexlayerr!   r!   r"   r?      s
   

zGPTMoETransformer.forward)r@   rA   rB   rC   r   r_   r?   rD   r!   r!   r   r"   rW      s
    rW   c                       rE   )GPTMoETransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    sJ   t    t|j|j| _t|j|j| _t	|j
| _t|| _d S r)   )r   r   r   	Embedding
vocab_sizer   word_embeddingsZmax_position_embeddingsposition_embeddingsr   r   embedding_dropoutrW   transformerr   r   r!   r"   r     s   
z'GPTMoETransformerLanguageModel.__init__c           
      C   sF   |  |}| |}|| }| |}| ||}t|| j j}	|	S r)   )rf   rg   rh   ri   rI   Zlinearweight)
r   	input_idsr`   position_idsZwords_embeddingsrg   Z
embeddingsZtransformer_inputZtransformer_outputlogitsr!   r!   r"   r?     s   


z&GPTMoETransformerLanguageModel.forwardrM   r!   r!   r   r"   rc      s    rc   c                       s`   e Zd ZeZdd Z fddZ			dddZede	e
eejf  fd	d
Zdd Z  ZS )GPTMoEModelc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )meanZstdNr4   )
isinstancer   r   rj   dataZnormal_r   Zinitializer_rangeZbiasZzero_rd   Zpadding_idxrQ   Zfill_)r   moduler!   r!   r"   _init_weights!  s$   

zGPTMoEModel._init_weightsc                    s   t  | t|| _d S r)   )r   r   rc   language_modelr   r   r!   r"   r   3  s   zGPTMoEModel.__init__Nc           
      K   s   | d}ttjdd||ftj|jd}|d u r,tj|tj|jd}|d|}| 	|||}d }|d urKt
 }	|	|d| jj|d}tj||dS )Nr   )Zdtyper3   r   r   )lossrm   )r$   r.   r9   r:   longr3   ZarangeZ	unsqueezeZ	expand_asrt   r   ZCrossEntropyLossr%   r   re   addictDict)
r   rk   r`   rl   labelskwargsZ
seq_lengthrm   ru   Zloss_fctr!   r!   r"   r?   7  s(   

zGPTMoEModel.forwardpretrained_model_name_or_pathc                 C   s^   | j |}| |}tj|tj}t|}d|v r|d }dd |	 D }|
| |S )N
state_dictc                 S   s   i | ]\}}| d d|qS )zmodel.language_modelrt   )replace)r+   kvr!   r!   r"   
<dictcomp>[  s    z/GPTMoEModel.from_pretrained.<locals>.<dictcomp>)config_classfrom_pretrainedospathjoinr   ZTORCH_MODEL_BIN_FILEr.   loaditemsZload_state_dict)clsr{   r   modelZstate_dict_filer|   r!   r!   r"   r   O  s   

zGPTMoEModel.from_pretrainedc                 O   s   d|iS )Nrk   r!   )r   rk   argsrz   r!   r!   r"   prepare_inputs_for_generationb  s   z)GPTMoEModel.prepare_inputs_for_generation)NNN)r@   rA   rB   r	   r   rs   r   r?   classmethodr   r   strr   PathLiker   r   rD   r!   r!   r   r"   rn     s    
rn   )r7   r   typingr   r   rw   r.   r   Ztorch.nnr   rI   Ztransformers.modeling_utilsr   Zmodelscope.utils.constantr   configurationr	   Moduler
   rF   rN   rW   rc   rn   r!   r!   r!   r"   <module>   s    k,"'