o
    *j'"                     @   s   d Z ddlZddlmZ ddlm  mZ ddlmZm	Z	 ddl
mZ ddlmZ ddd	ZG d
d dejjZG dd dejjZdd ZdS )zGPT-2 model.    N)mpuprint_rank_0)PromptSpell   )GPT2ParallelTransformer{Gz?c                    s    fdd}|S )zInit method based on normal distribution.

    This is only used for embeddings. The transformer has its
    own initializer.
    c                    s   t jjj| d dS )N        )meanstd)torchnninitZnormal_)Ztensorr
    n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/model/modeling_glm.pyinit_!   s   z!init_method_normal.<locals>.init_r   )r
   r   r   r   r   init_method_normal   s   r   c                       sN   e Zd ZdZ								d fdd		Zdd
dZddddddZ  ZS )GLMModelzGLM Language model.

    The output of the forward method are the logits (parallel or
    serial depending on the `parallel_output` flag.
    r   TFNlstm      ?c                    s   t t|   || _|| _|| _tdd}tj|||d| _	t
|||||	||||
||||d| _|d ur>t|| j|| _d S d S )Nr   r   init_method)attention_scalerelative_encodingblock_position_encoding)superr   __init__parallel_outputoutput_predicthidden_sizer   r   VocabParallelEmbeddingword_embeddingsr   transformerr   prompt_spell)self
num_layers
vocab_sizer   num_attention_headsembedding_dropout_probattention_dropout_proboutput_dropout_probmax_sequence_lengthmax_memory_lengthcheckpoint_activationscheckpoint_num_layersr   r   r   r   Zspell_lengthZ
spell_funcr   r   	__class__r   r   r   .   s8   

zGLMModel.__init__c                 C   s`   d}| j d | jd |d ur*|d| d7 }t|D ]}| jj| d qt| d S )NzFreeze transformerFz tune z prefix layersT)r!   Zrequires_grad_r"   rangeZlayersr   )r$   Ztune_prefix_layersZlog_strir   r   r   freeze_transformerc   s   zGLMModel.freeze_transformer)return_memorydetach_memory
prompt_posc                G   s   | d}| |}	|	}
|d ur)|
 }
|  }tj||jdd}||
||f< | j|
|||||d}|\}}|}| j	r\t
|}t|| jj}| jrS|g|R S t
|g|R S |g|R S )Nr   )devicer   )r4   r5   )sizer!   cloner#   r   Zaranger7   Z	unsqueezer"   r   r   copy_to_model_parallel_regionFlinearweightr   !gather_from_model_parallel_region)r$   Z	input_idsZposition_idsZattention_maskr4   r5   r6   ZmemsZ
batch_sizeZwords_embeddingsZ
embeddingsZprompt_embedsZbatch_indexZtransformer_outputZlogitsZhidden_layersZoutputslogits_parallelr   r   r   forwardm   sB   
	


zGLMModel.forward)r   TFFTNr   r   N)__name__
__module____qualname____doc__r   r3   r@   __classcell__r   r   r/   r   r   '   s     
5r   c                       s0   e Zd ZdZ			d fdd	Zdd Z  ZS )	EncoderDecoderzSeq2Seq Transformer Model
    The output of the forward method are the logits (parallel or serial depending on the `parallel_output` flag).
    r   Tc                    sv   t t|   || _|| _tdd}tj|||d| _t	|||||	||||
|
| _
t	|||||	||||
|dd| _d S )Nr   r   r   T)Zuse_decoder_layer)r   rG   r   r   r   r   r   r    r!   r   encoderdecoder)r$   r%   r&   r   r'   r(   r)   r*   r+   r,   r-   r.   r   r   r   r/   r   r   r      s2   
zEncoderDecoder.__init__c                 C   sv   |  |}|  |}| |||\}	}
| |||\}}
| jr8t|}t|| j j}| j	r2|fS t
|fS |fS rA   )r!   rH   rI   r   r   r:   r;   r<   r=   r   r>   )r$   Z
source_idsZ
target_idsZsource_position_idsZtarget_position_idsZsource_maskZtarget_maskZsource_embeddingsZtarget_embeddingsZencoder_output_Zdecoder_outputZoutput_parallelr?   r   r   r   r@      s"   


zEncoderDecoder.forward)r   TT)rB   rC   rD   rE   r   r@   rF   r   r   r/   r   rG      s    +rG   c                 C   s   dg i}g dd}|   D ]@}t|tjtjjfr+|d dd t|j	 D  q|d dd t|j
 D  |d dd t|j
 D  q||fS )Nparamsr   )rK   Zweight_decayc                 S   s   g | ]}|d ur|j r|qS rA   Zrequires_grad).0pr   r   r   
<listcomp>   s
    z@glm_get_params_for_weight_decay_optimization.<locals>.<listcomp>c                 S   s*   g | ]\}}|d ur|j r|dkr|qS NZbiasrL   rM   nrN   r   r   r   rO      
    c                 S   s*   g | ]\}}|d ur|j r|dkr|qS rP   rL   rQ   r   r   r   rO      rS   )modules
isinstancer   Z	LayerNormr   r   extendlist_parametersvaluesitems)moduleZweight_decay_paramsZno_weight_decay_paramsZmodule_r   r   r   ,glm_get_params_for_weight_decay_optimization   s   


r\   )r   )rE   r   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr;   Zmegatron_utilr   r   Z'modelscope.models.nlp.mglm.model.promptr   r"   r   r   Moduler   rG   r\   r   r   r   r   <module>   s   
rI