o
    *j                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZ d dl
mZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) G dd dej*Z+G dd dej*Z,G dd dej*Z-dd Z.G dd dej*Z/G dd dej*Z0G dd dZ1dd  Z2d!d" Z3e
j4j5d#e
j6d$e
j6d%e
j6d&e7d'e
j6f
d(d)Z8e
j4j5d#e
j6d$e
j6d%e
j6d&e7d'e
j6f
d*d+Z9G d,d- d-ej*Z:G d.d/ d/ej*Z;G d0d1 d1ej*Z<d2d3 Z=d4d5 Z>G d6d7 d7eZ?d8d9 Z@d:d; ZAdMd>d?ZBG d@dA dAZCdBdC ZDdDeeEe
j6f dEe?dFeFd'eeEe
j6f fdGdHZGG dIdJ dJee)ZHG dKdL dLZIdS )N    N)OrderedDict)CallableDictListOptionalUnion)get_argsmpu)get_global_memory_buffer)AttnMaskTypeFloat16Module	LayerNormbias_gelu_impl)FusedScaleMaskSoftmax)nn)
functional)PreTrainedModel)
TorchModel)
GPT3Config)TextGenerationModelOutputTokenGeneratorOutput)init_megatron_util)pre_load)StreamingOutputMixinc                       s(   e Zd ZdZ fddZdd Z  ZS )GPT3ParallelMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    c                    sR   t    tj|j|jd|dd| _|j| _tj	| _
tj|j|jd|dd| _d S )NFT)gather_outputinit_methodskip_bias_addZinput_is_parallelr   r   )super__init__r	   ColumnParallelLinearhidden_sizeZffn_hidden_sizedense_h_to_4hbias_gelu_fusionFZgeluactivation_funcRowParallelLineardense_4h_to_hselfconfigr   output_layer_init_method	__class__ l/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/gpt3/distributed_gpt3.pyr    /   s"   
zGPT3ParallelMLP.__init__c                 C   sD   |  |\}}| jrt||}n| || }| |\}}||fS N)r#   r$   r   r&   r(   )r*   hidden_statesZintermediate_parallelZbias_paralleloutputZoutput_biasr/   r/   r0   forwardE   s   zGPT3ParallelMLP.forward__name__
__module____qualname____doc__r    r4   __classcell__r/   r/   r-   r0   r   '   s    r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )GPT3Embeddinga  Language model embeddings.

    Arguments:
        hidden_size: hidden size
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        init_method: weight initialization method
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    sr   t    |j| _|| _tj|j| j| jd| _t	|j
| j| _| | jj |j| _|j| _t|j| _d S )N)r   )r   r    r"   r   r	   ZVocabParallelEmbedding
vocab_sizeword_embeddingsr   Z	Embeddingmax_position_embeddingsposition_embeddingsweightfp32_residual_connectionsequence_parallelDropouthidden_dropoutembedding_dropout)r*   r+   r   r-   r/   r0   r    e   s   
zGPT3Embedding.__init__c                 C   s8   | j jjd d| j j_| jjjd d| jj_dS )z%Zero out all parameters in embedding.r   TN)r=   r@   dataZfill_Zsharedr?   r*   r/   r/   r0   zero_parametersz   s   
zGPT3Embedding.zero_parametersc                 C   s   |  |}| |}|| }|dd }| jr| }| jrCt|}t	 
  | |}W d    |S 1 s<w   Y  |S | |}|S )Nr      )r=   r?   	transpose
contiguousrA   floatrB   r	   Z#scatter_to_sequence_parallel_regionget_cuda_rng_trackerforkrE   )r*   	input_idsposition_idsZwords_embeddingsr?   Z
embeddingsr/   r/   r0   r4      s    




zGPT3Embedding.forward)r6   r7   r8   r9   r    rH   r4   r:   r/   r/   r-   r0   r;   W   s
    r;   c                       s,   e Zd Z fddZ			dddZ  ZS )NoopTransformerLayerc                    s   t    || _d S r1   )r   r    layer_numberr*   rR   r-   r/   r0   r       s   

zNoopTransformerLayer.__init__Nc                 C   s   |  S r1   )clone)r*   r2   attention_maskencoder_outputZenc_dec_attn_maskinference_paramsr/   r/   r0   r4      s   zNoopTransformerLayer.forwardNNN)r6   r7   r8   r    r4   r:   r/   r/   r-   r0   rQ      s    rQ   c                 C   s   |  |d | S )Ng     )masked_fill_)attention_scoresrU   r/   r/   r0   attention_mask_func   s   r[   c                       s*   e Zd Zejf fdd	Zdd Z  ZS )GPT3CoreAttentionc                    s   t    |j| _|j| _|j| _|j| _| jrd| _td|| _|| _|j	| _	|j
|j }t }t||| _t||j| _t|j|| _d }t| j| _| jr_| j}|  j|9  _t| j| j| j|jt| j|| _t|j| _d S )NTrI   )r   r    fp16bf16Zapply_query_key_layer_scalingZattention_softmax_in_fp32maxrR   attn_mask_typerB   kv_channelsnum_attention_headsr	   $get_tensor_model_parallel_world_sizedividehidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionmathsqrtnorm_factorr   Zmasked_softmax_fusionr[   scale_mask_softmaxr   rC   attention_dropout)r*   r+   rR   r`   projection_size
world_sizeZcoeffr-   r/   r0   r       s@   
zGPT3CoreAttention.__init__c                 C   s  | d| d| d| df}||d |d |d  d}||d |d |d  d}t |d |d  |d |d f|jd}tj||dd|dddddd| j d	}|j| }| 	||}	| j
st   | |	}	W d    n1 sw   Y  n| |	}	| d| d| d| df}|| d|d |d  d}|	|d |d  |d d}	t|	|dd}
|
j| }
|
dddd }
|
  d d
 | jf }|
j| }
|
S )NrI      r      r	                 ?)betaalpha)sizeviewr
   Z
get_tensordtypetorchZbaddbmmrJ   rj   rk   rB   r	   rM   rN   rl   ZbmmZpermuterK   re   )r*   query_layer	key_layervalue_layerrU   Zoutput_sizeZmatmul_input_bufferZmatmul_resultrZ   Zattention_probscontext_layerZnew_context_layer_shaper/   r/   r0   r4      s^   







zGPT3CoreAttention.forward)r6   r7   r8   r   paddingr    r4   r:   r/   r/   r-   r0   r\      s    +r\   c                       s2   e Zd ZdZ fddZdd Zd	ddZ  ZS )
GPT3ParallelAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    c                    s   t    td|| _|j| _|j|j }t }t	||j| _
t	|j|| _tj|jd| d|d| _t|| j| _tj||jd|dd| _d S )NrI   rq   F)r   r   Tr   )r   r    r_   rR   params_dtypera   rb   r	   rc   rd   rf   rg   r!   r"   query_key_valuer\   core_attentionr'   dense)r*   r+   r   r,   rR   rm   rn   r-   r/   r0   r    6  s2   
zGPT3ParallelAttention.__init__c                 C   s"   t j||| j| j| jt j dS )Nry   device)rz   emptyrg   rf   r   cudacurrent_device)r*   Zinference_max_sequence_len
batch_sizer/   r/   r0   _allocate_memoryV  s   z&GPT3ParallelAttention._allocate_memoryNc                 C   sn  |r+| j |jvr#|j}|j}| ||}| ||}||f|j| j < n|j| j  \}}| |\}}	| d d | jd| j f }
|j	|
 }t
|d\}}}|r|j}||d }||dkseJ |j}||d }||dksxJ ||||||df< ||||||df< |d |||df }|d |||df }| ||||}| |\}}||fS )Nrp   rq   rI   r   .)rR   key_value_memory_dictmax_sequence_lenmax_batch_sizer   r   rw   rg   rf   rx   r	   Zsplit_tensor_along_last_dimbatch_size_offsetsequence_len_offsetr   r   )r*   r2   rU   rW   Zinf_max_seq_lenZinf_max_batch_sizeinference_key_memoryinference_value_memoryZmixed_x_layer_Znew_tensor_shaper{   r|   r}   Zbatch_startZ	batch_endZsequence_startZsequence_endr~   r3   biasr/   r/   r0   r4   _  sd   


zGPT3ParallelAttention.forwardr1   )r6   r7   r8   r9   r    r   r4   r:   r/   r/   r-   r0   r   /  s
     	r   c                   @   s&   e Zd ZdddZdd Zdd ZdS )	nullcontextNc                 C   s
   || _ d S r1   enter_result)r*   r   r/   r/   r0   r         
znullcontext.__init__c                 C   s   | j S r1   r   rG   r/   r/   r0   	__enter__  s   znullcontext.__enter__c                 G   s   d S r1   r/   )r*   excinfor/   r/   r0   __exit__  s   znullcontext.__exit__r1   )r6   r7   r8   r    r   r   r/   r/   r/   r0   r     s    
r   c                 C   s    t j| | ||d}|| }|S )N)ptraining)r%   Zdropout)xr   residualprobr   outr/   r/   r0   bias_dropout_add  s   r   c                        fdd}|S )Nc                    s   t | ||| S r1   r   r   r   r   r   r   r/   r0   _bias_dropout_add     z/get_bias_dropout_add.<locals>._bias_dropout_addr/   )r   r   r/   r   r0   get_bias_dropout_add  s   r   r   r   r   r   returnc                 C      t | |||dS )NTr   r   r/   r/   r0   bias_dropout_add_fused_train     r   c                 C   r   )NFr   r   r/   r/   r0    bias_dropout_add_fused_inference  r   r   c                       s*   e Zd ZdZ fddZdddZ  ZS )GPT3ParallelTransformerLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    c                    s   t    || _|j| _|j| _|j| _t|j|j|j	|j
d| _t||||| _|j| _|j| _t|j|j|j	|j
d| _t|||| _ttjdd }ttjdd }|dkpd|dkod|dk}|rlt| _d S tj| _d S )Nepsno_persist_layer_normrB   .r   rI   
   )r   r    rR   (apply_residual_connection_post_layernormr^   rA   r   r"   layernorm_epsilonr   rB   input_layernormr   self_attentionrD   bias_dropout_fusionpost_attention_layernormr   mlpintrz   __version__splitr   Zenable_gradbias_dropout_add_exec_handler)r*   r+   r   r,   rR   ZTORCH_MAJORZTORCH_MINORZuse_nvfuserr-   r/   r0   r      sH   
z%GPT3ParallelTransformerLayer.__init__Nc                 C   s
  |  |}| j|||d\}}| jr|}n|}| jr#| jr t}nt}nt| j}|   |||	||| j
}	W d    n1 sBw   Y  | |	}| |\}
}| jrY|}n|	}|   ||
|	||| j
}W d    n1 suw   Y  tj||jdd}|S )NrW   T)inprequires_grad
keep_graph)r   r   r   r   r   r   r   r   r   	expand_asrD   r   r   r	   make_viewless_tensorr   )r*   r2   rU   rW   Zlayernorm_outputZattention_outputZattention_biasr   Zbias_dropout_add_funcZlayernorm_inputZ
mlp_outputZmlp_biasr3   r/   r/   r0   r4     sH   




z$GPT3ParallelTransformerLayer.forwardr1   r5   r/   r/   r-   r0   r     s    -r   c                       s:   e Zd ZdZ			d
 fdd	Zdd Zddd	Z  ZS )GPT3ParallelTransformerzTransformer class.Tc                    s   t    j| _j| _|| _|| _|| _d | _j| _j	| _
fdd | j
dkr<d| _
tjtdg| _ntj fddt| j
D | _| jrb| jrdtjjjjd| _d S d S d S )Nc                    s   t  | S r1   )r   )rR   )r+   r   r,   r/   r0   build_layerX  s   z5GPT3ParallelTransformer.__init__.<locals>.build_layerr   rI   c                    s   g | ]} |d  qS rI   r/   .0i)r   r/   r0   
<listcomp>b      z4GPT3ParallelTransformer.__init__.<locals>.<listcomp>r   )r   r    r^   rA   post_layer_normpre_processpost_processinput_tensorrB   num_hidden_layers
num_layersrz   r   Z
ModuleListrQ   layersranger   r"   r   r   final_layernorm)r*   r+   r   r,   r   r   r   r-   )r   r+   r   r,   r0   r    B  s0   

z GPT3ParallelTransformer.__init__c                 C   s
   | j | S r1   )r   rS   r/   r/   r0   
_get_layerl  r   z"GPT3ParallelTransformer._get_layerNc                 C   s   | j s| j}tj|ddd}| jrt  }nt }| t| j	D ]}| 
|}||||d}q#W d    n1 s<w   Y  | jrL| jrL| |}|S )NT)r   r   r   )r   r   r	   r   rB   rM   rN   r   r   r   r   r   r   r   )r*   r2   rU   rW   Zrng_contextindexlayerr/   r/   r0   r4   o  s.   


zGPT3ParallelTransformer.forward)TTTr1   )r6   r7   r8   r9   r    r   r4   r:   r/   r/   r-   r0   r   ?  s    *r   c                       s.   e Zd ZdZ fddZ		dddZ  ZS )GPT3TransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    s@   t    |j| _|| _d | _t|| j| _t|| j|| _d S r1   )	r   r    r"   r   encoder_hidden_stater;   	embeddingr   encoderr)   r-   r/   r0   r      s   

z%GPT3TransformerLanguageModel.__init__Nc                 C   sL   |  ||}|d u r| jd ur| j|||d}|S | j}|S ||j}|S )Nr   )r   r   r   tory   )r*   Zenc_input_idsZenc_position_idsZenc_attn_maskrW   Zenc_hidden_statesZencoder_inputrV   r/   r/   r0   r4     s   
	z$GPT3TransformerLanguageModel.forward)NNr5   r/   r/   r-   r0   r     s    r   c                    r   )z!Init method based on N(0, sigma).c                       t jj| d dS Nrr   )meanstdr   initZnormal_tensorsigmar/   r0   init_     z!init_method_normal.<locals>.init_r/   )r   r   r/   r   r0   init_method_normal  s   r   c                    s"   | t d|    fdd}|S )z3Init method based on N(0, sigma/sqrt(2*num_layers).g       @c                    r   r   r   r   r   r/   r0   r     r   z(scaled_init_method_normal.<locals>.init_)rh   ri   )r   r   r   r/   r   r0   scaled_init_method_normal  s   r   c                       sF   e Zd ZeZ fddZdd Zedd Z				d
dd	Z	  Z
S )	GPT3Modelc                    s.   t  | t|t|jt|j|j| _d S r1   )r   r    r   r   Zinit_method_stdr   r   language_model)r*   r+   r-   r/   r0   r      s   

zGPT3Model.__init__c                 C   s   | j jjjS r1   )r   r   r=   r@   rG   r/   r/   r0   word_embeddings_weight  s   z GPT3Model.word_embeddings_weightc                 C   s\   |  d}ttjdd||f| jd}|dk }tj|tj| jd}|d| }||fS )NrI   r   g      ?r   r   )	rw   rz   Ztrilonesr   Zarangelong	unsqueezer   )tokensZ
seq_lengthrU   rP   r/   r/   r0   %build_attention_mask_and_position_ids  s   

z/GPT3Model.build_attention_mask_and_position_idsNc                 K   s   |d u r|d u r|  |\}}| j||||d}tj||  d dd| jj}d }	|d urG|dd	 }t
|  |}	|	dd	 }	t|}
|
dd	 }
|
|	fS )Nr   FTr   rI   )r   r   r	   Z/LinearWithGradAccumulationAndAsyncCommunicationapplyr   r+   rB   rJ   rK   Zvocab_parallel_cross_entropyrT   rL   Z(gather_from_tensor_model_parallel_region)r*   rO   rU   rP   rW   labelskwargsZ	lm_outputZlogits_parallellosseslogitsr/   r/   r0   r4     s.   
zGPT3Model.forward)NNNN)r6   r7   r8   r   Zconfig_classr    r   staticmethodr   r4   r:   r/   r/   r-   r0   r     s    
r   c                 C   s,   | t | |d d k }| |td dS )z-Set the logits for none top-k values to -inf.r   ).rp   N-InfN)rz   ZtopkrY   rL   )r   top_kfilter_r/   r/   r0   !modify_logits_for_top_k_filtering,  s   r   c                 C   s   t j| dd\}}|jddjdd}||k}|ddddf  |ddddf< d|d< |d||}| |td	 dS )
z-Set the logits for none top-p values to -inf.TZ
descendingrp   dimNrI   r   ).r   r   )rz   sortsoftmaxZcumsumrT   ZscatterrY   rL   )r   top_pZsorted_logitsZsorted_indicesZcumulative_probsr   r/   r/   r0   !modify_logits_for_top_p_filtering3  s   (r  rr   rs   c                 C   s   | j dks	J d|dkr|dksJ dtj| dd}nU|  } |dkr*| | |dkrQ|dks6J d	|| dksAJ d
|rK||k sKJ dt| | n|dkrb|dks]J dt| | | jdd}tj	|dd
d}|r~tj|d|d d}|S )a9   Sample and generate a token.
    Note: logits has the dimension [b, v] where b is the batch size
          and v is the vocabulary size.
    If vocab_size is provided, we will make sure the sample that is
    generated is in [0, vocab-size). This will avoid out of vocabulary
    generations due to padding.
    ro   z*expected the logits to be of [b, v] shape.rI   rr   z+cannot set both greedy and top-p samplings.rp   r  rs   z*cannot set both top-k and top-p samplings.z top-k is larger than logit size.z top-k is larger than vocab size.ztop-p should be in (0, 1].)Znum_samplesr   )minr_   )ndimrz   ZargmaxrT   Zdiv_rw   r   r  r  Zmultinomialrx   clamp)r   r   r  temperaturer<   ZsamplesZprobsr/   r/   r0   sampleI  s*   


r  c                   @   s    e Zd ZdZdd Zdd ZdS )InferenceParamszInference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference.c                 C   s"   || _ || _d| _d| _i | _dS )zNote that offsets are set to zero and we always set the
        flag to allocate memory. After the first call, make sure to
        set this flag to False.r   N)r   r   r   r   r   )r*   r   r   r/   r/   r0   r    }  s
   
zInferenceParams.__init__c                 C   s|   t | jdkrtd| j D ]+}| j| \}}t ||jd ks$J |dd|f }|dd|f }||f| j|< qdS )zswap between batchesr   z"should not swap when dict in emptyrI   N)lenr   
ValueErrorkeysshape)r*   Z	batch_idxrR   r   r   Znew_inference_key_memoryZnew_inference_value_memoryr/   r/   r0   swap_key_value_dict  s   

z#InferenceParams.swap_key_value_dictN)r6   r7   r8   r9   r    r  r/   r/   r/   r0   r  y  s    
r  c           
      C   sj   t j| ||}t j||}tj| ||d}g }t|D ]}tj||d | |d}	||	 q|S )Nr  )	r	   utilsrd   rw   rz   r   r   catappend)
r   Znum_partitionspartition_dimstrideZper_partition_sizeZper_partition_per_stride_sizeZpartitions_list
partitionsr   	partitionr/   r/   r0   split_into_partitions  s    
r  
state_dictmodelr  c                 C   sh   |dkr| S t  }| D ]#\}}|j| | jkrqt|jd}|j}t| | |||| | |< q| S )NrI   r   )r	   get_tensor_model_parallel_rankZnamed_parametersr  r_   r  Zpartition_strider  )r  r  r  rankname
parametersr  r  r/   r/   r0   split_state_dict  s   

r!  c                
       s   e Zd Z	d%dd fddZd&def fdd	Z					d'd
dZ				d(ddZd)ddZe	
 d&ddZe	
 dd Zd*ddZ	d&dddefddZ			d+deeejf d eeee f d!ed"ee f fd#d$Z  ZS ),DistributedGPT3r  N)megatron_cfgc                   s   t  j|g|R i | t|||d t|| _t| j}| D ]}t	| q#|
tj
  | jjs;| jjrAt|| j}|| _t }	t dd }
|
d u rU|	n|
}
t |
 |	 }t|||d}t|||	|
 }| jj||ddd d | _d S )N)r  %checkpoint_tensor_model_parallel_size)tagstrictT)r&  )r   r    r   r   Zfrom_pretrainedr+   r   r   r	   Z8set_defaults_if_not_set_tensor_model_parallel_attributesr   rz   r   r]   r^   r   
dist_modelrc   r   getr  r   r!  load_state_dictrW   )r*   Z	model_dirr  Zpath_load_tagr#  argsr   r  paramZ	tensor_wsZckpt_wsZ	ckpt_rankZ
load_modelr-   r/   r0   r      s(   

zDistributedGPT3.__init__Tmodec                    s   |rd | _ t |S r1   )rW   r   train)r*   r,  r-   r/   r0   r-    s   zDistributedGPT3.trainc                 C   s.  | j |||| j|d\}}d }	|d u r| j j|d7  _nrtj| tj|jd}
|d u rAt|D ]\}}d|
||d f< q3n&t|D ]\}}d|
||d d f< qEt|D ]\}}d|
|d |d f< qX| }|
	d }
|

 }|dkrt
|	d }	nt
|	d|
 | }	t||	dS )N)rW   r   rI   r   r   rp   )r   loss)r'  rW   r   rw   rz   r   rL   r   	enumeraterx   sumZzero_r   )r*   r   rU   rP   r   prompts_lenZ
inputs_lenr   r   r.  Z	loss_maskr   lZmask_sumr/   r/   r0   r4     s8   
zDistributedGPT3.forwardFc           #      k   s   | d| jj}| d| jj}| d| jj}	| d|d| jj }
|d}|}|d u r=tj|dg|j	d}|
  }t
|
| jj}||krRtd||d }|dkrqtj|||j	d }tj||fd	d
}t||| _| jj}tj|tjtj d}t|\}}d}t||D ]}|d d ||f }|d d ||f }|d||d |f }| |||j}|d d d	d d f }t||||	| jjd}||k}|| |||f< t|d d d |d f dV  |}|r|dk | @ }|dk |d d |d f dk @ | @ }||B } n&|r7|dk | @ }|dk | @ }!||!B } n
||k | @ } || B }t|}"|rS|"rS d S qd S )Nr   r  r  
max_lengthrI   r   r   -context length + tokens_to_generate too largerp   r  r   .)r   r  r  r<   )	sequencesit     ) popr+   r   r  r  rw   tokens_to_generaterz   r   r   r  itemr>   r  zerosr   r  r  rW   eod_idZuint8r   r   r   r   r   r   r  r<   r   byteall)#r*   r   r1  Z#use_eod_token_for_early_terminationZstop_on_double_eolZstop_on_eolr   r   r  r  r3  r   lengthsZmin_prompt_lengthZmax_sequence_lengthZ
pad_lengthpadsZtermination_idZis_generation_donerU   rP   prev_context_lengthcontext_length
tokens2usepositions2useattention_mask2user   Zlast_token_logitsZ
new_samplestartedZhit_double_eolZhit_two_eolsZ
done_tokenZhit_eoldoner/   r/   r0   r  
  s   
	




zDistributedGPT3.sample   rI   c           $         s  | d}|dksJ |dtj| dg|jd }| jj}tjd| jj	|jd
 | }tj||fdd}| d}	t|	| jj}	||	krMtdt||	| _t|}
d}tj|tjtj d	d}||d}t|\}}d}t||	D ]}|d d ||f }|d d ||f }|d
||d |f }| |||j}| d}tj|dd}|d d dd d f | }||krtj|dd d f dd\}}ntj|ddd\}}t |d d|  |! 
 }|d d|  | }|d d|  }g }t"t#|||D ]9\}\}} }!| |kr1||k}"|"r!q
|
$||! % | |d |  n|&|| |!f t'||krB nq
|
(|)  |d | rWd} n8|*dd |D }#||#d d f }|*dd |D |d d |f< |*dd |D d}| j+|# |}q||st|D ]}!|
$||! % ||! |d |  qt,|
j-dd dd t|t' } fddt|D } fddt|D }tj.|dd}tj.|dd}t/||dS )Nr   rI   prompt_lengthr   rp   r  r4  Fr   .ro   Tr  c                 S      g | ]}|d  qS )ro   r/   r   r9  r/   r/   r0   r         z/DistributedGPT3.beam_search.<locals>.<listcomp>c                 S   rI  r   r/   rJ  r/   r/   r0   r     rK  c                 S   rI  r   r/   rJ  r/   r/   r0   r     rK  c                 S   s   | d S )Nr   r/   )r   r/   r/   r0   <lambda>  s    z-DistributedGPT3.beam_search.<locals>.<lambda>)keyreversec                       g | ]} | d  qS rL  r/   r   Zsorted_hypsr/   r0   r     r   c                    rP  r   r/   r   rQ  r/   r0   r     r   )r5  scores)0rw   r7  rz   r   r   r9  r+   r;  r   r8  r   r  r  r>   r  r  rW   BeamHypothesesr:  Zfloat32r   r   r   repeatr   r   r   r   r%   Zlog_softmaxr  rx   divtruncr/  zipaddrT   r  r  is_doner_   newr  sortedbeamsstackr   )$r*   r   Z	beam_sizeZnum_return_genr   r   rH  Z
stop_tokenr?  Zfinal_sequence_lengthZbeam_hyprF  rR  rU   rP   r@  rA  rB  rC  rD  r   r<   Z	log_probsZ
new_scoresZsorted_scoresindicesZbest_beam_idsZ
best_wordsZbest_scoresZ
next_beamsZbeam_token_rankZtoken_idZ
beam_scoreZbeam_idZ&is_beam_token_worse_than_top_num_beamsZbest_batchesr/   rQ  r0   beam_searcht  s   








zDistributedGPT3.beam_searchc                 O   sF   |rd }| j |g|R i |D ]}|}q|S | j|g|R i |S r1   )r  r_  )r*   r   Z	do_sampler*  r   Zlast_outputr3   r/   r/   r0   generate  s   zDistributedGPT3.generatec                 O   s   | j |g|R i |S r1   )r  )r*   r   r*  r   r/   r/   r0   stream_generate  s   zDistributedGPT3.stream_generate c                 C   s   | j |||S r1   )r'  r  )r*   ZdestinationprefixZ	keep_varsr/   r/   r0   r    r   zDistributedGPT3.state_dictr  zOrderedDict[str, torch.Tensor]r&  c                 C   s   | j ||S r1   )r'  r)  )r*   r  r&  r/   r/   r0   r)    s   zDistributedGPT3.load_state_dicttarget_foldersave_checkpoint_namessave_functionr+   c                    s   d|d d< |d  dd  |d  dd  |d  dd  |d  dd  t j}t j}|| |d d	< t j||||fi |S )
Nzgpt3-generationZpipelinetyper  r  r#  Zmegatronr$  rn   )r7  r   Ztensor_model_parallel_sizeZpipeline_model_parallel_sizer   save_pretrained)r*   rd  re  rf  r+   r   Ztp_sizeZpp_sizer-   r/   r0   rh    s   
zDistributedGPT3.save_pretrained)r  )T)NNNNN)NTFF)rG  rI   )Nrb  FrX   )r6   r7   r8   r    boolr-  r4   r  r_  rz   Zno_gradr`  ra  r  r)  r   strosPathLiker   r   r   dictrh  r:   r/   r/   r-   r0   r"    sR    &
*

jq	


r"  c                   @   sh   e Zd Z		ddededefddZdd	 Z	
ddej	dede
ej	 fddZdededefddZd
S )rS  rs   F	num_beamslength_penaltyearly_stoppingc                 C   s"   || _ || _|| _g | _d| _dS )z7
        Initialize n-best list of hypotheses.
        g    eAN)ro  rp  rn  r\  worst_score)r*   rn  ro  rp  r/   r/   r0   r      s
   
zBeamHypotheses.__init__c                 C   s
   t | jS )z3
        Number of hypotheses in the list.
        )r  r\  rG   r/   r/   r0   __len__  s   
zBeamHypotheses.__len__Nhypsum_logprobsbeam_indicesc                 C   s   ||j d | j  }t| | jk s|| jkrL| j|||f t| | jkrCtdd t| jD }| j|d d = |d d | _dS t	|| j| _dS dS )z3
        Add a new hypothesis to the list.
        rp   c                 S   s   g | ]\}\}}}||fqS r/   r/   )r   idxsr   r/   r/   r0   r   0  s    z&BeamHypotheses.add.<locals>.<listcomp>r   rI   N)
r  ro  r  rn  rq  r\  r  r[  r/  r  )r*   rs  rt  ru  ZscoreZsorted_next_scoresr/   r/   r0   rX  %  s   zBeamHypotheses.addbest_sum_logprobscur_lenr   c                 C   s8   t | | jk r	dS | jrdS ||| j  }| j|k}|S )z
        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
        one in the heap, then we are done with this sentence.
        FT)r  rn  rp  ro  rq  )r*   rx  ry  Z	cur_scoreretr/   r/   r0   rY  8  s   
zBeamHypotheses.is_done)rs   Fr1   )r6   r7   r8   r   rL   ri  r    rr  rz   Z
LongTensorr   rX  rY  r/   r/   r/   r0   rS    s(    
	
rS  )r   rr   rs   N)Jrh   rk  collectionsr   typingr   r   r   r   r   rz   Zmegatron_utilr   r	   Zmegatron_util.global_varsr
   Zmegatron_util.modelr   r   r   r   Z!megatron_util.model.fused_softmaxr   r   Ztorch.nnr   r%   Ztransformers.modeling_utilsr   Zmodelscope.modelsr   Zmodelscope.models.nlp.gpt3r   Zmodelscope.outputsr   r   Zmodelscope.utils.megatron_utilsr   Z$modelscope.utils.nlp.load_checkpointr   Z!modelscope.utils.streaming_outputr   Moduler   r;   rQ   r[   r\   r   r   r   r   ZjitscriptZTensorrL   r   r   r   r   r   r   r   r   r   r  r  r  r  rj  r   r!  r"  rS  r/   r/   r/   r0   <module>   s   0A yna7	
B
0
  ]