o
    *jT                     @   s   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ ddlmZmZ ddlmZ e ZdZej ej!ej"dG dd deZ#dS )    N)OptionalTupleUnion)nn)CrossEntropyLoss)assert_device_mapget_device_map)Models)MODELS)AttentionBackboneModelOutputSeq2SeqLMOutputTokenGeneratorOutput)Tasks)
get_logger   )T5PreTrainedModelT5Stack)T5Configa_  
The input argument `head_mask` was split into two arguments `head_mask` and
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
but this feature is deprecated and will be removed in future versions. If you do
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
torch.ones(num_layers, num_heads)`.
)Z	group_keymodule_namec                %       s  e Zd Zg dZdgZd2def fddZd2ddZd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Z																d3deej deej deej deej deej deej deej deeeej   deeeej   d eej d!eej d"eej d#ee d$ee d%ee d&ee d'eeej ef f"d(d)Z							d4d*d+Zd"ejfd,d-Z fd.d/Zd0d1 Z  ZS )5T5ForConditionalGeneration)zencoder\.embed_tokens\.weightzdecoder\.embed_tokens\.weightzlm_head\.weightzMdecoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weightNconfigc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d| _|dkrZ|   d S d S )NFT)Zbiasauto)super__init__Zd_model	model_dimr   Z	EmbeddingZ
vocab_sizesharedcopydeepcopyZ
is_decoder	use_cacheZis_encoder_decoderr   encodernum_decoder_layers
num_layersdecoderZLinearlm_headZ	post_initmodel_parallelparallelize)selfr   
device_mapkwargsZencoder_configZdecoder_config	__class__ n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/T5/text2text_generation.pyr   <   s&   

z#T5ForConditionalGeneration.__init__c                 C   sv   |d u rt t| jjttj n|| _t	| jt| jj | j
| j | j
| j | j| jj| _d| _d S )NT)r   lenr   blockrangetorchcudaZdevice_countr'   r   r%   r"   r#   tofirst_devicer$   )r&   r'   r+   r+   r,   r%   X   s   
z&T5ForConditionalGeneration.parallelizec                 C   sX   | j   | j  | j d| _ | jd| _| jd| _d| _d | _tj	  d S )NcpuF)
r   deparallelizer"   r2   r#   r$   r'   r0   r1   Zempty_cacher&   r+   r+   r,   r5   c   s   

z(T5ForConditionalGeneration.deparallelizec                 C      | j S N)r   r6   r+   r+   r,   get_input_embeddingsm      z/T5ForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S r8   )r   r   set_input_embeddingsr"   r&   Znew_embeddingsr+   r+   r,   r;   p   s   z/T5ForConditionalGeneration.set_input_embeddingsc                 C   s
   || _ d S r8   r#   r<   r+   r+   r,   set_output_embeddingsu      
z0T5ForConditionalGeneration.set_output_embeddingsc                 C   r7   r8   r=   r6   r+   r+   r,   get_output_embeddingsx   r:   z0T5ForConditionalGeneration.get_output_embeddingsc                 C   r7   r8   )r   r6   r+   r+   r,   get_encoder{   r:   z&T5ForConditionalGeneration.get_encoderc                 C   r7   r8   )r"   r6   r+   r+   r,   get_decoder~   r:   z&T5ForConditionalGeneration.get_decoder	input_idsattention_maskdecoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputspast_key_valuesinputs_embedsdecoder_inputs_embedslabelsr   output_attentionsoutput_hidden_statesreturn_dictreturnc                 K   s|  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }| jrptj| jj |dur|du r|du r| |}| jrtj| jj || jj}|dur|| jj}|dur|| jj}|dur|| jj}| j||||	||||||||d}|d }| jrtj| j	j | j| j	j| _|| jjj}| j jr|| jd  }| |}d}|durtd	d
}||d|d|d}|s*|f|dd  | }|dur(|f| S |S t|||j|j|j |j!|j"|j|j d	S )aH  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. T5 is a model
                with relative position embeddings so you should be able to pad the
                inputs on both the right and the left.

                Indices can be obtained using [`T5Tokenizer`]. See
                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
                for detail.

                [What are input IDs?](../glossary#input-ids)

                To know more on how to prepare `input_ids` for pretraining take a
                look a [T5 Training](./t5#training).
            attention_mask (`torch.FloatTensor` of shape `(batch_size,sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask
                values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
                Indices of decoder input sequence tokens in the vocabulary.

                Indices can be obtained using [`T5Tokenizer`]. See
                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
                for details.

                [What are decoder input IDs?](../glossary#decoder-input-ids)

                T5 uses the `pad_token_id` as the starting token for
                `decoder_input_ids` generation. If `past_key_values` is used,
                optionally only the last `decoder_input_ids` have to be input (see
                `past_key_values`).

                To know more on how to prepare `decoder_input_ids` for pretraining
                take a look at [T5 Training](./t5#training).
            decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
                Default behavior: generate a tensor that ignores pad tokens in
                `decoder_input_ids`. Causal mask will also be used by default.
            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the self-attention modules in the
                encoder. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
                `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the self-attention modules in the
                decoder. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                    Mask to nullify selected heads of the cross-attention modules in
                    the decoder. Mask values selected in `[0, 1]`:

                    - 1 indicates the head is **not masked**,
                    - 0 indicates the head is **masked**.

            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
                Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
                `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
                sequence_length, hidden_size)` is a sequence of hidden states at the
                output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
                `config.n_layers` with each tuple having 4 tensors of shape
                `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):

                Contains precomputed key and value hidden states of the attention
                blocks. Can be used to speed up decoding.

                If `past_key_values` are used, the user can optionally input only
                the last `decoder_input_ids` (those that don't have their past key
                value states given to this model) of shape `(batch_size, 1)` instead
                of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to
                directly pass an embedded representation. This is useful if you want
                more control over how to convert `input_ids` indices into associated
                vectors than the model's internal embedding lookup matrix.
            decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`,
                *optional*):
                Optionally, instead of passing `decoder_input_ids` you can choose to
                directly pass an embedded representation. If `past_key_values` is
                used, optionally only the last `decoder_inputs_embeds` have to be
                input (see `past_key_values`). This is useful if you want more
                control over how to convert `decoder_input_ids` indices into
                associated vectors than the model's internal embedding lookup
                matrix.

                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
                `decoder_inputs_embeds` takes the value of `inputs_embeds`.

            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned
                and can be used to speed up decoding (see `past_key_values`).

            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention
                layers. See `attentions` under returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See
                `hidden_states` under returned tensors for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain
                tuple.
            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the sequence classification/regression loss.
                Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
                labels set to `-100` are ignored (masked), the loss is only computed
                for labels in `[0, ..., config.vocab_size]`

        Returns:

        Examples:

        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration

        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer(
        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
        >>> ).input_ids  # Batch size 1
        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        >>> # studies have shown that owning a dog is good for you.
        N)rC   rD   rL   rG   rO   rP   rQ   r   r      )last_hidden_statehidden_states
attentions)rC   rD   rL   rK   encoder_hidden_statesZencoder_attention_maskrG   rI   r   rO   rP   rQ   g      i)Zignore_index)	lossZlogitsrK   Zdecoder_hidden_statesZdecoder_attentionscross_attentionsZencoder_last_hidden_staterW   Zencoder_attentions)#r   r   Zuse_return_dictr!   r    warningswarnZ2_T5ForConditionalGeneration__HEAD_MASK_WARNING_MSGFutureWarningr   
isinstancer   r-   r$   r0   r1   Z
set_devicer"   r3   _shift_rightr2   r#   weightdeviceZtie_word_embeddingsr   r   viewsizer   rK   rU   rV   rZ   rT   )r&   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r   rO   rP   rQ   r(   rU   Zdecoder_outputsZsequence_outputZ	lm_logitsrY   Zloss_fctoutputr+   r+   r,   forward   s    !	



z"T5ForConditionalGeneration.forwardc	           
   	   K   s2   |d ur|d d dd f }||||||||dS )NrX   )rE   rK   rJ   rD   rG   rH   rI   r   r+   )
r&   rC   pastrD   rG   rH   rI   r   rJ   r(   r+   r+   r,   prepare_inputs_for_generation  s   z8T5ForConditionalGeneration.prepare_inputs_for_generationc                 C   s
   |  |S r8   )r_   )r&   rN   r+   r+   r,   %prepare_decoder_input_ids_from_labels  r?   z@T5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s4   t  j|i |}tt|tjr|dS |d dS )Nr   )	sequences)r   generater   r^   r0   Tensor)r&   argsr(   rd   r)   r+   r,   rj     s   z#T5ForConditionalGeneration.generatec              	   C   s   |d u rt d |S d}|D ]1}d}|D ]}||d||jf }q|d j|d jks1J t|t|ks;J ||f }q|S )NzHYou might want to consider setting `use_cache=True` to speed up decodingr+   r   )loggerwarningZindex_selectr2   ra   shaper-   )r&   rf   Zbeam_idxZreordered_decoder_pastZlayer_past_statesZreordered_layer_past_statesZlayer_past_stater+   r+   r,   _reorder_cache  s0   
z)T5ForConditionalGeneration._reorder_cacher8   )NNNNNNNNNNNNNNNN)NNNNNNN)__name__
__module____qualname__Z_keys_to_ignore_on_load_missingZ"_keys_to_ignore_on_load_unexpectedr   r   r%   r5   r9   r;   r>   r@   rA   rB   r   r0   Z
LongTensorZFloatTensorZ
BoolTensorrk   r   boolr   r   re   rg   rh   rj   rp   __classcell__r+   r+   r)   r,   r   .   s    

	

  

r   )$r   r[   typingr   r   r   r0   r   Ztorch.nnr   Z'transformers.utils.model_parallel_utilsr   r   Zmodelscope.metainfor	   Zmodelscope.models.builderr
   Zmodelscope.outputsr   r   r   Zmodelscope.utils.constantr   Zmodelscope.utils.loggerr   Zbackboner   r   configurationr   rm   Z__HEAD_MASK_WARNING_MSGZregister_moduleZtext2text_generationZT5r   r+   r+   r+   r,   <module>   s*   	