o
    *j~                     @   sT  d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! e" ZG dd de	j#Z$G dd dej%j&Z'G dd de(Z)dd Z*G dd dej%j&Z+G dd de	j#Z,G dd de	j#Z-G d d! d!e	j#Z.G d"d# d#e	j#Z/G d$d% d%e	j#Z0G d&d' d'e	j#Z1G d(d) d)e	j#Z2G d*d+ d+e	j#Z3d,d- Z4	.	.d@d/d0Z5ej6j7d1d2 Z8ej6j7d3d4 Z9ej6j7d5d6 Z:G d7d8 d8e	j#Z;G d9d: d:e	j#Z<G d;d< d<eeZ=ej>ej?ej@d=G d>d? d?e=ZAdS )Az PyTorch DeBERTa-v2 model.    )Sequence)OptionalTupleUnionN)nn)	LayerNorm)ACT2FN)PreTrainedModel)softmax_backward_data)Models)Model
TorchModel)MODELS)AttentionBackboneModelOutput)logger)Tasks   )DebertaV2Configc                       s0   e Zd Z fddZdd Zedd Z  ZS )ContextPoolerc                    s2   t    t|j|j| _t|j| _|| _	d S N)
super__init__r   LinearZpooler_hidden_sizedenseStableDropoutZpooler_dropoutdropoutconfigselfr   	__class__ j/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/deberta_v2/backbone.pyr   *   s   

zContextPooler.__init__c                 C   s8   |d d df }|  |}| |}t| jj |}|S Nr   )r   r   r   r   Zpooler_hidden_act)r   hidden_statesZcontext_tokenZpooled_outputr!   r!   r"   forward1   s
   

zContextPooler.forwardc                 C      | j jS r   )r   hidden_sizer   r!   r!   r"   
output_dim;   s   zContextPooler.output_dim)__name__
__module____qualname__r   r%   propertyr)   __classcell__r!   r!   r   r"   r   (   s
    
r   c                   @   s4   e Zd ZdZedd Zedd Zedd ZdS )	XSoftmaxa  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    >>> import torch
    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    c                 C   sX   || _ |tj }||tt|jj}t	|| j }|
|d | | |S r#   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxmasked_fill_save_for_backward)r   inputmaskr0   rmaskoutputr!   r!   r"   r%   \   s   
zXSoftmax.forwardc                 C   s$   | j \}t| ||| j|}|d d fS r   )saved_tensorsr
   r0   )r   grad_outputr?   Z	inputGradr!   r!   r"   backwardh   s
   
zXSoftmax.backwardc           
      C   s   dd l m  m} ddlm}m} | jd||jd d}| jd| d| jdtj	dtj
d	d
||jd d}|| ||| jdt	t|  jd
}	|| |	|}	|| |	|| jdtj	dtjd	d
S )Nr   )r4   r9   ZCastLong)Zto_iSubConstantr   r7   )Zvalue_tZByte)Ztorch.onnx.symbolic_helperZonnxZsymbolic_helperZtorch.onnx.symbolic_opset9r4   r9   opZcast_pytorch_to_onnxr2   r5   Zint64r6   typer7   r8   Zuint8)
gr   r=   r0   Zsym_helpr4   r9   Zmask_cast_valueZr_maskr?   r!   r!   r"   symbolico   s2   zXSoftmax.symbolicN)r*   r+   r,   __doc__staticmethodr%   rB   rJ   r!   r!   r!   r"   r/   A   s    

r/   c                   @   s   e Zd Zdd ZdS )DropoutContextc                 C   s   d| _ d | _d| _d| _d S )Nr   r   T)r   r=   scale
reuse_maskr(   r!   r!   r"   r      s   
zDropoutContext.__init__N)r*   r+   r,   r   r!   r!   r!   r"   rM      s    rM   c                 C   s   t |ts
|}d }n|j}||j9 }|jr|jnd }|dkr2|d u r2dt| d|  	tj
}t |tr?|jd u r?||_||fS )Nr   r   )
isinstancerM   r   rN   rO   r=   r2   Z
empty_likeZ
bernoulli_r1   r3   )r<   Zlocal_contextr   r=   r!   r!   r"   get_mask   s   



rQ   c                	   @   sZ   e Zd ZdZedd Zedd Zedejj	dejj
deeef d	ejj
fd
dZdS )XDropoutzlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                 C   sD   t ||\}}dd|  | _|dkr | | ||d| j S |S )Ng      ?r   r   )rQ   rN   r;   r4   )ctxr<   	local_ctxr=   r   r!   r!   r"   r%      s   
zXDropout.forwardc                 C   s0   | j dkr| j\}||d| j  d fS |d fS )Nr   r   )rN   r@   r4   )rS   rA   r=   r!   r!   r"   rB      s   
zXDropout.backwardrI   r<   rT   returnc                 C   s4   ddl m} |}t|tr|j}d}|| |||S )Nr   )symbolic_opset12T)Z
torch.onnxrV   rP   rM   r   )rI   r<   rT   rV   Z	dropout_ptrainr!   r!   r"   rJ      s   
zXDropout.symbolicN)r*   r+   r,   rK   rL   r%   rB   r2   Z_CZGraphValuer   floatrM   rJ   r!   r!   r!   r"   rR      s    
	

rR   c                       sB   e Zd ZdZ fddZdd Zdd Zdd
dZdd Z  Z	S )r   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                    s    t    || _d| _d | _d S r#   )r   r   	drop_probcountcontext_stack)r   rZ   r   r!   r"   r      s   

zStableDropout.__init__c                 C   s$   | j r| jdkrt||  S |S )zr
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        r   )trainingrZ   rR   applyget_context)r   xr!   r!   r"   r%      s   zStableDropout.forwardc                 C   s   d| _ d | _d S r#   )r[   r\   r(   r!   r!   r"   clear_context   s   
zStableDropout.clear_contextTr   c                 C   s2   | j d u rg | _ d| _| j D ]}||_||_qd S r#   )r\   r[   rO   rN   )r   rO   rN   cr!   r!   r"   init_context   s   

zStableDropout.init_contextc                 C   sT   | j d ur'| jt| j kr| j t  | j | j }| j|_|  jd7  _|S | jS )Nr   )r\   r[   lenappendrM   rZ   r   )r   rS   r!   r!   r"   r_      s   
zStableDropout.get_context)Tr   )
r*   r+   r,   rK   r   r%   ra   rc   r_   r.   r!   r!   r   r"   r      s    
r   c                       $   e Zd Z fddZdd Z  ZS )DebertaV2SelfOutputc                    s<   t    t|j|j| _t|j|j| _t|j	| _
d S r   )r   r   r   r   r'   r   r   layer_norm_epsr   hidden_dropout_probr   r   r   r!   r"   r     s   
zDebertaV2SelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   r   r   r   r   r$   Zinput_tensorr!   r!   r"   r%        

zDebertaV2SelfOutput.forwardr*   r+   r,   r   r%   r.   r!   r!   r   r"   rg     s    rg   c                       s.   e Zd Z fddZ				dddZ  ZS )DebertaV2Attentionc                    s(   t    t|| _t|| _|| _d S r   )r   r   DisentangledSelfAttentionr   rg   r?   r   r   r   r!   r"   r     s   



zDebertaV2Attention.__init__FNc           
      C   sJ   | j ||||||d}|r|\}}|d u r|}| ||}	|r#|	|fS |	S )N)query_statesrelative_posrel_embeddings)r   r?   )
r   r$   attention_maskoutput_attentionsrq   rr   rs   Zself_output
att_matrixattention_outputr!   r!   r"   r%     s    	zDebertaV2Attention.forwardFNNNrn   r!   r!   r   r"   ro         
ro   c                       s2   e Zd Z fddZdejdejfddZ  ZS )DebertaV2Intermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r   r'   intermediate_sizer   rP   Z
hidden_actstrr   intermediate_act_fnr   r   r!   r"   r   ;  s
   
zDebertaV2Intermediate.__init__r$   rU   c                 C   s   |  |}| |}|S r   )r   r}   )r   r$   r!   r!   r"   r%   C  s   

zDebertaV2Intermediate.forward)r*   r+   r,   r   r2   Tensorr%   r.   r!   r!   r   r"   rz   9  s    rz   c                       rf   )DebertaV2Outputc                    sB   t    t|j|j| _t|j|j| _t	|j
| _|| _d S r   )r   r   r   r   r{   r'   r   r   rh   r   ri   r   r   r   r   r!   r"   r   L  s
   

zDebertaV2Output.__init__c                 C   rj   r   rk   rl   r!   r!   r"   r%   S  rm   zDebertaV2Output.forwardrn   r!   r!   r   r"   r   J  s    r   c                       s.   e Zd Z fddZ				dddZ  ZS )DebertaV2Layerc                    s,   t    t|| _t|| _t|| _d S r   )r   r   ro   	attentionrz   intermediater   r?   r   r   r!   r"   r   ]  s   


zDebertaV2Layer.__init__NFc                 C   sH   | j ||||||d}|r|\}}| |}	| |	|}
|r"|
|fS |
S )Nru   rq   rr   rs   )r   r   r?   )r   r$   rt   rq   rr   rs   ru   rw   rv   Zintermediate_outputZlayer_outputr!   r!   r"   r%   c  s   	
zDebertaV2Layer.forward)NNNFrn   r!   r!   r   r"   r   [  ry   r   c                       rf   )	ConvLayerc                    sx   t    t|dd}t|dd}t|dd| _tj|j|j||d d |d| _t|j|j	| _t
|j| _|| _d S )	Nconv_kernel_size   Zconv_groupsr   conv_acttanh   )paddinggroups)r   r   getattrr   r   ZConv1dr'   convr   rh   r   ri   r   r   )r   r   Zkernel_sizer   r   r!   r"   r     s   


zConvLayer.__init__c           	      C   s   |  |ddd ddd }d|  }||d| d t| j	 | 
|}|| }| ||}|d u rE|}|S | | kr`| dkr[|dd}|d}||j}|| }|S )Nr   r   r      )r   permute
contiguousr3   r:   	unsqueezeexpandsizer   r   r   r   r1   r0   squeezer7   )	r   r$   Zresidual_states
input_maskoutr>   Zlayer_norm_inputr?   output_statesr!   r!   r"   r%     s(   

zConvLayer.forwardrn   r!   r!   r   r"   r   ~  s    r   c                       sN   e Zd ZdZ fddZdd Zdd Zdd	d
Z					dddZ  Z	S )DebertaV2Encoderz8Modified BertEncoder with relative position bias supportc                    s  t    t fddt jD | _t dd| _| jrMt dd| _	| j	dk r/ j
| _	t dd| _| j	d	 }| jd
krE| jd	 }t| j| _dd t dd dD | _d| jv rmt j jdd| _t dd
d
kryt nd | _d| _d S )Nc                    s   g | ]}t  qS r!   )r   .0_r   r!   r"   
<listcomp>  s    z-DebertaV2Encoder.__init__.<locals>.<listcomp>relative_attentionFmax_relative_positionsr   r   position_bucketsr   r   c                 S   s   g | ]}|  qS r!   )strip)r   r`   r!   r!   r"   r     s    norm_rel_ebdnone|
layer_normT)Zelementwise_affiner   )r   r   r   Z
ModuleListrangeZnum_hidden_layerslayerr   r   r   max_position_embeddingsr   	Embeddingr'   rs   lowersplitr   r   rh   r   r   gradient_checkpointing)r   r   pos_ebd_sizer   r   r"   r     sF   






zDebertaV2Encoder.__init__c                 C   s2   | j r| jjnd }|d urd| jv r| |}|S )Nr   )r   rs   weightr   r   )r   rs   r!   r!   r"   get_rel_embedding  s   
z"DebertaV2Encoder.get_rel_embeddingc                 C   sV   |  dkr|dd}||dd }| }|S |  dkr)|d}|S )Nr   r   r   r   )r0   r   r   byte)r   rt   Zextended_attention_maskr!   r!   r"   get_attention_mask  s   
z#DebertaV2Encoder.get_attention_maskNc                 C   sH   | j r"|d u r"|d ur|dn|d}t||d| j| jd}|S )Nr   bucket_sizemax_position)r   r   build_relative_positionr   r   )r   r$   rq   rr   qr!   r!   r"   get_rel_pos  s   zDebertaV2Encoder.get_rel_posTFc              	      s  |  dkr	|}n	|ddk }| |}| |||}|r"dnd }	 r(dnd }
t|tr4|d }n|}|  }|}t| j	D ]l\}}|rL|	|f }	| j
rg| jrg fdd}tjj|||||||}n
|||||| d} rw|\}}|dkr| jd ur| |||}|d ur|}t|tr|d t| j	k r||d  nd }n|} r|
|f }
qA|r|	|f }	|std	d
 ||	|
fD S t||	|
dS )Nr   r   r   r!   c                    s    fdd}|S )Nc                     s    g | R  S r   r!   )Zinputs)moduleru   r!   r"   custom_forward  s   zODebertaV2Encoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr!   )r   r   ru   )r   r"   create_custom_forward  s   z7DebertaV2Encoder.forward.<locals>.create_custom_forward)rq   rr   rs   ru   r   c                 s   s    | ]	}|d ur|V  qd S r   r!   )r   vr!   r!   r"   	<genexpr><  s    z+DebertaV2Encoder.forward.<locals>.<genexpr>Zlast_hidden_stater$   
attentions)r0   sumr   r   r   rP   r   r   	enumerater   r   r]   r2   utils
checkpointr   rd   tupler   )r   r$   rt   output_hidden_statesru   rq   rr   return_dictr   Zall_hidden_statesZall_attentionsZnext_kvrs   r   iZlayer_moduler   Zatt_mr!   r   r"   r%     s   




		


zDebertaV2Encoder.forward)NN)TFNNT)
r*   r+   r,   rK   r   r   r   r   r%   r.   r!   r!   r   r"   r     s    %
r   c              	   C   s   t | }|d }t | |k | | k@ t |d | t | }t t || t t |d |  |d  | }t ||k| ||| }|S )Nr   r   )r2   signwherer5   Ztype_asabsceillog)rr   r   r   r   midZabs_posZlog_posZ
bucket_posr!   r!   r"   make_log_bucket_positionE  s.   

r   r   c                 C   s   t d| }t d|}|dddf |dddf  }|dkr*|dkr*t|||}|t j}|d| ddf }|d}|S )af  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key
        bucket_size (int): the size of position bucket
        max_position (int): the maximum allowed absolute position

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r   N)r2   aranger   r1   longr   )Z
query_sizeZkey_sizer   r   Zq_idsZk_idsZrel_pos_idsr!   r!   r"   r   W  s    
r   c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r   r   r   )c2p_posquery_layerrr   r!   r!   r"   c2p_dynamic_expandx     r   c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r   )r   r   	key_layerr!   r!   r"   p2c_dynamic_expand  r   r   c                 C   s*   |  | d d | d|df S )Nr   r   r   )Z	pos_indexp2c_attr   r!   r!   r"   pos_dynamic_expand  s   r   c                       sB   e Zd ZdZ fddZdd Z				ddd	Zd
d Z  ZS )rp   a  
    Disentangled self-attention module

    Parameters:
        config (`DebertaV2Config`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaV2Config`]

    c                    s  t    |j|j dkrtd|j d|j d|j| _|j|j }t|d|| _| j| j | _tj	|j| jdd| _
tj	|j| jdd| _tj	|j| jdd| _t|dd	| _|jd urb|jng | _t|d
d	| _| jrt|dd| _t|dd| _| jdk r|j| _| j| _| jdkr| j| _t|j| _| jsd| jv rtj	|j| jdd| _d| jv rt	|j| j| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()attention_head_sizeTbiasshare_att_keyFr   r   r   r   r   c2pp2c)r   r   r'   num_attention_heads
ValueErrorr   r   Zall_head_sizer   r   
query_projkey_proj
value_projr   pos_att_typer   r   r   r   r   r   ri   pos_dropoutpos_key_projpos_query_projZattention_probs_dropout_probr   )r   r   Z_attention_head_sizer   r!   r"   r     s\   









z"DisentangledSelfAttention.__init__c                 C   sL   |  d d |df }||}|dddd d| d| dS )Nr   r   r   r   r   )r   viewr   r   )r   r`   Zattention_headsZnew_x_shaper!   r!   r"   transpose_for_scores  s
   
z.DisentangledSelfAttention.transpose_for_scoresFNc              	   C   s  |du r|}|  | || j}|  | || j}|  | || j}	d}
d}d| jv r1|d7 }d| jv r:|d7 }ttj|	dtj
d| }t||ddtj||jd }| jrm| |}| |||||}
|
duru||
 }|}|d| j|	d|	d}t||d}| |}t|d|	d|	d|	}|d| j|	d|	ddd	dd
 }|	 dd d }||}|r||fS |S )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.ByteTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, optional):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, optional):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r   r   r   rF   r   r   r   r   r   )r   r   r   r   r   r   r2   sqrtr5   r   rY   bmm	transposer7   r   r   disentangled_attention_biasr   r/   r^   r   r   r   )r   r$   rt   ru   rq   rr   rs   r   r   Zvalue_layerZrel_attscale_factorrN   Zattention_scoresZattention_probsZcontext_layerZnew_context_layer_shaper!   r!   r"   r%     s   &







z!DisentangledSelfAttention.forwardc              
   C   s$  |d u r| d}t|| d| j| jd}| dkr%|dd}n| dkr1|d}n| dkr@td|  | j}| 	|j
}|d|d d d f d}| jr| | || j| d| j dd}| | || j| d| j dd}	n4d	| jv r| | || j| d| j dd}	d
| jv r| | || j| d| j dd}d}
d	| jv rttj|	 dtjd| }t||	dd}t|| d|d d }tj|d|d| d| d| dgd}|
|tj||jd 7 }
d
| jv rttj| dtjd| }| d| dkrLt| d| d| j| jd	|j
}|d}n|}t| | d|d d }t||dd}tj|d|d| d| d| dgddd}|
|tj||jd 7 }
|
S )Nr   r   r   r   r   r   r   z2Relative position ids must be of dim 2 or 3 or 4. r   r   r   rF   )r0   index)r   r   r   r   r0   r   r   r   r   r1   devicer   r   r   r   repeatr   r   r   r   r2   r   r5   rY   r   r   clampZgatherr   r   r7   )r   r   r   rr   rs   r   r   Zatt_spanZpos_query_layerZpos_key_layerZscorerN   Zc2p_attr   Zr_posZp2c_posr   r!   r!   r"   r   .  s   



	
	z5DisentangledSelfAttention.disentangled_attention_biasrx   )	r*   r+   r,   rK   r   r   r%   r   r.   r!   r!   r   r"   rp     s    
,

\rp   c                       s4   e Zd ZdZ fddZ					dddZ  ZS )DebertaV2EmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    t|dd}t|d|j| _tj|j| j|d| _t|dd| _	| j	s,d | _
n	t|j| j| _
|jdkrCt|j| j| _| j|jkrTtj| j|jdd| _t|j|j| _t|j| _|| _| d	t|jd
 d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr   position_ids)r   r   )r   r   r   r'   r   r   r   Z
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsr   
embed_projr   rh   r   ri   r   r   Zregister_bufferr2   r   r   )r   r   r   r   r!   r"   r     s>   



zDebertaV2Embeddings.__init__Nc                 C   sN  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jd urI| | }nt|}|}	| j	rW|	|7 }	| j
jdkrf| |}
|	|
7 }	| j| j
jkrr| |	}	| |	}	|d ur| |	 kr| dkr|dd}|d}||	j}|	| }	| |	}	|	S )Nr   r   r7   r   r   r   r   )r   r   r2   zerosr   r   r   r   Z
zeros_liker   r   r   r  r   r'   r  r   r0   r   r   r1   r7   r   )r   	input_idstoken_type_idsr   r=   inputs_embedsinput_shapeZ
seq_lengthr   
embeddingsr  r!   r!   r"   r%     sB   








zDebertaV2Embeddings.forward)NNNNN)r*   r+   r,   rK   r   r%   r.   r!   r!   r   r"   r     s    !r   c                       sZ   e Zd ZdZeZdZdgZdgZdZ	 fddZ
dd	 ZdddZe fddZ  ZS )DebertaV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    Zdebertar   r   Tc                    s*   t  j|jfi | t t| | d S r   )r   r   Zname_or_pathr   r   r   kwargsr   r!   r"   r     s   z!DebertaV2PreTrainedModel.__init__c                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rA|jjjd| jjd |jdurC|jj|j 	  dS dS dS )zInitialize the weights.g        )meanZstdN)rP   r   r   r   dataZnormal_r   Zinitializer_ranger   Zzero_r   r   )r   r   r!   r!   r"   _init_weights  s   

z&DebertaV2PreTrainedModel._init_weightsFc                 C   s   t |tr
||_d S d S r   )rP   r   r   )r   r   valuer!   r!   r"   _set_gradient_checkpointing  s   

z4DebertaV2PreTrainedModel._set_gradient_checkpointingc                    sD   | dd }|d u rtdi |}| |}|S tt| j|d}|S )N	model_dir)Zpretrained_model_name_or_pathr!   )popr   r   r   Zfrom_pretrained)clsr  r  Zponet_configmodelr   r!   r"   _instantiate  s   z%DebertaV2PreTrainedModel._instantiate)F)r*   r+   r,   rK   r   Zconfig_classZbase_model_prefixZ_keys_to_ignore_on_load_missingZ"_keys_to_ignore_on_load_unexpectedZsupports_gradient_checkpointingr   r  r  classmethodr  r.   r!   r!   r   r"   r
    s    
r
  )module_namec                       s   e Zd ZdZ fddZdd Zdd Zdd	 Z	
	
	
	
	
	
	
	
ddee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee deeef fddZ  ZS )DebertaV2Modela  The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top.

    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration.
    c                    s8   t  | t|| _t|| _d| _|| _|   d S r#   )	r   r   r   r	  r   encoderz_stepsr   Z	post_initr  r   r!   r"   r   -  s   

zDebertaV2Model.__init__c                 C   r&   r   r	  r   r(   r!   r!   r"   get_input_embeddings7  s   z#DebertaV2Model.get_input_embeddingsc                 C   s   || j _d S r   r  )r   Znew_embeddingsr!   r!   r"   set_input_embeddings:  s   z#DebertaV2Model.set_input_embeddingsc                 C   s   t d)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r   Zheads_to_pruner!   r!   r"   _prune_heads=  s   zDebertaV2Model._prune_headsNr  rt   r  r   r  ru   r   r   rU   c	              	      s  |dur|n j j}|dur|n j j}|dur|n j j}|dur*|dur*td|dur3| }	n|dur@| dd }	ntd|durK|jn|j}
|du rYtj|	|
d}|du rftj	|	tj
|
d} j|||||d} j||d||d	}|d
 } jd
kr|d } fddt jD }|d } j } j|} j|}|d
d D ]}|||d|||d}|| q|d }|s|f||rd
ndd  S t||r|jnd|jdS )u5
  
        Args:
            input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
                Indices of input sequence tokens in the vocabulary.

            attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

            token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
                1]`:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

            position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0,config.max_position_embeddings - 1]`.

            inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert *input_ids* indices into associated
                vectors than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a dataclass instead of a plain tuple.

        Returns:
            Returns `modelscope.outputs.AttentionBackboneModelOutput`

        Examples:
            >>> from modelscope.models import Model
            >>> from modelscope.preprocessors import Preprocessor
            >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone')
            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
            >>> print(model(**preprocessor('这是个测试')))
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r  )r  r  r   r=   r  T)r   ru   r   r   r   c                    s   g | ]} j jd  qS r   )r  r   r   r(   r!   r"   r     s    z*DebertaV2Model.forward.<locals>.<listcomp>Fr   r   r   )r   ru   r   Zuse_return_dictr   r   r   r2   Zonesr  r   r	  r  r  r   r   r   r   re   r   r$   r   )r   r  rt   r  r   r  ru   r   r   r  r   Zembedding_outputZencoder_outputsZencoded_layersr$   Zlayersrq   rs   Zrel_posr   Zsequence_outputr!   r(   r"   r%   E  s   9


zDebertaV2Model.forward)NNNNNNNN)r*   r+   r,   rK   r   r  r  r   r   r2   r~   r3   r   r   r   r%   r.   r!   r!   r   r"   r    sB    

	

r  )r   r   )BrK   collections.abcr   typingr   r   r   r2   Ztorch.utils.checkpointr   Ztorch.nnr   Ztransformers.activationsr   Ztransformers.modeling_utilsr	   Ztransformers.pytorch_utilsr
   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.models.builderr   Zmodelscope.outputsr   Zmodelscope.utilsr   loggingZmodelscope.utils.constantr   configurationr   Z
get_loggerModuler   ZautogradFunctionr/   objectrM   rQ   rR   r   rg   ro   rz   r   r   r   r   r   r   Zjitscriptr   r   r   rp   r   r
  Zregister_moduleZbackboneZ
deberta_v2r  r!   r!   r!   r"   <module>   s`   H
)2&#) 
!
	
	
 |Y0