o
    *j                     @   s  d Z ddlmZmZmZ ddlZddlZddlZddlZ	ddl
Z
ddlZddlmZ ddlm  mZ ddlZddlmZ dd ZG dd deZG d	d
 d
ejZG dd dejZG dd dejZd1ddZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd  d ejZ!G d!d" d"ejZ"G d#d$ d$ejZ#G d%d& d&ejZ$G d'd( d(ejZ%G d)d* d*ejZ&G d+d, d,ejZ'G d-d. d.ejZ(G d/d0 d0ejZ)dS )2zPyTorch BERT model.    )absolute_importdivisionprint_functionN)CrossEntropyLossc                 C   s    | d dt | td   S )zImplementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    g      ?      ?g       @)torcherfmathsqrt)x r   s/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/diffusion/structbert.pygelu   s    r   c                   @   sv   e Zd ZdZ											
																dddZedd Zedd Zdd Zdd Z	dS )
BertConfigzEConfiguration class to store the configuration of a `BertModel`.
          originallinearr         r   皙?      {Gz?selfFc                 C   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _dS )a_  Constructs BertConfig.

        Args:
            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler.
            hidden_dropout_prob: The dropout probability for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `BertModel`.
            initializer_range: The stdev of the truncated_normal_initializer for
                initializing all weight matrices.
        N)
vocab_sizehidden_sizeemb_sizenum_hidden_layerstransformer_typetransition_functionweighted_transformernum_rolled_layersnum_attention_heads
hidden_actintermediate_sizehidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_rangeattention_typerezeropre_lnsqueeze_excitationtransfer_matrixdim_dropoutset_mask_zeroroberta_style
init_scale
safer_fp16grad_checkpoint)r   r   r   r   r   r    r!   r"   r#   r$   r&   r%   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r3   r2   r4   r5   r6   r   r   r   __init__*   s6   4
zBertConfig.__init__c                 C   s,   t dd}t|D ]	\}}||j|< q
|S )zAConstructs a `BertConfig` from a Python dictionary of parameters.N)r   )r   six	iteritems__dict__)clsZjson_objectconfigkeyvaluer   r   r   	from_dictz   s   
zBertConfig.from_dictc                 C   sF   t |ddd}| }W d   n1 sw   Y  | t|S )z9Constructs a `BertConfig` from a json file of parameters.rzutf-8)encodingN)openreadr?   jsonloads)r;   Z	json_filereadertextr   r   r   from_json_file   s   
zBertConfig.from_json_filec                 C   s   t | j}|S )z0Serializes this instance to a Python dictionary.)copydeepcopyr:   r   outputr   r   r   to_dict   s   zBertConfig.to_dictc                 C   s   t j|  dddd S )z*Serializes this instance to a JSON string.   T)indent	sort_keys
)rD   dumpsrM   )r   r   r   r   to_json_string   s   zBertConfig.to_json_stringN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FFFFFFFFFF)
__name__
__module____qualname____doc__r7   classmethodr?   rH   rM   rS   r   r   r   r   r   &   sF    
P

r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )BERTLayerNorm-q=Nc                    sd   t t|   || _|dur|n|j}tt|| _	tt
|| _|js-|| _dS d| _dS )zWConstruct a layernorm module in the TF style (epsilon inside the square root).
        Ngh㈵>)superrY   r7   r<   r   nn	Parameterr   onesgammazerosbetar3   variance_epsilon)r   r<   rb   special_sizer   	__class__r   r   r7      s   zBERTLayerNorm.__init__c                 C   s   |  }| jjr| }|jddd}|| djddd}|| t|| j  }| jjr:| j	| | j
  |S | j	| | j
 S )Nr   TkeepdimrN   )typer<   r5   floatmeanpowr   r
   rb   r_   ra   )r   r   Zprevious_typeusr   r   r   forward   s   zBERTLayerNorm.forward)rZ   NrT   rU   rV   r7   rn   __classcell__r   r   rd   r   rY      s    
rY   c                       &   e Zd Z fddZdddZ  ZS )BERTEmbeddingsc                    s   t t|   	 |jdk r|jn|j}tj|j||jrdnd d| _	tj|j
||jr,dnd d| _t|j|| _|| _|jdk rCd nt|j|j| _t||d| _t|j| _d S )Nr      )Zpadding_idx)rc   )r[   rr   r7   r   r   r\   	Embeddingr   r3   word_embeddingsr)   position_embeddingsr*   token_type_embeddingsr<   LinearprojrY   	LayerNormDropoutr'   dropout)r   r<   r   rd   r   r   r7      s,   zBERTEmbeddings.__init__Nc                 C   s  | d}| jjstj|tj|jd}|d|}n|	d
 }tj|dd||  d }|d u r<t|}|d u rE| |n|}| jjrQd||dk< | |}| |}	| jjsf|| |	 }
n|| }
| |
}
| |
}
| jd ur| |
}
| |
}
d S |
|fS )Nrs   )dtypedevicer   dim        g   )sizer<   r3   r   Zarangelongr~   	unsqueezeZ	expand_asneintZcumsumZtype_as
zeros_likeru   r2   rv   rw   rz   r|   ry   )r   	input_idstoken_type_idsadv_embeddingZ
seq_lengthZposition_idsmaskZwords_embeddingsrv   rw   
embeddingsr   r   r   rn      sB   








zBERTEmbeddings.forward)NNro   r   r   rd   r   rr      s    rr   c                       ,   e Zd Z fddZdd Zdd Z  ZS )BERTFactorizedAttentionc                    s   t t|   |j|j dkrtd|j|jf |j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _d S Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d))r[   r   r7   r   r$   
ValueErrorr   attention_head_sizeall_head_sizer\   rx   queryr=   r>   r{   r(   r|   r   r<   rd   r   r   r7      s    
z BERTFactorizedAttention.__init__c                 G   s0   |  d d | j| jf }|j| }||S )Nr   r   r$   r   viewpermute)r   r   r   new_x_shaper   r   r   transpose_for_scores   s
   

z,BERTFactorizedAttention.transpose_for_scoresc                 C   s   |  |}| |}| |}| |dddd}| |dddd}| |dddd}|| }	tjdd|	}
| |
}
tjdd|}t|
|}t||}|	dddd
 }| d d | jf }|j| }|S )Nr   rN   r   rs   r   r   )r   r=   r>   r   r\   Softmaxr|   r   matmulr   
contiguousr   r   r   )r   hidden_statesattention_maskmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerZs_attention_scoresZs_attention_probsZc_attention_probsZs_context_layercontext_layernew_context_layer_shaper   r   r   rn      s$   




zBERTFactorizedAttention.forwardrT   rU   rV   r7   r   rn   rp   r   r   rd   r   r      s    r   r   Fc                 C   s`   |r|dkr| S d| }| j |   d }t|| }|||tj||dd  |  S )Nr   rs   T)r   rg   )datanewr   zero_r   Z	bernoullisum)r   pr   trainingabZdropout_maskr   r   r   r1     s   r1   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	BERTSelfAttentionc                    s   t t|   |j|j dkrtd|j|jf |j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|| _|jr\t|| _d S d S r   )r[   r   r7   r   r$   r   r   r   r   r\   rx   r   r=   r>   r{   r(   r|   r<   r.   rY   rz   r   rd   r   r   r7   $  s(   
zBERTSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   rN   rs   r   r   )r   r   r   r   r   r   r   9  s
   
z&BERTSelfAttention.transpose_for_scoresNc                 C   sB  | j jr	| |}| |}| |}| |}| |}| |}| |}	t||	dd}
|
t
| j }
|d ur]| js]t|D ]\}}|| dkr\d|
d d |d d d d f< qD|
| }
tjdd|
}| j jss| |}nt|| j jd| jd}t||	}|dddd	 }| d d | jf }|j| }|S )
Nr   r   rs   r   r   )r   r   r   r   rN   r   )r<   r.   rz   r   r=   r>   r   r   r   	transposer	   r
   r   r   	enumerater\   r   r1   r|   r(   r   r   r   r   r   )r   r   r   	head_maskr   r   r   r   r   r   Zattention_scoresir   Zattention_probsr   r   r   r   r   rn   ?  sH   








zBERTSelfAttention.forwardNr   r   r   rd   r   r   "  s    r   c                       $   e Zd Z fddZdd Z  ZS )BERTSelfOutputc                    s   t t|   || _t|j|j| _|js|j	st
|| _t|j| _|j	rQttddjt|  jd| _ttdjt|  jd| _d S d S )Nrs   Gz?r}   )r[   r   r7   r<   r\   rx   r   denser.   r-   rY   rz   r{   r'   r|   r]   r   Tensorfill_tonext
parametersr}   
res_factorr^   factorr   rd   r   r   r7   n  s    

zBERTSelfOutput.__init__c                 C   sV   |  |}| |}| jjs| jjs| || }|S | jjr(|| j|  }|S 	 |S r   )r   r|   r<   r-   r.   rz   r   r   r   input_tensorr   r   r   rn   |  s   

zBERTSelfOutput.forwardro   r   r   rd   r   r   l  s    r   c                       rq   )BERTAttentionc                    s`   t t|   |j dkrt|| _n|j dkr!t|| _ntd	|jt
|| _d S )Nr   Z
factorizedz5Attention type must in [self, factorized], but got {})r[   r   r7   r,   lowerr   r   r   r   formatr   rL   r   rd   r   r   r7     s   zBERTAttention.__init__Nc                 C   s   |  |||}| ||}|S r   rK   )r   r   r   r   self_outputattention_outputr   r   r   rn     s   zBERTAttention.forwardr   ro   r   r   rd   r   r     s    r   c                       s0   e Zd Z					d fdd	Zdd Z  ZS )	DepthwiseSeparableConv1drs   r   Fc              
      sV   t t|   |d d }tj||||||||d| _tj||ddddd|d| _d S )Nrs   rN   )groupsbiasr   )r   )r[   r   r7   r\   ZConv1d	depthwise	pointwise)r   Zin_channelsZout_channelskernel_sizeZstridepaddingZdilationr   rd   r   r   r7     s   	z!DepthwiseSeparableConv1d.__init__c                 C   s   |  |}| |}|S r   )r   r   )r   r   r   r   r   rn     s   

z DepthwiseSeparableConv1d.forward)rs   rs   r   rs   Fro   r   r   rd   r   r     s    r   c                       r   )BERTIntermediatec                    s   t t|   || _| jjrt|| _t| _|j	
 dkr(t|j|j| _d S |j	
 dkr=t|jd|j dd| _d S |jj
 dkrItdtd)	Nr   cnn      r   rnn.rnn transition function is not implemented yetOnly support linear/cnn/rnn)r[   r   r7   r<   r.   rY   rz   r   intermediate_act_fnr!   r   r\   rx   r   r&   r   r   r   NotImplementedErrorr   r   rd   r   r   r7     s$   

zBERTIntermediate.__init__c                 C   sj   | j jr	| |}| j j dkr| |}n| j j dkr-| |dddd}n	 | |}|S Nr   r   r   r   )	r<   r.   rz   r!   r   r   r   r   r   )r   r   r   r   r   rn     s   


zBERTIntermediate.forwardro   r   r   rd   r   r     s    r   c                       r   )SqueezeExcitationBlockc                    s>   t t|   t|j|jd | _t|jd |j| _d S )Nr   )r[   r   r7   r\   rx   r   down_samplingup_samplingr   rd   r   r   r7     s   
zSqueezeExcitationBlock.__init__c                 C   s2   t j|ddd}t | t| |}|| S )Nrs   Trf   )r   rj   sigmoidr   r   r   )r   r   squeezeZ
excitationr   r   r   rn     s
   zSqueezeExcitationBlock.forwardro   r   r   rd   r   r     s    r   c                       r   )
BERTOutputc                    s  t t|   || _|j dkrt|j|j	| _
n$|j dkr/td|j	 |j	dd| _n|jj	 dkr;tdtd|jsJ|jsJt|| _t|j| _|jrYt|| _|jrttd	d
jt|  j d| _!tt"d	jt|  j d| _#d S d S )Nr   r   r   r   r   r   r   r   rs   r   r   )$r[   r   r7   r<   r!   r   r\   rx   r&   r   r   r   r   r   r   r.   r-   rY   rz   r{   r'   r|   r/   r   SEblockr]   r   r   r   r   r   r   r}   r   r^   r   r   rd   r   r   r7     s<   



zBERTOutput.__init__c                 C   s   | j j dkr| |}n| j j dkr$| |dddd}n	 | |}| j jr3| |}| j j	sD| j j
sD| || }|S | j j	rQ|| j|  }|S 	 |S r   )r<   r!   r   r   r   r   r|   r/   r   r-   r.   rz   r   r   r   r   r   rn   	  s,   


zBERTOutput.forwardro   r   r   rd   r   r     s    r   c                       rq   )	BERTLayerc                    s0   t t|   t|| _t|| _t|| _d S r   )	r[   r   r7   r   	attentionr   intermediater   rL   r   rd   r   r   r7      s   

zBERTLayer.__init__Nc                 C   s,   |  |||}| |}| ||}||fS r   )r   r   rL   )r   r   r   r   r   Zintermediate_outputZlayer_outputr   r   r   rn   &  s   
zBERTLayer.forwardr   ro   r   r   rd   r   r     s    r   c                       r   )BERTWeightedLayerc                    s   t t   _t _jj_t fddt	 j
D _t j
_tjj  _t j
_tjj  _t _t j j_t _t j_d S )Nc                    s   g | ]
}t j jqS r   )r\   rx   r   r   .0_r<   r   r   r   
<listcomp>6  s    z.BERTWeightedLayer.__init__.<locals>.<listcomp>)r[   r   r7   r<   r   r   r   r\   
ModuleListranger$   w_or   Zrandw_kpr]   r   w_ar   r   rx   r&   r   rL   rY   rz   r{   r'   r|   r   rd   r   r   r7   0  s   




zBERTWeightedLayer.__init__c                    s     ||}|j j jdd fddttD  fddttD dd t jD  fddttD  fddttD  fd	dttD d
d t jD t} 	|| S )Nr   r   c                    s   g | ]} j | | qS r   )r   r   r   r   Zself_outputsr   r   r   G  s    z-BERTWeightedLayer.forward.<locals>.<listcomp>c                       g | ]	}  | qS r   r|   r   r   r   r   r   J      c                 S      g | ]\}}|| qS r   r   )r   kapparL   r   r   r   r   M      c                    r   r   )r   r   r   r   r   r   P  s    c                    r   r   )rL   r   r   r   r   r   T  r   c                    r   r   r   r   r   r   r   r   W  r   c                 S   r   r   r   )r   alpharL   r   r   r   r   Z  r   )
r   splitr   r   lenzipr   r   r   rz   )r   r   r   r   rL   r   r   r   rn   D  s2   






zBERTWeightedLayer.forwardro   r   r   rd   r   r   .  s    r   c                       *   e Zd Z fddZ		dddZ  ZS )BERTEncoderc                    s   t t|   t | _t|jD ]}|jr| j	t
| q| j	t| q|jrot| jD ]>\}}ttddjt|  jd|j_ttddjt|  jd|j_|jj|jj_|jj|jj_q0|| _d S )Nrs   r   r   )r[   r   r7   r\   r   layerr   r   r"   appendr   r   r-   r   r]   r   r   r   r   r   r   r}   rL   r   r   r   r<   )r   r<   r   indexr   rd   r   r   r7   c  s*   

zBERTEncoder.__init__r   Nc           
      C   s   |g}|dkrt t| jd d|  d }nd}t| jD ]>\}}|d u r@| jjs3|||d \}	}ntjj|||d \}	}n
||||| \}	}||krR|	  |
|	 |
| q|S )Nr   r   rN   rs   )r   r   r   r   r<   r6   r   utils
checkpointZdetach_r  )
r   r   r   epoch_id
head_masksall_encoder_layersZdetach_indexr  Zlayer_moduleZself_outr   r   r   rn   w  s,    


zBERTEncoder.forwardr   Nro   r   r   rd   r   r   a  s
    r   c                       r   )BERTEncoderRolledc                    s@   t t|   t| || _t fddt|jD | _	d S )Nc                       g | ]}t  qS r   rI   rJ   r   r   r   r   r         z.BERTEncoderRolled.__init__.<locals>.<listcomp>)
r[   r	  r7   r   r<   r\   r   r   r#   r   r   rd   r  r   r7     s   
zBERTEncoderRolled.__init__r   Nc                 C   s   |g}t | jjD ]6}| jj dkr | j|| jj  ||}n| jj dkr:|| jj| jj  }| j| ||}|| q	|S )N	universalalbert)r   r<   r   r    r   r   r#   r  )r   r   r   r  r  r  r   r   r   r   r   rn     s   zBERTEncoderRolled.forwardr  ro   r   r   rd   r   r	    s
    
r	  c                       r   )BERTEncoderACTc                    sx   t t|   t|| _t|jd t fddt	|j
D | _| jD ]	}|jjd q&|| _|j
| _d| _d S )Nrs   c                    r
  r   r  r   r   r   r   r     r  z+BERTEncoderACT.__init__.<locals>.<listcomp>r   r   )r[   r  r7   r   r   r\   rx   r   r   r   r   r   r   r   r   r<   act_max_steps	threshold)r   r<   modulerd   r  r   r7     s   


zBERTEncoderACT.__init__c                 C   s   | | j| | j S r   )ltr  __and__r  any)r   halting_probability	n_updatesr   r   r   should_continue  s
   
zBERTEncoderACT.should_continuec                 C   sJ  |g}|  \}}}t|| }t|| }t|| }	t| jD ]t}
t| j|
 |d}|	d
 }|||  | j
 | }|||  | j
 | }|||  }||d|   }|||  }|	| | }	|| ||  d}| ||}|| |d|   }|| | ||	s nq'|t|	| fS )NrN   r   rs   )r   r   r`   cudar   r  r   r   r   r  ri   gtr  ler   r   r  r  rj   )r   r   r   r  Z
batch_sizeZseq_lenZhdimr  Z
remaindersr  r   r   Zstill_runningZ
new_haltedZupdate_weightsZtransformed_statesr   r   r   rn     sH   
zBERTEncoderACT.forward)rT   rU   rV   r7   r  rn   rp   r   r   rd   r   r    s    r  c                       r   )
BERTPoolerc                    s.   t t|   t|j|j| _t | _d S r   )	r[   r  r7   r\   rx   r   r   ZTanh
activationr   rd   r   r   r7     s   zBERTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )r   r   Zfirst_token_tensorpooled_outputr   r   r   rn     s   

zBERTPooler.forwardro   r   r   rd   r   r    s    r  c                       s:   e Zd ZdZdef fddZ					d	ddZ  ZS )
	BertModela  BERT model ("Bidirectional Embedding Representations from a Transformer").

    Example:
        >>> # Already been converted into WordPiece token ids
        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])

        >>> config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
        >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

        >>> model = modeling.BertModel(config=config)
        >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    r<   c                    s   t t|   || _t|| _|j dkrt|| _	nK|j dkr)t
|| _	n>|j dkr6t
|| _	n1|j dkrCt|| _	n$|j dkr]ddlm}m}m} t||||| _	n
td|j t|| _d	S )
z]Constructor for BertModel.

        Args:
            config: `BertConfig` instance.
        r   r  r  actZtextnasr   )
input_dictop_dict	skip_dictz Not support transformer type: {}N)r[   r!  r7   r<   rr   r   r    r   r   encoderr	  r  Ztextnas_finalr#  r$  r%  ZTextNASEncoderr   r   r  pooler)r   r<   r#  r$  r%  rd   r   r   r7     s(   
zBertModel.__init__Nr   c                 C   s  |d u r	t |}|d u rt |}|dd}|jt|  jd}d| d }| |||\}}	| j	j
 dkrE| ||\}
}n| j	j
 dkrW| |}||g}
n| ||||}
|
d|	 |
d	 }| j	jsv| |}|
|fS |d d df }|
|fS )
Nrs   rN   r   r   g     r"  Zreformerr   r   )r   Z	ones_liker   r   r   r   r   r}   r   r<   r    r   r&  insertr5   r'  )r   r   r   r   r  r  r   Zextended_attention_maskZembedding_outputru   r  Zact_lossZsequence_outputr   r   r   r   rn     s<   





zBertModel.forward)NNr   NN)rT   rU   rV   rW   r   r7   rn   rp   r   r   rd   r   r!    s    r!  c                       s8   e Zd ZdZ fddZ							d	ddZ  ZS )
&BertForSequenceClassificationMultiTaska  BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.

    Example:
        >>> # Already been converted into WordPiece token ids
        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])

        >>> config = BertConfig(vocab_size=32000, hidden_size=512,
        >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

        >>> num_labels = 2

        >>> model = BertForSequenceClassification(config, num_labels)
        >>> logits = model(input_ids, token_type_ids, input_mask)
    c                    s   t t|   | dkrt | _n| dkrt | _ntd|t	
 j| _t	 | _|D ]}| jt	 jt| q4|| _ fdd}| | d S )NbertZlstmz%Only support lstm or bert, but got {}c                    sz   t | tjtjfr| jjjd jd nt | tr-| j	jjd jd | j
jjd jd t | tjr;| jj  d S d S )Nr   )rj   Zstd)
isinstancer\   rx   rt   weightr   Znormal_r+   rY   ra   r_   r   r   )r  r<   r   r   init_weightsm  s   
zEBertForSequenceClassificationMultiTask.__init__.<locals>.init_weights)r[   r)  r7   r   r!  r*  Z	LSTMModelr   r   r\   r{   r'   r|   r   
classifierr  rx   r   r   
label_listapply)r   r<   r0  Zcore_encoderlabelr.  rd   r-  r   r7   ^  s   
z/BertForSequenceClassificationMultiTask.__init__Nr   Fc                    s  |  ||||||\} |    fdd| jD }|d urtdd}tjdd}t|d}g }tt	||D ]B\}\}}t
| j| dkrP||| }n||d|}||kjt|  jd}|
d uro||
|  }t|| }|| q9|	st||fS t|||d fS |S )	Nc                    s   g | ]}| qS r   r   )r   r/  r   r   r   r     s    zBBertForSequenceClassificationMultiTask.forward.<locals>.<listcomp>none)Z	reductionrs   r   r   r   )r*  r|   r/  r   r\   ZMSELossr   Zunbindr   r   r   r0  r   r   r   r   r   r}   rj   r  r   )r   r   r   r   labelsZlabels_indexr  r  r   Zreturn_embeddingZloss_weightr  ZlogitsZloss_fctZregression_loss_fctZ
labels_lstZloss_lstr  r2  ZlogitZlossZlabels_maskr   r3  r   rn   }  s8   

z.BertForSequenceClassificationMultiTask.forward)NNr   NNFN)rT   rU   rV   rW   r7   rn   rp   r   r   rd   r   r)  J  s    #r)  )r   r   F)*rW   
__future__r   r   r   rI   r	   rD   numpynpr8   r   Ztorch.nnr\   Ztorch.nn.functionalZ
functionalFZtorch.utils.checkpointr   r   objectr   ModulerY   rr   r   r1   r   r   r   r   r   r   r   r   r   r   r	  r  r  r!  r)  r   r   r   r   <module>   sB   m:
2
J#133/[