o
    *j-                     @   s0  d dl mZ d dlmZmZ d dlZd dlZd dlm	  m
Z d dlm  mZ d dlm	Z	 d dlmZmZ d dlmZ G dd de	jZG d	d
 d
e	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZdS )    )OrderedDict)TupleUnionN)nn)
BertConfigBertForMaskedLM)compatible_position_idsc                       s(   e Zd ZdZdejf fddZ  ZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.xc                    s$   |j }t |tj}||S N)dtypesuperforwardtypetorchZfloat32)selfr
   	orig_typeret	__class__ i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/team/utils.pyr      s   
zLayerNorm.forward)__name__
__module____qualname____doc__r   Tensorr   __classcell__r   r   r   r   r	      s    r	   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr
   c                 C   s   |t d|  S )NgZd;?)r   Zsigmoidr   r
   r   r   r   r   !   s   zQuickGELU.forwardN)r   r   r   r   r   r   r   r   r   r   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )NZc_fc   ZgeluZc_proj)r   __init__r   MultiheadAttentionattnr	   ln_1
Sequentialr   Linearr   mlpln_2r#   )r   r!   r"   r#   r   r   r   r%   '   s   



zResidualAttentionBlock.__init__r
   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )Nr   deviceF)Zneed_weightsr#   r   )r#   tor   r.   r'   r   r   r   r   	attention6   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r   )r0   r(   r+   r,   r   r   r   r   r   =   s   zResidualAttentionBlock.forwardr   )
r   r   r   intr   r   r%   r0   r   r   r   r   r   r   r    %   s    r    c                	       sF   e Zd Z		ddedededejf fddZd	ejfd
dZ  ZS )TransformerNFwidthlayersheadsr#   c                    sB   t    || _| _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r   )r    ).0_r#   r5   r3   r   r   
<listcomp>O   s    
z(Transformer.__init__.<locals>.<listcomp>)	r   r%   use_gcr3   r4   r   r)   range	resblocks)r   r3   r4   r5   r#   r:   r   r8   r   r%   E   s   
zTransformer.__init__r
   c                 C   s,   | j r| jD ]}t||}q|S | |S r   )r:   r<   
checkpoint)r   r
   Z
each_blockr   r   r   r   T   s
   

zTransformer.forward)NF)	r   r   r   r1   r   r   r%   r   r   r   r   r   r   r2   C   s    r2   c                       sJ   e Zd Z	ddedededededef fdd	Zd
ejfddZ  ZS )VisionTransformerFinput_resolution
patch_sizer3   r4   r5   
output_dimc           	         s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t||||d| _t|| _t|t	|| | _d S )N   F)Zin_channelsZout_channelsZkernel_sizeZstridebiasg            )r:   )r   r%   r?   rA   r   ZConv2dconv1	Parameterr   Zrandnclass_embeddingpositional_embeddingr	   ln_prer2   transformerln_postproj)	r   r?   r@   r3   r4   r5   rA   r:   scaler   r   r   r%   _   s&   




zVisionTransformer.__init__r
   c                 C   s   |  |}||jd |jd d}|ddd}| j|jtj|jd d|jd |j|j	d }tj
||gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urt|| j }|S Nr   rE   rD   r-   dim)rF   reshapeshapepermuterH   r/   r   r   zerosr.   catrI   rJ   rK   rL   rM   )r   r
   rH   r   r   r   r   |   s(   
"




zVisionTransformer.forward)F)	r   r   r   r1   r%   r   r   r   r   r   r   r   r   r>   ]   s     	r>   c                       $   e Zd Z fddZdd Z  ZS )CLIPVisionWrapperc                    s$   t    tddddddd| _d S )N                  )r?   r@   r3   r4   r5   rA   )r   r%   r>   vision_transformer)r   r   r   r   r%      s   
zCLIPVisionWrapper.__init__c                 C   s  | j |}||jd |jd d}|ddd}| j j|jtj	|jd d|jd |j|j
d }tj||gdd}|| j j|j }| j |}|ddd}| j |}|ddd}| }| j |d d dd d f }| j jd ur|| j j }||fS rO   )r`   rF   rS   rT   rU   rH   r/   r   r   rV   r.   rW   rI   rJ   rK   clonerL   rM   )r   r
   rH   Zx_tensorr   r   r   r      s*   "
zCLIPVisionWrapper.forwardr   r   r   r%   r   r   r   r   r   r   rY      s    
rY   c                       rX   )BertWrapperc                    sH   t t|   t|}t|j| _tjd|dd| _	td|| _
d S )Nr_   F)rC   )r   rc   r%   r   Zfrom_json_filer   bertr   r*   	projectorprojector_token_embeds)r   Zconfig_jsonfeat_dimZ	token_dimZbert_configr   r   r   r%      s
   
zBertWrapper.__init__c                 C   sT   ||d}| j di |ddi}|d }|d d dd d f }| || |fS )N)	input_idsattention_maskZreturn_dictFr   r   )rd   re   rf   )r   rh   ri   Ztrans_featuresZoutput_statesZoutput_tokensZ
cls_tokensr   r   r   r      s   zBertWrapper.forwardrb   r   r   r   r   rc      s    rc   c                       s0   e Zd Zddejdf fdd	Zdd Z  ZS )MlpNg        c                    sN   t    |p|}|p|}t||| _| | _t||| _t|| _d S r   )	r   r%   r   r*   fc1actfc2Dropoutdrop)r   in_featureshidden_featuresZout_featuresZ	act_layerro   r   r   r   r%      s   
zMlp.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r   )rk   rl   ro   rm   r   r   r   r   r      s   




zMlp.forward)r   r   r   r   ZGELUr%   r   r   r   r   r   r   rj      s    rj   c                       rX   )
CrossLayerc                    s   t t|   t|| _t|| _t|| _tj|dd| _	tj|dd| _
t||| dd| _td| _td| _td| _d S )Nr^   )Z	embed_dimZ	num_headsg?)rp   rq   ro   )r   rr   r%   r   r	   norm1norm2norm3r&   	self_attn
cross_attnrj   ffnrn   dropout1dropout2dropout3)r   rg   	mlp_ratior   r   r   r%      s$   zCrossLayer.__init__c              	   C   s   |  |}| j|| ddd|| ddd|ddd|dkdd ddd}|| | }| |}| || ddd|ddd|dddd ddd}|| | }| |}|| | 	| }|S )NrE   r   rD   )Zkey_padding_mask)
rs   rv   rU   ry   rt   rw   rz   ru   r{   rx   )r   text_tensorsZ
text_masksimage_tensorsretrieved_tensorsZretrieved_tensors_resr   r   r   r      s@   




zCrossLayer.forwardrb   r   r   r   r   rr      s    rr   c                       s.   e Zd Z fddZdddZdd Z  ZS )	TEAMc                    st   t t|   || _|| _ttdddg| _t	dd| _
t	dd| _t|d}t|d | j|dd d S )	Nr\   rD   )rg   r|   r_   cpuz'text_model.bert.embeddings.position_idsT)strict)r   r   r%   
text_modelimage_modelr   Z
ModuleListrr   cross_modelr*   image_tensor_fctext_tensor_fcr   loadr   Zload_state_dict)r   r   r   Z
pretrainedparamsr   r   r   r%     s   zTEAM.__init__Nc                 C   sn   |d ur|  ||\}}tj|ddd}nd\}}|d ur-| |\}}tj|ddd}nd\}}||||fS )N       @rE   prR   )NN)r   F	normalizer   )r   Z	text_data	text_maskZ
img_tensorZtext_featurer}   Zimage_featurer~   r   r   r   get_feature,  s   zTEAM.get_featurec                 C   s   t |}g }| |}||j}| jD ]=}|||||}| |}	t jtj	|	dddtj	|ddd dd}
t j|
| ddt j
t j|dddd }|| q|S )Nr   rD   r   rQ   rE   g      ?)min)r   Z
zeros_liker   r   r   r   r   sumr   r   clampappend)r   r}   r   r~   r   Zpair_score_listZtext_tensors_projZtext_mask_floatZeach_cross_modelZretrieved_tensors_projZ
pair_scoreZpair_score_reducedr   r   r   get_cross_score;  s2   



zTEAM.get_cross_score)NNN)r   r   r   r%   r   r   r   r   r   r   r   r     s    
r   )collectionsr   typingr   r   numpynpr   Ztorch.nn.functionalr   Z
functionalr   Ztorch.utils.checkpointutilsr=   Ztransformersr   r   Z-modelscope.utils.compatible_with_transformersr   r	   Moduler   r    r2   r>   rY   rc   rj   rr   r   r   r   r   r   <module>   s&   	7%0