o
    *j@                     @   s  d dl mZ d dlmZmZ d dlZd dlm  mZ	 d dl
m  mZ d dlmZ G dd dejZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdd ZdS )    )OrderedDict)TupleUnionN)nnc                   @   s   e Zd ZdejfddZdS )	QuickGELUxc                 C   s   |t d|  S )NgZd;?)torchZsigmoidselfr    r   n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/mplug/clip/clip.pyforward      zQuickGELU.forwardN)__name__
__module____qualname__r   Tensorr   r   r   r   r   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fc   Zgeluc_proj)super__init__r   ZMultiheadAttentionattn	LayerNormln_1
Sequentialr   Linearr   mlpln_2r   )r
   r   r   r   	__class__r   r   r      s   



zResidualAttentionBlock.__init__r   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )NdtypedeviceF)need_weightsr   r   )r   tor&   r'   r   r	   r   r   r   	attention"   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S N)r*   r   r!   r"   r	   r   r   r   r   )   s   zResidualAttentionBlock.forwardr+   )
r   r   r   intr   r   r   r*   r   __classcell__r   r   r#   r   r      s    r   c                       sJ   e Zd Z		ddedededejdef
 fdd	Zd
ejfddZ  Z	S )TransformerNTwidthlayersheadsr   use_grad_ckpc                    sB   t    | _|| _tj fddt|D  | _|| _d S )Nc                    s   g | ]}t  qS r   )r   ).0_r   r1   r/   r   r   
<listcomp>:   s    
z(Transformer.__init__.<locals>.<listcomp>)	r   r   r/   r0   r   r   range	resblocksr2   )r
   r/   r0   r1   r   r2   r#   r5   r   r   1   s   


zTransformer.__init__r   c                 C   s,   | j r| jD ]}t||}q|S | |S r+   )r2   r8   
checkpoint)r
   r   Z
each_blockr   r   r   r   @   s
   

zTransformer.forward)NT)
r   r   r   r,   r   r   boolr   r   r-   r   r   r#   r   r.   /   s    r.   c                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneckr      c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr<   F)bias   )paddingr=   TZinplacez-10)strider=   1)r   r   r   Conv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dZIdentityavgpool	expansionconv3bn3ReLUrelu
downsamplerB   r;   r   r   )r
   ZinplanesplanesrB   r#   r   r   r   L   s6   

zBottleneck.__init__r   c                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S r+   )	rP   rG   rE   rI   rH   rK   rN   rM   rQ   )r
   r   identityoutr   r   r   r   l   s   



zBottleneck.forwardr<   )	r   r   r   rL   r   r   r   r   r-   r   r   r#   r   r;   I   s     r;   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r<   g      ?)r   r   r   	Parameterr   randnpositional_embeddingr    k_projq_projv_projr   rY   )r
   rW   rX   rY   rZ   r#   r   r   r   ~   s   

zAttentionPool2d.__init__c              	   C   sD  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }| j	r@d}nd	}t
jdi d
|d|d|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddd|d| jjd| jjddd| j	dd\}}|d S ) Nr   r<   r[   r>   TdimZkeepdimrc   g?g        querykeyvalueZembed_dim_to_checkrY   Zq_proj_weightZk_proj_weightZv_proj_weightin_proj_weightZin_proj_biasZbias_kZbias_vZadd_zero_attnFZ	dropout_pZout_proj_weightZout_proj_biasZuse_separate_proj_weighttrainingr(   r   )reshapeshapepermuter   catmeanr^   r)   r&   rj   FZmulti_head_attention_forwardrY   r`   weightr_   ra   r=   r   )r
   r   Zdropoutr4   r   r   r   r      sl   $

	
zAttentionPool2d.forwardr+   )r   r   r   r,   r   r   r-   r   r   r#   r   rV   |   s    rV   c                       s:   e Zd ZdZ		d fdd	ZdddZdd
dZ  ZS )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    s6  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr>   r[   r<   F)kernel_sizerB   r?   r=   )ru   r?   r=   Tr@   r   )rB   r          )r   r   rZ   input_resolutionr   rD   rE   rF   rG   rH   rI   rM   rN   rJ   rK   rO   rP   	_inplanes_make_layerlayer1layer2layer3layer4rV   attnpool)r
   r0   rZ   r1   rx   r/   rX   r#   r   r   r      s4   


zModifiedResNet.__init__r<   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr<   )r;   ry   rL   r7   appendr   r   )r
   rR   blocksrB   r0   r4   r   r   r   rz      s
   
zModifiedResNet._make_layerFc                    s^    fdd}|  jjj}||} |} |} |} |}|s- |}|S )Nc                    sL    j  jf j jf j jffD ]\}} ||| } q | } | S r+   )rE   rG   rH   rI   rM   rN   rP   rK   )r   convZbnr
   r   r   stem   s   

z$ModifiedResNet.forward.<locals>.stem)	typerE   rq   r&   r{   r|   r}   r~   r   )r
   r   skip_last_layerr   r   r   r   r      s   




zModifiedResNet.forward)rs   rt   rU   )F)r   r   r   __doc__r   rz   r   r-   r   r   r#   r   rr      s    
"	rr   c                       s(   e Zd ZdZdejf fddZ  ZS )r   z*Subclass torch's LayerNorm to handle fp16.r   c                    s   |j }t |}||S r+   )r&   r   r   r   )r
   r   	orig_typeretr#   r   r   r      s   
zLayerNorm.forward)r   r   r   r   r   r   r   r-   r   r   r#   r   r      s    r   c                       sN   e Zd Zdedededededef fddZ			
	
ddejfddZ  ZS )VisualTransformerrx   
patch_sizer/   r0   r1   rZ   c                    s   t    || _|| _|| _tjd|||dd| _|d }t|t	
| | _t|t	
|| d d | | _t|| _t|||| _t|| _t|t	
|| | _d S )Nr>   F)Zin_channelsZout_channelsru   rB   r=         r[   r<   )r   r   rx   rZ   r1   r   rD   rE   r\   r   r]   class_embeddingr^   r   ln_prer.   transformerln_postproj)r
   rx   r   r/   r0   r1   rZ   scaler#   r   r   r     s(   




zVisualTransformer.__init__FNr   c                 C   s   |  |}||jd |jd d}|ddd}| j|j}tj|jd d|jd |j|j	d}tj
|| |gdd}|| j|jd |dd d f  }| |}|ddd}| |}|ddd}|rr| |}|S || j }|S )Nr   r<   rh   r[   r%   rd   )rE   rk   rl   rm   r   r)   r&   r   Zzerosr'   rn   r^   sizer   r   r   r   )r
   r   r   Ztext_embeddingZ	text_maskZcls_embZx_zerosr   r   r   r     s,   
(



zVisualTransformer.forward)FNN)	r   r   r   r,   r   r   r   r   r-   r   r   r#   r   r      s    r   c                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd Zdd Zdd Z  ZS )CLIPrX   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _t|ttfr |d d }t|||||d| _n|d }t||||||d| _t	||
|	| 
 d| _|| _t||| _tt| j|| _t|| _tt||| _ttg | _|   d S )Nrw   rt   )r0   rZ   r1   rx   r/   )rx   r   r/   r0   r1   rZ   )r/   r0   r1   r   )r   r   r   
isinstancetuplelistrr   visualr   r.   build_attention_maskr   r   r   Z	Embeddingtoken_embeddingr\   r   emptyr^   r   ln_finaltext_projectionZoneslogit_scaleinitialize_parameters)r
   rX   r   r   r   r   r   r   r   r   r   Zvision_headsr#   r   r   r   =  sJ   



zCLIP.__init__c           	      C   s  t jj| jjdd t jj| jdd t| jtr|| jj	d urW| jj	j
jd }t jj| jj	jj|d t jj| jj	jj|d t jj| jj	jj|d t jj| jj	j
j|d | jj| jj| jj| jjfD ]}| D ]\}}|drzt j| qkqe| jjd d| jj d  }| jjd }d| jj d }| jjD ]-}t jj|jj|d t jj|jjj|d t jj|jjj|d t jj|jj
j|d q| jd urt jj| j| jjd d d S d S )Ng{Gz?)stdg{Gz?r   z
bn3.weightr[   ) r   initZnormal_r   rq   r^   r   r   rr   r   r   Zin_featuresr`   r_   ra   r{   r|   r}   r~   Znamed_parametersendswithZzeros_r   r/   r0   r8   r   ri   Zout_projr!   r   r   )	r
   r   Zresnet_blocknameparamZproj_stdZattn_stdZfc_stdblockr   r   r   r   s  s@   



zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-infr<   )r   r   r   Zfill_floatZtriu_)r
   maskr   r   r   r     s   
zCLIP.build_attention_maskc                 C   s   | j jjjS r+   )r   rE   rq   r&   r   r   r   r   r&     s   z
CLIP.dtypec                 C   s   |  || jS r+   )r   r   r&   )r
   imager   r   r   encode_image  r   zCLIP.encode_imagec                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|t|j	d |j
ddf | j }|S )Nr<   r   r[   rh   rd   )r   r   r&   r^   rm   r   r   r   Zarangerl   Zargmaxr   )r
   textr   r   r   r   encode_text  s   


zCLIP.encode_textc                 C   sj   |  |}| |}||jddd }||jddd }| j }|| |  }|| |  }||fS )Nrh   Trb   )r   r   Znormr   expt)r
   r   r   Zimage_featuresZtext_featuresr   Zlogits_per_imageZlogits_per_textr   r   r   r     s   


zCLIP.forward)r   r   r   r,   r   r   r   r   r   propertyr&   r   r   r   r-   r   r   r#   r   r   ;  s:    	
6"
r   c                 C   s.   t | j| j| j| j| j| j| j| j| j	| j

S r+   )r   Zclip_embed_dimZclip_image_resolutionZclip_vision_layersZclip_vision_widthZclip_vision_patch_sizeZclip_context_lengthZclip_vocab_sizeZclip_transformer_widthZclip_transformer_headsZclip_transformer_layers)configr   r   r   load_from_config  s   
r   )collectionsr   typingr   r   r   Ztorch.nn.functionalr   Z
functionalrp   Ztorch.utils.checkpointutilsr9   Moduler   r   r.   r;   rV   rr   r   r   r   r   r   r   r   r   <module>   s"   33H	; 