o
    *jx.                     @   s   d dl Z d dlmZ d dlmZmZ d dlZd dlZd dlm	Z	 G dd de	j
ZG dd de	jZG d	d
 d
e	j
ZG dd de	j
ZG dd de	j
ZG dd de	j
ZdefddZej rddnddfdedeeejf fddZdS )    N)OrderedDict)TupleUnion)nnc                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd Zdd Zdd Z  ZS )CLIP	embed_dimimage_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _|d }t||||||d| _t||
|	|  d| _|| _t	
||| _t	t| j|| _t|| _t	t||| _t	tg td | _|   d S )N@   )input_resolution
patch_sizewidthlayersheads
output_dim)r   r   r   	attn_maskg$I$I,@)super__init__r   VisionTransformervisualTransformerbuild_attention_masktransformerr   r   Z	Embeddingtoken_embedding	Parametertorchemptypositional_embedding	LayerNormln_finaltext_projectiononesnploglogit_scaleinitialize_parameters)selfr   r   r	   r
   r   r   r   r   r   r   Zvision_heads	__class__ j/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/clip.pyr      s8   


zCLIP.__init__c                 C   s   t jj| jjdd t jj| jdd | jjd d| jj d  }| jjd }d| jj d }| jj	D ]-}t jj|j
j|d t jj|j
jj|d t jj|jjj|d t jj|jjj|d q3| jd urut jj| j| jjd d d S d S )Ng{Gz?)Zstdg{Gz?         )r   initZnormal_r    weightr$   r   r   r   	resblocksattnZin_proj_weightZout_projmlpc_fcc_projr'   )r-   Zproj_stdZattn_stdZfc_stdblockr0   r0   r1   r,   <   s"   


zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-inf   )r"   r#   r   Zfill_floatZtriu_)r-   maskr0   r0   r1   r   N   s   
zCLIP.build_attention_maskc                 C   s   | j jjjS N)r   conv1r5   dtype)r-   r0   r0   r1   rA   V   s   z
CLIP.dtypec                 C   s   |  || jS r?   )r   typerA   )r-   imager0   r0   r1   encode_imageZ      zCLIP.encode_imagec                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|t|j	d |j
ddf }|S )Nr<   r   r3   dim)r    rB   rA   r$   permuter   r&   r"   ZarangeshapeZargmax)r-   textxr0   r0   r1   encode_text]   s   

 zCLIP.encode_textc                 C   sb   |  |}| |}||jddd }||jddd }| j }|| |  }| }||fS )Nr<   T)rH   Zkeepdim)rD   rM   Znormr+   expt)r-   rC   rK   Zimage_featuresZtext_featuresr+   Zlogits_per_imageZlogits_per_textr0   r0   r1   forwardm   s   


zCLIP.forward)__name__
__module____qualname__intr   r   r   r,   r   propertyrA   rD   rM   rP   __classcell__r0   r0   r.   r1   r      s:    	
-
r   c                       s(   e Zd ZdZdejf fddZ  ZS )r%   z*Subclass torch's LayerNorm to handle fp16.rL   c                    s$   |j }t |tj}||S r?   )rA   r   rP   rB   r"   Zfloat32)r-   rL   	orig_typeretr.   r0   r1   rP      s   
zLayerNorm.forward)rQ   rR   rS   __doc__r"   TensorrP   rV   r0   r0   r.   r1   r%      s    r%   c                   @   s   e Zd ZdejfddZdS )	QuickGELUrL   c                 C   s   |t d|  S )NgZd;?)r"   Zsigmoidr-   rL   r0   r0   r1   rP      rE   zQuickGELU.forwardN)rQ   rR   rS   r"   rZ   rP   r0   r0   r0   r1   r[      s    r[   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_headr   c              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nr9      Zgelur:   )r   r   r   ZMultiheadAttentionr7   r%   ln_1
Sequentialr   ZLinearr[   r8   ln_2r   )r-   r^   r_   r   r.   r0   r1   r      s   



zResidualAttentionBlock.__init__rL   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )NrA   deviceF)Zneed_weightsr   r   )r   torA   re   r7   r\   r0   r0   r1   	attention   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r?   )rg   ra   r8   rc   r\   r0   r0   r1   rP      s   zResidualAttentionBlock.forwardr?   )
rQ   rR   rS   rT   r"   rZ   r   rg   rP   rV   r0   r0   r.   r1   r]      s    r]   c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )r   Nr   r   r   r   c                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r0   )r]   ).0_r   r   r   r0   r1   
<listcomp>   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r   r   r   r   rb   ranger6   )r-   r   r   r   r   r.   rj   r1   r      s   
zTransformer.__init__rL   c                 C   s
   |  |S r?   )r6   r\   r0   r0   r1   rP      s   
zTransformer.forwardr?   )	rQ   rR   rS   rT   r"   rZ   r   rP   rV   r0   r0   r.   r1   r      s    r   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )r   r   r   r   r   r   r   c                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )N   F)Zin_channelsZout_channelsZkernel_sizeZstrideZbiasr2   r3   r<   )r   r   r   r   r   ZConv2dr@   r!   r"   Zrandnclass_embeddingr$   r%   ln_prer   r   ln_postproj)r-   r   r   r   r   r   r   scaler.   r0   r1   r      s&   




zVisionTransformer.__init__rL   c                 C   s   |  |}||jd |jd d}|ddd}| j|jtj|jd d|jd |j|j	d }tj
||gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urt|| j }|S )Nr   r<   rF   r3   rd   rG   )r@   ZreshaperJ   rI   rn   rf   rA   r"   Zzerosre   catr$   ro   r   rp   rq   )r-   rL   Zclass_tokenr0   r0   r1   rP      s$   




zVisionTransformer.forward)	rQ   rR   rS   rT   r   r"   rZ   rP   rV   r0   r0   r.   r1   r      s    r   
state_dictc                 C   s   | d j d }tdd |  D }| d j d }t| d j d d d }|| }| d	 j d }| d
 j d }| d j d }| d j d }	|	d }
ttdd | D }t||||||||	|
|
}dD ]	}|| v rm| |= qd||  | S )Nzvisual.conv1.weightr   c                 S   s$   g | ]}| d r|dr|qS )zvisual.z.attn.in_proj_weight)
startswithendswithrh   kr0   r0   r1   rk      s    zbuild_model.<locals>.<listcomp>rF   zvisual.positional_embeddingr<   g      ?r'   r$   ztoken_embedding.weightzln_final.weightr   c                 s   s(    | ]}| d r|dd V  qdS )ztransformer.resblocks.r3   N)ru   splitrw   r0   r0   r1   	<genexpr>   s    
zbuild_model.<locals>.<genexpr>)r   r   r   )rJ   lenkeysroundsetr   Zload_state_dicteval)rt   r
   r	   r   Z	grid_sizer   r   r   r   r   r   r   modelkeyr0   r0   r1   build_model   s:   

r   cudacpuTnamere   c                    sl  d}| }zt jj||r ndd }d }W n ty3   |r*td| d d}t j|dd}Y nw |sMt|p<| 	 }t
 dkrK|  |S t jj fddg d}d	d
 |jdD d fdd}|| ||j ||j t
 dkrt jjdd g d}t|jd d }	|	 fdd}
||
 |
|j |
|j |  |S )NFr   )Zmap_locationzFile z6 is not a JIT archive. Loading as a state dict insteadc                      s   t g t  S r?   )r"   r(   rf   re   r0   )re   r0   r1   <lambda>)  s    zload_clip.<locals>.<lambda>)Zexample_inputsc                 S   s   g | ]
}d t |v r|qS )ZDevice)repr)rh   nr0   r0   r1   rk   *  s
    zload_clip.<locals>.<listcomp>prim::ConstantrF   c                    st   t | dr	| jgng }t | dr|| jj |D ]}|dD ]}d| v r6t|d dr6|  q qd S )Ngraphforward1r   valuer   )	hasattrr   appendr   findAllNodesZattributeNamesstrru   copyAttributes)modulegraphsr   node)device_noder0   r1   patch_device/  s   

zload_clip.<locals>.patch_devicec                   S   s   t g  S r?   )r"   r(   r=   r0   r0   r0   r1   r   @  s    aten::tor<   c                    s   t | dr	| jgng }t | dr|| jj |D ](}|dD ] }t| }dD ]}||  d dkr?||    q*q qd S )Nr   r   r   )r<   r3   r      )	r   r   r   r   r   listinputsr   r   )r   r   r   r   r   i)
float_noder0   r1   patch_floatD  s   
zload_clip.<locals>.patch_float)r"   jitloadr   RuntimeErrorwarningswarnr   rt   rf   r   r=   tracer   r   applyrD   rM   r   ZfindNoder   r   )r   re   r   Z
model_pathr   rt   Zdevice_holderr   Zfloat_holderZfloat_inputr   r0   )re   r   r   r1   	load_clip  s\   







r   )r   collectionsr   typingr   r   numpyr)   r"   r   Moduler   r%   r[   r]   r   r   dictr   r   Zis_availabler   re   r   r0   r0   r0   r1   <module>   s(   r	0#