o
    *jnW                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	Z
d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZm Z  d dl!m"Z" e" Z#dgZ$G dd dej%Z&G dd dej%Z'G dd dej%Z(G dd dej)Z)G dd dej%Z*G dd dej%Z+G dd dej%Z,G dd dej%Z-G dd dej%Z.dd  Z/d!ej%fd"d#Z0ej1e j2ej3d$G d%d deZ4dS )&    N)OrderedDict)AnyDictTupleUnion)Models)
TorchModel)MODELS)FullTokenizer)
BertConfig)	BertModel)ModeKeys	ModelFileTasks)
get_loggerCLIPForMultiModalEmbeddingc                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneck      c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr   F)bias   )paddingr   TZinplacez-10)strider   1)super__init__nnConv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dZIdentityavgpool	expansionconv3bn3ReLUrelu
downsampler   r   
Sequentialr   )selfZinplanesplanesr   	__class__ i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/clip/model.pyr   +   s6   

zBottleneck.__init__xc                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S N)	r+   r"   r    r$   r#   r&   r)   r(   r,   )r.   r4   identityoutr2   r2   r3   forwardK   s   



zBottleneck.forwardr   )	__name__
__module____qualname__r'   r   torchTensorr8   __classcell__r2   r2   r0   r3   r   (   s     r   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r   g      ?)r   r   r   	Parameterr=   randnpositional_embeddingLineark_projq_projv_projc_projrC   )r.   rA   rB   rC   rD   r0   r2   r3   r   ]   s   

zAttentionPool2d.__init__c              	   C   s4  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }t	j
di d|d	|d
|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|d S )Nr   r   rE   r   TdimZkeepdimrO   querykeyvalueZembed_dim_to_checkrC   Zq_proj_weightZk_proj_weightZv_proj_weightZin_proj_weightin_proj_biasbias_kbias_vZadd_zero_attnFZ	dropout_pZout_proj_weightZout_proj_biasZuse_separate_proj_weighttrainingneed_weightsr2   )reshapeshapepermuter=   catmeanrH   todtypeFZmulti_head_attention_forwardrC   rK   weightrJ   rL   r   rM   rX   )r.   r4   _r2   r2   r3   r8   k   sf   $

	
zAttentionPool2d.forwardr5   )r:   r;   r<   intr   r8   r?   r2   r2   r0   r3   r@   [   s    r@   c                       s8   e Zd ZdZ		d fdd	ZdddZd	d
 Z  ZS )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    s6  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr   rE   r   F)kernel_sizer   r   r   )rh   r   r   Tr   r   )r   r          )r   r   rD   input_resolutionr   r   r    r!   r"   r#   r$   r(   r)   r%   r&   r*   r+   	_inplanes_make_layerlayer1layer2layer3layer4r@   attnpool)r.   layersrD   headsrk   widthrB   r0   r2   r3   r      s4   


zModifiedResNet.__init__r   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr   )r   rl   r'   rangeappendr   r-   )r.   r/   blocksr   rs   rc   r2   r2   r3   rm      s
   
zModifiedResNet._make_layerc                    sZ    fdd}|  jjj}||} |} |} |} |} |}|S )Nc                    sL    j  jf j jf j jffD ]\}} ||| } q | } | S r5   )r    r"   r#   r$   r(   r)   r+   r&   )r4   convZbnr.   r2   r3   stem   s   

z$ModifiedResNet.forward.<locals>.stem)	typer    rb   r`   rn   ro   rp   rq   rr   )r.   r4   r{   r2   rz   r3   r8      s   




zModifiedResNet.forward)rf   rg   r9   )r:   r;   r<   __doc__r   rm   r8   r?   r2   r2   r0   r3   re      s    
"	re   c                       s(   e Zd ZdZdejf fddZ  ZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.r4   c                    s$   |j }t |tj}||S r5   )r`   r   r8   r|   r=   Zfloat32)r.   r4   	orig_typeretr0   r2   r3   r8      s   
zLayerNorm.forward)r:   r;   r<   r}   r=   r>   r8   r?   r2   r2   r0   r3   r~      s    r~   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr4   c                 C   s   |t d|  S )NgZd;?)r=   Zsigmoidr.   r4   r2   r2   r3   r8         zQuickGELU.forwardN)r:   r;   r<   r=   r>   r8   r2   r2   r2   r3   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )NZc_fcr   ZgelurM   )r   r   r   MultiheadAttentionattnr~   ln_1r-   r   rI   r   mlpln_2r   )r.   r   r   r   r0   r2   r3   r      s   



zResidualAttentionBlock.__init__r4   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )Nr`   deviceF)rY   r   r   )r   r_   r`   r   r   r   r2   r2   r3   	attention   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r5   )r   r   r   r   r   r2   r2   r3   r8      s   zResidualAttentionBlock.forwardr5   )
r:   r;   r<   rd   r=   r>   r   r   r8   r?   r2   r2   r0   r3   r      s    r   c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )TransformerNru   rs   rt   r   c                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r2   )r   ).0rc   r   rt   ru   r2   r3   
<listcomp>  s    
z(Transformer.__init__.<locals>.<listcomp>)r   r   ru   rs   r   r-   rv   	resblocks)r.   ru   rs   rt   r   r0   r   r3   r      s   
zTransformer.__init__r4   c                 C   s
   |  |S r5   )r   r   r2   r2   r3   r8     s   
zTransformer.forwardr5   )	r:   r;   r<   rd   r=   r>   r   r8   r?   r2   r2   r0   r3   r      s    r   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )VisualTransformerrk   
patch_sizeru   rs   rt   rD   c                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )Nr   F)Zin_channelsZout_channelsrh   r   r         rE   r   )r   r   rk   rD   r   r   r    rF   r=   rG   class_embeddingrH   r~   ln_prer   transformerln_postproj)r.   rk   r   ru   rs   rt   rD   scaler0   r2   r3   r     s&   




zVisualTransformer.__init__r4   c              	   C   s   |  |}||jd |jd d}|ddd}tj| j|jtj	|jd d|jd |j|j
d |gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urr|| j }|S )Nr   r   rT   rE   r   rP   )r    rZ   r[   r\   r=   r]   r   r_   r`   Zzerosr   rH   r   r   r   r   r   r2   r2   r3   r8   *  s4   
	



zVisualTransformer.forward)	r:   r;   r<   rd   r   r=   r>   r8   r?   r2   r2   r0   r3   r     s    r   c                %       s   e Zd Z	d"dededeeeeeef ef dedededed	ed
edededededededededef$ fddZ	dd Z
edd Zdd Zdd Zdd Zd d! Z  ZS )#CLIPrg   rB   image_resolutionvision_layersvision_widthvision_patch_size
vocab_size!text_attention_probs_dropout_probtext_hidden_acttext_hidden_dropout_probtext_hidden_sizetext_initializer_rangetext_intermediate_sizetext_max_position_embeddingstext_num_attention_headstext_num_hidden_layerstext_type_vocab_size	tokenizervision_head_widthc                    s   t    t|ttfr|d | }t|||||d| _n|| }t||||||d| _t||
|||||	||||dd| _	t
| j	| _tt|
|| _ttg td | _|| _|   d S )Nrj   )rs   rD   rt   rk   ru   )rk   r   ru   rs   rt   rD   g-q=)Zvocab_size_or_config_json_filehidden_sizeZnum_hidden_layersZnum_attention_headsZintermediate_sizeZ
hidden_actZhidden_dropout_probZattention_probs_dropout_probZmax_position_embeddingsZtype_vocab_sizeZinitializer_rangeZlayer_norm_eps$I$I,@)r   r   
isinstancetuplelistre   visualr   r   bert_configr   bertr   rF   r=   emptytext_projectiononesnploglogit_scaler   initialize_parameters)r.   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Zvision_headsr0   r2   r3   r   K  sP   


zCLIP.__init__c                 C   s  t tg td | _t| jt	rw| jj
d urR| jj
jjd }t jj| jj
jj|d t jj| jj
jj|d t jj| jj
jj|d t jj| jj
jj|d | jj| jj| jj| jjfD ]}| D ]\}}|drut j| qfq`| jd urt jj| j| jjd d d S d S )Nr   r   )stdz
bn3.weight)r   rF   r=   r   r   r   r   r   r   re   rr   rM   Zin_featuresinitZnormal_rK   rb   rJ   rL   rn   ro   rp   rq   Znamed_parametersendswithZzeros_r   r   r   )r.   r   Zresnet_blocknameparamr2   r2   r3   r     s*   


zCLIP.initialize_parametersc                 C   s   | j jjjS r5   )r   r    rb   r`   rz   r2   r2   r3   r`     s   z
CLIP.dtypec                 C   s   |  || jS r5   )r   r|   r`   )r.   imager2   r2   r3   encode_image  r   zCLIP.encode_imagec                 C   sT   | j jd }||| j}| j||dd | j}|d d dd d f | j S )Nz[PAD])Zattention_maskr   )r   Zvocabner|   r`   r   r   )r.   textZ	pad_indexr   r4   r2   r2   r3   encode_text  s   zCLIP.encode_textc                 C   s   |d us|d usJ d|d u r|  |S |d u r| |S | |}|  |}||jddd }||jddd }||| j fS )Nz#text and image cannot both be None!rT   TrN   )r   r   normr   exp)r.   r   r   image_featurestext_featuresr2   r2   r3   r8     s   



zCLIP.forwardc                 C   sb   |  |}| |}||jddd }||jddd }| j }|| |  }| }||fS )Nr   TrN   )r   r   r   r   r   t)r.   r   r   r   r   r   Zlogits_per_imageZlogits_per_textr2   r2   r3   get_similarity  s   


zCLIP.get_similarity)rg   )r:   r;   r<   rd   r   r   floatstrr
   r   r   propertyr`   r   r   r8   r   r?   r2   r2   r0   r3   r   I  s\    	
D
r   c                 C   s4   |   D ]}|j |_|jr|jj |j_qd S r5   )
parametersdatar   Zgrad)modelpr2   r2   r3   convert_models_to_fp32  s   r   r   c                 C   s   dd }|  | dS )z+Convert applicable model parameters to fp16c                 S   s   t | tjtjtjfr | jj | j_| jd ur | jj | j_t | tj	rGg dd dD dddD ]}t
| |}|d urF|j |_q5t | trR| tj dD ]}t| |rjt
| |}|d urj|j |_qTd S )Nc                 S   s   g | ]}| d qS )Z_proj_weightr2   )r   sr2   r2   r3   r     s    zEconvert_weights.<locals>._convert_weights_to_fp16.<locals>.<listcomp>)inqkvrU   rV   rW   )r   r   )r   r   ZConv1dr   rI   rb   r   Zhalfr   r   getattrr   r_   r=   hasattr)moduleattrZtensorr   r2   r2   r3   _convert_weights_to_fp16  s6   




z1convert_weights.<locals>._convert_weights_to_fp16N)apply)r   r   r2   r2   r3   convert_weights  s   r   )module_namec                       sl   e Zd Z fddZdeeef deeef fddZdeeef deeef fdd	Ze	d
d Z
  ZS )r   c              	      sL  t  j|d|i| d|}td|  tj|s J d|}td|  tj|s5J t|ddd6}t|ddd}t	
|| _t	
| D ]	\}}	|	| j|< qRW d    n1 sfw   Y  W d    n1 suw   Y  | d	tj }
t|
d
| _tdi | jd| ji| _t| j t
| d	tj d}d|v r|d n|}tt| d drdd | D }tt| d drdd | D }| j| | j  tj rdttjddnd| _ tj r| j!| j  tdttjdd d S | j"  td d S )N	model_dirz{}/vision_model_config.jsonz!Loading vision model config from z{}/text_model_config.jsonzLoading text model config from rzutf-8)encoding/)
vocab_filer   cpuZ
state_dictr   r   c                 S   "   i | ]\}}|t d d |qS )zmodule.Nlenr   r   r   r2   r2   r3   
<dictcomp>#     " z7CLIPForMultiModalEmbedding.__init__.<locals>.<dictcomp>
clip_modelc                 S   r   )zclip_model.Nr   r   r2   r2   r3   r   &  r   zcuda:{}Z
LOCAL_RANKz%Use GPU {} for finetuning & inferencez"Use CPU for finetuning & inferencer2   )#r   r   formatloggerinfoospathexistsopenjsonloadZ
model_infoitemsr   Z
VOCAB_FILEr
   r   r   r   r   r=   ZTORCH_MODEL_BIN_FILEnextiter
startswithZload_state_dictevalcudaZis_availablerd   environgetr   r_   r   )r.   r   argskwargsZvision_model_config_fileZtext_model_config_fileZfvftr   r   r   
checkpointsdr0   r2   r3   r      st   
 



z#CLIPForMultiModalEmbedding.__init__inputreturnc           	      C   s  ddl m} |jd |jd i}|dtj}d|v rht|d tj	rh|d 
| j}| dkr;|jd dkr;|d}tj|tjk | j|}||jddd	 }W d    n1 s^w   Y  |||j< d
|v rt|d
 tj	r|d
 
| j}| dkr|jd dkr|d}tj|tjk | j|}||jddd	 }W d    n1 sw   Y  |||j< |tjkr| jjd   |d< |S )Nr   )
OutputKeysmodeZimg   r   rT   TrN   r   r         ?r   )Zmodelscope.outputsr	  ZIMG_EMBEDDINGZTEXT_EMBEDDINGr  r   Z	INFERENCEr   r=   r>   r_   r   rO   r[   ZsqueezeZautogradZset_grad_enabledZTRAINr   r   r   r   r   r   r^   )	r.   r  r	  outputr
  Zimage_tensorr   Ztext_tensorr   r2   r2   r3   r8   5  sD   






z"CLIPForMultiModalEmbedding.forwardinputsc                 C   s   |S r5   r2   )r.   r  r2   r2   r3   postprocess[  s   z&CLIPForMultiModalEmbedding.postprocessc                 C   s   d| j j  S )Nr  )r   r   r   rz   r2   r2   r3   temperature^  s   z&CLIPForMultiModalEmbedding.temperature)r:   r;   r<   r   r   r   r   r8   r  r   r  r?   r2   r2   r0   r3   r     s    "5"&)5r   collectionsr   typingr   r   r   r   r   numpyr   r=   Ztorch.nnr   Ztorch.nn.functionalZ
functionalra   Zmodelscope.metainfor   Zmodelscope.modelsr   Zmodelscope.models.builderr	   Z1modelscope.models.multi_modal.clip.bert_tokenizerr
   Z5modelscope.models.multi_modal.clip.configuration_bertr   Z0modelscope.models.multi_modal.clip.modeling_bertr   Zmodelscope.utils.constantr   r   r   Zmodelscope.utils.loggerr   r   __all__Moduler   r@   re   r~   r   r   r   r   r   r   r   Zregister_moduleZmulti_modal_embeddingZclipr   r2   r2   r2   r3   <module>   s@   3/G	8 