o
    *j4                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ ddlmZ dd	lmZmZ ejejejd
G dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"dS )    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )	load_clip)get_state_dictset_seed)module_namec                       sB   e Zd ZdZdef fddZdddZdd	 Zdd
dZ  Z	S )VoPa  
        The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
        This model is dynamically initialized with the following parts:
            - clip: the upstream pre-trained backbone model (CLIP in this code)
            - pool_frames: the frames pooling method
            - visual_prompt_learner: visual prompt
            - ImageEncoder: get image encoder
            - TextPromptLearner: text prompt
            - TextEncoder: get text encoder
    	model_dirc                    s   t t|   t|d}t|d}t|tj}t|j	| _
t|d| _tt| jjjj| j
_tt| jjj| j
_t| j
j| j
| _t| j| j
| _t| j| j
| _t| j| j
| _t| j| j
| _| t | | !  dt"j#d< t$| j
j% dS )zl
            Initialize a VoP Model

            Args:
                model_dir: model id or path,
        zVoP_msrvtt9k.pthzViT-B-32.pt)namefalseZTOKENIZERS_PARALLELISMN)&superr   __init__ospjoinr   ZCONFIGURATIONr   	from_fileZ
hyperparamconfigr	   Zcliplistrangevisualtransformerlayers
vpt_layers
tpt_layersBaselinePoolingpooling_typepool_framesVisualPromptLearnervisual_prompt_learnerImageEncoderimage_encoderTextPromptLearnertext_prompt_learnerTextEncodertext_encoderZload_state_dictr
   evalosenvironr   seed)selfr   argskwargsZ
model_pathZ	clip_archZconfig_path	__class__ i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/model.pyr   !   s.   
zVoP.__init__Fc                 C   sv   |j d }|dd| jj| jj}|  }| ||}||jddd }||| jjd}| d|}|r9||fS |S )z
            Get video Features

            Args:
                videos: the dim is [1, 12, 3, 224, 224]
                return_all_frames: default False
        r      TdimZkeepdimN)	shapereshaper   	input_resr"   r$   norm
num_framesr    )r-   Zvideosreturn_all_frames
batch_size
video_datavisual_promptsvideo_featuresvideo_features_pooledr2   r2   r3   get_video_featuresF   s    
zVoP.get_video_featuresc                 C   s*   |   }| ||}||jddd }|S )zh
            Get Text Features

            Args:
                text_data: the dim is [1, 69]
        r4   Tr6   )r&   r(   r;   )r-   	text_datatext_promptstext_featuresr2   r2   r3   get_text_featuresa   s   zVoP.get_text_featuresc                 C   s   |d j d }|d }|d }|dd| jj| jj}|  }| ||}|  }| ||}	|	|	jddd }	||jddd }||| jj	d}| 
|	|}
|rW|	||
fS |	|
fS )z
            Dynamic Forward Function of VoP

            Args:
                data: the input data
                return_all_frames: default False
        Zvideor   textr4   r5   Tr6   )r8   r9   r   r:   r"   r$   r&   r(   r;   r<   r    )r-   datar=   r>   rD   r?   r@   rA   rE   rF   rB   r2   r2   r3   forwardo   s.   
zVoP.forward)F)
__name__
__module____qualname____doc__strr   rC   rG   rJ   __classcell__r2   r2   r0   r3   r      s    
%r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r   z(
        Redefined Pooling Function
    c                    s&   t t|   |dkr| j| _d S t)NZavg)r   r   r   _avg_poolingpooling_funcNotImplementedError)r-   r   r   r0   r2   r3   r      s   zBaselinePooling.__init__c                 C   s   |j dd}|S )a$  
            Pooling mean of frames

            Args:
                text_embeds: the input text embedding which is None here.
                video_embeds: the input video embedding with [1, 12, 512].

            Returns:
                video_embeds_pooled: num_vids x embed_dim
        r   r7   )mean)r-   text_embedsvideo_embedsZvideo_embeds_pooledr2   r2   r3   rQ      s   zBaselinePooling._avg_poolingc                 C   s   |  ||S N)rR   )r-   rV   rW   r2   r2   r3   rJ      s   zBaselinePooling.forward)rK   rL   rM   rN   r   rQ   rJ   rP   r2   r2   r0   r3   r      s
    r   c                       (   e Zd ZdZ fddZdd Z  ZS )r!   a  
        The implementation of visual prompt.
        This module is used to define the learnable prompt parameters:
            the number of tokens is 8,
            the prompt dimension is 768,
            and the initialization weight std used is 0.02.
    c                    sd   t t|   |j}|jjjjd }|j}t	j
t|jd|||d}tjj|dd t|| _d S )Nr   r   dtype{Gz?Zstd)r   r!   r   vp_token_numr   ln_postweightr8   r[   torchemptylenr   nninitnormal_	Parameterr@   )r-   
clip_modelr   r^   Zvp_dimr[   r@   r0   r2   r3   r      s   zVisualPromptLearner.__init__c                 C   s
   | j }|S rX   )r@   )r-   Zvpr2   r2   r3   rJ      s   zVisualPromptLearner.forwardrK   rL   rM   rN   r   rJ   rP   r2   r2   r0   r3   r!      s    r!   c                       rY   )r%   a  
        The implementation of visual prompt.
        This module is used to define the learnable prompt parameters:
            the number of tokens is 4,
            the prompt dimension is 512,
            and the initialization weight std used is 0.02.
    c                    s   t t|   |j}|j}|dkr|dksJ |jjjd }|j}t	j
t|j|| ||d}tjj|dd t|| _|| _|| _d S )Nr   rZ   r\   r]   )r   r%   r   tp_prefix_token_numtp_suffix_token_numln_finalr`   r8   r[   ra   rb   rc   r   rd   re   rf   rg   rE   )r-   rh   r   rj   rk   Ztp_dimr[   rE   r0   r2   r3   r      s    
zTextPromptLearner.__init__c                 C   s<   | j d d d | jd d f | j d d | jd d d f fS rX   )rE   rj   )r-   r2   r2   r3   rJ      s   zTextPromptLearner.forwardri   r2   r2   r0   r3   r%      s    r%   c                       rY   )r#   z
        The implementation of image encoder.
        This module is used to obtain the features of each frame of the video.
    c                    sv   t t|   || _|j| _|j| _|j| _|jj| _|jj	| _	|jj
| _
|jj| _|jj| _|jj| _|jj| _d S rX   )r   r#   r   r   r   r^   r<   r   conv1class_embeddingpositional_embeddingln_prer   r_   projr-   rh   r   r0   r2   r3   r      s   





zImageEncoder.__init__c           	   	   C   s  |j d }| |}|||j d d}|ddd}| j|j}tj|d|j d |j|j	d}|| }tj
||gdd}|| j|j }t| jjD ]}|| jv r| j|}||ddddddf |dd}tj
|ddddddf ||ddddddf gdd}|dkr| |}|ddd}| jj| |}|ddd}|d | jv rtj
|ddddddf |ddd| j dddf gdd}qM| |dddddf }| jdur|| j }|S )a   
            The forward function of image encoder.

            Args:
                visual_prompts: the visual prompt, dim is [12, 1, 8, 768]
                x: the input data, dim is [12, 3, 224, 224]

            Returns:
                x: the output data, dim is [12, 512]
        r   r   r4      )r[   devicerT   N)r8   rm   r9   permutern   tor[   ra   Zzerosrt   catro   r   r   r   r   indexrepeatrp   	resblocksr^   r_   rq   )	r-   r@   xr>   Zx_1Zx_2i_layeri_promptZcur_layer_vpr2   r2   r3   rJ     s@   


@
<

zImageEncoder.forwardri   r2   r2   r0   r3   r#      s    r#   c                       rY   )r'   z
        The implementation of text encoder.
        This module is used to obtain the features of each word of the sentence.
    c                    sv   t t|   |j| _|j| _|j| _|j| _|j| _|j| _|j	| _	d| j	v s*J |j
| _
|j| _|j
|j | _d S )Nr   )r   r'   r   r   token_embeddingro   rl   text_projectionr[   r   rj   rk   tp_token_numrr   r0   r2   r3   r   9  s   zTextEncoder.__init__c              	   C   s8  |  || j}|jd }|\}}t| jjD ]}|| jv r| j|}| j	dkr^|||d ddddf 
|dd}	tj|ddddddf |	|ddddddf gdd}| jdkr|||d ddddf 
|dd}
tj|ddddddf |
|ddddddf gdd}|dkr|| j| j }|ddd}| jj| |}|ddd}|d | jv r|ddddddf }|ddd| j	 d| j ddf }|ddddddf }tj|||gdd}|}q| || j}|t|jd |jdd| j f | j }|S )a  
            The forward function of text encoder.

            Args:
                text_prompts: the text prompt, dim is 2 x [12, 4, 512]
                text: the input data, dim is [1, 69]

            Returns:
                x: the output data, dim is [1, 512]
        r   r   Nr4   rT   rs   )r~   typer[   r8   r   r   r   r   rx   rj   expandra   rw   rk   ro   ru   rz   rl   ZarangeZargmaxr   r   )r-   rE   rH   r{   r>   Zprompt_prefixZprompt_suffixr|   r}   Zcur_layer_tp_prefixZcur_layer_tp_suffixZtemp_1Ztemp_2Ztemp_3tempr2   r2   r3   rJ   H  sp   


6
4zTextEncoder.forwardri   r2   r2   r0   r3   r'   3  s    r'   )#r*   Zos.pathpathr   ra   Ztorch.nnrd   Ztorch.nn.functionalZ
functionalFZmodelscope.metainfor   Z'modelscope.models.base.base_torch_modelr   Zmodelscope.models.builderr   Zmodelscope.utils.configr   Zmodelscope.utils.constantr   r   Zbackboner	   Zbasic_utilsr
   r   Zregister_moduleZvop_retrievalZvop_retrieval_modelr   r   r!   r%   r#   r'   r2   r2   r2   r3   <module>   s*   }"G