o
    *j                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ ddlmZ dd	lmZmZ ejejejd
G dd deZG dd deZdS )    N)Models)
TorchModel)MODELS)Config)	ModelFileTasks   )	load_clip)get_state_dictset_seed)module_namec                       sB   e Zd ZdZdef fddZdddZdd	 Zdd
dZ  Z	S )VideoTextRetrievalModelSeriesa  
        The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
        This model is dynamically initialized with the following parts:
            - clip: the upstream pre-trained backbone model (CLIP in this code).
                - The pretrain param (ViT-B/32) downloads from OpenAI:
                - "https://openaipublic.azureedge.net/clip/models/
                - 40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"
            - pool_frames: the frames pooling method
            - visual_prompt_learner: visual prompt
            - ImageEncoder: get image encoder
            - TextPromptLearner: text prompt
            - TextEncoder: get text encoder
    	model_dirc                    sv   t t|   t|d}t|d}t|tj}t|j	| _
t|d| _t| j
j| _| t| |   dS )zl
            Initialize a VoP Model

            Args:
                model_dir: model id or path,
        zVoPSE_msrvtt9k.pthzViT-B-32.pt)nameN)superr   __init__ospjoinr   ZCONFIGURATIONr   	from_fileZ
hyperparamconfigr	   clipBaselinePoolingpooling_typepool_framesZload_state_dictr
   eval)selfr   argskwargsZ
model_pathZ	clip_archZconfig_path	__class__ l/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/model_se.pyr   $   s   z&VideoTextRetrievalModelSeries.__init__Fc                 C   sl   |j d }|dd| jj| jj}| j|}||jddd }||| jjd}| |}|r4||fS |S )z
            Get video Features

            Args:
                videos: the dim is [1, 12, 3, 224, 224]
                return_all_frames: default False
        r      TdimZkeepdim)	shapereshaper   	input_resr   encode_imagenorm
num_framesr   )r   Zvideosreturn_all_frames
batch_size
video_datavideo_featuresvideo_features_pooledr    r    r!   get_video_features;   s   

z0VideoTextRetrievalModelSeries.get_video_featuresc                 C   s"   | j |}||jddd }|S )zh
            Get Text Features

            Args:
                text_data: the dim is [1, 69]
        r"   Tr$   )r   encode_textr*   )r   	text_datatext_featuresr    r    r!   get_text_featuresU   s
   z/VideoTextRetrievalModelSeries.get_text_featuresc           	      C   s   |d j d }|d }|d }|dd| jj| jj}| j|}| j|}||jddd }||jddd }||| jjd}| 	|}|rN|||fS ||fS )z
            Dynamic Forward Function of VoP

            Args:
                data: the input data
                return_all_frames: default False
        Zvideor   textr"   r#   Tr$   )
r&   r'   r   r(   r   r2   r)   r*   r+   r   )	r   datar,   r-   r3   r.   r4   r/   r0   r    r    r!   forwardb   s*   

z%VideoTextRetrievalModelSeries.forward)F)
__name__
__module____qualname____doc__strr   r1   r5   r8   __classcell__r    r    r   r!   r      s    
r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r   z(
        Redefined Pooling Function
    c                    s&   t t|   |dkr| j| _d S t)NZavg)r   r   r   _avg_poolingpooling_funcNotImplementedError)r   r   r   r    r!   r      s   zBaselinePooling.__init__c                 C   s   |j dd}|S )z
            Pooling mean of frames

            Args:
                video_embeds: the input video embedding with [1, 12, 512].

            Returns:
                video_embeds_pooled: num_vids x embed_dim
        r   )r%   )mean)r   video_embedsZvideo_embeds_pooledr    r    r!   r?      s   
zBaselinePooling._avg_poolingc                 C   s
   |  |S )N)r@   )r   rC   r    r    r!   r8      s   
zBaselinePooling.forward)r9   r:   r;   r<   r   r?   r8   r>   r    r    r   r!   r      s
    r   )osZos.pathpathr   ZtorchZtorch.nnnnZtorch.nn.functionalZ
functionalFZmodelscope.metainfor   Z'modelscope.models.base.base_torch_modelr   Zmodelscope.models.builderr   Zmodelscope.utils.configr   Zmodelscope.utils.constantr   r   Zbackboner	   Zbasic_utilsr
   r   Zregister_moduleZvop_retrievalZvop_retrieval_model_ser   r   r    r    r    r!   <module>   s"   m