o
    *jj                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dlZ	d dl
Z
d dlZd dlmZ d dlmZ dddZdd	 Zd
d Zdd Zdd ZdddZdddZdS )    N)OrderedDict)Image)
transforms   c              
   C   sd   t t j| tjdt | t ddgt t j| ddt  t j	ddddt ddgd}|S )	z
        The implementation of transforms functions.
        The default image resolution is 224.
        The normalize parameter follows the mainstream setting.
    )interpolation)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)g      ?g      ?)scaler   )Z
brightnessZ
saturationhue)Z	clip_testZ
clip_train)
r   ZComposeZResizer   ZBICUBICZ
CenterCropZ	NormalizeZRandomResizedCropZRandomHorizontalFlipZColorJitter)Z	input_resZ	tsfm_dict r	   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/basic_utils.pyinit_transform_dict   s"   r   c                 C   sJ   t j| dd}|d j|d}|d j|d}|d }|d }||||gS )a7  
        Loading dataset from 'feature_path' as a retrieval docs.
        The default dataset is MSRVTT-9K.

        Args:
            feature_path: 'VoP_msrvtt9k_features.pkl'
            mydevice: device(type='cuda', index=0)

        Returns:
            [text_embeds, vid_embeds_pooled, vid_ids, texts]
    TZweights_onlytext_embeds)ZdeviceZ
vid_embedsvid_idstexts)torchloadto)Zfeature_pathZmydeviceZfeature_contentr   Zvid_embeds_pooledr   r   r	   r	   r
   	load_data,   s   r   c                 C   s8   t | d}t|W  d   S 1 sw   Y  dS )z
        Load json files.
    rN)openjsonr   )filenamefr	   r	   r
   	load_json@   s   $r   c                 C   sP   | dkr&t |  tj|  t j|  t|  dt jj_	dt jj_
dS dS )z
        Set random seed.
    r   TFN)r   Zmanual_seednprandomseedcudaZmanual_seed_allbackendsZcudnnZdeterministicZ	benchmark)r   r	   r	   r
   set_seedH   s   


r   c                 C   sF   t j| dd}|d }t }| D ]\}}|||dd< q|}|S )z0
        Load pre-train parameters for VoP.
    Tr   
state_dictzmodule. )r   r   r   itemsreplace)Zcheckpoint_path
checkpointr    Znew_state_dictkvr	   r	   r
   get_state_dictU   s   r'   randc                 C   s@  t ||}tjd||d dt}g }t|dd D ]\}}||||d  d f q|dkr:dd |D }	nd	d |D }	g }
|	D ]A}| tj	| | 
 \}}|sid
}t|D ]}| 
 \}}|rh nq\|sn dS t|tj}t|}|ddd}|
| qEt|
|k r|
|
d   t|
|k s|
|	fS )a  
        Get indexes of sampled frames.

        Args:
            cap: cv2.VideoCapture
            num_frames: int - number of frames to sample
            vlen: video length, int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 325
            sample: 'rand' | 'uniform' how to sample

        Returns:
            frames: torch.tensor of stacked sampled video frames
                    of dim (num_frames, C, H, W)
            frame_idxs: list(int) indices of where the frames where sampled
    r      )startstopnumNr(   c                 S   s$   g | ]}t t|d  |d qS )r   r)   )r   choicerange.0xr	   r	   r
   
<listcomp>z   s   $ z$get_valid_frames.<locals>.<listcomp>c                 S   s    g | ]}|d  |d  d qS )r   r)      r	   r0   r	   r	   r
   r3   |   s        )NNr4   )minr   ZlinspaceZastypeint	enumerateappendsetcv2ZCAP_PROP_POS_FRAMESreadr/   ZcvtColorZCOLOR_BGR2RGBr   Z
from_numpyZpermutelenclone)cap
num_framesvlensampleZacc_samplesZ	intervalsrangesidxZinterv
frame_idxsframesindexretframeZn_tries_r	   r	   r
   get_valid_framesd   sB   


rK   c                 C   s^   t | }| sJ | t|t j}t||||\}}t|	 d }|
  ||fS )a  
        Get indexes of sampled frames.

        Args:
            video_path: the local video path
            num_frames: Frame number, 12 frames for each video
            sample: 'rand' | 'uniform' how to sample

        Returns:
            frames: torch.tensor of stacked sampled video frames
                    of dim (num_frames, C, H, W)
            frame_idxs: list(int) indices of where the frames where sampled
       )r;   ZVideoCaptureZisOpenedr7   getZCAP_PROP_FRAME_COUNTrK   r   stackfloatrelease)Z
video_pathr@   rB   r?   rA   rF   rE   r	   r	   r
   load_frames_from_video   s   
rQ   )r   )r(   )ospickler   shutilzipfilecollectionsr   r;   numpyr   r   Zujsonr   ZPILr   Ztorchvisionr   r   r   r   r   r'   rK   rQ   r	   r	   r	   r
   <module>   s&   

2