o
    *jc                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ eje
jejdG dd deZdS )    N)Models)
TorchModel)MODELS)	ModelFileTasks   )SwinTransformer)DeformableTransformer)FPNFusionModule)Detector)module_namec                       s.   e Zd ZdZdef fddZdd Z  ZS )	VidtModelaA  
        The implementation of 'ViDT for joint-learning of object detection and instance segmentation'.
        This model is dynamically initialized with the following parts:
            - 'backbone': pre-trained backbone model with parameters.
            - 'head': detection and segentation head with fine-tuning.
    	model_dirc           	         s   t t|   tj|tj}tj	|ddd}t
ddgdg dg ddd	d
}|jddddgd || _| jj|d dd t|jdd}tddddddddddd
}t||dddd|ddddddd}|| _| jj|d dd dS )z Initialize a Vidt Model.
        Args:
          model_dir: model id or path, where model_dir/pytorch_model.pt contains:
                    - 'backbone_weights': parameters of backbone.
                    - 'head_weights': parameters of head.
        cpuT)Zmap_locationZweights_only   `   )   r      r   )   r            g?)Zpretrain_img_sizeZ	embed_dimZdepthsZ	num_headsZwindow_sizeZdrop_path_ratevidti,     r   )methodZdet_token_numZpos_dimZcross_indicesZbackbone_weights)strict)Zfuse_dim   r   i   g?Zrelu   F)
Zd_modelZnheadZnum_decoder_layersZdim_feedforwardZdropoutZ
activationZreturn_intermediate_decZnum_feature_levelsZdec_n_pointstoken_labelr   N)Znum_classesZnum_queriesZaux_lossZwith_box_refineepffZwith_vectorZprocessor_dctZ	iou_awarer   Zvector_hidden_dimZdistilZhead_weights)superr   __init__ospathjoinr   ZTORCH_MODEL_FILEtorchloadr   Zfinetune_detbackboneZload_state_dictr
   Znum_channelsr	   r   head)	selfr   kwargsZ
model_pathZ
model_dictr'   r   Zdeform_transformersr(   	__class__ `/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/vidt/model.pyr!      sd   
zVidtModel.__init__c              	   C   s:   |  ||\}}}}}}| |||||||\}	}
|	|
fS )z Dynamic forward function of VidtModel.
        Args:
            x: input images (B, 3, H, W)
            mask: input padding masks (B, H, W)
        )r'   r(   )r)   xmaskZ
features_0Z
features_1Z
features_2Z
features_3Zdet_tgtZdet_posZout_pred_logitsZout_pred_boxesr-   r-   r.   forwardX   s   zVidtModel.forward)__name__
__module____qualname____doc__strr!   r1   __classcell__r-   r-   r+   r.   r      s    >r   )r"   r%   Zmodelscope.metainfor   Z'modelscope.models.base.base_torch_modelr   Zmodelscope.models.builderr   Zmodelscope.utils.constantr   r   r'   r   Zdeformable_transformerr	   Z
fpn_fusionr
   r(   r   Zregister_moduleZimage_object_detectionr   r   r-   r-   r-   r.   <module>   s   