o
    *j                     @   s2  d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlm  mZ d dlmZ zd dlmZ W n eyE   dZY nw dddgd	dgd
dggddgd	dgd
dggg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg dg d g d!g d"gg d#g d$d%Z	&	'dJd(d)Zd*d+ ZG d,d- d-ejZG d.d/ d/ejZdKd1ed2efd3d4ZG d5d6 d6ejZdLd8d9ZG d:d; d;ejZdMd<d=Z d>d? Z!d@dA Z"dBdC Z#G dDdE dEejZ$G dFdG dGejZ%G dHdI dIejZ&dS )N    )OrderedDict)partialN)trunc_normal_)checkpoint_wrapper      g       @      )r      r
   r
   )r
   r
   r
   r
   )r   r
   r   r   )   r
   r
   r
   )   r
   r
   r
   )r   r
   r   r   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )	   r
   r
   r
   )
   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )   r
   r
   r
   )r	   r
   r   r   )   r
   r
   r
   )   r
   r
   r
   )r   r   r   )r
   r   r   )depthdim_mulhead_mulpool_q_stridepool_kvq_kernelpool_kv_stride_adaptiveTFc           	         s   ddg}|r|dg7 }|   }|  D ]Y\ }t fdd|D rl|  }|jd |jd krdtj|d|jd ddd	d|jd d
d}|d|jd dd}|rctd	 |j |j n|}|
 | < q|S )N	rel_pos_h	rel_pos_w	rel_pos_tc                    s   g | ]}| v qS  r'   .0xkr'   i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/mplug/mvit.py
<listcomp>-       z-interpolate_rel_pos_embed.<locals>.<listcomp>r   r
   r   linearsizemodezInflate {}: {} -> {}: {})copyitemsanyshapeFinterpolatereshapepermuteprintformatclone)	Zstate_dict_originZstate_dict_modelZtemporalverboseZrel_pos_embed_typesZstate_dict_inflatedZv2dZv3dZrel_pos_resizedr'   r+   r-   interpolate_rel_pos_embed#   s.   
rA   c                    s  | d }t |d t |d }}tt| d D ]| d  d || d  d < qtt| d D ]| d  d || d  d < q6dd t|D }dd t|D }d	d t|D d
d t|D }tt| d D ] | d  dd  | d  d < | d || d  d < qu| d d ur| d  g | d< t| d D ]#t dkrÇ fddtt D  | d g   qtt| d D ] | d  dd  || d  d < | d || d  d < q|||||fS )Nr   r
   r   r   r    c                 S      g | ]}g qS r'   r'   r)   ir'   r'   r-   r.   G       z)_prepare_mvit_configs.<locals>.<listcomp>c                 S   rB   r'   r'   rC   r'   r'   r-   r.   H   rE   c                 S   rB   r'   r'   rC   r'   r'   r-   r.   I   rE   c                 S   rB   r'   r'   rC   r'   r'   r-   r.   J   rE   r!   r"   r#   Zpool_kv_stridec                    s&   g | ]}t  |  |  d qS r
   )max)r)   dZ
_stride_kvrD   stride_qr'   r-   r.   U   s    )torchZonesrangelenappend)cfgr   r   r    pool_qpool_kv	stride_kvr'   rI   r-   _prepare_mvit_configs?   s4   ""$
$rS   c                       s0   e Zd Zddejdf fdd	Zdd Z  ZS )MlpN        c                    sb   t    || _|p|}|p|}t||| _| | _t||| _| jdkr/t|| _	d S d S NrU   )
super__init__	drop_ratennLinearfc1actfc2Dropoutdrop)selfin_featureshidden_featuresout_features	act_layerrY   	__class__r'   r-   rX   d   s   

zMlp.__init__c                 C   sJ   |  |}| |}| jdkr| |}| |}| jdkr#| |}|S rV   )r\   r]   rY   r`   r^   ra   r*   r'   r'   r-   forwardv   s   






zMlp.forward)__name__
__module____qualname__rZ   GELUrX   ri   __classcell__r'   r'   rf   r-   rT   b   s    rT   c                       s$   e Zd Z fddZdd Z  ZS )Permutec                    s   t    || _d S N)rW   rX   dims)ra   rq   rf   r'   r-   rX      s   

zPermute.__init__c                 C   s   |j | j S rp   )r<   rq   rh   r'   r'   r-   ri      s   zPermute.forward)rj   rk   rl   rX   ri   rn   r'   r'   rf   r-   ro      s    ro   rU   	drop_probtrainingc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )z&
    Stochastic Depth per sample.
    rU   r
   r   rF   )dtypedevice)r8   ndimrK   Zrandrt   ru   Zfloor_div)r*   rr   rs   Z	keep_probr8   maskoutputr'   r'   r-   	drop_path   s   rz   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )DropPathzYDrop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).Nc                    s   t t|   || _d S rp   )rW   r{   rX   rr   )ra   rr   rf   r'   r-   rX      s   
zDropPath.__init__c                 C   s   t || j| jS rp   )rz   rr   rs   rh   r'   r'   r-   ri      s   zDropPath.forwardrp   rj   rk   rl   __doc__rX   ri   rn   r'   r'   rf   r-   r{      s    r{   r
   c                 C   s   |s| S | |9 } |p|}|r0t d|  t d|  d|  t dt| |d  | |   t|t| |d  | | }|d|  k rI||7 }t|S )Nz
min width zwidth z	 divisor zother r   g?)r=   intrG   )widthZ
multiplier	min_widthdivisorr@   Z	width_outr'   r'   r-   round_width   s   "r   c                       s6   e Zd ZdZ						d fdd		Zd
d Z  ZS )
PatchEmbedz
    PatchEmbed.
    r      r   r   r   r   r   r   Fc                    s4   t    |rtj}ntj}||||||d| _d S )N)Zkernel_sizestridepadding)rW   rX   rZ   ZConv2dConv3dproj)ra   dim_indim_outkernelr   r   Zconv2dZconv_functionrf   r'   r-   rX      s   
	zPatchEmbed.__init__c                 C   s"   |  |}|ddd|jfS )Nr   r
   )r   flatten	transposer8   rh   r'   r'   r-   ri      s   
zPatchEmbed.forward)r   r   r   r   r   Fr|   r'   r'   rf   r-   r      s    r   c                 C   sn  |d u r| |fS | j }|dkrn|dkr| d} ntd| j |rE| d d d d d dd d f | d d d d dd d d f }} | j\}}}	}
|\}}}| || ||||
ddddd } || } | jd | jd | jd g}| jd | jd  | jd  }| |||
|dd} |rtj	|| fdd} |d ur|| } |dkr	 | |fS | 
d} | |fS )Nr   r   r
   zUnsupported input dimension r   r   dim)rv   Z	unsqueezeNotImplementedErrorr8   r;   r<   
contiguousr   rK   catZsqueeze)Ztensorpool	thw_shapehas_cls_embednormZ
tensor_dimZcls_tokBNLCTHWZL_pooledr'   r'   r-   attention_pool   s:   
B
&
r   c                 C   s\   t |tr,| jd }||kr| S tj| d|dddd|dd}|d|ddS d S )Nr   r
   r0   r   r1   r2   )
isinstancer~   r8   r9   r:   r;   r<   )Zrel_posrH   Zori_dnew_pos_embedr'   r'   r-   get_rel_pos   s   

r   c                  C   s\  |rdnd}|\}	}
}|\}}}t dt|
| d }t dt|| d }t||
 d}t|
| d}t|
dddf | t|dddf |  }||d | 7 }t|| d}t|| d}t|dddf | t|dddf |  }||d | 7 }t||}t||}||  }||  }|j\}}}}|dddd|df |||	|
||}td||}td||}| dddd|d|df 	|d|	|
|||||ddddddddddddddf  |ddddddddddddddf  	|d|	|
 | || | | dddd|d|df< | S )	z<
    Decomposed Spatial Relative Positional Embeddings.
    r
   r   r         ?Nzbythwc,hkc->bythwkzbythwc,wkc->bythwkr0   )
r~   rG   rK   aranger   longr8   r;   Zeinsumview) attnqr,   r   q_shapek_shaper$   r%   sp_idxq_tq_hq_wk_tk_hk_wZdhZdwZ	q_h_ratioZ	k_h_ratioZdist_hZ	q_w_ratioZ	k_w_ratioZdist_wZRhZRwr   n_headq_Nr   r_qZrel_h_qZrel_w_qr'   r'   r-   cal_rel_pos_spatial  sR   



*2..
r   c              
   C   s  |rdnd}|\}}}	|\}
}}t dt||
 d }t||}t|
| d}t||
 d}t|dddf | t|
dddf |  }||
d | 7 }||  }|j\}}}}|dddd|df |||||	|}|dddddd||| | |	 |}t	||
dd
dd}|||||	||
dddddd}| dddd|d|df |d	|||	|
|||ddddddddddddddf  |d	|| |	 |
| | | dddd|d|df< | S )
z2
    Temporal Relative Positional Embeddings.
    r
   r   r   r   Nr   r   r   r0   )r~   rG   r   rK   r   r   r8   r;   r<   matmulr   r   )r   r   r   r   r   r&   r   r   r   r   r   r   r   dtZ	q_t_ratioZ	k_t_ratioZdist_tZRtr   r   r   r   r   relr'   r'   r-   cal_rel_pos_temporal=  s8   


*$$2.
r   c                       sH   e Zd Zdddddddejddddddddf fdd	Zd	d
 Z  ZS )MultiScaleAttentionr   FrU   r
   r
   r
   Tconvc              	      s  t    || _|| _|| _|| _|| _|| }|d | _|| _dd |D }dd |D }|s2|rNt	j
|||d| _t	j
|||d| _t	j
|||d| _nt	j
||d |d| _t	
||| _|dkrjt	|| _t|dkrzt|	dkrzd	}t|dkrt|
dkrd	}|| _|d
v r|dkrt	jnt	j}t|dkr|||	|ddnd | _t|dkr|||
|ddnd | _t|dkr|||
|ddnd | _n|dks|dkr\|r|dkr|| n|}n
|dkr|| n|}t|dkrt	j||||	||ddnd | _t|dkr||nd | _t|dkr&t	j||||
||ddnd | _t|dkr4||nd | _t|dkrJt	j||||
||ddnd | _t|dkrX||nd | _ntd| || _ || _!| j r|d |d ksxJ |d }t|	dkr||	d  n|}t|
dkr||
d  n|}dt"|| d }t	#t$%||| _&t	#t$%||| _'|st(| j&dd t(| j'dd | j!rt	#t$%d|d  d || _)|| _*d S )Ng      c                 S      g | ]}t |d  qS r   r~   )r)   r   r'   r'   r-   r.         z0MultiScaleAttention.__init__.<locals>.<listcomp>c                 S   r   r   r   )r)   kvr'   r'   r-   r.     r   )biasr   rU   r
   r'   )ZavgrG   rG   r   FZ	ceil_moder   conv_unshared)r   r   groupsr   zUnsupported model r   {Gz?Zstd)+rW   rX   
pool_firstseparate_qkvrY   	num_headsr   scaler   rZ   r[   r   r,   vqkvr   r_   	proj_dropnpprodr4   	MaxPool3dZ	AvgPool3drM   rP   pool_kpool_vr   norm_qnorm_knorm_vr   rel_pos_spatialrel_pos_temporalrG   	ParameterrK   zerosr$   r%   r   r&   residual_pooling)ra   r   r   
input_sizer   qkv_biasrY   kernel_q	kernel_kvrJ   rR   
norm_layerr   r4   r   r   r   rel_pos_zero_initr   r   Zhead_dimZ	padding_qZ
padding_kvZpool_opZdim_convr3   Zq_sizeZkv_sizeZ
rel_sp_dimrf   r'   r-   rX   g  s   








zMultiScaleAttention.__init__c              	   C   s  |j \}}}| jr)| jdkrd}n| j}||||ddddd}| } }}	ni| jdks0J | jsV| |||d| jdddddd}
|
d |
d |
d }}}	n<| } }}	| |||| jddddd}| 	|||| jddddd}| 
|	||| jddddd}	t|| j|| jt| dr| jnd d	\}}t|| j|| jt| d
r| jnd d	\}}t|	| j|| jt| dr| jnd d	\}	}| jr_| jrt|d nt|}| jrt|d nt|}| jrt|d nt|}|dddd||d}| |||| jddddd}|	dddd||d}	| 
|	||| jddddd}	|dddd||d}| 	|||| jddddd}|j d }|| j |dd }| jrt|||| j||| j| j}| jrt||| j||| j}|jdd}||	 }| j r| jr|d d d d dd d d f  |d d d d dd d d f 7  < n|| }|dd|d| j!}| "|}| j#dkr| $|}||fS )Nr   r
   r0   r   r   r   r   r   )r   r   r   r   r   rU   )%r8   r   r4   r   r;   r<   r   r   r   r,   r   r   rP   r   hasattrr   r   r   r   r   r   r   r   r   r   r   r$   r%   r   r   r&   Zsoftmaxr   r   r   rY   r   )ra   r*   r   r   r   _Zfold_dimr   r,   r   r   r   r   Zv_shaper   Zk_NZv_Nr   r'   r'   r-   ri     s   







F

zMultiScaleAttention.forward)rj   rk   rl   rZ   	LayerNormrX   ri   rn   r'   r'   rf   r-   r   e  s(     r   c                       sV   e Zd Zdddddejejdddddddddddddddf fdd		Zd
d Z  ZS )MultiScaleBlock      @FNrU   r   r   Tc           !         sz  t    || _|| _||| _|| _dd |D }|}dd |D }|r'|n|}|| _t||fi d|d|d|d|d|d	|d
|d|d|d|d|d|d|d|d|d|d|| _|	dkrot	|	nt
 | _||| _t|| }|| _|d ur|dkr|| } n|} t||| |
|d| _||krt
||| _t|dkrt
j|||dd| _d S d | _d S )Nc                 S   s    g | ]}|d kr|d  n|qS rF   r'   )r)   sr'   r'   r-   r.     s     z,MultiScaleBlock.__init__.<locals>.<listcomp>c                 S   r   r   r   )r)   skipr'   r'   r-   r.     r   r   r   r   rY   r   r   rJ   rR   r   r   r4   r   r   r   r   r   r   rU   r
   )rb   rc   rd   re   rY   r   Fr   )rW   rX   r   r   norm1dim_mul_in_attuse_grad_checkpointr   r   r{   rZ   Identityrz   norm2r~   r   rT   mlpr[   r   rM   r   	pool_skip)!ra   r   r   r   r   	mlp_ratior   Zqk_scalerY   rz   re   r   Zup_rater   r   rJ   rR   r4   r   r   r   r   r   r   r   r   r   Zkernel_skipZstride_skipZpadding_skipZatt_dimZmlp_hidden_dimZmlp_dim_outrf   r'   r-   rX   `  s   

	


zMultiScaleBlock.__init__c           	      C   s   |  |}| jrt| j||\}}n| ||\}}| jr)| j| jkr)| |}t|| j	|| j
d\}}|| | }| |}| jrLt| j|}n| |}| js_| j| jkr_| |}|| | }||fS )N)r   )r   r   
checkpointr   r   r   r   r   r   r   r   rz   r   r   )	ra   r*   r   Zx_normZx_blockZthw_shape_newZx_resr   Zx_mlpr'   r'   r-   ri     s(   






zMultiScaleBlock.forward)	rj   rk   rl   rZ   rm   r   rX   ri   rn   r'   r'   rf   r-   r   ^  s2    Wr   c                       s   e Zd ZdZddddddg dg d	g d
dddddddddddddddddddf fdd	Zdd Zejjdd Z	dd Z
dd Zdd Z  ZS )MViTv2a  
    Improved Multiscale Vision Transformers for Classification and Detection
    Yanghao Li*, Chao-Yuan Wu*, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik,
        Christoph Feichtenhofer*
    https://arxiv.org/abs/2112.01526
    Multiscale Vision Transformers
    Haoqi Fan*, Bo Xiong*, Karttikeya Mangalam*, Yanghao Li*, Zhicheng Yan, Jitendra Malik,
        Christoph Feichtenhofer*
    https://arxiv.org/abs/2104.11227
       `   i  r   r
   r   )r   r   r   )r   r   r   )r
   r   r   NrU   r   Tr   Fc           -   	      s  t    d}|| _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _ttjdd}|r>tt|||||	d| _n
t|||||	d| _||d  ||d  ||d  g}t|} dd	 td||D }!| jr}ttdd|| _| d }"n| }"| jrttd|"|| _| jr| jrttd| jd | jd  || _ttd| jd || _| jrttdd|| _nttd|"|| _|
d usJ t|
\}#}$}%}&}'}(|})|r||nd | _ t! | _"t#|D ]}*t$||$|* }|rt$||#|* t$||$|* d
}+nt$||#|*d  t$||$|*d  d
}+t%d'i d|d|+d|d|)d|d|d| jd|!|* d|dt&|%|*krL|%|* ng dt&|&|*krZ|&|* ng dt&|'|*krh|'|* ng dt&|(|*krv|(|* ng d|d| jd|d|d|d|d|d|d |d!d"},|rt|,d"d#},| j"'|, t&|'|* dkrd$d	 t(|)|'|* D })|+}q||| _)t* | _+| jr| jrt,| jd%d& t,| jd%d& | jrt,| jd%d& nt,| jd%d& | jrt,| jd%d& | -| j. d S )(Nr   gư>)eps)r   r   r   r   r   r   r
   r   c                 S   s   g | ]}|  qS r'   )itemr(   r'   r'   r-   r.   '  r/   z#MViTv2.__init__.<locals>.<listcomp>)r   r   r   r   r   r   r   rY   rz   r   r   r   rJ   rR   r4   r   r   r   r   r   r   r   r   r   F)Zoffload_to_cpuc                 S   s   g | ]\}}|| qS r'   r'   )r)   r3   r   r'   r'   r-   r.   w  s    r   r   r'   )/rW   rX   img_sizenum_classes	embed_dimr   r   cls_embed_onuse_abs_poszero_decay_pos_clsr   sep_pos_embedrY   r   rZ   r   r   r   patch_embedr   r   rK   Zlinspacer   r   	cls_token	pos_embed
patch_dimspos_embed_spatialpos_embed_temporalpos_embed_classrS   	norm_stemZ
ModuleListblocksrL   r   r   rM   rN   zipr   r   headr   apply_init_weights)-ra   r   r   r   Z
num_framesr   r   Zpatch_kernelZpatch_strideZpatch_paddingconfigZdropout_rateZdrop_path_rater   r   r4   r   r   r   r   r   r   r   r   r   r   r  r   r   Zin_chansr   r  Znum_patchesZdprZpos_embed_dimr   r    rP   rQ   rJ   rR   r   rD   r   Zattention_blockrf   r'   r-   rX     s:  
	
	





	


zMViTv2.__init__c                 C   s   t |tjr(tjj|jdd t |tjr$|jd ur&tj|jd d S d S d S t |tjr@tj|jd tj|jd d S d S )Nr   r   r   r   )	r   rZ   r[   initr   weightr   Z	constant_r   )ra   mr'   r'   r-   r    s   zMViTv2._init_weightsc                 C   sl   g }| j r4| jr| jr|g d n|dg | jr#|g d | jr,|dg | jr4|d |S )N)r  r  r  r  )r$   r%   Z
rel_pos_hwr&   r  )r   r   r   extendrN   r   r   r   )ra   namesr'   r'   r-   no_weight_decay  s   
zMViTv2.no_weight_decayc                 C   s  |d |d |d }}}| j r(|d d ddd d f }|d d dd f }|jd }| j\}}	}
||	 |
 |ks=J ||	|
f|||fkr{tj|d d d d d d f d||	|
dddddd|||fd	d
}|dd|| | ddd}| j rtj||fdd}|S )Nr   r0   r   r
   r   r   r   Z	trilinearr2   r   )	r   r8   r  r9   r:   r;   r<   rK   r   )ra   r  bcthwthwZcls_pos_embedZtxy_numZp_tZp_hZp_wr   r'   r'   r-   _get_pos_embed  s0   
"

zMViTv2._get_pos_embedc                 C   sJ  | ddddd}| |\}}|d |d |d }}}|j\}}}| jr8| j|dd}	tj|	|fdd	}| jr|| j	rq| j
d| jd dtj| j| jd | jd  dd	 }
| jrft| j|
gd}
| |
|}
||
 }n| | j|}
||
 }| jr| |}| jr| |}|||g}| jD ]	}|||\}}q| |}|S )
Nr   r   r
   r   r   r  r   r0   r   )r<   r  r8   r   r  expandrK   r   r   r   r  repeatr  Zrepeat_interleaver  r  r  r  rY   Zpos_dropr  r	  r   )ra   r*   r  r   r   r   r   r   r   Z
cls_tokensr  ZthwZblkr'   r'   r-   forward_features  sF   





zMViTv2.forward_featuresc                 C   s   |  |}| |}|S rp   )r  r  rh   r'   r'   r-   ri     s   

zMViTv2.forward)rj   rk   rl   r}   rX   r  rK   Zjitignorer  r  r  ri   rn   r'   r'   rf   r-   r     sL     2	
(r   )TF)rU   F)r
   r
   F)TN)'collectionsr   	functoolsr   numpyr   rK   Ztorch.nnrZ   Ztorch.nn.functionalZ
functionalr9   Ztorch.utils.checkpointutilsr   Ztimm.models.layersr   Zfairscale.nn.checkpointr   ImportErrorZMViTv2_Base_configrA   rS   ModulerT   ro   floatboolrz   r{   r   r   r   r   r   r   r   r   r   r'   r'   r'   r-   <module>   sV   
#


#%0( zr