o
    *Îjn&  ã                   @   sÞ   d dl Z d dlZd dlmZ d dlm  mZ G dd„ dejƒZG dd„ dejƒZ	G dd„ dejƒZ
G dd	„ d	ejƒZG d
d„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZddd„ZdS )é    Nc                       ó(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚQ2VRankerStage1zÂ
        Used to calculate the qv_ctx_score with query embedding and multi anchor context embeddings as input.
        The qv_ctx_score is used to pre-rank and retain top-k related anchors.
    c                    ó"   t ƒ  ¡  t ||¡| _|| _d S ©N©ÚsuperÚ__init__ÚnnÚLinearÚfcÚnscales©Úselfr   Ú
hidden_dim©Ú	__class__© úl/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/blocks.pyr      ó   

zQ2VRankerStage1.__init__c              
   C   sX   |   |¡}tƒ }t| jƒD ]}t dtj|| dddtj|ddd¡}| |¡ q|S ©Nz
bld,bd->blé   ©ÚpÚdimé   )	r   ÚlistÚranger   ÚtorchÚeinsumÚFÚ	normalizeÚappend)r   Ú	ctx_featsÚqfeatÚqv_ctx_scoresÚiÚscorer   r   r   Úforward   s   
þzQ2VRankerStage1.forward©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r'   Ú__classcell__r   r   r   r   r   	   s    r   c                       r   )ÚV2QRankerStage1zt
        Used to calculate the vq_ctx_score with anchor context embeddings and multi query embeddings as input.
    c                    r   r   r   r   r   r   r   r   %   r   zV2QRankerStage1.__init__c              
   C   sT   t ƒ }t| jƒD ]}t dtj|  || ¡dddtj|ddd¡}| |¡ q|S r   )	r   r   r   r   r   r   r    r   r!   )r   r"   r#   Zvq_ctx_scoresr%   r&   r   r   r   r'   *   s   þzV2QRankerStage1.forwardr(   r   r   r   r   r.       ó    r.   c                       ó*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )ÚQ2VRankerStage2z¡
        Used to calculate the qv_ctn_score with query embedding and video sequence embedding as input.
        The qv_ctn_score is used to re-rank anchors.
    é
   c                    s0   t ƒ  ¡  || _|| _t ||¡| _tƒ | _d S r   )	r   r   r   Úsnippet_lengthr	   r
   ÚqfcÚV2VAttentionÚencoder)r   r   r   r3   r   r   r   r   ;   s
   
zQ2VRankerStage2.__init__c              
   C   s.  |   |¡}tƒ }tƒ }| ¡ \}}}	tƒ }
t| jƒD ]w}| jd|  }|| ||  d¡ks0J ‚t || d|| ¡}| || ||	¡ 	¡ }t |d|| ¡}|  
|tj| ¡ d d… |jd¡}|
 |¡ t dtj| d¡dddtj|ddd¡}tj|dd\}}| |¡ | || ¡ q|||
fS )	Nr   r   r   )Údeviceúbkld,bd->bklé   r   ©r   )r4   r   Úsizer   r   r3   r   Zindex_selectÚviewÚdetachr6   Zonesr7   r!   r   r   r    Ú	unsqueezeÚmax)r   Zvfeatsr#   Zhit_indicesr$   Zqv_ctn_scoresZqv_merge_scoresÚ_ÚLÚDÚ	ctn_featsr%   Zanchor_lengthZqv_ctx_scoreZctn_featZqv_ctn_scorer   r   r   r'   B   s<   
ÿÿÿþ
þ

zQ2VRankerStage2.forward)r2   r(   r   r   r   r   r1   5   s    r1   c                       r   )ÚV2QRankerStage2zt
        Used to calculate the vq_ctn_score with anchor content embeddings and multi query embeddings as input.
    c                    r   r   r   r   r   r   r   r   g   r   zV2QRankerStage2.__init__c              
   C   sh   t ƒ }t| jƒD ])}t dtj|  || ¡ d¡dddtj|ddd¡}tj	|dd}| 
|¡ q|S )Nr8   r   r   r9   r   r   r:   )r   r   r   r   r   r   r    r   r>   Úmeanr!   )r   rC   r#   Zvq_ctn_scoresr%   r&   r   r   r   r'   l   s   ýzV2QRankerStage2.forwardr(   r   r   r   r   rD   b   r/   rD   c                       r   )r5   z`
        Self-attention encoder for anchor frame sequence to encode intra-anchor knowledge.
    c                    s:   t ƒ  ¡  tdddd| _tdddd| _t d¡| _d S )Ni  i   ç        )Úmax_lenr   Údropouté   gš™™™™™¹?)r   Ún_headsrH   )	r   r   ÚPositionEncodingÚposembÚMultiHeadAttentionr6   r	   ÚDropoutrH   )r   r   r   r   r   ~   s   
zV2VAttention.__init__c                 C   sX   t  d||¡ d¡}|}||  |¡ }| j||||d}|  || ¡| d¡ ¡  }|S )Nz
bm,bn->bmnr   )ÚqueryÚkeyÚvalueÚmaskr   )r   r   r>   rL   r6   rH   Úfloat)r   Zvideo_featsZvideo_masksrR   ZresidualÚoutr   r   r   r'   „   s    ÿÿÿÿÿzV2VAttention.forwardr(   r   r   r   r   r5   y   ó    r5   c                       r0   )ÚBboxRegressorzK
        Predict the offset of bounding box for each candidate anchor.
    Fc                    sš   t ƒ  ¡  t ||¡| _t ||¡| _|r6t ||¡| _t|ƒ| _t 	t d| |¡t 
¡ t |d¡¡| _nt 	t ||¡t 
¡ t |d¡¡| _|| _d S )Nr   )r   r   r	   r
   Úfc_ctxÚfc_qÚfc_ctnÚSelfAttentionÚattnZ
SequentialÚReLUÚ	predictorÚenable_stage2)r   r   r^   r   r   r   r   •   s   


þ
þ
zBboxRegressor.__init__c           	      C   sÒ   |   |¡}tj|dd}t |  |¡¡t | d¡¡ }| jr`|r`tƒ }t	t
|ƒƒD ]$}t |  || ¡ d¡¡t | d¡ d¡¡ }|  |¡}| |¡ q*tj|dd}tj||gdd}n|}|  |¡}|S )Nr   r:   r   éÿÿÿÿ)rX   r   Úcatr   ÚrelurW   r>   r^   r   r   ÚlenrY   r[   r!   r]   )	r   r"   rC   r#   Zctx_fuse_featsZctn_fuse_featsr%   rT   Z
fuse_featsr   r   r   r'   ¦   s$   
ÿ
ÿ

zBboxRegressor.forward)Fr(   r   r   r   r   rV      s    rV   c                       r   )rZ   z?
        Obtain pooled features by self-attentive pooling.
    c                    s<   t ƒ  ¡  t ||d ¡| _t ¡ | _t |d d¡| _d S )Nr   r   )r   r   r	   r
   Úfc1r\   ra   Úfc2)r   r   r   r   r   r   Â   s   

zSelfAttention.__init__c                 C   sF   |   |  |  |¡¡¡ d¡}tj|dd d¡}tj|| dd}|S )Nr9   r   r:   )	rd   ra   rc   Zsqueezer   Úsoftmaxr>   r   Úsum)r   ÚxÚattrT   r   r   r   r'   È   s   zSelfAttention.forwardr(   r   r   r   r   rZ   ½   rU   rZ   c                       r0   )rK   a!  
        An implementation of trainable positional embedding which is added to
        sequence features to inject time/position information.

        Args:
            max_len: The max number of trainable positional embeddings.
            dim: the dimension of positional embedding.
    rF   c                    s6   t t| ƒ ¡  t ||¡| _t ¡ | _t |¡| _	d S r   )
r   rK   r   r	   Z	EmbeddingÚembedr\   ra   rN   rH   )r   rG   r   rH   r   r   r   r   Ù   s   
zPositionEncoding.__init__c                 C   sR   |j d d… \}}tj|tj|jd}| d¡ |d¡}|  |  |  	|¡¡¡}|S )Nr   )Zdtyper7   r   r   )
Úshaper   ZarangeÚlongr7   r>   ÚrepeatrH   ra   ri   )r   rg   Z
batch_sizeZseq_lenZpos_idsZpos_embr   r   r   r'   à   s
   zPositionEncoding.forward©rF   r(   r   r   r   r   rK   Ï   s    	rK   c                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
rM   a  
        An implementation of multi-head attention module, as described in
        'Attention Is All You Need <https://arxiv.org/abs/1706.03762>'

        Args:
            dim: the dimension of features of hidden layers.
            n_heads: the number of head.
    rF   c                    sl   t t| ƒ ¡  || _|| _|| | _t ||¡| _t ||¡| _	t ||¡| _
t |¡| _tjdd| _d S )Nr_   r:   )r   rM   r   r   rJ   Úhead_dimr	   r
   Úto_qÚto_kÚto_vrN   rH   ZSoftmaxre   )r   r   rJ   rH   r   r   r   r   ó   s   
zMultiHeadAttention.__init__c                 C   s6   |  ¡ d d… | j| jf }|j|Ž }| dddd¡S )Nr_   r   r   r   r9   )r;   rJ   rn   r<   Úpermute)r   rg   Znew_x_shaper   r   r   Útranspose_for_scores  s   
z'MultiHeadAttention.transpose_for_scoresc                 C   sÄ   |   |¡}|  |¡}|  |¡}|  |¡}|  |¡}	|  |¡}
t ||	 dd¡¡}|t | j	¡ }t
||ƒ}|  |¡}|  |¡}t ||
¡}| dddd¡ ¡ }| ¡ d d… | jf }|j|Ž }|S )Nr_   éþÿÿÿr   r   r   r9   )ro   rp   rq   rs   r   ÚmatmulZ	transposeÚmathÚsqrtrn   Úmask_logitsre   rH   rr   Ú
contiguousr;   r   r<   )r   rO   rP   rQ   rR   ÚqÚkÚvZq_transZk_transZv_transrh   Zctx_vrj   r   r   r   r'     s$   





ÿ



zMultiHeadAttention.forwardrm   )r)   r*   r+   r,   r   rs   r'   r-   r   r   r   r   rM   é   s
    	rM   çêŒ 9Y>)Æc                 C   s   |  tj¡}| d| |  S )Ng      ð?)Útyper   Zfloat32)ZinputsrR   Z
mask_valuer   r   r   rx     s   rx   )r}   )rv   r   Ztorch.nnr	   Ztorch.nn.functionalZ
functionalr   ÚModuler   r.   r1   rD   r5   rV   rZ   rK   rM   rx   r   r   r   r   Ú<module>   s   --4