o
    *j*3                     @   sh   d dl Z d dlZd dlmZ d dlm  mZ G dd dejZG dd dejZ	G dd dejZ
dS )    Nc                       s@   e Zd Z					d fdd	Zdd Zd	d
 Zdd Z  ZS )SelfAttention   '     Nc                    s  t t|   ddg| _|dur$| }|| jv s$J dg | jR  || _|| _|| _|| _|| _	t
 t
 t
 | _| _| _t| jD ],}| jt
j||| dd | jt
j||| dd | jt
j||| dd qIt
j||dd| _t
jdd| _t
jd	d
| _dS )a   The basic (multi-head) Attention 'cell' containing the learnable parameters of Q, K and V

        :param int input_size: Feature input size of Q, K, V.
        :param int output_size: Feature -hidden- size of Q, K, V.
        :param int freq: The frequency of the sinusoidal positional encoding.
        :param int heads: Number of heads for the attention module.
        :param str | None pos_enc: The type of the positional encoding [supported: Absolute, Relative].
        absoluterelativeNzSupported encodings: F)in_featuresout_featuresZbiasdim      ?p)superr   __init__Zpermitted_encodingslower
input_sizeoutput_sizeheadspos_encfreqnn
ModuleListWkWqWvrangeappendLinearoutZSoftmaxsoftmaxDropoutdrop)selfr   r   r   r   r   _	__class__ q/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/video_summarization/pgl_sum.pyr      sP   
 zSelfAttention.__init__c                 C   s   | j }| j}tjdd t|D | jjjd}tjdd t|d D | jjjd}||j	d d}|j
|j	d dd}||j	d d}tj||| jjjd}t||d| |   ||d| f< t||d| |   ||d| d f< |S )	a.  Calculate the sinusoidal positional encoding based on the absolute position of each considered frame.
        Based on 'Attention is all you need' paper (https://arxiv.org/abs/1706.03762)

        :param int T: Number of frames contained in Q, K and V
        :return: Tensor with shape [T, T]
        c                 S      g | ]}|qS r(   r(   .0kr(   r(   r)   
<listcomp>I       z5SelfAttention.getAbsolutePosition.<locals>.<listcomp>devicec                 S   r*   r(   r(   r+   r(   r(   r)   r.   K   r/      r   r   r   )r   r   torchtensorr   r    weightr1   reshapeshaperepeat_interleaverepeatzerossincos)r$   Tr   dposiAPr(   r(   r)   getAbsolutePosition?   s   &*z!SelfAttention.getAbsolutePositionc           
   	   C   s  | j }d| }|d  }tjdd t|D | jjjd}tjdd t|D | jjjd}||jd d}|j	|jd dd}|
|jd d}|| | }tj||| jjjd}tjd	d t|d D | jjjd}	t|d
d
d|	 f ||d
d
d|	 f |d
d
d|	 f  |   |d
d
d|	 f< t|d
d
d|	 d f ||d
d
d|	 d f |d
d
d|	 d f  |   |d
d
d|	 d f< |S )a+  Calculate the sinusoidal positional encoding based on the relative position of each considered frame.
        r_pos calculations as here: https://theaisummer.com/positional-embeddings/

        :param int T: Number of frames contained in Q, K and V
        :return: Tensor with shape [T, T]
        r2   r   c                 S   r*   r(   r(   r+   r(   r(   r)   r.   c   r/   z5SelfAttention.getRelativePosition.<locals>.<listcomp>r0   c                 S   r*   r(   r(   r+   r(   r(   r)   r.   d   r/   r   r   c                 S   r*   r(   r(   r+   r(   r(   r)   r.   o   r/   N)r   r3   r4   r   r    r5   r1   r6   r7   r8   r9   r:   r;   r<   )
r$   r=   r   r>   Zmin_rposr@   jZr_posRPidxr(   r(   r)   getRelativePositionX   s,   
""B6z!SelfAttention.getRelativePositionc                 C   s   g }t | jD ]`}| j| |}| j| |}| j| |}t||dd}| jdurR| jdkr@| j	|j
d d}|| }n| jdkrR| j|j
d d}	||	 }| |}
| |
}t||}|| q| tj|dd}||
 fS )a   Compute the weighted frame features, based on either the global or local (multi-head) attention mechanism.

        :param torch.tensor x: Frame features with shape [T, input_size]
        :return: A tuple of:
                    y: Weighted features based on the attention weights, with shape [T, input_size]
                    att_weights : The attention weights (before dropout), with shape [T, T]
        r   r   Nr   )r=   r   r   )r   r   r   r   r   r3   matmulZ	transposer   rB   r7   rF   r!   r#   r   r    catclone)r$   xZoutputsheadKQVZenergiesrA   rD   Zatt_weightsZ_att_weightsyr(   r(   r)   forwardx   s&   





zSelfAttention.forward)r   r   r   r   N)__name__
__module____qualname__r   rB   rF   rP   __classcell__r(   r(   r&   r)   r      s    2 r   c                       4   e Zd Z							d	 fdd	Zdd Z  ZS )
MultiAttentionr   r   Nr   c           	   
      s   t t|   t|||||d| _|| _| jdur>| jdks"J dt | _t	| jD ]}| j
t||| ||dd q,g d| _|| _| jdurb| j | _| j| jv sdJ dg | jR  dS dS )a}   Class wrapping the MultiAttention part of PGL-SUM; its key modules and parameters.

        :param int input_size: The expected input feature size.
        :param int output_size: The hidden feature size of the attention mechanisms.
        :param int freq: The frequency of the sinusoidal positional encoding.
        :param None | str pos_enc: The selected positional encoding [absolute, relative].
        :param None | int num_segments: The selected number of segments to split the videos.
        :param int heads: The selected number of global heads.
        :param None | str fusion: The selected type of feature fusion.
        )r   r   r   r   r   Nr2   znum_segments must be None or 2+   )addmultavgmaxzFusion method must be: )r   rV   r   r   	attentionnum_segmentsr   r   local_attentionr   r   Zpermitted_fusionsfusionr   )	r$   r   r   r   r   r]   r   r_   r%   r&   r(   r)   r      s:   



"zMultiAttention.__init__c                 C   sT  |  |\}}| jdur| jdurt|jd | j }t| jD ]}|| }|d | }||| }| j| |\}	}
tj	||| 
 ddd|||< tj	|	ddd}	| jdkrf|||  |	7  < q!| jdkrv|||  |	9  < q!| jdkr|||  |	7  < |||  d  < q!| jd	krt||| 
 |	|||< q!||fS )
a   Compute the weighted frame features, based on the global and locals (multi-head) attention mechanisms.

        :param torch.Tensor x: Tensor with shape [T, input_size] containing the frame features.
        :return: A tuple of:
            weighted_value: Tensor with shape [T, input_size] containing the weighted frame features.
            attn_weights: Tensor with shape [T, T] containing the attention weights.
        Nr   r   r2   )r   r   rX   rY   rZ   r[   )r\   r]   r_   mathceilr7   r   r^   F	normalizerI   r3   r[   )r$   rJ   weighted_valueattn_weightsZsegment_sizesegmentZleft_posZ	right_posZlocal_xZweighted_local_valueZattn_local_weightsr(   r(   r)   rP      s@   



zMultiAttention.forwardr   r   r   NNr   NrQ   rR   rS   r   rP   rT   r(   r(   r&   r)   rV      s    /rV   c                       rU   )
PGL_SUMr   r   Nr   c              	      s   t t|   t|||||||d| _tj||d| _tj| jjdd| _	tj
dd| _tj|dd| _tj| jjdd| _t | _t | _dS )	al   Class wrapping the PGL-SUM model; its key modules and parameters.

        :param int input_size: The expected input feature size.
        :param int output_size: The hidden feature size of the attention mechanisms.
        :param int freq: The frequency of the sinusoidal positional encoding.
        :param None | str pos_enc: The selected positional encoding [absolute, relative].
        :param None | int num_segments: The selected number of segments to split the videos.
        :param int heads: The selected number of global heads.
        :param None | str fusion: The selected type of feature fusion.
        )r   r   r   r   r]   r   r_   )r   r	   r   r   r   gư>)Znormalized_shapeepsN)r   ri   r   rV   r\   r   r   linear_1r	   linear_2r"   r#   Z	LayerNormnorm_ynorm_linearZReLUreluZSigmoidsigmoid)r$   r   r   r   r   r]   r   r_   r&   r(   r)   r      s.   
zPGL_SUM.__init__c                 C   s   | d|jd }|}| |\}}|| }| |}| |}| |}| |}| |}| |}| |}| 	|}|
dd}||fS )a   Produce frames importance scores from the frame features, using the PGL-SUM model.

        :param torch.Tensor frame_features: Tensor of shape [T, input_size] containing the frame features produced by
        using the pool5 layer of GoogleNet.
        :return: A tuple of:
            y: Tensor with shape [1, T] containing the frames importance scores in [0, 1].
            attn_weights: Tensor with shape [T, T] containing the attention weights.
        r
   r   )r6   r7   r\   r#   rm   rk   ro   rn   rl   rp   view)r$   Zframe_featuresZresidualrd   re   rO   r(   r(   r)   rP     s   	







zPGL_SUM.forwardrg   rh   r(   r(   r&   r)   ri      s    (ri   )r`   r3   Ztorch.nnr   Ztorch.nn.functionalZ
functionalrb   Moduler   rV   ri   r(   r(   r(   r)   <module>   s    X