o
    )jJH                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
m	  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ d.ddZd	ed
ededefddZG dd de	jZG dd de	jZG dd de	jZG dd dej	jZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#dd  Z$d/d%d&Z%G d'd( d(e	jZ&G d)d* d*ej	jZ'ej(ej)ej*d+G d,d- d-eZ+dS )0aC   This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    Self-Distillation Prototypes Network(SDPN) is a self-supervised learning framework in SV.
    It comprises a teacher and a student network with identical architecture
    but different parameters. Teacher/student network consists of three main modules:
    the encoder for extracting speaker embeddings, multi-layer perceptron for
    feature transformation, and prototypes for computing soft-distributions between
    global and local views. EMA denotes Exponential Moving Average.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r
   )lenshapemaxlongitemtorchZaranger
   r   expand	unsqueezeZ	as_tensor)lengthmax_lenr   r
   mask r   `/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/sdpn.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S Nr	      )mathceil)r   r   r   r   Zn_stepsZL_outpaddingr   r   r   get_padding_elem+   s   r#   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr	   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r   r"   groupsbias)
super__init__r   r   r   r"   padding_modennr$   conv)
selfout_channelsr   in_channelsr   r   r"   r'   r(   r+   	__class__r   r   r*   :   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr%   Zcausalr	   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r"   _manage_paddingr   r   r   Fpad
ValueErrorr-   )r.   xZnum_padZwxr   r   r   forwardX   s    



zConv1d.forwardr   r   r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r#   r5   r6   r+   )r.   r8   r   r   r   r   r"   r   r   r   r4   m   s   
zConv1d._manage_padding)r	   r	   r%   r	   Tr&   )__name__
__module____qualname__r*   r9   intr4   __classcell__r   r   r1   r   r$   8   s     r$   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r)   r*   r,   rA   norm)r.   
input_sizerD   rE   r1   r   r   r*   }   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rF   r.   r8   r   r   r   r9      s   
zBatchNorm1d.forward)rB   rC   r<   r=   r>   r*   r9   r@   r   r   r1   r   rA   {   s
    rA   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr	   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r0   r/   r   r   r'   rG   )r)   rK   r*   r$   r-   
activationrA   rF   )r.   r0   r/   r   r   rM   r'   r1   r   r   r*      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rH   )rF   rM   r-   rI   r   r   r   r9      s   zTDNNBlock.forward)r<   r=   r>   r,   ReLUr*   r9   r@   r   r   r1   r   rK      s
    rK   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r	   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r   )rK   ).0ir   Zhidden_channelZ
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r	   )r)   rO   r*   r,   
ModuleListrangeblocksscale)r.   r0   r/   rY   r   r   r1   rT   r   r*      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr	   dimr   )	enumerater   chunkrY   rX   appendcat)r.   r8   yrS   Zx_iZy_ir   r   r   r9      s   zRes2NetBlock.forward)rP   rQ   r	   rJ   r   r   r1   r   rO      s    rO   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr	   r0   r/   r   T)Zinplace)r)   ra   r*   r$   conv1r   r,   rN   reluconv2ZSigmoidsigmoid)r.   r0   se_channelsr/   r1   r   r   r*      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )Nr:   r   r
   r	   r   Tr[   Zkeepdim)
r   r   r
   r   summeanrd   rc   rf   re   )r.   r8   lengthsLr   totalsr   r   r   r9      s   

zSEBlock.forwardrH   rJ   r   r   r1   r   ra      s    
ra   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rQ   r	   rb   )r)   r*   rD   global_contextrK   tdnnr,   ZTanhtanhr$   r-   )r.   channelsattention_channelsrr   r1   r   r   r*      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )Nr:   r   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr   )rj   r   sqrtr   powclamp)r8   mr[   rD   rk   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r
   rh   r	   Tri   rZ   z-inf)r   rD   r   Zonesr
   r   r   rr   rj   floatrepeatr_   r-   rt   rs   Zmasked_fillr5   Zsoftmax)r.   r8   rl   rm   r|   r   rn   rk   r{   ZattnZpooled_statsr   r   r   r9      s(   


z"AttentiveStatisticsPooling.forward)rq   TrH   rJ   r   r   r1   r   rp      s    rp   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrP   rq   r	   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr	   )r   r   rM   r'   rb   )r)   r*   r/   rK   tdnn1rO   res2net_blocktdnn2ra   se_blockshortcutr$   )	r.   r0   r/   res2net_scalerg   r   r   rM   r'   r1   r   r   r*   *  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rH   )r   r   r   r   r   )r.   r8   rl   Zresidualr   r   r   r9   S  s   



zSERes2NetBlock.forwardrH   )	r<   r=   r>   r   r,   rN   r*   r9   r@   r   r   r1   r   r   (  s    )r   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )r   r   r   r   i   )   rQ   rQ   rQ   r	   )r	   r   rQ      r	   rq   rP   T)r	   r	   r	   r	   r	   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r	   )r   rg   r   r   rM   r'   r:   )r'   )rv   rr   r   rL   rb   )r)   r*   r   ru   r,   rV   rX   r^   rK   rW   r   mfarp   asprA   asp_bnr$   fc)r.   rG   r
   Zlin_neuronsrM   ru   Zkernel_sizesZ	dilationsrv   r   rg   rr   r'   rS   r1   r   r   r*   f  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r	   r   )rl   NrZ   )Z	transposerX   	TypeErrorr^   r   r_   r   r   r   r   Zsqueeze)r.   r8   rl   Zxllayerr   r   r   r9     s    



zECAPA_TDNN.forwardrH   )
r<   r=   r>   __doc__r   r,   rN   r*   r9   r@   r   r   r1   r   r   `  s    Hr   c                 C   s   dd }||d|  k s||d|  krt jddd t B ||| | }||| | }| d| d d| d  |   | |td  | 	| | j
||d | W  d    S 1 sdw   Y  d S )	Nc                 S   s   dt | t d  d S )N      ?       @)r    erfrw   )r8   r   r   r   norm_cdf  s   z(_no_grad_trunc_normal_.<locals>.norm_cdfr   zimean is more than 2 std from [a, b] in nn.init.trunc_normal_.The distribution of values may be incorrect.)
stacklevelr	   r   )minr   )warningswarnr   Zno_gradZuniform_Zerfinv_Zmul_r    rw   Zadd_Zclamp_)tensorrk   r{   abr   Zl_ur   r   r   _no_grad_trunc_normal_  s     

$r           r          r   c                 C   s   t | ||||S rH   )r   )r   rk   r{   r   r   r   r   r   trunc_normal_  s   r   c                       s6   e Zd Z				d fdd	Zdd Zd	d
 Z  ZS )SDPNHeadFrQ         c                    s   t    t|d}|dkrt||| _nJt||g}|r'|t| |t  t	|d D ]}|t|| |rI|t| |t  q4|t|| tj
| | _| | j d S r   )r)   r*   r   r,   Linearmlpr^   rA   ZGELUrW   Z
Sequentialapply_init_weights)r.   Zin_dimZuse_bnZnlayersZ
hidden_dimZbottleneck_dimZlayers_r1   r   r   r*     s    

zSDPNHead.__init__c                 C   sP   t |tjr"t|jdd t |tjr$|jd ur&tj|jd d S d S d S d S )Ng{Gz?)r{   r   )
isinstancer,   r   r   weightr(   initZ	constant_)r.   rz   r   r   r   r     s   zSDPNHead._init_weightsc                 C   s    |  |}tjj|ddd}|S )Nr:   r   )r[   p)r   r,   
functional	normalizerI   r   r   r   r9     s   
zSDPNHead.forward)FrQ   r   r   )r<   r=   r>   r*   r   r9   r@   r   r   r1   r   r     s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )Combinerz1
    Combine backbone (ECAPA) and head (MLP)
    c                    s   t t|   || _|| _d S rH   )r)   r   r*   backbonehead)r.   r   r   r1   r   r   r*   #  s   
zCombiner.__init__c                 C   s   |  |}| |}||fS rH   )r   r   )r.   r8   outputr   r   r   r9   (  s   

zCombiner.forward)r<   r=   r>   r   r*   r9   r@   r   r   r1   r   r     s    r   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationSDPNz
    Self-Distillation Prototypes Network (SDPN) effectively facilitates
    self-supervised speaker representation learning. The specific structure can be
    referred to in https://arxiv.org/pdf/2308.02774.
    model_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| j|d| _t| jt	dd| _|d	 }| 
| | j  d S )
NZchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   )ru   r   TZpretrained_model)r)   r*   r   Zother_configr7   feature_dimr   embedding_modelr   r   *_SpeakerVerificationSDPN__load_check_pointeval)r.   	model_dirr   argskwargsZchannels_configpretrained_model_namer1   r   r   r*   6  s$   
z SpeakerVerificationSDPN.__init__c                 C   s>   t |jdkr|jd dksJ d| |}| j|}|S )Nr   r   r	   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   )_SpeakerVerificationSDPN__extract_featurer   r   )r.   audiofeatureZ	embeddingr   r   r   r9   M  s   
zSpeakerVerificationSDPN.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)Znum_mel_binsr   Tri   )KaldiZfbankr   rk   r   )r.   r   r   r   r   r   Z__extract_featureV  s   
z)SpeakerVerificationSDPN.__extract_featureNc                 C   sR   |st d}t jtj| j||d}dd |d  D }| jj	|dd d S )Nr   )Zmap_locationc                 S   s   i | ]\}}| d d|qS )zmodule. )replace)rR   kvr   r   r   
<dictcomp>b  s    z>SpeakerVerificationSDPN.__load_check_point.<locals>.<dictcomp>ZteacherT)strict)
r   r
   loadospathjoinr   itemsr   Zload_state_dict)r.   r   r
   Z
state_dictZstate_dict_tear   r   r   Z__load_check_point\  s   

z*SpeakerVerificationSDPN.__load_check_pointrH   )r<   r=   r>   r   r   strr   r*   r9   r   r   r@   r   r   r1   r   r   .  s    	r   )NNN)r   r   r   r   ),r   r    r   typingr   r   r   r   Ztorch.nnr,   Ztorch.nn.functionalr   r5   Ztorchaudio.compliance.kaldiZ
complianceZkaldir   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   r   r?   r#   Moduler$   rA   rK   rO   ra   rp   r   r   r   r   r   r   Zregister_moduleZspeaker_verificationZsdpn_svr   r   r   r   r   <module>   s6   
C'=8o
$&