o
    )j:                     @   sx  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ ddlmZ d#dd	Zd
edededefddZG dd dejZ G dd dejZ!G dd dejZ"G dd de	jjZ#G dd dejZ$G dd dejZ%G dd dejZ&G dd dejZ'ej(ej)ej*d G d!d" d"eZ+dS )$z\ This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r   )lenshapemaxlongitemtorchZaranger   r   expand	unsqueezeZ	as_tensor)lengthmax_lenr   r   mask r   f/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/ecapa_tdnn.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S )Nr
      )mathceil)r   r   r   r   Zn_stepsZL_outpaddingr   r   r   get_padding_elem'   s   r#   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr
   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r   r"   groupsbias)
super__init__r   r   r   r"   padding_modennr$   conv)
selfout_channelsr   in_channelsr   r   r"   r'   r(   r+   	__class__r   r   r*   6   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr%   Zcausalr
   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r"   _manage_paddingr   r   r   Fpad
ValueErrorr-   )r.   xZnum_padZwxr   r   r   forwardT   s    



zConv1d.forwardr   r   r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r#   r5   r6   r+   )r.   r8   r   r   r   r   r"   r   r   r   r4   i   s   
zConv1d._manage_padding)r
   r
   r%   r
   Tr&   )__name__
__module____qualname__r*   r9   intr4   __classcell__r   r   r1   r   r$   4   s     r$   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r)   r*   r,   rA   norm)r.   
input_sizerD   rE   r1   r   r   r*   y   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rF   r.   r8   r   r   r   r9      s   
zBatchNorm1d.forward)rB   rC   r<   r=   r>   r*   r9   r@   r   r   r1   r   rA   w   s
    rA   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr
   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r0   r/   r   r   r'   rG   )r)   rK   r*   r$   r-   
activationrA   rF   )r.   r0   r/   r   r   rM   r'   r1   r   r   r*      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rH   )rF   rM   r-   rI   r   r   r   r9      s   zTDNNBlock.forward)r<   r=   r>   r,   ReLUr*   r9   r@   r   r   r1   r   rK      s
    rK   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r
   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r   )rK   ).0ir   Zhidden_channelZ
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r
   )r)   rO   r*   r,   
ModuleListrangeblocksscale)r.   r0   r/   rY   r   r   r1   rT   r   r*      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr
   dimr   )	enumerater   chunkrY   rX   appendcat)r.   r8   yrS   Zx_iZy_ir   r   r   r9      s   zRes2NetBlock.forward)rP   rQ   r
   rJ   r   r   r1   r   rO      s    rO   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr
   r0   r/   r   T)Zinplace)r)   ra   r*   r$   conv1r   r,   rN   reluconv2ZSigmoidsigmoid)r.   r0   se_channelsr/   r1   r   r   r*      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )Nr:   r   r   r
   r   Tr[   Zkeepdim)
r   r   r   r   summeanrd   rc   rf   re   )r.   r8   lengthsLr   totalsr   r   r   r9      s   

zSEBlock.forwardrH   rJ   r   r   r1   r   ra      s    
ra   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rQ   r
   rb   )r)   r*   rD   global_contextrK   tdnnr,   ZTanhtanhr$   r-   )r.   channelsattention_channelsrr   r1   r   r   r*      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )Nr:   r   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr   )rj   r   sqrtr   powclamp)r8   mr[   rD   rk   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r   rh   r
   Tri   rZ   z-inf)r   rD   r   Zonesr   r   r   rr   rj   floatrepeatr_   r-   rt   rs   Zmasked_fillr5   Zsoftmax)r.   r8   rl   rm   r|   r   rn   rk   r{   ZattnZpooled_statsr   r   r   r9      s(   


z"AttentiveStatisticsPooling.forward)rq   TrH   rJ   r   r   r1   r   rp      s    rp   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrP   rq   r
   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr
   )r   r   rM   r'   rb   )r)   r*   r/   rK   tdnn1rO   res2net_blocktdnn2ra   se_blockshortcutr$   )	r.   r0   r/   res2net_scalerg   r   r   rM   r'   r1   r   r   r*   &  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rH   )r   r   r   r   r   )r.   r8   rl   Zresidualr   r   r   r9   O  s   



zSERes2NetBlock.forwardrH   )	r<   r=   r>   r   r,   rN   r*   r9   r@   r   r   r1   r   r   $  s    )r   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )   r   r   r   i   )   rQ   rQ   rQ   r
   )r
   r   rQ      r
   rq   rP   T)r
   r
   r
   r
   r
   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r
   )r   rg   r   r   rM   r'   r:   )r'   )rv   rr   r   rL   rb   )r)   r*   r   ru   r,   rV   rX   r^   rK   rW   r   mfarp   asprA   asp_bnr$   fc)r.   rG   r   Zlin_neuronsrM   ru   Zkernel_sizesZ	dilationsrv   r   rg   rr   r'   rS   r1   r   r   r*   b  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r
   r   )rl   NrZ   )Z	transposerX   	TypeErrorr^   r   r_   r   r   r   r   Zsqueeze)r.   r8   rl   Zxllayerr   r   r   r9     s    



zECAPA_TDNN.forwardrH   )
r<   r=   r>   __doc__r   r,   rN   r*   r9   r@   r   r   r1   r   r   \  s    Hr   )module_namec                       sB   e Zd Zdeeef f fddZdd Zdd Zdd	 Z	  Z
S )
SpeakerVerificationECAPATDNNmodel_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| jd | _t| j t	| j|d| _
|d }| | | j
| j | j
  d S )	NZchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   r   )ru   Zpretrained_model)r)   r*   r   Zother_configr7   feature_dimr	   r   printr   embedding_model/_SpeakerVerificationECAPATDNN__load_check_pointtoeval)r.   	model_dirr   argskwargsZchannels_configpretrained_model_namer1   r   r   r*     s$   

z%SpeakerVerificationECAPATDNN.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr
   r   r   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpZndarrayr   Z
from_numpyr   r   r   ._SpeakerVerificationECAPATDNN__extract_featurer   r   r   detachr   )r.   audiofeatureZ	embeddingr   r   r   r9     s   


z$SpeakerVerificationECAPATDNN.forwardc                 C   sT   g }|D ]}t j|d| jd}||jddd }||d qt|}|S )Nr   )Znum_mel_binsTri   )KaldiZfbankr   r   rk   r^   r   r_   )r.   r   featuresaur   r   r   r   Z__extract_feature  s   
z.SpeakerVerificationECAPATDNN.__extract_featurec                 C   s0   | j jtjtj| j|tdddd d S )Nr   )Zmap_locationT)strict)	r   Zload_state_dictr   loadospathjoinr   r   )r.   r   r   r   r   Z__load_check_point  s   
z/SpeakerVerificationECAPATDNN.__load_check_point)r<   r=   r>   r   strr   r*   r9   r   r   r@   r   r   r1   r   r     s
    
r   )NNN),r   r    r   typingr   r   r   numpyr   r   Ztorch.nnr,   Ztorch.nn.functionalZ
functionalr5   Ztorchaudio.compliance.kaldiZ
complianceZkaldir   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   Zmodelscope.utils.devicer	   r   r?   r#   Moduler$   rA   rK   rO   ra   rp   r   r   Zregister_moduleZspeaker_verificationZecapa_tdnn_svr   r   r   r   r   <module>   s6   
C'=8o