o
    )jA                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
m	  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ d&ddZd	ed
ededefddZG dd de	jZG dd de	jZG dd de	jZG dd dej	jZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#G dd  d e	jZ$G d!d" d"e	jZ%ej&ej'ej(d#G d$d% d%eZ)dS )'z This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    RDINOHead implementation is adapted from DINO framework.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r
   )lenshapemaxlongitemtorchZaranger
   r   expand	unsqueezeZ	as_tensor)lengthmax_lenr   r
   mask r   a/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/rdino.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S )Nr	      )mathceil)r   r   r   r   Zn_stepsZL_outpaddingr   r   r   get_padding_elem&   s   r"   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr	   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r   r!   groupsbias)
super__init__r   r   r   r!   padding_modennr#   conv)
selfout_channelsr   in_channelsr   r   r!   r&   r'   r*   	__class__r   r   r)   5   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr$   Zcausalr	   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r!   _manage_paddingr   r   r   Fpad
ValueErrorr,   )r-   xZnum_padZwxr   r   r   forwardS   s    



zConv1d.forwardr   r   r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r"   r4   r5   r*   )r-   r7   r   r   r   r   r!   r   r   r   r3   h   s   
zConv1d._manage_padding)r	   r	   r$   r	   Tr%   )__name__
__module____qualname__r)   r8   intr3   __classcell__r   r   r0   r   r#   3   s     r#   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r(   r)   r+   r@   norm)r-   
input_sizerC   rD   r0   r   r   r)   x   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rE   r-   r7   r   r   r   r8      s   
zBatchNorm1d.forward)rA   rB   r;   r<   r=   r)   r8   r?   r   r   r0   r   r@   v   s
    r@   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr	   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r/   r.   r   r   r&   rF   )r(   rJ   r)   r#   r,   
activationr@   rE   )r-   r/   r.   r   r   rL   r&   r0   r   r   r)      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rG   )rE   rL   r,   rH   r   r   r   r8      s   zTDNNBlock.forward)r;   r<   r=   r+   ReLUr)   r8   r?   r   r   r0   r   rJ      s
    rJ   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r	   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r   )rJ   ).0ir   Zhidden_channelZ
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r	   )r(   rN   r)   r+   
ModuleListrangeblocksscale)r-   r/   r.   rX   r   r   r0   rS   r   r)      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr	   dimr   )	enumerater   chunkrX   rW   appendcat)r-   r7   yrR   Zx_iZy_ir   r   r   r8      s   zRes2NetBlock.forward)rO   rP   r	   rI   r   r   r0   r   rN      s    rN   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr	   r/   r.   r   T)Zinplace)r(   r`   r)   r#   conv1r   r+   rM   reluconv2ZSigmoidsigmoid)r-   r/   se_channelsr.   r0   r   r   r)      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )Nr9   r   r
   r	   r   TrZ   Zkeepdim)
r   r   r
   r   summeanrc   rb   re   rd   )r-   r7   lengthsLr   totalsr   r   r   r8      s   

zSEBlock.forwardrG   rI   r   r   r0   r   r`      s    
r`   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rP   r	   ra   )r(   r)   rC   global_contextrJ   tdnnr+   ZTanhtanhr#   r,   )r-   channelsattention_channelsrq   r0   r   r   r)      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )Nr9   r   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr   )ri   r   sqrtr   powclamp)r7   mrZ   rC   rj   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r
   rg   r	   Trh   rY   z-inf)r   rC   r   Zonesr
   r   r   rq   ri   floatrepeatr^   r,   rs   rr   Zmasked_fillr4   Zsoftmax)r-   r7   rk   rl   r{   r   rm   rj   rz   ZattnZpooled_statsr   r   r   r8      s(   


z"AttentiveStatisticsPooling.forward)rp   TrG   rI   r   r   r0   r   ro      s    ro   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrO   rp   r	   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr	   )r   r   rL   r&   ra   )r(   r)   r.   rJ   tdnn1rN   res2net_blocktdnn2r`   se_blockshortcutr#   )	r-   r/   r.   res2net_scalerf   r   r   rL   r&   r0   r   r   r)   %  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rG   )r   r   r   r   r   )r-   r7   rk   Zresidualr   r   r   r8   N  s   



zSERes2NetBlock.forwardrG   )	r;   r<   r=   r   r+   rM   r)   r8   r?   r   r   r0   r   r~   #  s    )r~   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )r   r   r   r   i   )   rP   rP   rP   r	   )r	   r   rP      r	   rp   rO   T)r	   r	   r	   r	   r	   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r	   )r   rf   r   r   rL   r&   r9   )r&   )ru   rq   r   rK   ra   )r(   r)   r   rt   r+   rU   rW   r]   rJ   rV   r~   mfaro   aspr@   asp_bnr#   fc)r-   rF   r
   Zlin_neuronsrL   rt   Zkernel_sizesZ	dilationsru   r   rf   rq   r&   rR   r0   r   r   r)   a  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r	   r   )rk   NrY   )Z	transposerW   	TypeErrorr]   r   r^   r   r   r   r   Zsqueeze)r-   r7   rk   Zxllayerr   r   r   r8     s    



zECAPA_TDNN.forwardrG   )
r;   r<   r=   __doc__r   r+   rM   r)   r8   r?   r   r   r0   r   r   [  s    Hr   c                       s:   e Zd Z						d fdd	Zd	d
 Zdd Z  ZS )	RDINOHeadFTrP             c	                    s  t    t|d}|dkrt||| _nJt||g}	|r'|	t| |	t  t	|d D ]}
|	t|| |rI|	t| |	t  q4|	t|| tj
|	 | _t||| _| | j tjtj||dd| _| jjjd |rd| jj_d S d S )Nr	   r   F)r'   )r(   r)   r   r+   Linearmlpr]   r@   ZGELUrV   Z
Sequential	add_layerapply_init_weightsutilsZweight_norm
last_layerZweight_gdataZfill_Zrequires_grad)r-   Zin_dimZout_dimZuse_bnZnorm_last_layerZnlayersZ
hidden_dimZbottleneck_dimZadd_dimZlayers_r0   r   r   r)     s0   
	
zRDINOHead.__init__c                 C   sV   t |tjr%tjjj|jdd t |tjr'|jd ur)tj|jd d S d S d S d S )Ng{Gz?)rz   r   )	
isinstancer+   r   r   initZtrunc_normal_weightr'   Z	constant_)r-   ry   r   r   r   r     s   zRDINOHead._init_weightsc                 C   s8   |  |}| |}tjj|ddd}| |}||fS )Nr9   r   )rZ   p)r   r   r+   
functional	normalizer   )r-   r7   Zvicr_outr   r   r   r8     s
   


zRDINOHead.forward)FTrP   r   r   r   )r;   r<   r=   r)   r   r8   r?   r   r   r0   r   r     s    "r   c                       s$   e Zd Z fddZdd Z  ZS )Combinec                    s   t t|   || _|| _d S rG   )r(   r   r)   backbonehead)r-   r   r   r0   r   r   r)     s   
zCombine.__init__c                 C   s   |  |}| |}|S rG   )r   r   )r-   r7   outputr   r   r   r8     s   

zCombine.forwardrI   r   r   r0   r   r     s    r   )module_namec                       sD   e Zd Zdeeef f fddZdd Zdd Zdd	d
Z	  Z
S )SpeakerVerification_RDINOmodel_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| j|d| _t| jt	ddd	| _|d
 }| 
| | j  d S )NZchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   )rt   r   i   TZpretrained_model)r(   r)   r   Zother_configr6   feature_dimr   embedding_modelr   r   ,_SpeakerVerification_RDINO__load_check_pointeval)r-   	model_dirr   argskwargsZchannels_configpretrained_model_namer0   r   r   r)     s$   

z"SpeakerVerification_RDINO.__init__c                 C   s>   t |jdkr|jd dksJ d| |}| j|}|S )Nr   r   r	   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   +_SpeakerVerification_RDINO__extract_featurer   r   )r-   audiofeatureZ	embeddingr   r   r   r8   $  s   
z!SpeakerVerification_RDINO.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)Znum_mel_binsr   Trh   )KaldiZfbankr   rj   r   )r-   r   r   r   r   r   Z__extract_feature-  s   
z+SpeakerVerification_RDINO.__extract_featureNc                 C   sR   |st d}t jtj| j||d}dd |d  D }| jj	|dd d S )Nr   )Zmap_locationc                 S   s   i | ]\}}| d d|qS )zmodule. )replace)rQ   kvr   r   r   
<dictcomp>9  s    z@SpeakerVerification_RDINO.__load_check_point.<locals>.<dictcomp>ZteacherT)strict)
r   r
   loadospathjoinr   itemsr   Zload_state_dict)r-   r   r
   Z
state_dictZstate_dict_tear   r   r   Z__load_check_point3  s   

z,SpeakerVerification_RDINO.__load_check_pointrG   )r;   r<   r=   r   strr   r)   r8   r   r   r?   r   r   r0   r   r   	  s
    	r   )NNN)*r   r   r   typingr   r   r   r   Ztorch.nnr+   Ztorch.nn.functionalr   r4   Ztorchaudio.compliance.kaldiZ
complianceZkaldir   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   r   r>   r"   Moduler#   r@   rJ   rN   r`   ro   r~   r   r   r   Zregister_moduleZspeaker_verificationZrdino_tdnn_svr   r   r   r   r   <module>   s6   
C'=8o2