o
    )j)                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  G dd	 d	ej!Z"G d
d dej!Z#ej$ej%ej&dG dd deZ'dS )aV  
    This TDNN implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    TDNN replaces i-vectors for text-independent speaker verification with embeddings
    extracted from a feedforward deep neural network. The specific structure can be
    referred to in https://www.danielpovey.com/files/2017_interspeech_embeddings.pdf.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )	TdnnLayer   r   c                    s`   t t|   || _|| _|| _|| _|| _tj	| j| j| j| j| jd| _
tj|dd| _dS )aT  Define the TDNN layer, essentially 1-D convolution

        Args:
            in_dim (int): input dimension
            out_dim (int): output channels
            context_size (int): context size, essentially the filter size
            dilation (int, optional):  Defaults to 1.
            padding (int, optional):  Defaults to 0.
        )dilationpaddingF)ZaffineN)superr
   __init__in_dimout_dimcontext_sizer   r   nnZConv1dconv_1dZBatchNorm1dbn)selfr   r   r   r   r   	__class__ c/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/xvector.pyr      s   
zTdnnLayer.__init__c                 C   s"   |  |}t|}| |}|S )N)r   FZrelur   )r   xoutr   r   r   forward5   s   


zTdnnLayer.forward)r   r   __name__
__module____qualname__r   r   __classcell__r   r   r   r   r
      s    r
   c                       s0   e Zd Z					d	 fdd	Zdd Z  ZS )
XVEC(        TSTPc                    s   t t|   || _|| _|| _t||ddd| _t||ddd| _t||ddd| _	t||ddd| _
t||ddd| _|dksE|dkrGdnd| _tt|| jd| _t| j| j || _d	S )
z
        Implementation of Kaldi style xvec, as described in
        X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION
           r   )r   r         TAPZTSDP)r   N)r   r$   r   feat_dim	stats_dim	embed_dimr
   frame_1frame_2frame_3frame_4frame_5Zn_statsgetattrpooling_layerspoolr   ZLinearseg_1)r   r-   Zhid_dimr.   r/   Zpooling_funcr   r   r   r   >   s    
zXVEC.__init__c                 C   sX   | ddd}| |}| |}| |}| |}| |}| |}| |}|S )Nr   r+   r   )Zpermuter0   r1   r2   r3   r4   r7   r8   )r   r   r   statsZembed_ar   r   r   r   X   s   






zXVEC.forward)r%   r&   r'   r&   r(   r   r   r   r   r   r$   <   s    r$   )module_namec                       sB   e Zd Zdeeef f fddZdd Zdd Zdd	 Z	  Z
S )
SpeakerVerificationTDNNmodel_configc                    s   t  j||g|R i | || _|| _d| _d| _t| jd | _t| j t	| j| jd| _
|d }| | | j
| j | j
  d S )NP   r&   device)r-   r/   Zpretrained_model)r   r   r<   Zother_configfeature_dimr/   r	   r>   printr$   embedding_model*_SpeakerVerificationTDNN__load_check_pointtoeval)r   	model_dirr<   argskwargspretrained_model_namer   r   r   r   i   s   

z SpeakerVerificationTDNN.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r+   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpZndarraytorchZ
from_numpylenshape	unsqueeze)_SpeakerVerificationTDNN__extract_featurerA   rC   r>   detachcpu)r   audiofeatureZ	embeddingr   r   r   r   |   s   


zSpeakerVerificationTDNN.forwardc                 C   sT   g }|D ]}t j|d| jd}||jddd }||d qt|}|S )Nr   )Znum_mel_binsT)dimZkeepdim)KaldiZfbankrN   r?   meanappendrK   cat)r   rR   featuresaurS   r   r   r   Z__extract_feature   s   
z)SpeakerVerificationTDNN.__extract_featurec                 C   s0   | j jtjtj| j|tdddd d S )NrQ   )Zmap_locationT)strict)	rA   Zload_state_dictrK   loadospathjoinrE   r>   )r   rH   r   r   r   Z__load_check_point   s   
z*SpeakerVerificationTDNN.__load_check_point)r    r!   r"   r   strr   r   r   rO   rB   r#   r   r   r   r   r;   f   s
    
r;   )(__doc__mathr]   typingr   r   r   numpyrJ   rK   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr   Ztorchaudio.compliance.kaldiZ
complianceZkaldirU   Z)modelscope.models.audio.sv.pooling_layersmodelsrR   svr6   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   Zmodelscope.utils.devicer	   Moduler
   r$   Zregister_moduleZspeaker_verificationZtdnn_svr;   r   r   r   r   <module>   s$   #*