o
    )j'                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  G dd	 d	ej!Z"G d
d dej!Z#ej$ej%ej&dG dd deZ'dS )a   ResNet implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ResNet, or Residual Neural Network, is notable for its optimization ease
    and depth-induced accuracy gains. It utilizes skip connections within its residual
    blocks to counteract the vanishing gradient problem in deep networks.
    Reference: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                       s*   e Zd ZdZd fdd	Zdd Z  ZS )
BasicBlock   c              	      s   t t|   tj||d|ddd| _t|| _tj||ddddd| _t|| _	t
 | _|dks;|| j| krUt
tj|| j| d|ddt| j| | _d S d S )N   r   Fkernel_sizestridepaddingbias)r   r   r   )superr
   __init__nnConv2dconv1BatchNorm2dbn1conv2bn2
Sequentialshortcut	expansion)self	in_planesplanesr   	__class__ b/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/ResNet.pyr      s6   

zBasicBlock.__init__c                 C   sB   t | | |}| | |}|| |7 }t |}|S N)Frelur   r   r   r   r   )r   xoutr#   r#   r$   forward5   s
   
zBasicBlock.forward)r   )__name__
__module____qualname__r   r   r*   __classcell__r#   r#   r!   r$   r
      s    r
   c                       s@   e Zd Zeg ddddddf fdd	Zd	d
 Zdd Z  ZS )ResNet)r         r       P      ZTSTPTc                    s\  t t|   || _|| _|| _t|d | d | _|| _t	j
d|ddddd| _t	|| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _|d
ksn|dkrpdnd| _tt|| j|j d| _t	| j|j | j || _| jrt	j|dd| _t	||| _d S t	 | _t	 | _d S )N   r   r   Fr   r   )r      r0   TAPZTSDP)Zin_dim)Zaffine)r   r/   r   r   feat_dimembedding_sizeintZ	stats_dimtwo_emb_layerr   r   r   r   r   _make_layerlayer1layer2layer3layer4Zn_statsgetattrpooling_layersr   poolZLinearseg_1ZBatchNorm1dseg_bn_1seg_2ZIdentity)r   block
num_blocks
m_channelsr8   r9   Zpooling_funcr;   r!   r#   r$   r   ?   sD   

zResNet.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )appendr   r   r   r   )r   rG   r    rH   r   stridesZlayersr#   r#   r$   r<   g   s   
zResNet._make_layerc           	      C   s   | ddd}|d}t| | |}| |}| |}| |}| 	|}| 
|}| |}| jrIt|}| |}| |}|S |S )Nr   r6   r   )ZpermuteZ
unsqueeze_r&   r'   r   r   r=   r>   r?   r@   rC   rD   r;   rE   rF   )	r   r(   r)   Zout1Zout2Zout3statsZembed_aZembed_br#   r#   r$   r*   o   s   









zResNet.forward)r+   r,   r-   r
   r   r<   r*   r.   r#   r#   r!   r$   r/   =   s    (r/   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationResNetzW
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _|| _d| _t| jd | _t	| j| jd| _
|d }| | | j
| j | j
  d S )N	embed_dimZchannelsr3   device)r9   rI   Zpretrained_model)r   r   rO   rP   rI   Zother_configfeature_dimr	   rQ   r/   embedding_model,_SpeakerVerificationResNet__load_check_pointtoeval)r   	model_dirrO   argskwargspretrained_model_namer!   r#   r$   r      s   
z"SpeakerVerificationResNet.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r6   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpZndarraytorchZ
from_numpylenshape	unsqueeze+_SpeakerVerificationResNet__extract_featurerS   rU   rQ   detachcpu)r   audiofeatureZ	embeddingr#   r#   r$   r*      s   


z!SpeakerVerificationResNet.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)Znum_mel_binsr   T)dimZkeepdim)KaldiZfbankrR   meanr`   )r   rd   re   r#   r#   r$   Z__extract_feature   s   
z+SpeakerVerificationResNet.__extract_featureNc                 C   s8   |st d}| jjt jtj| j||ddd d S )Nrc   )Zmap_locationT)strict)	r]   rQ   rS   Zload_state_dictloadospathjoinrW   )r   rZ   rQ   r#   r#   r$   Z__load_check_point   s   

z,SpeakerVerificationResNet.__load_check_pointr%   )r+   r,   r-   __doc__r   strr   r   r*   ra   rT   r.   r#   r#   r!   r$   rN      s    rN   )(rn   mathrk   typingr   r   r   numpyr\   r]   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr&   Ztorchaudio.compliance.kaldiZ
complianceZkaldirg   Z)modelscope.models.audio.sv.pooling_layersmodelsrd   svrB   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   Zmodelscope.utils.devicer	   Moduler
   r/   Zregister_moduleZspeaker_verificationZ	resnet_svrN   r#   r#   r#   r$   <module>   s(   #F