o
    )Îj*   ã                   @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  G dd	„ d	ej!ƒZ"G d
d„ dej#ƒZ$G dd„ dej#ƒZ%ej&ej'ej(dG dd„ deƒƒZ)dS )aˆ   Res2Net implementation is adapted from https://github.com/Res2Net/Res2Net-PretrainedModels.
    Res2Net is an advanced neural network architecture that enhances the capabilities of standard ResNets
    by incorporating hierarchical residual-like connections. This innovative structure improves
    performance across various computer vision tasks, such as image classification and object
    detection, without significant computational overhead.
    Reference: https://arxiv.org/pdf/1904.01169.pdf
    Some modifications from the original architecture:
    1. Smaller kernel size for the input layer
    2. Smaller expansion in BasicBlockRes2Net
é    N)ÚAnyÚDictÚUnion)ÚModels)ÚMODELSÚ
TorchModel)ÚTasks)Úcreate_devicec                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚReLUFc                    s   t t| ƒ dd|¡ d S )Nr   é   )Úsuperr
   Ú__init__)ÚselfÚinplace©Ú	__class__© úc/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/Res2Net.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr   Ú z (ú))r   r   Ú__name__)r   Zinplace_strr   r   r   Ú__repr__"   s   
ÿÿzReLU.__repr__)F)r   Ú
__module__Ú__qualname__r   r   Ú__classcell__r   r   r   r   r
      s    r
   c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	ÚBasicBlockRes2Neté   é   é    c           
   
      sT  t t| ƒ ¡  tt ||d  ¡ƒ}tj||| d|dd| _t 	|| ¡| _
|d | _g }g }t| jƒD ]}	| tj||dddd¡ | t 	|¡¡ q5t |¡| _t |¡| _tdd| _tj|| || j ddd	| _t 	|| j ¡| _t ¡ | _|dks‡|| j| krŸt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )
Ng      P@r   F)Úkernel_sizeÚstrideÚbiasé   )r   Úpaddingr!   T)r   )r   r!   )r   r   r   ÚintÚmathÚfloorÚnnÚConv2dÚconv1ÚBatchNorm2dÚbn1ÚnumsÚrangeÚappendZ
ModuleListÚconvsÚbnsr
   ÚreluÚ	expansionÚconv3Úbn3Ú
SequentialÚshortcutr    ÚwidthÚscale)
r   Ú	in_planesÚplanesr    Z	baseWidthr8   r7   r/   r0   Úir   r   r   r   +   sH   ÿ
ÿÿ
ûú
zBasicBlockRes2Net.__init__c                 C   sê   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}qt 
||| j fd¡}|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S )Nr   r   )r)   r+   r1   ÚtorchÚsplitr7   r-   r,   r/   r0   Úcatr3   r4   r6   )r   ÚxZresidualÚoutZspxr;   Úspr   r   r   ÚforwardL   s*   







zBasicBlockRes2Net.forward)r   r   r   )r   r   r   r2   r   rB   r   r   r   r   r   r   (   s    !r   c                       s@   e Zd Zeg d¢dddddf‡ fdd„	Zd	d
„ Zdd„ Z‡  ZS )ÚRes2Net)r"   é   é   r"   r   éP   éÀ   ZTSTPFc                    s\  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _|d
ksn|dkrpdnd| _tt|ƒ| j|j d| _t	 | j|j | j |¡| _| jr¢t	j|dd| _t	 ||¡| _d S t	 ¡ | _t	 ¡ | _d S )Né   r   r"   F)r   r    r#   r!   r   )r    r   rD   ÚTAPZTSDP)Zin_dim)Zaffine)r   rC   r   r9   Úfeat_dimÚembedding_sizer$   Z	stats_dimÚtwo_emb_layerr'   r(   r)   r*   r+   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4Zn_statsÚgetattrÚpooling_layersr2   ÚpoolZLinearÚseg_1ZBatchNorm1dÚseg_bn_1Úseg_2ZIdentity)r   ÚblockÚ
num_blocksÚ
m_channelsrJ   rK   Zpooling_funcrL   r   r   r   r   m   sD   ÿÿÿÿÿ
ÿÿ
zRes2Net.__init__c                 C   sL   |gdg|d   }g }|D ]}|  || j||ƒ¡ ||j | _qtj|Ž S )Nr   )r.   r9   r2   r'   r5   )r   rX   r:   rY   r    ÚstridesZlayersr   r   r   rM   •   s   
zRes2Net._make_layerc                 C   s–   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	|¡}|  
|¡}|  |¡}| jrIt |¡}|  |¡}|  |¡}|S |S )Nr   r   r   )ZpermuteZ
unsqueeze_ÚFr1   r+   r)   rN   rO   rP   rQ   rT   rU   rL   rV   rW   )r   r?   r@   ÚstatsZembed_aZembed_br   r   r   rB      s   









zRes2Net.forward)r   r   r   r   r   rM   rB   r   r   r   r   r   rC   k   s    ù(rC   )Úmodule_namec                       sH   e Zd ZdZdeeef f‡ fdd„Zdd„ Zdd„ Z	dd
d„Z
‡  ZS )ÚSpeakerVerificationResNetzW
    Args:
        model_dir: A model dir.
        model_config: The model config.
    Úmodel_configc                    s–   t ƒ j||g|¢R i |¤Ž || _| jd | _| jd | _|| _d| _t| jd ƒ| _t	| j| jd| _
|d }|  |¡ | j
 | j¡ | j
 ¡  d S )NÚ	embed_dimZchannelsrF   Údevice)rK   rZ   Zpretrained_model)r   r   r`   ra   rZ   Zother_configÚfeature_dimr	   rb   rC   Úembedding_modelÚ,_SpeakerVerificationResNet__load_check_pointÚtoÚeval)r   Ú	model_dirr`   ÚargsÚkwargsÚpretrained_model_namer   r   r   r   ¼   s   ÿ
z"SpeakerVerificationResNet.__init__c                 C   sl   t |tjƒrt |¡}t|jƒdkr| d¡}t|jƒdks"J dƒ‚|  |¡}|  	| 
| j¡¡}| ¡  ¡ S )Nr   r   r   zFmodelscope error: the shape of input audio to model needs to be [N, T])Ú
isinstanceÚnpZndarrayr<   Z
from_numpyÚlenÚshapeÚ	unsqueezeÚ+_SpeakerVerificationResNet__extract_featurerd   rf   rb   ÚdetachÚcpu)r   ÚaudioÚfeatureZ	embeddingr   r   r   rB   Ï   s   

ÿþþ
z!SpeakerVerificationResNet.forwardc                 C   s0   t j|| jd}||jddd }| d¡}|S )N)Znum_mel_binsr   T)ÚdimZkeepdim)ÚKaldiZfbankrc   Úmeanrp   )r   rt   ru   r   r   r   Z__extract_featureÝ   s   
z+SpeakerVerificationResNet.__extract_featureNc                 C   s8   |st  d¡}| jjt jtj | j|¡|ddd d S )Nrs   )Zmap_locationT)Ústrict)	r<   rb   rd   Zload_state_dictÚloadÚosÚpathÚjoinrh   )r   rk   rb   r   r   r   Z__load_check_pointã   s   
þ
üz,SpeakerVerificationResNet.__load_check_point)N)r   r   r   Ú__doc__r   Ústrr   r   rB   rq   re   r   r   r   r   r   r_   ³   s    r_   )*r~   r%   r{   Útypingr   r   r   Únumpyrm   r<   Ztorch.nnr'   Ztorch.nn.functionalZ
functionalr\   Ztorchaudio.compliance.kaldiZ
complianceZkaldirw   Z)modelscope.models.audio.sv.pooling_layersÚmodelsrt   ÚsvrS   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   Zmodelscope.utils.devicer	   ZHardtanhr
   ÚModuler   rC   Zregister_moduleZspeaker_verificationZ
res2net_svr_   r   r   r   r   Ú<module>   s*   
CHÿ