o
    )Îj¿-  ã                   @   sH  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" G d	d
„ d
ej#ƒZ$ddd„Z%ddd„Z&G dd„ dej'ƒZ(G dd„ dej'ƒZ)G dd„ dej'ƒZ*ej+e j,ej-dG dd„ deƒƒZ.dS )a7   Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net_aug incorporates both local and global feature fusion techniques
    to improve the performance. The training code is located on the following
    GitHub repository: https://github.com/alibaba-damo-academy/3D-Speaker.
é    N)ÚAnyÚDictÚUnion)ÚModels)ÚMODELSÚ
TorchModel)ÚAFF)ÚTasks)Úcreate_devicec                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚReLUFc                    s   t t| ƒ dd|¡ d S )Nr   é   )Úsuperr   Ú__init__)ÚselfÚinplace©Ú	__class__© úh/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/ERes2Net_aug.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr   Ú z (ú))r   r   Ú__name__)r   Zinplace_strr   r   r   Ú__repr__   s   
ÿÿzReLU.__repr__)F)r   Ú
__module__Ú__qualname__r   r   Ú__classcell__r   r   r   r   r      s    r   é   c                 C   ó   t j| |d|dddS )z1x1 convolution without paddingr   r   F©Úkernel_sizeÚstrideÚpaddingÚbias©ÚnnÚConv2d©Ú	in_planesZ
out_planesr    r   r   r   Úconv1x1$   ó   úr(   c                 C   r   )z3x3 convolution with paddingé   r   Fr   r#   r&   r   r   r   Úconv3x3/   r)   r+   c                       ó*   e Zd ZdZd	‡ fdd„	Zdd„ Z‡  ZS )
ÚBasicBlockERes2Neté   r   é   r*   c           
   	      s6  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }t| jƒD ]}	| t||ƒ¡ | t 	|¡¡ q/t |¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dksx|| j| krt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nç      P@T©r   r   F©r   r    r"   )r   r-   r   ÚintÚmathÚfloorr(   Úconv1r$   ÚBatchNorm2dÚbn1ÚnumsÚrangeÚappendr+   Ú
ModuleListÚconvsÚbnsr   ÚreluÚ	expansionÚconv3Úbn3Ú
SequentialÚshortcutr%   r    ÚwidthÚscale)
r   r'   Úplanesr    Ú	baseWidthrF   rE   r=   r>   Úir   r   r   r   =   s<   
ûú
zBasicBlockERes2Net.__init__c                 C   sÔ   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S ©Nr   r   )r6   r8   r?   ÚtorchÚsplitrE   r:   r9   r=   r>   ÚcatrA   rB   rD   ©r   ÚxZresidualÚoutZspxrI   Úspr   r   r   Úforward\   s(   







zBasicBlockERes2Net.forward©r   r/   r*   ©r   r   r   r@   r   rR   r   r   r   r   r   r-   :   s    r-   c                       r,   )
ÚBasicBlockERes2Net_diff_AFFr.   r   r/   r*   c              	      sj  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }g }	t| jƒD ]}
| t||ƒ¡ |	 t 	|¡¡ q1t| jd ƒD ]
}| t|d¡ qKt |¡| _t |	¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dks’|| j| krªt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nr0   r   ©ZchannelsTr1   Fr2   )r   rU   r   r3   r4   r5   r(   r6   r$   r7   r8   r9   r:   r;   r+   r   r<   r=   r>   Úfuse_modelsr   r?   r@   rA   rB   rC   rD   r%   r    rE   rF   )r   r'   rG   r    rH   rF   rE   r=   rW   r>   rI   Újr   r   r   r   |   sD   
ûú
z$BasicBlockERes2Net_diff_AFF.__init__c                 C   sà   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]7}|dkr)|| }n| j|d  ||| ƒ}| j	| |ƒ}|  | j
| |ƒ¡}|dkrM|}qt ||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S rJ   )r6   r8   r?   rK   rL   rE   r:   r9   rW   r=   r>   rM   rA   rB   rD   rN   r   r   r   rR   ¢   s(   







z#BasicBlockERes2Net_diff_AFF.forwardrS   rT   r   r   r   r   rU   y   s    &rU   c                       sB   e Zd Zeeg d¢dddddf‡ fdd„	Zd	d
„ Zdd„ Z‡  ZS )ÚERes2Net_aug)r*   r.   é   r*   é@   éP   éÀ   ZTSTPFc	           	         sì  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t	j
|d	 |d ddddd
| _t	j
|d |d ddddd
| _t	j
|d |d ddddd
| _t|d d| _t|d d| _t|d d| _|dks¶|dkr¸dnd| _tt|ƒ| j|j d| _t	 | j|j | j |¡| _ | jrêt	j!|dd| _"t	 ||¡| _#d S t	 $¡ | _"t	 $¡ | _#d S )Né   r   r*   Fr   r   )r    é   r.   )r   r!   r    r"   é   é    rV   ÚTAPZTSDP)Zin_dim)Zaffine)%r   rY   r   r'   Úfeat_dimÚembedding_sizer3   Z	stats_dimÚtwo_emb_layerr$   r%   r6   r7   r8   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4Úlayer1_downsampleÚlayer2_downsampleÚlayer3_downsampler   Úfuse_mode12Úfuse_mode123Úfuse_mode1234Zn_statsÚgetattrÚpooling_layersr@   ÚpoolZLinearÚseg_1ZBatchNorm1dÚseg_bn_1Úseg_2ZIdentity)	r   ÚblockZ
block_fuseÚ
num_blocksZ
m_channelsrc   rd   Zpooling_funcre   r   r   r   r   Â   sz   	ÿÿÿÿÿúúú
ÿÿ
zERes2Net_aug.__init__c                 C   sL   |gdg|d   }g }|D ]}|  || j||ƒ¡ ||j | _qtj|Ž S )Nr   )r;   r'   r@   r$   rC   )r   rw   rG   rx   r    ÚstridesZlayersr   r   r   rf     s   
zERes2Net_aug._make_layerc                 C   sØ   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	||¡}|  
|¡}|  |¡}|  ||¡}	|  |¡}
|  |	¡}|  |
|¡}|  |¡}|  |¡}| jrjt |¡}|  |¡}|  |¡}|S |S )Nr   r_   r   )ZpermuteZ
unsqueeze_ÚFr?   r8   r6   rg   rh   rk   rn   ri   rl   ro   rj   rm   rp   rs   rt   re   ru   rv   )r   rO   rP   Zout1Zout2Zout1_downsampleZ
fuse_out12Zout3Zfuse_out12_downsampleZfuse_out123Zout4Zfuse_out123_downsampleZfuse_out1234ÚstatsZembed_aZembed_br   r   r   rR     s*   












zERes2Net_aug.forward)	r   r   r   r-   rU   r   rf   rR   r   r   r   r   r   rY   À   s    øBrY   )Úmodule_namec                       sH   e Zd ZdZdeeef f‡ fdd„Zdd„ Zdd„ Z	dd
d„Z
‡  ZS )ÚSpeakerVerificationERes2Neta)  Enhanced Res2Net_aug architecture with local and global feature fusion.
    ERes2Net_aug is an upgraded version of ERes2Net that uses a larger number of
    parameters to achieve better recognition performance.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    Úmodel_configc                    st   t ƒ j||g|¢R i |¤Ž || _|| _d| _t| jd ƒ| _tƒ | _|d }|  	|¡ | j 
| j¡ | j ¡  d S )Nr\   ÚdeviceZpretrained_model)r   r   r~   Zother_configÚfeature_dimr
   r   rY   Úembedding_modelÚ._SpeakerVerificationERes2Net__load_check_pointÚtoÚeval)r   Ú	model_dirr~   ÚargsÚkwargsÚpretrained_model_namer   r   r   r   2  s   
z$SpeakerVerificationERes2Net.__init__c                 C   sl   t |tjƒrt |¡}t|jƒdkr| d¡}t|jƒdks"J dƒ‚|  |¡}|  	| 
| j¡¡}| ¡  ¡ S )Nr   r   r_   zFmodelscope error: the shape of input audio to model needs to be [N, T])Ú
isinstanceÚnpZndarrayrK   Z
from_numpyÚlenÚshapeÚ	unsqueezeÚ-_SpeakerVerificationERes2Net__extract_featurer   rƒ   r   ÚdetachÚcpu)r   ÚaudioÚfeatureZ	embeddingr   r   r   rR   A  s   

ÿþþ
z#SpeakerVerificationERes2Net.forwardc                 C   s0   t j|| jd}||jddd }| d¡}|S )N)Znum_mel_binsr   T)ÚdimZkeepdim)ÚKaldiZfbankr€   Úmeanr   )r   r‘   r’   r   r   r   Z__extract_featureO  s   
z-SpeakerVerificationERes2Net.__extract_featureNc                 C   s8   |st  d¡}| jjt jtj | j|¡|ddd d S )Nr   )Zmap_locationT)Ústrict)	rK   r   r   Zload_state_dictÚloadÚosÚpathÚjoinr…   )r   rˆ   r   r   r   r   Z__load_check_pointU  s   
þ
üz.SpeakerVerificationERes2Net.__load_check_point)N)r   r   r   Ú__doc__r   Ústrr   r   rR   rŽ   r‚   r   r   r   r   r   r}   '  s    r}   )r   )/r›   r4   r˜   Útypingr   r   r   ÚnumpyrŠ   rK   Ztorch.nnr$   Ztorch.nn.functionalZ
functionalrz   Ztorchaudio.compliance.kaldiZ
complianceZkaldir”   Z)modelscope.models.audio.sv.pooling_layersÚmodelsr‘   Úsvrr   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Z!modelscope.models.audio.sv.fusionr   Zmodelscope.utils.constantr	   Zmodelscope.utils.devicer
   ZHardtanhr   r(   r+   ÚModuler-   rU   rY   Zregister_moduleZspeaker_verificationZeres2net_aug_svr}   r   r   r   r   Ú<module>   s2   

?Ggÿ