o
    )Îj¥.  ã                   @   s4  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" G d	d
„ d
ej#ƒZ$G dd„ dej%ƒZ&G dd„ dej%ƒZ'G dd„ dej%ƒZ(ej)e j*ej+dG dd„ deƒƒZ,dS )a¨  
    To further improve the short-duration feature extraction capability of ERes2Net,
    we expand the channel dimension within each stage. However, this modification also
    increases the number of model parameters and computational complexity.
    To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures,
    ultimately reducing both the model parameters and its computational cost.
é    N)ÚAnyÚDictÚUnion)ÚModels)ÚMODELSÚ
TorchModel)ÚAFF)ÚTasks)Úcreate_devicec                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚReLUFc                    s   t t| ƒ dd|¡ d S )Nr   é   )Úsuperr   Ú__init__)ÚselfÚinplace©Ú	__class__© úf/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/ERes2NetV2.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr   Ú z (ú))r   r   Ú__name__)r   Zinplace_strr   r   r   Ú__repr__!   s   
ÿÿzReLU.__repr__)F)r   Ú
__module__Ú__qualname__r   r   Ú__classcell__r   r   r   r   r      s    r   c                       ó.   e Zd Z				d‡ fdd„	Zdd„ Z‡  ZS )	ÚBasicBlockERes2NetV2é   é   é   c              
      sN  t t| ƒ ¡  tt ||d  ¡ƒ}|| _tj||| d|dd| _	t 
|| ¡| _|| _|| _g }g }	t| jƒD ]}
| tj||dddd¡ |	 t 
|¡¡ q9t |¡| _t |	¡| _tdd| _tj|| || j ddd	| _t 
|| j ¡| _t ¡ | _|dks‹|| j| kr¥t tj|| j| d|ddt 
| j| ¡¡| _d S d S )
Nç      P@r   F©Úkernel_sizeÚstrideÚbiasé   ©r#   Úpaddingr%   T©r   ©r#   r%   )r   r   r   ÚintÚmathÚfloorÚwidthÚnnÚConv2dÚconv1ÚBatchNorm2dÚbn1ÚnumsÚ	expansionÚrangeÚappendÚ
ModuleListÚconvsÚbnsr   ÚreluÚconv3Úbn3Ú
SequentialÚshortcut)r   Ú	in_planesÚplanesr$   Ú	baseWidthÚscaler5   r.   r9   r:   Úir   r   r   r   )   sH   ÿÿÿ
û
úÿzBasicBlockERes2NetV2.__init__c                 C   sÔ   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S ©Nr   r   )r1   r3   r;   ÚtorchÚsplitr.   r6   r4   r9   r:   Úcatr<   r=   r?   ©r   ÚxZresidualÚoutZspxrD   Úspr   r   r   ÚforwardP   s(   







zBasicBlockERes2NetV2.forward©r   r   r    r    ©r   r   r   r   rM   r   r   r   r   r   r   '   s    ú'r   c                       r   )	ÚBasicBlockERes2NetV2AFFr   r   r    c              
      s„  t t| ƒ ¡  tt ||d  ¡ƒ}|| _tj||| d|dd| _	t 
|| ¡| _|| _|| _g }g }	g }
t| jƒD ]}| tj||dddd¡ |
 t 
|¡¡ q;t| jd ƒD ]}|	 t|dd¡ qZt |¡| _t |
¡| _t |	¡| _td	d
| _tj|| || j ddd| _t 
|| j ¡| _t ¡ | _|dks¦|| j| krÀt tj|| j| d|ddt 
| j| ¡¡| _d S d S )Nr!   r   Fr"   r&   r'   é   ©ZchannelsÚrTr)   r*   )r   rP   r   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r   r8   r9   r:   Úfuse_modelsr   r;   r<   r=   r>   r?   )r   r@   rA   r$   rB   rC   r5   r.   r9   rT   r:   rD   Újr   r   r   r   o   sP   ÿÿÿ
û
úÿz BasicBlockERes2NetV2AFF.__init__c                 C   sà   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]7}|dkr)|| }n| j|d  ||| ƒ}| j	| |ƒ}|  | j
| |ƒ¡}|dkrM|}qt ||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S rE   )r1   r3   r;   rF   rG   r.   r6   r4   rT   r9   r:   rH   r<   r=   r?   rI   r   r   r   rM   ›   s(   







zBasicBlockERes2NetV2AFF.forwardrN   rO   r   r   r   r   rP   m   s    ú,rP   c                       sH   e Zd Zeeg d¢ddddddddf‡ fd	d
„	Zdd„ Zdd„ Z‡  ZS )Ú
ERes2NetV2)r&   rQ   é   r&   é@   éP   éÀ   r   r    ZTSTPFc                    s²  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _|| _	|| _
|	| _tjd|ddddd| _t |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _tj|d	 | j |d | j ddddd
| _t|d | j d	d| _|
dks™|
dkr›dnd| _tt|
ƒ| j| j d| _t | j| j | j |¡| _| jrÍtj|dd| _ t ||¡| _!d S t "¡ | _ t "¡ | _!d S )Né   r   r&   F)r#   r$   r(   r%   r   )r$   r    rQ   )r#   r(   r$   r%   rR   ÚTAPZTSDP)Zin_dim)Zaffine)#r   rV   r   r@   Úfeat_dimÚ	embed_dimr+   Z	stats_dimÚtwo_emb_layerrB   rC   r5   r/   r0   r1   r2   r3   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4Ú	layer3_dsr   Úfuse34Zn_statsÚgetattrÚpooling_layersÚpoolZLinearÚseg_1ZBatchNorm1dÚseg_bn_1Úseg_2ZIdentity)r   ÚblockZ
block_fuseÚ
num_blocksZ
m_channelsr]   r^   rB   rC   r5   Zpooling_funcr_   r   r   r   r   »   s\   ÿÿÿÿÿú	
ÿÿ
zERes2NetV2.__init__c                 C   sZ   |gdg|d   }g }|D ]}|  || j||| j| j| jd¡ || j | _qtj|Ž S )Nr   )rB   rC   r5   )r7   r@   rB   rC   r5   r/   r>   )r   rm   rA   rn   r$   ÚstridesZlayersr   r   r   r`   õ   s   úÿ
zERes2NetV2._make_layerc                 C   s¬   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	|¡}|  
|¡}|  ||¡}|  |¡}	|  |	¡}
| jrTt |
¡}|  |¡}|  |¡}|S |
S )Nr   r    r   )ZpermuteZ
unsqueeze_ÚFr;   r3   r1   ra   rb   rc   rd   re   rf   ri   rj   r_   rk   rl   )r   rJ   rK   Zout1Zout2Zout3Zout4Zout3_dsZ
fuse_out34ÚstatsZembed_aZembed_br   r   r   rM     s"   










zERes2NetV2.forward)	r   r   r   r   rP   r   r`   rM   r   r   r   r   r   rV   ¹   s    õ:rV   )Úmodule_namec                       sH   e Zd ZdZdeeef f‡ fdd„Zdd„ Zdd„ Z	dd
d„Z
‡  ZS )ÚSpeakerVerificationERes2NetV2aà  ERes2NetV2 architecture with local and global feature fusion. ERes2NetV2 is mainly composed
    of Bottom-up Dual-stage Feature Fusion (BDFF) and Bottleneck-like Local Feature Fusion (BLFF).
    BDFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
    The BLFF extracts localization-preserved speaker features and strengthen the local information interaction.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    Úmodel_configc                    s¶   t ƒ j||g|¢R i |¤Ž || _| jd | _| jd | _| jd | _| jd | _|| _d| _t	| jd ƒ| _
t| j| j| j| jd| _|d }|  |¡ | j | j
¡ | j ¡  d S )	Nr^   rB   rC   r5   rY   Údevice)r^   rB   rC   r5   Zpretrained_model)r   r   rt   r^   rB   rC   r5   Zother_configÚfeature_dimr
   ru   rV   Úembedding_modelÚ0_SpeakerVerificationERes2NetV2__load_check_pointÚtoÚeval)r   Ú	model_dirrt   ÚargsÚkwargsÚpretrained_model_namer   r   r   r   &  s&   ü
z&SpeakerVerificationERes2NetV2.__init__c                 C   sl   t |tjƒrt |¡}t|jƒdkr| d¡}t|jƒdks"J dƒ‚|  |¡}|  	| 
| j¡¡}| ¡  ¡ S )Nr   r   r    zFmodelscope error: the shape of input audio to model needs to be [N, T])Ú
isinstanceÚnpZndarrayrF   Z
from_numpyÚlenÚshapeÚ	unsqueezeÚ/_SpeakerVerificationERes2NetV2__extract_featurerw   ry   ru   ÚdetachÚcpu)r   ÚaudioÚfeatureZ	embeddingr   r   r   rM   >  s   

ÿþþ
z%SpeakerVerificationERes2NetV2.forwardc                 C   s0   t j|| jd}||jddd }| d¡}|S )N)Znum_mel_binsr   T)ÚdimZkeepdim)ÚKaldiZfbankrv   Úmeanrƒ   )r   r‡   rˆ   r   r   r   Z__extract_featureL  s   
z/SpeakerVerificationERes2NetV2.__extract_featureNc                 C   s8   |st  d¡}| jjt jtj | j|¡|ddd d S )Nr†   )Zmap_locationT)Ústrict)	rF   ru   rw   Zload_state_dictÚloadÚosÚpathÚjoinr{   )r   r~   ru   r   r   r   Z__load_check_pointR  s   
þ
üz0SpeakerVerificationERes2NetV2.__load_check_point)N)r   r   r   Ú__doc__r   Ústrr   r   rM   r„   rx   r   r   r   r   r   rs     s    	rs   )-r‘   r,   rŽ   Útypingr   r   r   Únumpyr€   rF   Ztorch.nnr/   Ztorch.nn.functionalZ
functionalrp   Ztorchaudio.compliance.kaldiZ
complianceZkaldirŠ   Z)modelscope.models.audio.sv.pooling_layersÚmodelsr‡   Úsvrh   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Z!modelscope.models.audio.sv.fusionr   Zmodelscope.utils.constantr	   Zmodelscope.utils.devicer
   ZHardtanhr   ÚModuler   rP   rV   Zregister_moduleZspeaker_verificationZeres2netv2_svrs   r   r   r   r   Ú<module>   s.   FLaÿ