o
    )j                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlmZ d dl	m
  mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d dejZejejej dG dd deZ!dS )    N)AnyDict)Models)MODELS
TorchModel)CAMPPlus)
DenseLayer)ERes2Net)Tasks)create_devicec                       s,   e Zd Z			d fdd	Zdd Z  ZS )	LinearClassifierr        c                    s`   t    t | _tjdd| _t|D ]}| jt	||dd |}qtj
||dd| _d S )NT)Zinplace)Zbias)super__init__nnZ
ModuleListblocksZReLU	nonlinearrangeappendr   ZLinearlinear)self	input_dimZ
num_blocksZ	inter_dimout_neurons_	__class__ x/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/lanuage_recognition_eres2net.pyr      s   

zLinearClassifier.__init__c                 C   s,   |  |}| jD ]}||}q| |}|S )N)r   r   r   )r   xlayerr   r   r   forward(   s
   



zLinearClassifier.forward)r   r   r   )__name__
__module____qualname__r   r!   __classcell__r   r   r   r   r      s    r   )module_namec                       sF   e Zd ZdZdeeef f fddZdd Zdd Z	d	d
 Z
  ZS )LanguageRecognitionERes2NetzA speech language recognition model using the ERes2Net architecture as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _| jd | _| jd | _t|d | _t	| j| jd| _
t| jt| jd d| _|d	 }|d
 }| || | j
| j | j| j | j
  | j  d S )N	embed_dimZchannelsZ	fbank_dimsample_ratedevice)r)   
m_channels	languages)r   r   pretrained_encoderpretrained_backend)r   r   r(   r)   r,   feature_dimr*   r   r+   r	   encoderr   lenbackend_load_check_pointtoeval)r   	model_dirr(   argskwargsr.   r/   r   r   r   r   :   s*   
z$LanguageRecognitionERes2Net.__init__c                 C   s   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}| | }| d}||fS )N   r      zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpZndarraytorchZ
from_numpyr2   shape	unsqueeze_extract_featurer1   r5   r+   r3   detachcpuZargmax)r   audiofeatureZembsZscoresoutputr   r   r   r!   U   s   


z#LanguageRecognitionERes2Net.forwardc                 C   sX   g }|D ] }t j|d| j| jd}||jddd }||d qt|}|S )Nr   )Znum_mel_binsZsample_frequencyT)dimZkeepdim)	KaldiZfbankrA   r0   r*   meanr   r?   cat)r   rE   featuresaurF   r   r   r   rB   c   s   
z,LanguageRecognitionERes2Net._extract_featurec                 C   sT   | j tjtj| j|tdd | j	tjtj| j|tdd d S )NrD   )Zmap_location)
r1   Zload_state_dictr?   loadospathjoinr7   r+   r3   )r   r.   r/   r   r   r   r4   o   s   z-LanguageRecognitionERes2Net._load_check_point)r"   r#   r$   __doc__r   strr   r   r!   rB   r4   r%   r   r   r   r   r'   1   s    r'   )"rO   typingr   r   numpyr>   r?   Ztorch.nnr   Ztorchaudio.compliance.kaldiZ
complianceZkaldirI   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Z modelscope.models.audio.sv.DTDNNr   Z'modelscope.models.audio.sv.DTDNN_layersr   Z#modelscope.models.audio.sv.ERes2Netr	   Zmodelscope.utils.constantr
   Zmodelscope.utils.devicer   Moduler   Zregister_moduleZspeech_language_recognitionZeres2net_lrer'   r   r   r   r   <module>   s$   