o
    )j&                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlmZ d dl	m
  mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ G d	d
 d
ejZejejejdG dd deZdS )    N)AnyDict)Models)MODELS
TorchModel)CAMPPlus)
DenseLayer)Tasks)create_devicec                       s,   e Zd Z			d fdd	Zdd Z  ZS )	LinearClassifierr        c                    s`   t    t | _tjdd| _t|D ]}| jt	||dd |}qtj
||dd| _d S )NT)Zinplace)Zbias)super__init__nnZ
ModuleListblocksZReLU	nonlinearrangeappendr   ZLinearlinear)self	input_dimZ
num_blocksZ	inter_dimout_neurons_	__class__ u/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/audio/sv/lanuage_recognition_model.pyr      s   

zLinearClassifier.__init__c                 C   s,   |  |}| jD ]}||}q| |}|S )N)r   r   r   )r   xlayerr   r   r   forward'   s
   



zLinearClassifier.forward)r   r   r   )__name__
__module____qualname__r   r    __classcell__r   r   r   r   r      s    r   )module_namec                       sF   e Zd ZdZdeeef f fddZdd Zdd Z	d	d
 Z
  ZS )LanguageRecognitionCAMPPluszA speech language recognition model using the CAM++ architecture as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _| jd | _t|d | _t| j| j| _	t
| jt| jd d| _|d }|d }| || | j	| j | j| j | j	  | j  d S )	Nemb_sizeZ	fbank_dimsample_ratedevice	languages)r   r   pretrained_encoderpretrained_backend)r   r   r'   r(   feature_dimr)   r
   r*   r   encoderr   lenbackend_load_check_pointtoeval)r   	model_dirr'   argskwargsr,   r-   r   r   r   r   9   s$   
z$LanguageRecognitionCAMPPlus.__init__c                 C   s   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}| | }| d}||fS )N   r      zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpZndarraytorchZ
from_numpyr0   shape	unsqueeze_extract_featurer/   r3   r*   r1   detachcpuZargmax)r   audiofeatureZembsZscoresoutputr   r   r   r    R   s   


z#LanguageRecognitionCAMPPlus.forwardc                 C   sX   g }|D ] }t j|d| j| jd}||jddd }||d qt|}|S )Nr   )Znum_mel_binsZsample_frequencyT)dimZkeepdim)	KaldiZfbankr?   r.   r)   meanr   r=   cat)r   rC   featuresaurD   r   r   r   r@   `   s   
z,LanguageRecognitionCAMPPlus._extract_featurec                 C   sT   | j tjtj| j|tdd | j	tjtj| j|tdd d S )NrB   )Zmap_location)
r/   Zload_state_dictr=   loadospathjoinr5   r*   r1   )r   r,   r-   r   r   r   r2   l   s   z-LanguageRecognitionCAMPPlus._load_check_point)r!   r"   r#   __doc__r   strr   r   r    r@   r2   r$   r   r   r   r   r&   0   s    r&   ) rM   typingr   r   numpyr<   r=   Ztorch.nnr   Ztorchaudio.compliance.kaldiZ
complianceZkaldirG   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Z modelscope.models.audio.sv.DTDNNr   Z'modelscope.models.audio.sv.DTDNN_layersr   Zmodelscope.utils.constantr	   Zmodelscope.utils.devicer
   Moduler   Zregister_moduleZspeech_language_recognitionZcampplus_lrer&   r   r   r   r   <module>   s"   