o
    *j2                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZd dl	Z
d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ e ZdgZej ej!ej"dG dd deZ#dS )    N)AnyDictListUnion)File)	Pipelines)
OutputKeys)pipeline)
InputModelPipeline)	PIPELINES)Tasks)
get_loggerSegmentationClusteringPipeline)module_namec                
       s  e Zd ZdZdef fddZdeeej	e
f deeef fddZd	e
dej	fd
dZdej	dej	fddZde
de
dej	dej	de
f
ddZdeeej	e
f de
fddZde
fddZde
de
fddZdededeej	e
f dej	fddZdd Zd d! Zd%d#d$Z  ZS )&r   aX  Segmentation and Clustering Pipeline
    use `model` to create a Segmentation and Clustering Pipeline.

    Args:
        model (SegmentationClusteringPipeline): A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the pipeline's constructor.
    Example:
    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks
    >>> p = pipeline(
    >>>    task=Tasks.speaker_diarization, model='damo/speech_campplus_speaker-diarization_common')
    >>> print(p(audio))

    modelc                    sZ   t  jd
d|i| | jj| _ddd}| j| | jd | _td| jd d| _d	S )zuse `model` to create a speaker diarization pipeline for prediction
        Args:
            model (str): a valid official model id
        r         ?g      ?)seg_dur	seg_shiftZsample_ratezspeaker-verificationZspeaker_modeltaskr   N )	super__init__r   Zother_configconfigupdatefsr	   sv_pipeline)selfr   kwargsr   	__class__r   |/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/pipelines/audio/segmentation_clustering_pipeline.pyr   -   s   

z'SegmentationClusteringPipeline.__init__audioreturnc                 K   s   | j | td | |}| | td | |}td | |}td | |}td | 	||||}t
j|iS )a   extract the speaker embeddings of input audio and do cluster
        Args:
            audio (str, np.ndarray, list): If it is represented as a str or a np.ndarray, it
            should be a complete speech signal and requires VAD preprocessing. If the audio
            is represented as a list, it should contain only the effective speech segments
            obtained through VAD preprocessing. The list should be formatted as [[0(s),3.2,
            np.ndarray], [5.3,9.1, np.ndarray], ...]. Each element is a sublist that contains
            the start time, end time, and the numpy array of the speech segment respectively.
        zDoing VAD...zDoing segmentation...zExtracting embeddings...zClustering...zPost processing...)r   r   loggerinfo
preprocesscheck_audio_listchunkforward
clusteringpostprocessr   ZTEXT)r   r#   paramsvad_segmentssegments
embeddingslabelsoutputr   r   r"   __call__=   s   










z'SegmentationClusteringPipeline.__call__inputc                 C   sL   g }|D ]}| j |d gdd}|d jdkr||d  qt|}|S )N   T)Z
output_embZembs)      )r   shapeappendnpconcatenate)r   r4   r0   s	save_dictr   r   r"   r*   \   s   
z&SegmentationClusteringPipeline.forwardr0   c                 C   s   | j |fi | j}|S )N)r   r   )r   r0   r1   r   r   r"   r+   e   s   z)SegmentationClusteringPipeline.clusteringr/   r.   r1   c                 C   s  t |t |ks
J | |}g }tt |D ]}||| d || d || g q| |}g }t| d D ]}|||k d}|| q;t|}dd }	tdt |D ]}|	||d  d || d r|| d ||d  d  d }
d| j	v rt
| dsttj| j	d d| _t|
d	 ||d  d }t|
d	 || d }|| dkr| |||}||d  d }|| d }| j||| || gd
d\}}|d ur|| }
|
|| d< |
||d  d< q\| |}|S )Nr   r6   c                 S   s   | |d krdS dS )Ng-C6?TFr   )t1t2r   r   r"   is_overlappedz   s   zASegmentationClusteringPipeline.postprocess.<locals>.is_overlappedr5   Zchange_locatorchange_locator_pipeliner   r   T)Z
output_res)lencorrect_labelsranger9   merge_sequemaxmeanr:   stackr   hasattrr	   r   speaker_diarizationrA   min	cut_audiosmooth)r   r/   r.   r1   r0   distribute_resiZspk_embsZspk_embr@   pZshort_utt_stZshort_utt_edZ
audio_dataZspk1Zspk2_ctr   r   r"   r,   i   sP   
&

 



z*SegmentationClusteringPipeline.postprocessc           
   
   C   s  t |tr|jdd d |S t |trat|}tjt|dd\}}t	|j
dkr4|d d df }|| jkratd| j d	 tjjt|d|d
t| jggd\}}|d }t	|j
dkslJ d|jdv ry|d d}n|d}t| dsttj| jd dd| _| j|| jddd d }g }t |trt|}nt |tr|}nt dt!| |D ](}t"|d d }t"|d d }	|#||	|t"|| j t"|	| j  g q|S )Nc                 S   s   | d S )Nr   r   )xr   r   r"   <lambda>   s    z;SegmentationClusteringPipeline.preprocess.<locals>.<lambda>)keyZfloat32)dtyper5   r   z+[WARNING]: The sample rate of audio is not z, resample it.Zrate)Zeffectsr6   %modelscope error: Wrong audio format.)Zint16Zint32Zint64i   vad_pipelineZ	vad_modelzv2.0.2)r   r   Zmodel_revisionT)r   Zis_finalvaluezIncorrect vad result. Get %si  )$
isinstancelistsortstrr   readsfioBytesIOrB   r8   r   r%   r&   
torchaudioZsox_effectsZapply_effects_tensortorchZ
from_numpyZ	unsqueezeZsqueezenumpyrV   ZastyperI   r	   r   Zvoice_activity_detectionr   rX   astliteral_eval
ValueErrortypeintr9   )
r   r#   Z
file_bytesr   Zvad_timer.   Zvad_time_listtstZedr   r   r"   r'      s`   









$z)SegmentationClusteringPipeline.preprocessc                 C   s   d}t t|D ]Y}|| }|d |d ksJ dt|d tjs&J dt|d | j t|d | j  |d jd ksCJ d|dkrW|d ||d  d ksWJ d||d |d  7 }q|dksjJ dd S )	Nr   r6   z$modelscope error: Wrong time stamps.r5   z"modelscope error: Wrong data type.zFmodelscope error: audio data in list is inconsistent with time length.   z<modelscope error: The effective audio duration is too short.)rD   rB   rZ   r:   ndarrayri   r   r8   )r   r#   Z	audio_durrO   segr   r   r"   r(      s8   z/SegmentationClusteringPipeline.check_audio_listc                    s4    fdd}g }t |D ]\}}||| q|S )Nc           
         s   | d }| d }t  jd  j }t  jd  j }d}g }td|jd |D ]I}t|| |jd }||kr< |S |}td|| }||| }	|	jd |k r`t|	d||	jd  fd}	|	| j | | j | |	g q)|S )Nr   r5   r   r   Zconstant)
ri   r   r   rD   r8   rK   rF   r:   padr9   )
Zseg_dataZseg_stdataZ	chunk_lenZchunk_shiftZlast_chunk_edZseg_resZchunk_stZchunk_edZ
chunk_datar   r   r"   	seg_chunk   s0   z7SegmentationClusteringPipeline.chunk.<locals>.seg_chunk)	enumerateextend)r   r.   rr   ZsegsrO   r<   r   rq   r"   r)      s
   z$SegmentationClusteringPipeline.chunkcut_stcut_edc              	   C   s`  t |tjr|t|| j t|| j  S t |trtt|D ]L}|dkr2||| d k r1|}n|||d  d krF||| d k rF|}|t|d krY||| d krX|}q!||| d krm|||d  d krm|}q!|||d  }g }tt|D ]&}|| \}	}
}||tt	||	|	 | j tt
||
|	 | j   q~t|}|S td)Nr   r6   rW   )rZ   r:   rm   ri   r   r[   rD   rB   r9   rF   rK   r;   rg   )r   ru   rv   r#   rO   Zst_iZed_iZ
audio_segsZcut_dataZs_stZs_edrp   r   r   r"   rL      sB    
$$
z(SegmentationClusteringPipeline.cut_audioc                 C   sF   d}i }g }|D ]}||vr|||< |d7 }| ||  qt|S )Nr   r6   )r9   r:   array)r   r1   Z	labels_idZid2idZ
new_labelsrO   r   r   r"   rC     s   
z-SegmentationClusteringPipeline.correct_labelsc                 C   sv   |d g}t dt|D ],}|| d |d d ks&|| d |d d kr.|||  q|| d |d d< q|S )Nr   r6   r5   )rD   rB   r9   )r   rN   resrO   r   r   r"   rE   &  s   

z*SegmentationClusteringPipeline.merge_sequer6   c                 C   s&  t t|D ]}t|| d d|| d< t|| d d|| d< || d || d  |k r|dkrA||d  d || d< q|t|d krV||d  d || d< q|| d ||d  d  ||d  d || d  kr||d  d || d< q||d  d || d< q| |}|S )Nr   r5   r6   )rD   rB   roundrE   )r   ry   ZmindurrO   r   r   r"   rM   0  s   8
z%SegmentationClusteringPipeline.smooth)r6   )__name__
__module____qualname____doc__r
   r   r   r]   r:   rm   r[   r   r   r3   r*   r+   r,   r'   r(   r)   floatrL   rC   rE   rM   __classcell__r   r   r    r"   r      s4    

	

3-
 
!
)$re   r`   typingr   r   r   r   rd   r:   Z	soundfiler_   rc   rb   Zmodelscope.fileior   Zmodelscope.metainfor   Zmodelscope.outputsr   Zmodelscope.pipelinesr	   Zmodelscope.pipelines.baser
   r   Zmodelscope.pipelines.builderr   Zmodelscope.utils.constantr   Zmodelscope.utils.loggerr   r%   __all__Zregister_modulerJ   Zsegmentation_clusteringr   r   r   r   r"   <module>   s*   