o
    *jn                     @   sj   d dl Z d dlmZ e ZdZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd ZdS )    N)
get_loggeruT   [’!"#$%&'()*+,-./:;<>=?@，。?★、…【】《》？“”‘’！[\]^_`{|}~]+c                 C   sr   g }|   }t|dkr7td|}|d ur|d}n|dd }|| ||ddd}t|dks|S )Nr   z[A-Za-z!?,<>()\']+     )lowerlenrematchgroupappendreplacestrip)	input_strtokenssr	   word r   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/trainers/audio/kws_utils/file_utils.pysplit_mixed_label   s   
r   c                 C   s$   t | }ddd |D }| S )Nr   c                 s   s    | ]}| d V  qdS )r   Nr   ).0subr   r   r   	<genexpr>(   s    z$space_mixed_label.<locals>.<genexpr>)r   joinr   )r   ZsplitsZ	space_strr   r   r   space_mixed_label&   s   r   c                 C   s\   g }t | ddd}|D ]}| dkr||  qW d    |S 1 s'w   Y  |S )Nrutf8encodingr   )openr   r   )Z	list_filelistsfinliner   r   r   
read_lists,   s   
r"   c              	   C   s   i }|D ]+}|  dd }t|dk r!td|   q||d d  ||d < qg }| D ]6}|  dd }t|dkr`|d |v r`|t|d ||d  |d dd	 q4td
|d  q4|S )N	r      zinvalid line in trans file: {}r   r   r   i>  )keytxtZwavZsample_ratez*can't find corresponding trans for key: {})	r   r   splitr   loggerdebugformatr   dict)Z	wav_listsZtrans_listsZtrans_tabler!   arrr   r   r   r   	make_pair5   s0   
r-   c                 C   s|   i }t | ddd'}|D ]}|  }t|dksJ t|d d ||d < qW d    n1 s3w   Y  |  |S )Nr   r   r   r$   r   r   )r   r   r'   r   intclose)Z
token_fileZtokens_tabler    r!   r,   r   r   r   
read_tokenQ   s   r0   c                 C   s   i }t | ddd)}|D ]}| dd }t|dks J |dd  ||d < qW d    n1 s5w   Y  |  |S )	Nr   r   r   r#   r   r$   r   r   )r   r   r   r'   r   r/   )Zlexicon_filelexicon_tabler    r!   r,   r   r   r   read_lexicon\   s   r2   c                 C   s  t  }t  }t| }|D ]a}|dks|dks|dkr|d }q|dks'|dkr,|d }q|dks<|d	ks<|d
ks<|dkrA|d }q||v rK||f }q||v r\|| D ]}||f }qSqttd|}|D ]}||f }qeq|D ]c}||v r~||| f }qp|dkrd|v r||d f }qp||d f }qp|dkrd|v r||d f }qp||d f }qpd|v r||d f }td| d qp||d f }td| d qp||fS )N!sil(sil)<sil>)r3   <blk><blank>)r6   (noise)noise)(noise<noise>)<GBG>r   silr<   ')' is not in token set, replace with <GBG>)' is not in token set, replace with <blk>)tupler   r   r   
symbol_strr(   infor&   symbol_tabler1   Z
tokens_strZ
tokens_idxpartspartchr   r   r   query_token_setg   sR   

 


rI   c                 C   s  g }g }t | }|D ]d}|dks|dks|dkr|d q
|dks&|dkr,|d q
|dks<|dks<|dks<|d	krB|d
 q
||v rL|| q
||v r]|| D ]}|| qTq
ttd|}|D ]}|| qfq
|D ]c}||v r|||  qq|dkrd|v r||d  qq||d  qq|d
krd
|v r||d
  qq||d  qqd
|v r||d
  td| d qq||d  td| d qq||fS )Nr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r   r=   r>   r?   r@   )r   r   r   r   rB   r(   rC   rD   r   r   r   query_token_list   sR    

rJ   c                 C   sF   | D ]}d|v s
J |d   }t|||\}}||d< ||d< q| S )Nr&   r   )r   rJ   )Z	data_listrE   r1   sampler&   strsZindexsr   r   r   tokenize   s   
rM   )r   Zmodelscope.utils.loggerr   r(   rB   r   r   r"   r-   r0   r2   rI   rJ   rM   r   r   r   r   <module>   s   	11