o
    *j                     @   s   d dl mZ G dd dZdS )    )	Tokenizerc                   @   sZ   e Zd ZdZdd Zedd Zedd Zedd	 ZdddZ	dd Z
edd ZdS )JiebaBPETokenizerz2SentencePiece BPE tokenizer with Jieba integrationc                 C   sf   d| _ t|| _| jd| _zdd l}W n ty!   tdw || _| jd | _	| jd | _
d S )NzJieba BPE Tokenizerz<|endoftext|>r   zhYou need to install rjieba to use JiebaTokenizer. See https://pypi.org/project/rjieba/ for installation.
z<sep>)namer   	from_file	tokenizerZtoken_to_ideod_idjiebaImportErrorvocabnew_lineZ	sep_token)selfZtokenizer_json_filer	    r   h/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/tokenizer.py__init__   s   zJiebaBPETokenizer.__init__c                 C      | j jddS NT)Zwith_added_tokens)r   Zget_vocab_sizer   r   r   r   
vocab_size$      zJiebaBPETokenizer.vocab_sizec                 C   r   r   )r   Z	get_vocabr   r   r   r   r   (   r   zJiebaBPETokenizer.vocabc                 C   s*   | j }t }| D ]\}}|||< q
|S N)r   dictitems)r   r   	inv_vocabkeyvalr   r   r   r   ,   s
   
zJiebaBPETokenizer.inv_vocabFc                 C   sB   |sdd | j |D }| jj|dddjS | jj|dddjS )Nc                 S   s   g | ]}|qS r   r   ).0xr   r   r   
<listcomp>6   s    z.JiebaBPETokenizer.tokenize.<locals>.<listcomp>T)Zis_pretokenizedZadd_special_tokensF)r	   cutr   encodeZids)r   textZis_codeZseg_listr   r   r   tokenize4   s   zJiebaBPETokenizer.tokenizec                 C   s   | j j|dd}|S )NF)Zskip_special_tokens)r   decode)r   Z	token_idsr!   r   r   r   
detokenize=   s   zJiebaBPETokenizer.detokenizec                 C   s   | j S r   )r   r   r   r   r   eodA   s   zJiebaBPETokenizer.eodN)F)__name__
__module____qualname____doc__r   propertyr   r   r   r"   r$   r%   r   r   r   r   r      s    



	r   N)Z
tokenizersr   r   r   r   r   r   <module>   s   