o
    *j
                     @   s*   d dl mZ d dlmZ G dd dZdS )    )List)	Tokenizerc                   @   s   e Zd ZdZdd Zedd Zedd Zedd	 Zdde	de
dee fddZddee de
de	fddZedd ZdS )JiebaBPETokenizerz2SentencePiece BPE tokenizer with Jieba integrationc                 C   sz   d| _ t|| _| jd| _zdd l}dd l}||j	 W n t
y+   t
dw || _| jd | _| jd | _d S )NzJieba BPE Tokenizerz<|endoftext|>r   zfYou need to install jieba to use JiebaTokenizer. See https://pypi.org/project/jieba/ for installation.
z<sep>)namer   	from_file	tokenizerZtoken_to_ideod_idjiebaloggingZsetLogLevelINFOImportErrorvocabnew_line	sep_token)selfZtokenizer_json_filer
   r    r   e/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/gpt3/tokenizer.py__init__   s   zJiebaBPETokenizer.__init__c                 C      | j jddS NT)Zwith_added_tokens)r   Zget_vocab_sizer   r   r   r   
vocab_size(      zJiebaBPETokenizer.vocab_sizec                 C   r   r   )r   Z	get_vocabr   r   r   r   r   ,   r   zJiebaBPETokenizer.vocabc                 C   s*   | j }t }| D ]\}}|||< q
|S N)r   dictitems)r   r   	inv_vocabkeyvalr   r   r   r   0   s
   
zJiebaBPETokenizer.inv_vocabFtextis_codereturnc                 C   sB   |sdd | j |D }| jj|dddjS | jj|dddjS )z	
        c                 S   s   g | ]}|qS r   r   ).0xr   r   r   
<listcomp><   s    z.JiebaBPETokenizer.tokenize.<locals>.<listcomp>T)Zis_pretokenizedZadd_special_tokensF)r
   cutr   encodeZids)r   r    r!   Zseg_listr   r   r   tokenize8   s   zJiebaBPETokenizer.tokenizeT	token_ids
early_stopc                 C   s6   |r| j |v r|d || j  }| jj|dd}|S )NT)Zskip_special_tokens)r   indexr   decode)r   r)   r*   r    r   r   r   
detokenizeC   s   zJiebaBPETokenizer.detokenizec                 C   s   | j S r   )r	   r   r   r   r   eodI   s   zJiebaBPETokenizer.eodN)F)T)__name__
__module____qualname____doc__r   propertyr   r   r   strboolr   intr(   r-   r.   r   r   r   r   r      s    


r   N)typingr   Z
tokenizersr   r   r   r   r   r   <module>   s   