o
    0j'                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ dgZddiZd	Zd
ZdZdZedd edD Zeeefe ZdZdd Zdedeeef fddZG dd deZ dS )    N)
CollectionDictListSetTupleUnion   )PretrainedTokenizer)
AddedTokenQWenTokenizer
vocab_fileqwen.tiktokenzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+z<|endoftext|>z<|im_start|>z
<|im_end|>c                 c   s    | ]	}d | dV  qdS )z<|extra_z|>N .0ir   r   y/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/common/tokenizer/qwen_tokenizer.py	<genexpr>$       r      c                   C   s   t jdd uS )Ntiktoken)	importlibutil	find_specr   r   r   r   is_tiktoken_available.   s   r   tiktoken_bpe_filereturnc                 C   sN   t | d}| }W d    n1 sw   Y  dd dd | D D S )Nrbc                 S   s    i | ]\}}t |t|qS r   )base64	b64decodeint)r   tokenrankr   r   r   
<dictcomp>5   s    z&_load_tiktoken_bpe.<locals>.<dictcomp>c                 s   s    | ]	}|r|  V  qd S N)split)r   liner   r   r   r   7   r   z%_load_tiktoken_bpe.<locals>.<genexpr>)openread
splitlines)r   fcontentsr   r   r   _load_tiktoken_bpe2   s   
r,   c                       s  e Zd ZdZg dZeZ			d2 fdd	Zdefd	d
Z	de
eef fddZdeeeeeeef  f dee fddZd3dee dedefddZ	d3deee ee f dedefddZdedee fddZ		d4dedeeef deeef deeeef  fdd Zdeeeef  defd!d"Zed#d$ Zd%edeeef fd&d'Zd(eeef defd)d*Zdefd+d,Z		d5d-eeee f d.ed/edefd0d1Z   Z!S )6r   zQWen tokenizer.)Z	input_idsZattention_maskZposition_idsreplaceleftNc           	         sF  t  jdi | t stddd l}|}|| _t|| _dd tt	t
| jdD | _|r5| j| |jdt| j| jd}t
| jt
| j |jks`J t
| jt
| j  d|j d	d
d | j D | _| jdd | j D  || _| jj| _| jt | _| jt | _d|v r|d | _d|v r|d | _d S d S )NzFtiktoken is not installed, please install it use: pip install tiktokenr   c                 S      i | ]\}}||qS r   r   )r   indexr!   r   r   r   r#   V   s    z*QWenTokenizer.__init__.<locals>.<dictcomp>)startQwenpat_strmergeable_ranksspecial_tokensz != z in encodingc                 S   r/   r   r   r   kvr   r   r   r#   i   s    
c                 S   r/   r   r   r7   r   r   r   r#   l   s    pad_token_ideos_token_idr   )super__init__r   
ValueErrorr   errorsr,   r5   	enumerateSPECIAL_TOKENSlenr6   updateEncodingPAT_STRn_vocabitemsdecoder	tokenizerZ	eot_tokeneod_idIMSTARTZim_start_idIMENDZ	im_end_idr:   r;   )	selfr   r?   Zpadding_sideZextra_special_tokenskwargstkr   enc	__class__r   r   r=   A   sL   

 

zQWenTokenizer.__init__r   c                 C      | j jS r$   rI   rF   rM   r   r   r   __len__y   s   zQWenTokenizer.__len__c                 C   s   | j S r$   )r5   rU   r   r   r   	get_vocab|   s   zQWenTokenizer.get_vocabtokensc                 C   sn   g }t |ttfr|| jv r| j| S | j|S |D ]}|| jv r+|| j|  q|| j| q|S r$   )
isinstancestrbytesr6   r5   getappend)rM   rX   Zidsr!   r   r   r   convert_tokens_to_ids   s   


z#QWenTokenizer.convert_tokens_to_idsFr6   c                 C   s   |r<g }|D ]!}|| j v rqt| jt| j  }|| j |< || j|< || qdd l}|jdt| j| j d| _t|S t	d)Nr   r2   r3   &Adding regular tokens is not supported)
r6   rB   r5   rH   r]   r   rD   rE   rI   r>   )rM   rX   r6   Zadded_tokensr!   Ztoken_idr   r   r   r   _update_tiktoken   s$   


zQWenTokenizer._update_tiktoken
new_tokensc                 C   sH   |s|rt dg }|D ]}t|tr|jn|}|| q| ||S )Nr_   )r>   rY   r
   contentr]   r`   )rM   ra   r6   Znew_tokens_strr!   Zsurface_formr   r   r   _add_tokens   s   zQWenTokenizer._add_tokenssave_directoryc                 K   s   t j|d}t|ddd)}| j D ]\}}t|dd t	| d }|
| qW d   |fS 1 s:w   Y  |fS )z
        Save only the vocabulary of the tokenizer (vocabulary).

        Returns:
            `Tuple(str)`: Paths to the files saved.
        r   wutf8)encoding 
N)ospathjoinr'   r5   rG   r   	b64encodedecoderZ   write)rM   rd   rN   	file_pathre   r8   r9   r&   r   r   r   save_vocabulary   s    
zQWenTokenizer.save_vocabularyallr   textallowed_specialdisallowed_specialc                 K   s<   g }t d|}| jj|||dD ]
}|| j|  q|S )a  
        Converts a string in a sequence of tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            allowed_special (`Literal["all"]` or `set`):
                The surface forms of the tokens to be encoded as special tokens in regular texts.
                Default to "all".
            disallowed_special (`Literal["all"]` or `Collection`):
                The surface forms of the tokens that should not be in regular texts and trigger errors.
                Default to an empty tuple.

            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific encode method.

        Returns:
            `List[bytes|str]`: The list of tokens.
        NFC)rt   ru   )unicodedata	normalizerI   encoder]   rH   )rM   rs   rt   ru   rN   rX   tr   r   r   tokenize   s   
zQWenTokenizer.tokenizec                 C   sx   d}d}|D ]'}t |tr |r||jd| jd7 }d}||7 }qt |tr*||7 }qtd|r:||jd| jd7 }|S )zC
        Converts a sequence of tokens in a single string.
             zutf-8r?   z)token should only be of type types or str)rY   rZ   rn   r?   r[   	TypeError)rM   rX   rs   temprz   r   r   r   convert_tokens_to_string   s   



z&QWenTokenizer.convert_tokens_to_stringc                 C   rS   r$   rT   rU   r   r   r   
vocab_size   s   zQWenTokenizer.vocab_sizer0   c                 C   s   || j v r
| j | S td)z2Converts an id to a token, special tokens includedzunknown ids)rH   r>   )rM   r0   r   r   r   _convert_id_to_token  s   

z"QWenTokenizer._convert_id_to_tokenr!   c                 C   s0   || j v r
| j | S || jv r| j| S td)zBConverts a token to an id using the vocab, special tokens includedzunknown token)r6   r5   r>   )rM   r!   r   r   r   _convert_token_to_id  s
   



z"QWenTokenizer._convert_token_to_idc                 K   s   t )a  
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        )NotImplementedError)rM   rs   rN   r   r   r   	_tokenize  s   zQWenTokenizer._tokenize	token_idsskip_special_tokensr?   c                    s<   t |tr|g}|r fdd|D } jj||p jdS )Nc                    s   g | ]	}| j k r|qS r   )rJ   r   rU   r   r   
<listcomp>"  s    z)QWenTokenizer._decode.<locals>.<listcomp>r~   )rY   r    rI   rn   r?   )rM   r   r   r?   rN   r   rU   r   _decode  s
   
zQWenTokenizer._decode)r-   r.   N)F)rr   r   )FN)"__name__
__module____qualname____doc__Zmodel_input_namesVOCAB_FILES_NAMESZresource_files_namesr=   r    rV   r   r[   rW   r   rZ   r   r^   boolr`   r
   rc   r   rq   r   r   r{   r   propertyr   r   r   r   r   __classcell__r   r   rQ   r   r   ;   sj    8




$
)!r   importlib.utilr   rj   rw   typingr   r   r   r   r   r   Ztokenizer_utilsr	   Ztokenizer_utils_baser
   __all__r   rE   Z	ENDOFTEXTrK   rL   tuplerangeZEXTRASrA   r   r   rZ   r[   r    r,   r   r   r   r   r   <module>   s0    	