o
    0j%                     @   sH   d dl Z d dlmZ d dlmZmZmZ d dlmZ G dd deZ	dS )    N)copyfile)ListOptionalTuple)PretrainedTokenizerc                
       s`  e Zd Zg dZddiZddddddddiZi i i i i i dZdZ					
			d0 fdd	Ze	dd Z
dd Ze	dee fddZe	dee fddZd1ddZdd Zdd Zdd Zd d! Zd"d# Z	d2d$ee dee fd%d&Zd2d'd(Z		d3d)ee d*eee  d+edee f fd,d-Z	d2d)ee d*eee  dee fd.d/Z  ZS )4LlamaTokenizer)Z	input_idsZattention_maskZposition_ids
vocab_filezsentencepiece.bpe.modelzQhttps://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model)z'__internal_testing__/micro-random-llamaz&__internal_testing__/tiny-random-llamazfacebook/llama-7bzfacebook/llama-13bzfacebook/llama-30bzfacebook/llama-65bleft<unk><s></s>TFNc	           
         s\   |d u ri n|| _ t jd|||d|	 || _|| _|| _|| _| |	dd| _	d S )N)	bos_token	eos_token	unk_token	from_slowT )
sp_model_kwargssuper__init__r   add_bos_tokenadd_eos_tokendecode_with_prefix_spaceget_spm_processorpopsp_model)
selfr   r   r   r   r   r   r   r   kwargs	__class__r   z/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/common/tokenizer/llama_tokenizer.pyr   2   s   zLlamaTokenizer.__init__c                 C   
   | j  S )zReturns vocab size)r   get_piece_sizer   r   r   r   
vocab_sizeI   s   
zLlamaTokenizer.vocab_sizec                 C   s0   d}| j D ]}|| j kr|d7 }q| j| S )zc
        Returns the vocabulary size. added_tokens_encoder has to be added in the sp_model
        r      )Zadded_tokens_decoderr   r!   r#   )r   Z
added_sizeidr   r   r   __len__N   s   

zLlamaTokenizer.__len__returnc                 C   r    N)r   Zbos_idr"   r   r   r   bos_token_idZ      
zLlamaTokenizer.bos_token_idc                 C   r    r(   )r   Zeos_idr"   r   r   r   eos_token_id^   r*   zLlamaTokenizer.eos_token_idc           	      C   s   dd l }ddl m} |jdi | j}|r|| j |S t| jd)}| }|j	|}|
 }d|_|j| | }|| W d    |S 1 sOw   Y  |S )Nr   )sentencepiece_model_pb2rbFr   )Zsentencepiecer,   ZSentencePieceProcessorr   Loadr   openreadZ
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringZLoadFromSerializedProto)	r   r   ZspmZ	model_pb2	tokenizerfr   modelr1   r   r   r   r   b   s$   
z LlamaTokenizer.get_spm_processorc                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokens).0ir"   r   r   
<dictcomp>w   s    z,LlamaTokenizer.get_vocab.<locals>.<dictcomp>)ranger#   updateZadded_tokens_encoder)r   Zvocabr   r"   r   	get_vocabu   s   zLlamaTokenizer.get_vocabc                 C   s   | j j|tdS )zReturns a tokenized string.)Zout_type)r   encodestr)r   textr   r   r   	_tokenize{   s   zLlamaTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r   Zpiece_to_id)r   tokenr   r   r   _convert_token_to_id   s   z#LlamaTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r   Zid_to_piece)r   indexr?   r   r   r   _convert_id_to_token   s   z#LlamaTokenizer._convert_id_to_tokenc                 C   s|   g }d}d}t |D ])\}}|| jv r,|s|dkr|d7 }|| j|| 7 }d}g }q
|| d}q
|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string. Fr    T)	enumerateZall_special_tokensr   decodeappend)r   tokensZcurrent_sub_tokensZ
out_stringZprev_is_specialr6   r?   r   r   r   convert_tokens_to_string   s   

z'LlamaTokenizer.convert_tokens_to_stringfilename_prefixc                 C   s   t j|std| dt j||r|d nd| jd  }t j| jt j|kr=t j| jr=t	| j| |fS t j| jset
|d}| j }|| W d   |fS 1 s`w   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.
        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directory-rC   r   wbN)ospathisdir
ValueErrorjoinresource_files_namesabspathr   isfiler   r/   r   Zserialized_model_protowrite)r   Zsave_directoryrJ   Zout_vocab_filefiZcontent_spiece_modelr   r   r   save_vocabulary   s2   


zLlamaTokenizer.save_vocabularyc                 C   sB   | j r| jg}ng }|| }|d ur|| }| jr|| jg }|S r(   )r   r)   r   r+   )r   token_ids_0token_ids_1Zbos_token_idsoutputr   r   r    build_inputs_with_special_tokens   s   
z/LlamaTokenizer.build_inputs_with_special_tokensrX   rY   already_has_special_tokensc                    sh   |rt  j||ddS |du rdgdgt|  dg S dgdgt|  ddg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.
        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rX   rY   r\   Nr$   r   )r   get_special_tokens_masklen)r   rX   rY   r\   r   r   r   r]      s   0z&LlamaTokenizer.get_special_tokens_maskc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of zeros.
        Nr   )r+   r^   )r   rX   rY   Zeosr   r   r   $create_token_type_ids_from_sequences   s   z3LlamaTokenizer.create_token_type_ids_from_sequences)r
   r   r   TFNF)Tr(   )NF)__name__
__module____qualname__Zmodel_input_namesrR   Zpretrained_resource_files_mapZpretrained_init_configurationZpadding_sider   propertyr#   r&   r   intr)   r+   r   r:   r>   r@   rB   rI   r<   r   rW   r[   r   boolr]   r_   __classcell__r   r   r   r   r      s    



!

r   )
rM   shutilr   typingr   r   r   Z9paddlex.inference.models.common.tokenizer.tokenizer_utilsr   r   r   r   r   r   <module>   s
   