o
    0j-=                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	Z
ddlmZ ddlmZ ddlmZmZmZmZ d	gZe d
d Zdd ZedG dd	 d	eZdS )    N)	lru_cache)DictOptionalUnion   )class_requires_deps   )PretrainedTokenizer)
AddedTokenBatchEncodingEncodedInputPaddingStrategyGPTTokenizerc                     s   t  tttdtdd tttdtdd  tttdtdd  } | dd }d	}td
D ]}|| vrK| | |d
|  |d7 }q5 fdd|D }tt| |S )a:  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~r      ¡   ¬   ®   ÿNr      c                    s   g | ]} |qS  r   ).0nZ_chrr   x/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/common/tokenizer/gpt_tokenizer.py
<listcomp>=   s    z$bytes_to_unicode.<locals>.<listcomp>)chrlistrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode%   s"   
r&   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charcharr   r   r   	get_pairsA   s   r,   regexc                       sJ  e Zd ZdZdddZdZdZeeeeeeeeeed
eeeeeeeeeed
dZi i i i i i i i i i d
Z				
	
	
			d/ddZ	e
dd Ze
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd0d#d$Zd1d%d&Zd	ejd	d	fd'eeeef ef d(ee d)ed*ee d+ee  d,e!f fd-d.Z"  Z#S )2r   a]  
    Constructs a GPT tokenizer based on byte-level Byte-Pair-Encoding.

    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
    which contains most of the main methods. For more information regarding those methods,
    please refer to this superclass.

    Args:
        vocab_file (str):
            Path to the vocab file.
            The vocab file contains a mapping from vocabulary strings to indices.
        merges_file (str):
            Path to the merge file.
            The merge file is used to split the input sentence into "subword" units.
            The vocab file is then used to encode those units as intices.
        errors (str):
            Paradigm to follow when decoding bytes to UTF-8.
            Defaults to `'replace'`.
        max_len (int, optional):
            The maximum value of the input sequence length.
            Defaults to `None`.

    Examples:
        .. code-block::

            from paddlenlp.transformers import GPTTokenizer

            tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
            print(tokenizer('Welcome to use PaddlePaddle and PaddleNLP'))

            '''
            {'input_ids': [14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930],
            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
            '''

    z
vocab.jsonz
merges.txt)
vocab_filemerges_filezHhttp://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-en-vocab.jsonzHhttp://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-en-merges.txt)
zgpt3-175B-enzgpt3-89B-enzgpt3-13B-enzgpt3-6.7B-enzgpt3-1.3B-enz
gpt2-xl-enzgpt2-large-enzgpt2-medium-enzgpt2-enzgpt2-small-enreplaceN<|endoftext|>   ĊFc                 K   s  dd l }t|trt|dddn|}t|trt|dddn|}t|tr,t|dddn|}|| _| jt| dd d u r=|n| j||d || _|| _	|d urP|nt
d| _d| _d| _t|dd	d
}t|| _W d    n1 ssw   Y  dd | j D | _t| j| _| jd | _|| _t | _dd | j D | _t|d	d
}| ddd }W d    n1 sw   Y  dd |D }tt|tt|| _ i | _!|	| _"|
| _#|$d| _%d S )Nr   F)lstriprstrip	bos_token)r5   	eos_token	unk_tokeng   mB   rutf-8)encodingc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>       z)GPTTokenizer.__init__.<locals>.<dictcomp>r   c                 S   r<   r   r   r=   r   r   r   r@      rA   
c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r      s    z)GPTTokenizer.__init__.<locals>.<listcomp>zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)&r-   
isinstancestrr
   	eol_tokenZ"_build_special_tokens_map_extendedgetattrr5   Z_vocab_fileZ_merges_fileintmax_lenZnum_command_tokensZnum_type_tokensopenjsonloadencoderitemsdecoderlenZ
num_tokensZnum_text_tokenserrorsr&   byte_encoderbyte_decoderreadrE   r!   r"   r   	bpe_rankscacheadd_prefix_spaceadd_bos_tokencompilepat)selfr.   r/   rT   rL   Z	pad_tokenr6   r7   rI   rZ   r[   kwargsrefZbpe_dataZ
bpe_mergesr   r   r   __init__   s\   

zGPTTokenizer.__init__c                 C   s
   t | jS )z
        Returns the size of vocabulary.

        Returns:
            int: The sum of size of vocabulary and the size of special tokens.

        )rS   rP   r^   r   r   r   
vocab_size   s   

zGPTTokenizer.vocab_sizec                 C   s   | j d u rd S | | j S N)rI   Zconvert_tokens_to_idsrc   r   r   r   eol_token_id   s   
zGPTTokenizer.eol_token_idc           
         sP  | j v r
 j | S t|}t|}|s|S 	 t| fddd}| jvr'nu|\}}g }d}|t|k rz|||}	||||	  |	}W n   |||d   Y n3|| |kry|t|d k ry||d  |kry|||  |d7 }n|||  |d7 }|t|k s5t|}|}t|dkrnt|}qd	|}| j |< |S )	NTc                    s    j | tdS )Ninf)rX   getfloat)pairrc   r   r   <lambda>
  s    z"GPTTokenizer.bpe.<locals>.<lambda>)keyr   r   r8    )
rY   rD   r,   minrX   rS   indexextendr    join)
r^   tokenr)   r*   ZbigramfirstsecondZnew_wordijr   rc   r   bpe   sH   


,


zGPTTokenizer.bpec                    sb   ddl }g }| j|D ]!}d fdd|dD }|dd  |dD  q|S )	zTokenize a string.r   N c                 3   s    | ]} j | V  qd S re   )rU   )r   r%   rc   r   r   	<genexpr>/  s    z)GPTTokenizer._tokenize.<locals>.<genexpr>r:   c                 s   s    | ]}|V  qd S re   r   )r   Z	bpe_tokenr   r   r   ry   0  s    rm   )r-   findallr]   rq   encoderp   rw   rE   )r^   textr`   Z
bpe_tokensrr   r   rc   r   	_tokenize)  s   "zGPTTokenizer._tokenizec                 C   s   | j || j | jS re   )rP   rh   r7   )r^   rr   r   r   r   _convert_token_to_id3  s   z!GPTTokenizer._convert_token_to_idc                 C   s
   | j | S re   rR   )r^   ro   r   r   r   _convert_id_to_token6  s   
z!GPTTokenizer._convert_id_to_tokenc                    s>   d  fdd|D }t fdd|D jd jd}|S )a  
        Converts a single index or a sequence of indices to texts.

        Args:
            ids (int|List[int]):
                The token id (or token ids) to be converted to text.

        Returns:
            str: The decoded text.

        Example:
            .. code-block::

                from paddlenlp.transformers import GPTTokenizer
                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))
                # 'Welcome to use PaddlePaddle and PaddleNLP'

        rx   c                       g | ]} j | qS r   r   )r   idrc   r   r   r   N  rA   z6GPTTokenizer.convert_ids_to_string.<locals>.<listcomp>c                    r   r   rV   r   crc   r   r   r   O  rA   r:   rT   rq   	bytearraydecoderT   )r^   Zidsr|   r   rc   r   convert_ids_to_string9  s
   z"GPTTokenizer.convert_ids_to_stringc                 C   sX   | j  D ]$\}}t| d| }tj||}tj|tj|kr)t|| qdS )z
        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
        (ends with '.spm') under `save_directory`.

        Args:
            save_directory (str): Directory to save files into.
        z_%sN)	resource_files_namesrQ   rJ   ospathrq   abspathshutilcopyfile)r^   Zsave_directoryname	file_namesource_pathZ	save_pathr   r   r   save_resourcesT  s   zGPTTokenizer.save_resourcesc                    s0   d |}t fdd|D jd jd}|S )zL
        Converts a sequence of tokens (string) in a single string.
        rx   c                    r   r   r   r   rc   r   r   r   h  rA   z9GPTTokenizer.convert_tokens_to_string.<locals>.<listcomp>r:   r   r   )r^   tokensr|   r   rc   r   convert_tokens_to_stringc  s
   
z%GPTTokenizer.convert_tokens_to_stringc                 C   s   t | jfi | jS re   )r!   rP   Zadded_tokens_encoderrc   r   r   r   	get_vocabm  s   zGPTTokenizer.get_vocabc                 K   s&   | d| j}|s|rd| }||fS )NrZ   rm   )poprZ   )r^   r|   Zis_split_into_wordsr_   rZ   r   r   r   prepare_for_tokenizationp  s   z%GPTTokenizer.prepare_for_tokenizationc                 C   s4   | j r| jg}ng }|| }|d u r|S || | S re   )r[   Zbos_token_id)r^   Ztoken_ids_0Ztoken_ids_1Zbos_token_idsoutputr   r   r    build_inputs_with_special_tokensv  s   
z-GPTTokenizer.build_inputs_with_special_tokensencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskreturnc           
         s   d|v rt t|d dkr|d }|d nd}|| jd  }t |||||}|durht t|dkrh||d< |tjkoGt ||k}|rh|t | }	d|v rhtj	|d d|	df|	dfgddd|d< |S )a=  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        attention_maskr8   Nr   )r   r   Zconstant)Z	pad_widthmodeZconstant_values)
rS   npshaper   Zmodel_input_namessuper_padr   
DO_NOT_PADpad)
r^   r   r   r   r   r   r   Zrequired_inputZneeds_to_be_padded
difference	__class__r   r   r     s8   #


zGPTTokenizer._pad)r0   Nr1   r1   r1   r2   FF)Fre   )$__name__
__module____qualname____doc__r   Zgpt_vocab_linkZgpt_merges_linkZpretrained_resource_files_mapZpretrained_init_configurationrb   propertyrd   rf   rw   r}   r~   r   r   r   r   r   r   r   r   r   r   r   rH   r   r   r   rK   boolr!   r   __classcell__r   r   r   r   r   N   s    &
H

)



)rN   r   r   	functoolsr   typingr   r   r   numpyr   Z
utils.depsr   Ztokenizer_utilsr	   Ztokenizer_utils_baser
   r   r   r   __all__r&   r,   r   r   r   r   r   <module>   s    
