o
    0jW                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZ d	gZe d
d Zdd Zdd Zdd ZG dd deZG dd	 d	eZdS )    N)	lru_cache)ListOptional   )logging   )PretrainedTokenizer_is_control_is_punctuation_is_whitespace)
AddedTokenCLIPTokenizerc                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS  )chr).0nr   r   y/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
<listcomp>:   s    z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode#   s    
r$   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charcharr   r   r   	get_pairs>   s   r*   c                 C   s   | dd| } |  } | S )Nz\s+ )substrip)textrer   r   r   whitespace_cleanL   s   r0   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r-   split)r.   tokensr   r   r   whitespace_tokenizeR   s
   r3   c                   @   sX   e Zd ZdZ					dddZdddZdd	 Zdd
dZdd Zdd Z	dd Z
dS )BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 C   s2   |d u rg }|| _ t|| _|| _|| _|| _d S N)do_lower_caser%   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr6   r7   r8   r9   r:   r   r   r   __init__s   s   

zBasicTokenizer.__init__c                 C   s   |r
| j t|n| j }| |}| jr| |}td|}t|}g }|D ])}||vrH| j	r@|
 }| jdur?| |}n| jrH| |}|| || q(td|}|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCFr+   )r7   unionr%   _clean_textr8   _tokenize_chinese_charsunicodedata	normalizer3   r6   lowerr9   _run_strip_accentsextend_run_split_on_puncjoin)r;   r.   r7   Zunicode_normalized_textZorig_tokensZsplit_tokenstokenZoutput_tokensr   r   r   tokenize   s,   




zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.ZNFDZMn )rA   rB   categoryr   rG   )r;   r.   outputr)   catr   r   r   rD      s   

z!BasicTokenizer._run_strip_accentsc                 C   s   | j r|dur||v r|gS t|}d}d}g }|t|k rI|| }t|r/||g d}n|r6|g  d}|d | |d7 }|t|k sdd |D S )	z&Splits punctuation on a piece of text.Nr   TFr   c                 S   s   g | ]}d  |qS )rJ   )rG   )r   xr   r   r   r          z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)r:   r   lenr
   r   )r;   r.   r7   charsiZstart_new_wordrL   r)   r   r   r   rF      s&   
z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.r+   rJ   )r   _is_chinese_charr   rG   r;   r.   rL   r)   cpr   r   r   r@      s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r;   rV   r   r   r   rT      s   zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r+   rJ   )r   r	   r   r   rG   rU   r   r   r   r?      s   
zBasicTokenizer._clean_text)TNTNTr5   )__name__
__module____qualname____doc__r<   rI   rD   rF   r@   rT   r?   r   r   r   r   r4   \   s    

*
r4   c                       s   e Zd ZdZdddZi i dZi ZddgZ					
	
	
d*ddZe	dd Z
dd Z	d+dee deee  dee fddZ	d+ddZ	d, fdd	Z	d+dee deee  dee fddZdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Z  ZS )-r   a  
    Construct a CLIP tokenizer based on byte-level Byte-Pair-Encoding.

    This tokenizer inherits from :class:`~paddlenlp.transformers.gpt.tokenizer.GPTTokenizer`.
    For more information regarding those methods, please refer to this superclass.

    Args:
        vocab_file (str):
            Path to the vocabulary file.
            The vocab file contains a mapping from vocabulary strings to indices.
        merges_file (str):
            Path to the merge file.
            The merge file is used to split the input sentence into "subword" units.
            The vocab file is then used to encode those units as intices.
        errors (str):
            Paradigm to follow when decoding bytes to UTF-8.
            Defaults to `'replace'`.
        max_len (int, optional):
            The maximum value of the input sequence length.
            Defaults to `77`.
        bos_token (str, optional):
            The beginning of sequence token that was used during pretraining. Can be
            used a sequence classifier token.
            Defaults to `"<|startoftext|>"`.
        eos_token (str, optional):
            A special token representing the end of a sequence that was used during pretraining.
            Defaults to `"<|endoftext|>"`.
        unk_token (str, optional):
            A special token representing the *unknown (out-of-vocabulary)* token.
            An unknown token is set to be `unk_token` inorder to be converted to an ID.
            Defaults to `"<|endoftext|>"`.
        pad_token (str, optional):
            A special token used to make arrays of tokens the same size for batching purposes.
            Defaults to `"<|endoftext|>"`.

    Examples:
        .. code-block::

            from paddlenlp.transformers import AutoTokenizer

            tokenizer = AutoTokenizer.from_pretrained('openai/clip-vit-base-patch32')
            print(tokenizer('He was a puppeteer'))

            '''
            {'input_ids': [49406, 797, 739, 320, 7116, 38820, 528, 49407]}
            '''

    z
vocab.jsonz
merges.txt)
vocab_filemerges_fileZ	input_idsZattention_maskreplaceM   <|startoftext|><|endoftext|>c	                 K   s  ddl m}
 t|trt|dddn|}t|tr t|dddn|}t|tr.t|dddn|}t|tr<t|dddn|}| j||||d z
dd l}|j| _W n tyj   t	
d tdddd| _d | _Y nw |
d	| _|| _|| _|d ur||ntd
| _t|dd}t|| _W d    n1 sw   Y  dd | j D | _|| _t | _dd | j D | _t|dd}|  ddd }W d    n1 sw   Y  dd |D }tt |t!t"|| _#ddd| _$| j%d| jj&| _'d S )Nr   )
try_importF)lstriprstrip)	bos_token	eos_token	unk_token	pad_tokenzKftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.T)r9   r:   r6   regexg   mButf-8)encodingc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>|  rP   z*CLIPTokenizer.__init__.<locals>.<dictcomp>c                 S   rk   r   r   rl   r   r   r   ro     rP   
r   i  c                 S   s   g | ]}t | qS r   )tupler1   )r   merger   r   r   r     s    z*CLIPTokenizer.__init__.<locals>.<listcomp>r_   r`   )r_   r`   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)(Zpaddle.utilsra   
isinstancestrr   Z"_build_special_tokens_map_extendedftfyfix_textImportErrorr   infor4   nlpr/   Z_vocab_fileZ_merges_fileintmax_lenopenjsonloadencoderitemsdecodererrorsr$   byte_encoderbyte_decoderreadr-   r1   r   r    r   rQ   	bpe_rankscachecompile
IGNORECASEpat)r;   r[   r\   r   r{   rd   re   rf   rg   kwargsra   ru   Zvocab_handleZmerges_handleZ
bpe_mergesr   r   r   r<   >  sv   


zCLIPTokenizer.__init__c                 C   s
   t | jS )z
        Returns the size of vocabulary.

        Returns:
            int: The sum of size of vocabulary and the size of special tokens.

        )rQ   r   r;   r   r   r   
vocab_size  s   
	zCLIPTokenizer.vocab_sizec                 C   s   t | jfi | jS r5   )r   r   Zadded_tokens_encoderr   r   r   r   	get_vocab  s   zCLIPTokenizer.get_vocabNtoken_ids_0token_ids_1returnc                 C   s<   | j g}| jg}|du r|| | S || | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CLIP sequence has the following format:

        - single sequence: `<|startoftext|> X <|endoftext|>`

        Pairs of sequences are not the expected use case, but they will be handled without a separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens.
        N)bos_token_ideos_token_idr;   r   r   rd   re   r   r   r    build_inputs_with_special_tokens  s
   z.CLIPTokenizer.build_inputs_with_special_tokensc                 C   s4   |du rdg| dg S dg| ddg | dg S )aQ  
        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.

        Should be overridden in a subclass if the model has a special way of building those.

        Args:
            offset_mapping_0 (List[tuple]):
                List of char offsets to which the special tokens will be added.
            offset_mapping_1 (List[tuple], optional):
                Optional second list of char offsets for offset mapping pairs.

        Returns:
            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
        N)r   r   r   )r;   Zoffset_mapping_0Zoffset_mapping_1r   r   r   (build_offset_mapping_with_special_tokens  s   z6CLIPTokenizer.build_offset_mapping_with_special_tokensFc                    sl   |rt  j||ddS |du rdgdgt|  dg S dgdgt|  dg dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   already_has_special_tokensNr   r   )superget_special_tokens_maskrQ   )r;   r   r   r   	__class__r   r   r     s   2z%CLIPTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|du rt|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r   r   rQ   r   r   r   r   $create_token_type_ids_from_sequences  s   z2CLIPTokenizer.create_token_type_ids_from_sequencesc           
         sr  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4ny|\}}g }d}|t|k rz|||}	W n ty\   |||d   Y n?w ||||	  |	}|| |kr|t|d k r||d  |kr|	||  |d	7 }n|	||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d

|}| j |< |S )NrN   </w>Tc                    s    j | tdS )Ninf)r   getfloat)pairr   r   r   <lambda>  s    z#CLIPTokenizer.bpe.<locals>.<lambda>)keyr   r      r+   )r   rq   r*   minr   rQ   index
ValueErrorrE   r   rG   )
r;   rH   r'   r(   ZbigramfirstsecondZnew_wordrS   jr   r   r   bpe
  sJ   


,


zCLIPTokenizer.bpec                    s   g } j du rd j|}nt  | j } j j|D ]!}d fdd|	dD }|
dd  |dD  q$|S )zTokenize a string.Nr+   rJ   c                 3   s    | ]} j | V  qd S r5   )r   )r   r#   r   r   r   	<genexpr>=  s    

z*CLIPTokenizer._tokenize.<locals>.<genexpr>ri   c                 s   s    | ]}|V  qd S r5   r   )r   Z	bpe_tokenr   r   r   r   @  s    )rv   rG   ry   rI   r0   r/   rC   findallr   encoderE   r   r1   )r;   r.   Z
bpe_tokensrH   r   r   r   	_tokenize4  s   
"zCLIPTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r   r   rf   )r;   rH   r   r   r   _convert_token_to_idC  s   z"CLIPTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   )r;   r   r   r   r   _convert_id_to_tokenG  s   z"CLIPTokenizer._convert_id_to_tokenc                    s@   d |}t fdd|D }|jd jddd }|S )z:Converts a sequence of tokens (string) in a single string.rJ   c                    s   g | ]} j | qS r   )r   )r   cr   r   r   r   N  rP   z:CLIPTokenizer.convert_tokens_to_string.<locals>.<listcomp>ri   )r   r   r+   )rG   	bytearraydecoder   r]   r-   )r;   r2   r.   Z
byte_arrayr   r   r   convert_tokens_to_stringK  s
   
z&CLIPTokenizer.convert_tokens_to_stringc                 C   sX   | j  D ]$\}}t| d| }tj||}tj|tj|kr)t|| qdS )z
        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
        (ends with '.spm') under `save_directory`.

        Args:
            save_directory (str): Directory to save files into.
        z_%sN)	resource_files_namesr   getattrospathrG   abspathshutilcopyfile)r;   Zsave_directoryname	file_namesource_pathZ	save_pathr   r   r   save_resourcesT  s   zCLIPTokenizer.save_resources)r]   r^   r_   r`   r`   r`   r5   )NF)rW   rX   rY   rZ   r   Zpretrained_resource_files_mapZpretrained_init_configurationZmodel_input_namesr<   propertyr   r   r   rz   r   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r     sX    
2
	
R




!

*	)r}   r   r   rA   	functoolsr   typingr   r   utilsr   Ztokenizer_utilsr   r	   r
   r   Ztokenizer_utils_baser   __all__r$   r*   r0   r3   objectr4   r   r   r   r   r   <module>   s$   

 (