o
    0jb                     @   sp   d dl Z d dlZddlmZmZmZmZmZmZm	Z	 g dZ
G dd deZG dd deZG d	d
 d
eZdS )    N   )PretrainedTokenizer_is_control_is_punctuation
_is_symbol_is_whitespaceconvert_to_unicodewhitespace_tokenize)BasicTokenizerBertTokenizerWordpieceTokenizerc                   @   sV   e Zd ZdZ				dddZdddZdd	 Zdd
dZdd Zdd Z	dd Z
dS )r
   a  
    Runs basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (bool):
            Whether to lowercase the input when tokenizing.
            Defaults to `True`.
        never_split (Iterable):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (bool):
            Whether to tokenize Chinese characters.
        strip_accents: (bool):
            Whether to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
    TNc                 C   s,   |du rg }|| _ t|| _|| _|| _dS )zConstructs a BasicTokenizer.N)do_lower_casesetnever_splittokenize_chinese_charsstrip_accents)selfr   r   r   r    r   y/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/common/tokenizer/bert_tokenizer.py__init__5   s   

zBasicTokenizer.__init__c                 C   s   t |}|r| jt|n| j}| |}| jr| |}t|}g }|D ])}||vrF| jr>|	 }| j
dur=| |}n| j
rF| |}|| || q&td|}|S )a9  
        Tokenizes a piece of text using basic tokenizer.

        Args:
            text (str): A piece of text.
            never_split (List[str]): List of token not to split.

        Returns:
            list(str): A list of tokens.

        Examples:
            .. code-block::

                from paddlenlp.transformers import BasicTokenizer
                basictokenizer = BasicTokenizer()
                tokens = basictokenizer.tokenize('He was a puppeteer')
                '''
                ['he', 'was', 'a', 'puppeteer']
                '''
        F )r   r   unionr   _clean_textr   _tokenize_chinese_charsr	   r   lowerr   _run_strip_accentsextend_run_split_on_puncjoin)r   textr   Zorig_tokenssplit_tokenstokenoutput_tokensr   r   r   tokenizeD   s,   




zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z6
        Strips accents from a piece of text.
        ZNFDZMn )unicodedata	normalizecategoryappendr   )r   r   outputcharcatr   r   r   r   r   s   

z!BasicTokenizer._run_strip_accentsc                 C   s   |dur||v r|gS t |}d}d}g }|t|k rJ|| }t|s't|r0||g d}n|r7|g  d}|d | |d7 }|t|k sdd |D S )	z8
        Splits punctuation on a piece of text.
        Nr   TFr   c                 S   s   g | ]}d  |qS )r$   )r   ).0xr   r   r   
<listcomp>   s    z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listlenr   r   r(   )r   r   r   charsiZstart_new_wordr)   r*   r   r   r   r      s$   
z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z;
        Adds whitespace around any CJK character.
        r   r$   )ord_is_chinese_charr(   r   r   r   r)   r*   cpr   r   r   r      s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )zH
        Checks whether CP is the codepoint of a CJK character.
        i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r   r7   r   r   r   r5      s   zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zT
        Performs invalid character removal and whitespace cleanup on text.
        r   i  r   r$   )r4   r   r   r(   r   r6   r   r   r   r      s   
zBasicTokenizer._clean_text)TNTNN)__name__
__module____qualname____doc__r   r#   r   r   r   r5   r   r   r   r   r   r
   #   s    

.
r
   c                   @   s"   e Zd ZdZdddZdd ZdS )	r   a  
    Runs WordPiece tokenization.

    Args:
        vocab (Vocab|dict):
            Vocab of the word piece tokenizer.
        unk_token (str):
            A specific token to replace all unknown tokens.
        max_input_chars_per_word (int):
            If a word's length is more than
            max_input_chars_per_word, it will be dealt as unknown word.
            Defaults to 100.
    d   c                 C   s   || _ || _|| _d S r8   )vocab	unk_tokenmax_input_chars_per_word)r   r>   r?   r@   r   r   r   r      s   
zWordpieceTokenizer.__init__c                 C   s   g }t |D ]m}t|}t|| jkr|| j qd}d}g }|t|k ret|}d}	||k rQd||| }
|dkrAd|
 }
|
| jv rI|
}	n|d8 }||k s0|	du rXd}n||	 |}|t|k s&|rn|| j q|| q|S )a  
        Tokenizes a piece of text into its word pieces.
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through `BasicTokenizer`.

        Returns:
            list (str): A list of wordpiece tokens.

        Examples:
            .. code-block::

                from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer

                berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                vocab  = berttokenizer.vocab
                unk_token = berttokenizer.unk_token

                wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token)
                inputs = wordpiecetokenizer.tokenize("unaffable")
                print(inputs)
                '''
                ["un", "##aff", "##able"]
                '''
        Fr   Nr$   z##r   T)	r	   r0   r1   r@   r(   r?   r   r>   r   )r   r   r"   r!   r2   Zis_badstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   r#      s>   

zWordpieceTokenizer.tokenizeN)r=   )r9   r:   r;   r<   r   r#   r   r   r   r   r      s    
r   c                   @   s>  e Zd ZdZddiZdi dddddd	d
dddddddddddddddddddddddddddd diiZi dd!d"idd!d"idd!d#id
d!d#idd!d"idd!d#idd!d#idd!d#idd!d#idd!d#idd!d#idd!d"idd!d"idd!d"idd!d"idd!d"idd!d"id d!d"iiZi dd$dd$dd$d
d$dd$dd$dd$dd$dd$dd$dd$dd$dd$dd$dd$dd$dd$d d$iZd%Z	"	"	&	'	(	)	*	+	"	&dBd,d-Z	e
d.d/ Zd0d1 Zd2d3 Zd4d5 ZdCd6d7ZdDd8d9Z	&dDd:d;ZdDd<d=Z	#dEd>d?Zd@dA Zd&S )Fr   a2
  
    Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation
    splitting, lower casing and so on, and follows a WordPiece tokenizer to
    tokenize as subwords.

    Args:
        vocab_file (str):
            The vocabulary file path (ends with '.txt') required to instantiate
            a `WordpieceTokenizer`.
        do_lower_case (bool, optional):
            Whether to lowercase the input when tokenizing.
            Defaults to `True`.
        do_basic_tokenize (bool, optional):
            Whether to use a basic tokenizer before a WordPiece tokenizer.
            Defaults to `True`.
        never_split (Iterable, optional):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`. Defaults to `None`.
        unk_token (str, optional):
            A special token representing the *unknown (out-of-vocabulary)* token.
            An unknown token is set to be `unk_token` inorder to be converted to an ID.
            Defaults to "[UNK]".
        sep_token (str, optional):
            A special token separating two different sentences in the same input.
            Defaults to "[SEP]".
        pad_token (str, optional):
            A special token used to make arrays of tokens the same size for batching purposes.
            Defaults to "[PAD]".
        cls_token (str, optional):
            A special token used for sequence classification. It is the last token
            of the sequence when built with special tokens. Defaults to "[CLS]".
        mask_token (str, optional):
            A special token representing a masked token. This is the token used
            in the masked language modeling task which the model tries to predict the original unmasked ones.
            Defaults to "[MASK]".
        tokenize_chinese_chars (bool, optional):
            Whether to tokenize Chinese characters.
            Defaults to `True`.
        strip_accents: (bool, optional):
            Whether to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
            Defaults to `None`.

    Examples:
        .. code-block::

            from paddlenlp.transformers import BertTokenizer
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

            inputs = tokenizer('He was a puppeteer')
            print(inputs)

            '''
            {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
            '''
    
vocab_filez	vocab.txtzbert-base-uncasedzIhttps://bj.bcebos.com/paddle-hapi/models/bert/bert-base-uncased-vocab.txtzbert-large-uncasedzJhttps://bj.bcebos.com/paddle-hapi/models/bert/bert-large-uncased-vocab.txtzbert-base-casedzGhttps://bj.bcebos.com/paddle-hapi/models/bert/bert-base-cased-vocab.txtzbert-large-casedzHhttps://bj.bcebos.com/paddle-hapi/models/bert/bert-large-cased-vocab.txtzbert-base-multilingual-uncasedzVhttps://bj.bcebos.com/paddle-hapi/models/bert/bert-base-multilingual-uncased-vocab.txtzbert-base-multilingual-casedzThttps://bj.bcebos.com/paddle-hapi/models/bert/bert-base-multilingual-cased-vocab.txtzbert-base-chinesezIhttps://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txtzbert-wwm-chinesezRhttp://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-chinese-vocab.txtzbert-wwm-ext-chinesezVhttp://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-ext-chinese-vocab.txtzmacbert-large-chinesezmacbert-base-chinesezsimbert-base-chinesezEhttps://bj.bcebos.com/paddlenlp/models/transformers/simbert/vocab.txtzuer/chinese-roberta-basezQhttps://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txtzuer/chinese-roberta-mediumzuer/chinese-roberta-6l-768hzuer/chinese-roberta-smallzuer/chinese-roberta-minizuer/chinese-roberta-tinyr   TFi   rightN[UNK][SEP][PAD][CLS][MASK]c                 K   s`   t j|std||| _| j||d| _|| _|r&t	|||
|d| _
t| j|d| _d S )NzCan't find a vocabulary file at path '{}'. To load the vocabulary from a pretrained model please use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`)r?   )r   r   r   r   )r>   r?   )ospathisfile
ValueErrorformatr   Zload_vocabularyr>   do_basic_tokenizer
   basic_tokenizerr   wordpiece_tokenizer)r   rD   r   rP   r   r?   Z	sep_tokenZ	pad_tokenZ	cls_tokenZ
mask_tokenr   r   kwargsr   r   r   r     s(   zBertTokenizer.__init__c                 C   s
   t | jS )zk
        Return the size of vocabulary.

        Returns:
            int: The size of vocabulary.
        )r1   r>   r   r   r   r   
vocab_size  s   
	zBertTokenizer.vocab_sizec                 C   s   t | jjfi | jS r8   )dictr>   Ztoken_to_idxZadded_tokens_encoderrT   r   r   r   	get_vocab  s   zBertTokenizer.get_vocabc                 C   s^   g }| j r'| jj|| jdD ]}|| jjv r|| q|| j|7 }q|S | j|}|S )z
        End-to-end tokenization for BERT models.

        Args:
            text (str): The text to be tokenized.

        Returns:
            list: A list of string representing converted tokens.
        )r   )rP   rQ   r#   Zall_special_tokensr   r(   rR   )r   r   r    r!   r   r   r   	_tokenize  s   

zBertTokenizer._tokenizec                 C   s   d |dd }|S )ac  
        Converts a sequence of tokens (list of string) to a single string. Since
        the usage of WordPiece introducing `##` to concat subwords, also removes
        `##` when converting.

        Args:
            tokens (list): A list of string representing tokens to be converted.

        Returns:
            str: Converted string from tokens.

        Examples:
            .. code-block::

                from paddlenlp.transformers import BertTokenizer

                berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                tokens = berttokenizer.tokenize('He was a puppeteer')
                '''
                ['he', 'was', 'a', 'puppet', '##eer']
                '''
                strings = tokenizer.convert_tokens_to_string(tokens)
                '''
                he was a puppeteer
                '''
        r   z ##r$   )r   replacestrip)r   tokensZ
out_stringr   r   r   convert_tokens_to_string  s   z&BertTokenizer.convert_tokens_to_stringc                 C   s$   g }g }t | ||r|S dS )ad  
        Returns the number of added tokens when encoding a sequence with special tokens.

        Args:
            pair(bool):
                Whether the input is a sequence pair or a single sequence.
                Defaults to `False` and the input is a single sequence.

        Returns:
            int: Number of tokens added to sequences.
        N)r1    build_inputs_with_special_tokens)r   pairtoken_ids_0token_ids_1r   r   r   num_special_tokens_to_add  s   z'BertTokenizer.num_special_tokens_to_addc                 C   s@   |du r| j g| | jg S | j g}| jg}|| | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.

        A BERT sequence has the following format:

        - single sequence:      ``[CLS] X [SEP]``
        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``

        Args:
            token_ids_0 (List[int]):
                List of IDs to which the special tokens will be added.
            token_ids_1 (List[int], optional):
                Optional second list of IDs for sequence pairs. Defaults to None.

        Returns:
            List[int]: List of input_id with the appropriate special tokens.
        N)cls_token_idsep_token_id)r   r_   r`   _cls_sepr   r   r   r]     s
   z.BertTokenizer.build_inputs_with_special_tokensc                 C   s2   |du rdg| dg S dg| dg | dg S )a  
        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.

        A BERT offset_mapping has the following format:

        - single sequence:      ``(0,0) X (0,0)``
        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``

        Args:
            offset_mapping_ids_0 (List[tuple]):
                List of wordpiece offsets to which the special tokens will be added.
            offset_mapping_ids_1 (List[tuple], optional):
                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.

        Returns:
            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
        N)r   r   r   )r   Zoffset_mapping_0Zoffset_mapping_1r   r   r   (build_offset_mapping_with_special_tokens8  s   z6BertTokenizer.build_offset_mapping_with_special_tokensc                 C   sV   | j g}| jg}|du rt|| | dg S t|| | dg t|| dg  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task.

        A BERT sequence pair mask has the following format:
        ::

            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
            | first sequence    | second sequence |

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (List[int]):
                A list of `inputs_ids` for the first sequence.
            token_ids_1 (List[int], optional):
                Optional second list of IDs for sequence pairs. Defaults to None.

        Returns:
            List[int]: List of token_type_id according to the given sequence(s).
        Nr   r   )rc   rb   r1   )r   r_   r`   re   rd   r   r   r   $create_token_type_ids_from_sequencesQ  s
   (z2BertTokenizer.create_token_type_ids_from_sequencesc                    sz   |r|dur
t dtt fdd|S |dur0dgdgt|  dg dgt|  dg S dgdgt|  dg S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``encode`` methods.

        Args:
            token_ids_0 (List[int]):
                A list of `inputs_ids` for the first sequence.
            token_ids_1 (List[int], optional):
                Optional second list of IDs for sequence pairs. Defaults to None.
            already_has_special_tokens (bool, optional): Whether or not the token list is already
                formatted with special tokens for the model. Defaults to None.

        Returns:
            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   |  j v rdS dS )Nr   r   )Zall_special_ids)r.   rT   r   r   <lambda>  s    z7BertTokenizer.get_special_tokens_mask.<locals>.<lambda>r   r   )rN   r0   mapr1   )r   r_   r`   Zalready_has_special_tokensr   rT   r   get_special_tokens_maskl  s   .z%BertTokenizer.get_special_tokens_maskc                 C   s   | j j|| jS )z=Converts an index (integer) in a token (str) using the vocab.)r>   Z_idx_to_tokengetr?   )r   indexr   r   r   _convert_id_to_token  s   z"BertTokenizer._convert_id_to_token)
TTNrF   rG   rH   rI   rJ   TN)Fr8   )NF)r9   r:   r;   r<   Zresource_files_namesZpretrained_resource_files_mapZpretrained_init_configurationZmax_model_input_sizesZpadding_sider   propertyrU   rW   rX   r\   ra   r]   rf   rg   rj   rm   r   r   r   r   r   %  s   9	
	
	

&






!r   )rK   r%   Ztokenizer_utilsr   r   r   r   r   r   r	   __all__objectr
   r   r   r   r   r   r   <module>   s   $
 /T