o
    *j8                     @   s   d Z ddlmZmZmZ ddlZddlZddlZddlZddl	Z	dd Z
dd Zdd	 Zd
d Zdd Zdd Zdd Zdd ZG dd deZG dd deZG dd deZdd Zdd Zdd ZdS ) zTokenization classes.    )absolute_importdivisionprint_functionNc           
      C   s   |sdS t d|}|du rdS |d}g d}g d}d}||v r-| s-d}d}d	}d
}	||v r;| r;d}d
}d}d}	|rHtd|||||	f dS )zHChecks whether the casing config is consistent with the checkpoint name.Nz$^.*?([A-Za-z0-9_-]+)/bert_model.ckpt   )zuncased_L-24_H-1024_A-16zuncased_L-12_H-768_A-12zmultilingual_L-12_H-768_A-12zchinese_L-12_H-768_A-12)zcased_L-12_H-768_A-12zcased_L-24_H-1024_A-16zmulti_cased_L-12_H-768_A-12FTFalseZ
lowercasedTrueZcaseda  You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. However, `%s` seems to be a %s model, so you should pass in `--do_lower_case=%s` so that the fine-tuning matches how the model was pre-training. If this error is wrong, please just comment out this check.)rematchgroup
ValueError)
do_lower_caseZinit_checkpointmZ
model_nameZlower_modelsZcased_modelsZis_bad_configZactual_flagZ	case_nameZopposite_flag r   r/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/clip/bert_tokenizer.py validate_case_matches_checkpoint   s8   
r   c                 C   s|   t jrt| tr
| S t| tr| ddS tdt|  t jr:t| tr+| ddS t| t	r2| S tdt|  td)zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.utf-8ignoreUnsupported string type: %s#Not running on Python2 or Python 3?)
sixPY3
isinstancestrbytesdecoder   typePY2unicodetextr   r   r   convert_to_unicodeL   s   



r    c                 C   sz   t jrt| tr
| S t| tr| ddS tdt|  t jr9t| tr'| S t| t	r1| 
dS tdt|  td)zAReturns text encoded in a way suitable for print or `tf.logging`.r   r   r   r   )r   r   r   r   r   r   r   r   r   r   encoder   r   r   r   printable_text`   s   




r"   c                 C   sn   t  }d}t| ddd }	 t| }|sn| }|||< |d7 }qW d   |S 1 s0w   Y  |S )z*Loads a vocabulary file into a dictionary.r   rr   )encodingTr   N)collectionsOrderedDictopenr    readlinestrip)
vocab_filevocabindexreadertokenr   r   r   
load_vocabw   s    
r/   c                 C   s    g }|D ]	}| | |  q|S )z4Converts a sequence of [tokens|ids] using the vocab.)append)r+   itemsoutputitemr   r   r   convert_by_vocab   s   r4   c                 C   
   t | |S Nr4   )r+   tokensr   r   r   convert_tokens_to_ids      
r9   c                 C   r5   r6   r7   )	inv_vocabidsr   r   r   convert_ids_to_tokens   r:   r=   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r)   split)r   r8   r   r   r   whitespace_tokenize   s
   r?   c                   @   sH   e Zd ZdZdddZdd Zdd Zd	d
 ZedddZ	dd Z
dS )FullTokenizerzRuns end-to-end tokenization.Tc                 C   s>   t || _dd | j D | _t|d| _t| jd| _d S )Nc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>       z*FullTokenizer.__init__.<locals>.<dictcomp>r   )r+   )r/   r+   r1   r;   BasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr*   r   r   r   r   __init__   s   
zFullTokenizer.__init__c                 C   s6   g }| j |D ]}| j|D ]}|| qq|S r6   )rH   tokenizerJ   r0   )rK   r   split_tokensr.   Z	sub_tokenr   r   r   rM      s   zFullTokenizer.tokenizec                 C      t | j|S r6   )r4   r+   )rK   r8   r   r   r   r9         z#FullTokenizer.convert_tokens_to_idsc                 C   rO   r6   )r4   r;   )rK   r<   r   r   r   r=      rP   z#FullTokenizer.convert_ids_to_tokensc                 C   s2   dd }d | dd }|r||}|S |S )z< Converts a sequence of tokens (string) in a single string. c                 S   sX   |  dd dd dd dd d	d
 dd dd dd dd dd} | S )z Clean up a list of simple English tokenization artifacts
            like spaces before punctuations and abbreviated forms.
            z ..z ??z !!z ,,z ' 'z n'tzn'tz 'mz'mz 'sz'sz 'vez'vez 'rez're)replace)Z
out_stringr   r   r   clean_up_tokenization   s   
zEFullTokenizer.convert_tokens_to_string.<locals>.clean_up_tokenization z ## )joinrV   r)   )r8   Zclean_up_tokenization_spacesrW   r   Z
clean_textr   r   r   convert_tokens_to_string   s   z&FullTokenizer.convert_tokens_to_stringc                 C   s
   t | jS r6   )lenr+   )rK   r   r   r   
vocab_size   r:   zFullTokenizer.vocab_sizeNT)__name__
__module____qualname____doc__rL   rM   r9   r=   staticmethodr[   r]   r   r   r   r   r@      s    
r@   c                   @   sJ   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )rG   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tc                 C   s
   || _ dS )znConstructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        NrF   )rK   r   r   r   r   rL      s   
zBasicTokenizer.__init__c                 C   sl   t |}| |}| |}t|}g }|D ]}| jr$| }| |}|| | qtd	|}|S )zTokenizes a piece of text.rX   )
r    _clean_text_tokenize_chinese_charsr?   r   lower_run_strip_accentsextend_run_split_on_puncrZ   )rK   r   Zorig_tokensrN   r.   output_tokensr   r   r   rM      s   


zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.ZNFDZMnrY   )unicodedata	normalizecategoryr0   rZ   )rK   r   r2   charcatr   r   r   rg      s   

z!BasicTokenizer._run_strip_accentsc                 C   s   t |}d}d}g }|t|k r;|| }t|r!||g d}n|r(|g  d}|d | |d7 }|t|k sdd |D S )z&Splits punctuation on a piece of text.r   TFr   c                 S   s   g | ]}d  |qS )rY   )rZ   )rA   xr   r   r   
<listcomp>  rE   z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listr\   _is_punctuationr0   )rK   r   charsiZstart_new_wordr2   rn   r   r   r   ri      s    
z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.rX   rY   )ord_is_chinese_charr0   rZ   rK   r   r2   rn   cpr   r   r   re     s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )rK   rz   r   r   r   rx      s    
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rX   rY   )rw   _is_control_is_whitespacer0   rZ   ry   r   r   r   rd   5  s   
zBasicTokenizer._clean_textNr^   )r_   r`   ra   rb   rL   rM   rg   ri   re   rx   rd   r   r   r   r   rG      s    
rG   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rI   zRuns WordPiece tokenization.[UNK]   c                 C   s   || _ || _|| _d S r6   )r+   	unk_tokenmax_input_chars_per_word)rK   r+   r   r   r   r   r   rL   F  s   
zWordpieceTokenizer.__init__c                 C   s   t |}g }t|D ]m}t|}t|| jkr|| j q
d}d}g }|t|k rit|}d}	||k rUd||| }
|dkrEd|
 }
|
| jv rM|
}	n|d8 }||k s4|	du r\d}n||	 |}|t|k s*|rr|| j q
|	| q
|S )a  Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.

        Returns:
          A list of wordpiece tokens.
        Fr   NrY   z##r   T)
r    r?   rs   r\   r   r0   r   rZ   r+   rh   )rK   r   rj   r.   ru   Zis_badstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   rM   K  s@   

zWordpieceTokenizer.tokenizeN)r}   r~   )r_   r`   ra   rb   rL   rM   r   r   r   r   rI   C  s    
rI   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z1Checks whether `chars` is a whitespace character.rX   	
TZsFrk   rm   rn   ro   r   r   r   r|     s    
r|   c                 C   s6   | dks| dks| dkrdS t | }|dv rdS dS )z.Checks whether `chars` is a control character.r   r   r   F)CcZCfTr   r   r   r   r   r{     s   
r{   c                 C   sh   t | }|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d	S t| }|d
r2d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)rw   rk   rm   
startswith)rn   rz   ro   r   r   r   rt     s     

rt   )rb   
__future__r   r   r   r%   osr   rk   r   r   r    r"   r/   r4   r9   r=   r?   objectr@   rG   rI   r|   r{   rt   r   r   r   r   <module>   s*   3	2r>