o
    *j`                     @   s$  d dl mZmZmZmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZdd ZG dd deZ	 eeZdd Zd	d
 ZG dd deZG dd deZG dd deZdd Zdd Zdd Z	 zd dlmZ W n ey|   dd ZY nw e dd Zdd ZG dd deZ dS )     )absolute_importdivisionprint_functionunicode_literalsNc              	   C   s8   ddddddddd	}|  D ]
\}}| ||} q| S )
N-'zn'tz'mz don'tz'sz'vez're)z - z ' z n'tz 'mz do notz 'sz 'vez 're)itemsreplace)stringZ
replace_mpkv r   m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/preprocessors/nlp/space/tokenizer.pyclean_string   s   
r   c                   @   s>   e Zd Zg dfddZdd Zdd Zdd	 Zg fd
dZdS )	TokenizerBertc                    sj  | _ |dkrsddd _|D ]}| jvr&|dvr&dt j d j|< qdd	  j D  _ fd
d|D }d _  jt fdd|D 7  _t| jd _ jD ]}| jj	v siJ d| dqYt jj	 _
d S |dkrddi _dd	  j D  _ fdd|D }tj|d}tj|d}t|||d _t| _t j _
d S t)Nr   z	[unused0]z	[unused1])z[BOS]z[EOS])[PAD][UNK]z[unused]c                 S      i | ]\}}||qS r   r   .0r   r   r   r   r   
<dictcomp>.       z&Tokenizer.__init__.<locals>.<dictcomp>c                       g | ]	} j ||qS r   spec_convert_dictgetr   tokselfr   r   
<listcomp>2   s    z&Tokenizer.__init__.<locals>.<listcomp>r   z[SEP]r   z[CLS]z[MASK]c                 3   s    | ]
}| j vr|V  qd S Nspecial_tokensr   xr    r   r   	<genexpr>7   s    
z%Tokenizer.__init__.<locals>.<genexpr>)never_splitzspecial token 'z' is not in the vocabularyZGPT2r   z<unk>c                 S   r   r   r   r   r   r   r   r   A   r   c                    s   g | ]	}| j vr|qS r   )r   r   r    r   r   r"   E   s
    
z
vocab.jsonz
merges.txtr%   )tokenizer_typer   lenr   spec_revert_dictr&   tupleBertTokenizer
_tokenizervocab
vocab_sizeospathjoinGPT2Tokenizernum_specials
ValueError)r!   Z
vocab_pathr&   r+   tokenr   
vocab_filemerges_filer   r    r   __init__!   sT   
 



zTokenizer.__init__c                 C   s   | j |S r$   )r0   tokenizer!   textr   r   r   r=   R   s   zTokenizer.tokenizec                    s`    j dkr fdd|D } j|}|S  fdd|D } j|} fdd|D }|S )Nr   c                    r   r   r   r   r    r   r   r"   W       z3Tokenizer.convert_tokens_to_ids.<locals>.<listcomp>c                    r   r   r   r   r    r   r   r"   [   r@   c                    s   g | ]
}| j   j qS r   r7   r2   r   ir    r   r   r"   ]       )r+   r0   convert_tokens_to_ids)r!   tokensidsr   r    r   rE   U   s   
zTokenizer.convert_tokens_to_idsc                    s`    j dkr j|} fdd|D }|S  fdd|D } j|} fdd|D }|S )Nr   c                    r   r   r-   r   r   r    r   r   r"   c   r@   z3Tokenizer.convert_ids_to_tokens.<locals>.<listcomp>c                    s   g | ]
}| j   j qS r   rA   rB   r    r   r   r"   f   rD   c                    r   r   rH   r   r    r   r   r"   h   r@   )r+   r0   convert_ids_to_tokens)r!   rG   rF   r   r    r   rI   `   s   
zTokenizer.convert_ids_to_tokensc                    s    |}t dkrt   fdd|D }jdkr'd|dd}nd|}tfdd|D d	}t|}|S )
Nr   c                    s   g | ]}| vr|qS r   r   r   )ignore_tokensr   r   r"   o       z$Tokenizer.decode.<locals>.<listcomp>r    z ## c                    s   g | ]} j j| qS r   )r0   byte_decoderr   cr    r   r   r"   t       utf-8)	rI   r,   setr+   r5   r	   	bytearraydecoder   )r!   rG   rJ   rF   r
   r   )rJ   r!   r   rU   k   s   


zTokenizer.decodeN)__name__
__module____qualname__r<   r=   rE   rI   rU   r   r   r   r   r      s    1r   c                 C   sj   t  }d}t| ddd}	 | }|sn| }|||< |d7 }qW d   |S 1 s.w   Y  |S )z*Loads a vocabulary file into a dictionary.r   rrR   encodingT   N)collectionsOrderedDictopenreadlinestrip)r:   r1   indexreaderr9   r   r   r   
load_vocab   s    
rd   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)ra   split)r?   rF   r   r   r   whitespace_tokenize   s
   rf   c                   @   s:   e Zd ZdZ				dddZdd Zd	d
 Zdd ZdS )r/   z?Runs end-to-end tokenization: punctuation splitting + wordpieceTNr#   c                 C   s   t j|std|t|| _tdd | j	 D | _
|| _|r,t||d| _t| jd| _|dur<|| _dS td| _dS )a  Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_lower_case: Whether to lower case the input
                         Only has an effect when do_wordpiece_only=False
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        zCan't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`c                 S   s   g | ]\}}||fqS r   r   )r   r   rG   r   r   r   r"      rQ   z*BertTokenizer.__init__.<locals>.<listcomp>do_lower_caser*   )r1   N   mB)r3   r4   isfiler8   formatrd   r1   r]   r^   r   ids_to_tokensdo_basic_tokenizeBasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizerintmax_len)r!   r:   rh   rs   rm   r*   r   r   r   r<      s    


 zBertTokenizer.__init__c                 C   sL   g }| j r| j|D ]}| j|D ]}|| qq|S | j|}|S r$   )rm   ro   r=   rq   append)r!   r?   split_tokensr9   Z	sub_tokenr   r   r   r=      s   zBertTokenizer.tokenizec                 C   sH   g }|D ]
}| | j|  qt|| jkr"tdt|| j |S )z7Converts a sequence of tokens into ids using the vocab.zToken indices sequence length is longer than the specified maximum  sequence length for this BERT model ({} > {}). Running this sequence through BERT will result in indexing errors)rt   r1   r,   rs   loggerwarningrk   )r!   rF   rG   r9   r   r   r   rE      s   
z#BertTokenizer.convert_tokens_to_idsc                 C   s"   g }|D ]
}| | j|  q|S )z?Converts a sequence of ids in wordpiece tokens using the vocab.)rt   rl   )r!   rG   rF   rC   r   r   r   rI      s   z#BertTokenizer.convert_ids_to_tokens)TNTr#   )rV   rW   rX   __doc__r<   r=   rE   rI   r   r   r   r   r/      s    
$
r/   c                   @   sN   e Zd ZdZ		dddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dS )rn   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tr#   c                 C   s   || _ || _dS )znConstructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        Nrg   )r!   rh   r*   r   r   r   r<      s   
zBasicTokenizer.__init__c                 C   sn   |  |}| |}t|}g }|D ]}| jr%|| jvr%| }| |}|| | qtd	|}|S )zTokenizes a piece of text.rL   )
_clean_text_tokenize_chinese_charsrf   rh   r*   lower_run_strip_accentsextend_run_split_on_puncr5   )r!   r?   Zorig_tokensru   r9   output_tokensr   r   r   r=      s   


zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.ZNFDZMnrM   )unicodedata	normalizecategoryrt   r5   )r!   r?   outputcharcatr   r   r   r|     s   

z!BasicTokenizer._run_strip_accentsc                 C   s   || j v r|gS t|}d}d}g }|t|k rC|| }t|r)||g d}n|r0|g  d}|d | |d7 }|t|k sdd |D S )z&Splits punctuation on a piece of text.r   TFr\   c                 S   s   g | ]}d  |qS )rM   )r5   r'   r   r   r   r"   -      z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)r*   listr,   _is_punctuationrt   )r!   r?   charsrC   Zstart_new_wordr   r   r   r   r   r~     s$   

z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.rL   rM   )ord_is_chinese_charrt   r5   r!   r?   r   r   cpr   r   r   rz   /  s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dko|dk}|p|dko|dk}|p|dko|dk}|p%|dko%|dk}|p/|d	ko/|d
k}|p9|dko9|dk}|pC|dkoC|dk}|pM|dkoM|dk}|rRdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r!   r   tmpr   r   r   r   <  s   
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rL   rM   )r   _is_control_is_whitespacert   r5   r   r   r   r   ry   S  s   
zBasicTokenizer._clean_textN)Tr#   )rV   rW   rX   rx   r<   r=   r|   r~   rz   r   ry   r   r   r   r   rn      s    
rn   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rp   zRuns WordPiece tokenization.r   d   c                 C   s   || _ || _|| _d S r$   )r1   	unk_tokenmax_input_chars_per_word)r!   r1   r   r   r   r   r   r<   d  s   
zWordpieceTokenizer.__init__c                 C   s   g }t |D ]m}t|}t|| jkr|| j qd}d}g }|t|k ret|}d}	||k rQd||| }
|dkrAd|
 }
|
| jv rI|
}	n|d8 }||k s0|	du rXd}n||	 |}|t|k s&|rn|| j q|| q|S )a  Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          >>> input = "unaffable"
          >>> output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        Fr   NrM   z##r\   T)	rf   r   r,   r   rt   r   r5   r1   r}   )r!   r?   r   r9   r   Zis_badstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   r=   i  s>   

zWordpieceTokenizer.tokenizeN)r   r   )rV   rW   rX   rx   r<   r=   r   r   r   r   rp   a  s    
rp   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z1Checks whether `chars` is a whitespace character.rL   	
TZsF)r   r   r   r   r   r   r   r     s    
r   c                 C   s8   | dks| dks| dkrdS t | }|drdS dS )z.Checks whether `chars` is a control character.r   r   r   FCT)r   r   
startswithr   r   r   r   r     s   

r   c                 C   sx   t | }|dko|dk}|p|dko|dk}|p|dko|dk}|p)|dko)|dk}|r.d	S t| }|d
r:d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)r   r   r   r   )r   r   r   r   r   r   r   r     s   

r   )	lru_cachec                   C   s   dd S )Nc                 S   s   | S r$   r   )funcr   r   r   <lambda>  s    zlru_cache.<locals>.<lambda>r   r   r   r   r   r     s   r   c                     s   t jd dkr	tnt tttdtdd tttdtdd  tttdtd	d  } | d
d
 }d}tdD ]}|| vrT| | |d|  |d7 }q> fdd|D }tt	| |S )a:  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    r      !~r\      ¡   ¬   ®   ÿN   c                    s   g | ]} |qS r   r   )r   nZ_chrr   r   r"     s    z$bytes_to_unicode.<locals>.<listcomp>)
sysversion_infounichrchrr   ranger   rt   dictzip)bscsr   br   r   r   bytes_to_unicode  s.   





r   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r\   N)rS   add)wordpairsZ	prev_charr   r   r   r   	get_pairs  s   r   c                   @   sb   e Zd ZdZ			dddZdd Zdd	 Zd
d Zdd Zdd Z	dddZ
dd Zdd ZdS )r6   zF
    GPT-2 BPE tokenizer. Peculiarities:
        - Byte-level BPE
    r	   Nc                 C   s   |d ur|nt d| _tt|dd| _dd | j D | _|| _t	 | _
dd | j
 D | _t|dd ddd	 }d
d |D }tt|tt|| _i | _td| _i | _i | _| | d S )Nri   rR   rZ   c                 S   r   r   r   r   r   r   r   r     r   z*GPT2Tokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r   r     r   r   r\   r   c                 S   s   g | ]}t | qS r   )r.   re   )r   merger   r   r   r"     rK   z*GPT2Tokenizer.__init__.<locals>.<listcomp>zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)rr   rs   jsonloadr_   encoderr   decodererrorsr   byte_encoderrN   readre   r   r   r   r,   	bpe_rankscacherecompilepatr&   special_tokens_decoderset_special_tokens)r!   r:   r;   r   r&   rs   Zbpe_dataZ
bpe_mergesr   r   r   r<     s    zGPT2Tokenizer.__init__c                 C   s   t | jt | j S r$   )r,   r   r&   r    r   r   r   __len__)  s   zGPT2Tokenizer.__len__c                    s\   |s
i  _ i  _dS t fddt|D  _ dd  j  D  _td j  dS )z Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        Nc                 3   s&    | ]\}}|t  j| fV  qd S r$   )r,   r   )r   rC   r   r    r   r   r)   5  s    z3GPT2Tokenizer.set_special_tokens.<locals>.<genexpr>c                 S   r   r   r   r   r   r   r   r   7  r   z4GPT2Tokenizer.set_special_tokens.<locals>.<dictcomp>zSpecial tokens {})r&   r   r   	enumerater   rv   infork   )r!   r&   r   r    r   r   ,  s   
z GPT2Tokenizer.set_special_tokensc           
         sX  | j v r
 j | S t|}t|}|s|S 	 t| fddd}| jvr'ny|\}}g }d}|t|k rz|||}	||||	  |	}W n tyZ   |||d   Y n4w || |kr}|t|d k r}||d  |kr}|	||  |d7 }n|	||  |d7 }|t|k s5t|}|}t|dkrnt|}qd
|}| j |< |S )	NTc                    s    j | tdS )Ninf)r   r   float)pairr    r   r   r   H  s    z#GPT2Tokenizer.bpe.<locals>.<lambda>)keyr   r\   r   rL   )r   r.   r   minr   r,   rb   r}   	Exceptionrt   r5   )
r!   r9   r   r   ZbigramfirstsecondZnew_wordrC   jr   r    r   bpe=  sV   





zGPT2Tokenizer.bpec                    s^   g }t  j|D ]#}d fdd|D }|dkrq	|dd  |dD  q	|S )z Tokenize a string. rM   c                 3   s,    | ]}t | jv r jt | V  qd S r$   )r   r   )r   r   r    r   r   r)   l  s    z)GPT2Tokenizer.tokenize.<locals>.<genexpr>c                 s   s    | ]}|V  qd S r$   r   )r   Z	bpe_tokenr   r   r   r)   p  s    
rL   )r   findallr   r5   r}   r   re   )r!   r?   Z
bpe_tokensr9   r   r    r   r=   h  s   

zGPT2Tokenizer.tokenizec                 C   s   g }t |t}tjd dkot |t}|s|r(|| jv r!| j| S | j|dS |D ]}|| jv r:|| j|  q*|| j|d q*t	|| j
krXtdt	|| j
 |S )z9 Converts a sequence of tokens into ids using the vocab. r   r   zToken indices sequence length is longer than the specified maximum  sequence length for this OpenAI GPT model ({} > {}). Running this sequence through the model will result in indexing errors)
isinstancestrr   r   unicoder&   r   r   rt   r,   rs   rv   rw   rk   )r!   rF   rG   Zpython_version_3Zpython_version_2r9   r   r   r   rE   t  s$   



z#GPT2Tokenizer.convert_tokens_to_idsFc                 C   sB   g }|D ]}|| j v r|s|| j |  q|| j|  q|S )z9Converts a sequence of ids in BPE tokens using the vocab.)r   rt   r   )r!   rG   Zskip_special_tokensrF   rC   r   r   r   rI     s   
z#GPT2Tokenizer.convert_ids_to_tokensc                 C   s   |  | |S r$   )rE   r=   r>   r   r   r   encode  s   zGPT2Tokenizer.encodec                    s>   d  fdd|D }t fdd|D jd jd}|S )NrM   c                       g | ]} j | qS r   )r   )r   r9   r    r   r   r"     r   z(GPT2Tokenizer.decode.<locals>.<listcomp>c                    r   r   )rN   rO   r    r   r   r"     r   rR   )r   )r5   rT   rU   r   )r!   rF   r?   r   r    r   rU     s
   zGPT2Tokenizer.decode)r	   NN)F)rV   rW   rX   rx   r<   r   r   r   r=   rE   rI   r   rU   r   r   r   r   r6   	  s    
+
r6   )!
__future__r   r   r   r   r]   loggingr3   r   r   r   regexr   r   objectr   	getLoggerrV   rv   rd   rf   r/   rn   rp   r   r   r   	functoolsr   ImportErrorr   r   r6   r   r   r   r   <module>   s<   i
	Fv< 
