o
    *jE                     @   s   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dl	m
Z
mZ dgZe dd Ze dd Zd	d
 Zdd Zdd ZG dd deZG dd deZdS )    N)	lru_cache)BertWordPieceTokenizerCharBPETokenizerCLIPTokenizerc                  C   s2   t jt} d| dd d } t j| dS )N/zbpe_simple_vocab_16e6.txt.gz)ospathrealpath__file__joinsplit)root r   |/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/data/tokenizers.pydefault_bpe   s   r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a:  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr).0nr   r   r   
<listcomp>.       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s,   





r(   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charcharr   r   r   	get_pairs2   s   r.   c                 C   s"   t | } tt| } |  S N)ftfyZfix_texthtmlunescapestriptextr   r   r   basic_clean>   s   
r6   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr3   r4   r   r   r   whitespace_cleanD   s   r:   c                   @   s8   e Zd Ze fdefddZdd Zdd Zdd	 Zd
S )SimpleTokenizerbpe_pathc                 C   s   t  | _dd | j D | _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>N       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r   )tupler   )r   merger   r   r   r   Q   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r@   r   r   r   r   S   r    <|startoftext|><|endoftext|>c                 S   r=   r   r   r>   r   r   r   rA   X   rB   )rI   rJ   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r(   byte_encoderitemsbyte_decodergzipopenreaddecoder   r   valuesr"   r   extendr#   r$   r    lenencoderdecoder	bpe_rankscacher8   compile
IGNORECASEpat)selfr<   ZmergesZvocabrF   r   r   r   __init__L   s(   
zSimpleTokenizer.__init__c              
      s  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4n|\}}g }d}|t|k rz|||}	||||	  |	}W n tyt }
 z|||d   t	|
 W Y d }
~
n8d }
~
ww || |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d
|}| j |< |S )Nr   rG   Tc                    s    j | tdS )Ninf)rW   getfloat)pairr\   r   r   <lambda>m   s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      r7   )rX   rE   r.   minrW   rT   indexrS   	Exceptionprintr"   r   )r\   tokenr+   r,   ZbigramfirstsecondZnew_wordijer   rb   r   bpeb   sZ   





zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )NrH   c                 3       | ]} j | V  qd S r/   )rK   )r   r'   rb   r   r   	<genexpr>       z)SimpleTokenizer.encode.<locals>.<genexpr>rC   c                 3   rq   r/   )rU   )r   Z	bpe_tokenrb   r   r   rr      rs   r7   )r:   r6   lowerr8   findallr[   r   encoderS   rp   r   )r\   r5   Z
bpe_tokensrj   r   rb   r   rv      s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
NrH   c                       g | ]} j | qS r   )rV   )r   rj   rb   r   r   r      rB   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    rw   r   )rM   )r   crb   r   r   r      rB   rC   replace)errorsrG   r7   )r   	bytearrayrQ   ry   )r\   tokensr5   r   rb   r   rQ      s   zSimpleTokenizer.decodeN)	__name__
__module____qualname__r   strr]   rp   rv   rQ   r   r   r   r   r;   J   s
    ,
r;   c                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
r   M   c                 C   sB   || _ tt d| _| jjd | _| jjd | _t| jj| _d S )N)r<   rI   rJ   )	lengthr;   r   	tokenizerrU   	sos_token	eos_tokenrT   Z
vocab_size)r\   r   r   r   r   r]      s
   zCLIPTokenizer.__init__c                    sN   t |trt |S t |trt fdd|D S tdt| )Nc                    s   g | ]}  |qS r   )
_tokenizer)r   urb   r   r   r      rB   z*CLIPTokenizer.__call__.<locals>.<listcomp>z:Expected the "sequence" to be a string or a list, but got )
isinstancer   torchZ
LongTensorr   r   	TypeErrortype)r\   sequencer   rb   r   __call__   s   

zCLIPTokenizer.__call__c                 C   sJ   | j |d | jd  }| jg| | jg }|dg| jt|   }|S )Nre   r   )r   rv   r   r   r   rT   )r\   r5   r|   r   r   r   r      s   zCLIPTokenizer._tokenizerN)r   )r}   r~   r   r]   r   r   r   r   r   r   r      s    
	
)rN   r1   r   	functoolsr   r0   regexr8   r   Z
tokenizersr   r   __all__r   r(   r.   r6   r:   objectr;   r   r   r   r   r   <module>   s$   

U