o
    *jG                     @   s   d Z ddlZddlZddlZddlmZ ddlZddlZddl	Z	e dd Z
e dd Zdd	 Zd
d Zdd ZG dd deZdddZdS )z CLIP Tokenizer.    N)	lru_cachec                   C   s   t jt jt jtdS )Nzbpe_simple_vocab_16e6.txt.gz)ospathjoindirnameabspath__file__ r	   r	   m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/gemm/tokenizer.pydefault_bpe   s   r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a:  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r	   )chr).0nr	   r	   r
   
<listcomp>1       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br	   r	   r
   bytes_to_unicode   s,   





r"   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charcharr	   r	   r
   	get_pairs5   s   r(   c                 C   s"   t | } tt| } |  S N)ftfyZfix_texthtmlunescapestriptextr	   r	   r
   basic_cleanA   s   
r0   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr-   r.   r	   r	   r
   whitespace_cleanG   s   r4   c                   @   s8   e Zd Ze fdefddZdd Zdd Zdd	 Zd
S )SimpleTokenizerbpe_pathc                 C   s   t  | _dd | j D | _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _d S )Nc                 S      i | ]\}}||qS r	   r	   r   kvr	   r	   r
   
<dictcomp>Q       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r	   )tuplesplit)r   merger	   r	   r
   r   T   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r	   )r   r:   r	   r	   r
   r   V   r    <|startoftext|><|endoftext|>c                 S   r7   r	   r	   r8   r	   r	   r
   r;   [   r<   )rD   rE   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r"   byte_encoderitemsbyte_decodergzipopenreaddecoder@   r   valuesr   r   extendr   r   r   lenencoderdecoder	bpe_rankscacher2   compile
IGNORECASEpat)selfr6   ZmergesZvocabrA   r	   r	   r
   __init__O   s(   
zSimpleTokenizer.__init__c              
      s  | j v r
 j | S t|d d |d d f }t|}|s#|d S g }	 t| fddd}| jvr6n|\}}g }d}	|	t|k rz|||	}
|||	|
  |
}	W n  tyw } z|	| |||	d   W Y d }~n8d }~ww ||	 |kr|	t|d k r||	d  |kr|	||  |	d	7 }	n|	||	  |	d7 }	|	t|k sDt|}|}t|dkrnt|}q&t|d
krt
|d  d|}| j |< |S )NrB   Tc                    s    j | tdS )Ninf)rR   getfloat)pairrW   r	   r
   <lambda>q   s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      d   r1   )rS   r?   r(   minrR   rO   indexrN   	Exceptionr   printr   )rW   tokenr%   r&   Z
error_listZbigramfirstsecondZnew_wordijerrr	   r^   r
   bpee   s`   






zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )NrC   c                 3       | ]} j | V  qd S r)   )rF   )r   r!   r^   r	   r
   	<genexpr>       z)SimpleTokenizer.encode.<locals>.<genexpr>r=   c                 3   rn   r)   )rP   )r   Z	bpe_tokenr^   r	   r
   ro      rp   r1   )r4   r0   lowerr2   findallrV   r   encoderN   rm   r@   )rW   r/   Z
bpe_tokensrg   r	   r^   r
   rs      s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
NrC   c                       g | ]} j | qS r	   )rQ   )r   rg   r^   r	   r
   r      r<   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    rt   r	   )rH   )r   cr^   r	   r
   r      r<   r=   replace)errorsrB   r1   )r   	bytearrayrL   rv   )rW   tokensr/   r	   r^   r
   rL      s   zSimpleTokenizer.decodeN)	__name__
__module____qualname__r   strrX   rm   rs   rL   r	   r	   r	   r
   r5   M   s
    /
r5   M   Tc                    s   t |tr|g}jd jd   fdd|D }tjt||tjd}t|D ]0\}}t||krO|rC|d| } |d< ntd||  d	| t	|||dt|f< q,|S )
a  
    Returns the tokenized representation of given input string(s)
    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize
    context_length : int
        The context length to use; all CLIP models use 77 as the context length
    truncate: bool
        Whether to truncate the text in case its encoding is longer than the context length
    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
    rD   rE   c                    s"   g | ]}g |  g qS r	   )rs   )r   r/   Z	eot_tokenZ	sot_token	tokenizerr	   r
   r      s    z!clip_tokenize.<locals>.<listcomp>)ZdtypeNrY   zInput z  is too long for context length )

isinstancer}   rP   torchZzerosrO   int	enumerateRuntimeErrorZtensor)r   ZtextsZcontext_lengthtruncateZ
all_tokensresultrj   ry   r	   r   r
   clip_tokenize   s$   



r   )r~   T)__doc__rI   r+   r   	functoolsr   r*   regexr2   r   r   r"   r(   r0   r4   objectr5   r   r	   r	   r	   r
   <module>   s"   

X