o
    *j9                     @   sv   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZe dd Z	dd Z
dd Zd	d
 ZG dd deZdS )    N)	lru_cachec                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | d d  }d}td	D ]}|| vrI| | |d	|  |d7 }q3d
d |D }tt| |S )N!~      ¡   ¬   ®   ÿr      c                 S   s   g | ]}t |qS  )chr).0nr   r   u/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/tokenization_clip.py
<listcomp>       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s,   





r   c                 C   s6   t  }| d }| dd  D ]}|||f |}q|S )Nr   r   )setadd)wordpairsZ	prev_charcharr   r   r   	get_pairs"   s   r!   c                 C   s"   t | } tt| } |  S N)ftfyZfix_texthtmlunescapestriptextr   r   r   basic_clean+   s   
r)   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr&   r'   r   r   r   whitespace_clean1   s   r-   c                   @   s4   e Zd Zdd Zdd Zdd Z			dd	d
ZdS )LengthAdaptiveTokenizerc                 C   s   t  | _dd | j D | _|}|dd }dd |D }tt   }|dd |D  }|D ]
}|d| q0|d	d
g t	t
|tt|| _dd | j D | _t	t
|tt|| _d	d
d| _tdtj| _| j| _|j|j | _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>;       z4LengthAdaptiveTokenizer.__init__.<locals>.<dictcomp>r   i  c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r   >   s    z4LengthAdaptiveTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r2   r   r   r   r   @   r    <|startoftext|><|endoftext|>c                 S   r/   r   r   r0   r   r   r   r3   E   r4   )r:   r;   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   byte_encoderitemsZbyte_decoderr   valuesr   joinextendr   r   r   lenencoderdecoder	bpe_rankscacher+   compile
IGNORECASEpatvocabZtp_prefix_token_numZtp_suffix_token_numtp_token_num)selfconfigZbpe_pathZmergesrI   r7   r   r   r   __init__9   s,   z LengthAdaptiveTokenizer.__init__c           
         sr  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4ny|\}}g }d}|t|k rz|||}	||||	  |	}W n tyg   |||d   Y n4w || |kr|t|d k r||d  |kr|	||  |d	7 }n|	||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d

|}| j |< |S )Nr8   Tc                    s    j | tdS )Ninf)rD   getfloat)pairrK   r   r   <lambda>^   s    z-LengthAdaptiveTokenizer.bpe.<locals>.<lambda>)keyr   r      r*   )rE   r5   r!   minrD   rA   indexr@   
ValueErrorr   r?   )
rK   tokenr   r   ZbigramfirstsecondZnew_wordijr   rS   r   bpeS   sV   





zLengthAdaptiveTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )Nr9   c                 3       | ]} j | V  qd S r"   )r<   )r   r   rS   r   r   	<genexpr>       z1LengthAdaptiveTokenizer.encode.<locals>.<genexpr>zutf-8c                 3   r`   r"   )rB   )r   Z	bpe_tokenrS   r   r   ra      rb   r*   )r-   r)   lowerr+   findallrH   r?   encoder@   r_   r6   )rK   r(   Z
bpe_tokensrZ   r   rS   r   re   ~   s   
zLengthAdaptiveTokenizer.encodeptTc                    s   dj  }t|tr|g}jd jd   fdd|D }tjt||tjd}t|D ]3\}}	t|	|krWg|	d|d    g }
t	|
||d t|	f< q1t	|	||d t|	f< q1|S )NM   r:   r;   c                    s"   g | ]}g |  g qS r   )re   )r   r(   Z	eot_tokenrK   Z	sot_tokenr   r   r      s    z4LengthAdaptiveTokenizer.__call__.<locals>.<listcomp>)Zdtyper   )
rJ   
isinstancestrrB   torchZzerosrA   long	enumerateZtensor)rK   ZtextsZreturn_tensorspaddingZ
truncationZcontext_lengthZ
all_tokensresultr]   tokensZ
new_tokensr   rh   r   __call__   s&   



z LengthAdaptiveTokenizer.__call__N)rf   TT)__name__
__module____qualname__rM   r_   re   rq   r   r   r   r   r.   7   s    +r.   )gzipr$   os	functoolsr   r#   regexr+   rk   r   r!   r)   r-   objectr.   r   r   r   r   <module>   s   
	