o
    *j                     @   sn   d dl Z d dlZd dlmZ d dlZd dlZd dlZe dd Zdd Z	dd Z
d	d
 ZG dd deZdS )    N)	lru_cachec                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | d d  }d}td	D ]}|| vrI| | |d	|  |d7 }q3d
d |D }tt| |S )N!~      ¡   ¬   ®   ÿr      c                 S   s   g | ]}t |qS  )chr).0nr   r   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/tokenizer.py
<listcomp>       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s,   





r   c                 C   s6   t  }| d }| dd  D ]}|||f |}q|S )Nr   r   )setadd)wordpairsZ	prev_charcharr   r   r   	get_pairs   s   r!   c                 C   s"   t | } tt| } |  S N)ftfyZfix_texthtmlunescapestriptextr   r   r   basic_clean(   s   
r)   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr&   r'   r   r   r   whitespace_clean.   s   r-   c                   @   s6   e Zd Zdd Zdd Zdd Zdd Zdd
dZdS )SimpleTokenizerc                 C   s   t  | _dd | j D | _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>8       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r   ;   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r2   r   r   r   r   =   r    <|startoftext|><|endoftext|>c                 S   r/   r   r   r0   r   r   r   r3   B   r4   )r<   r=   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   byte_encoderitemsbyte_decodergzipopenreaddecoder8   r   valuesr   joinextendr   r   r   lenencoderdecoder	bpe_rankscacher+   compile
IGNORECASEpat)selfZbpe_pathZmergesZvocabr9   r   r   r   __init__6   s(   
zSimpleTokenizer.__init__c           
         sr  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4ny|\}}g }d}|t|k rz|||}	||||	  |	}W n tyg   |||d   Y n4w || |kr|t|d k r||d  |kr|	||  |d	7 }n|	||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d

|}| j |< |S )Nr:   Tc                    s    j | tdS )Ninf)rK   getfloat)pairrP   r   r   <lambda>W   s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      r*   )rL   r7   r!   minrK   rH   indexrG   
ValueErrorr   rF   )
rP   tokenr   r   ZbigramfirstsecondZnew_wordijr   rW   r   bpeL   sV   





zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )Nr;   c                 3       | ]} j | V  qd S r"   )r>   )r   r   rW   r   r   	<genexpr>{       z)SimpleTokenizer.encode.<locals>.<genexpr>r5   c                 3   rd   r"   )rI   )r   Z	bpe_tokenrW   r   r   re   }   rf   r*   )r-   r)   lowerr+   findallrO   rF   encoderG   rc   r8   )rP   r(   Z
bpe_tokensr^   r   rW   r   ri   w   s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
Nr;   c                       g | ]} j | qS r   )rJ   )r   r^   rW   r   r   r      r4   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    rj   r   )r@   )r   crW   r   r   r      r4   r5   replace)errorsr:   r*   )rF   	bytearrayrD   rl   )rP   tokensr(   r   rW   r   rD      s   zSimpleTokenizer.decodeM   c                    s   t |tr|g}jd jd   fdd|D }tjt||tjd}t|D ]!\}}t||kr@|d | } |d< t|||d t|f< q,|S )Nr<   r=   c                    s"   g | ]}g |  g qS r   )ri   )r   r(   Z	eot_tokenrP   Z	sot_tokenr   r   r      s    z,SimpleTokenizer.tokenize.<locals>.<listcomp>)ZdtyperR   )	
isinstancestrrI   torchZzerosrH   int	enumerateZtensor)rP   ZtextsZcontext_lengthZ
all_tokensresultra   ro   r   rq   r   tokenize   s   


zSimpleTokenizer.tokenizeN)rp   )__name__
__module____qualname__rQ   rc   ri   rD   rx   r   r   r   r   r.   4   s    +
r.   )rA   r$   	functoolsr   r#   regexr+   rt   r   r!   r)   r-   objectr.   r   r   r   r   <module>   s   
	