o
    *j                     @   s|   d dl mZmZ d dlZd dlmZ d dlmZ dedefddZ	d	e
dedefd
dZG dd deZG dd deZdS )    )ListUnionN)AutoTokenizer)GPT2TokenizerFaststart_extra_idmax_lenc                    sv   dt dtf fdd}d}d}| D ]!}|dkr(|d7 }| kr'|||}d}q|||}d}|| }q|||}|S )	z Encode whitespaces to extra tokens in GPT-J.

    >>> encode_whitespaces('a\n  b\n   c', 10, 10)
    'a\n<|extratoken_10|>b\n<|extratoken_11|>c'
    acc_lentextc                    sX   | dkr|S | dkr|d S |  ksJ d  d|  d |  }d| d}|| S )	Nr       zMax whitespace run length z, but found    <|extratoken_|> )r   r	   Zextra_idextra_tokenr   r   r   i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/codegeex/tokenizer.pypush_acc_space   s   z*encode_whitespaces.<locals>.push_acc_spacer    r   r
   )intstr)r	   r   r   r   r   reschr   r   r   encode_whitespaces	   s   




r   r	   c                 C   s@   t d|d D ]}|d | }d| d}| |d| } q| S )z Decode the whitespace-encoded strings produced by encode_whitespace.

    >>> text = 'a\n  b\n   c'
    >>> s, l = 10, 10
    >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
    True
    r   r
   r   r   r   )rangereplace)r	   r   r   ltoken_idtokenr   r   r   decode_whitespaces,   s
   r   c                   @   s   e Zd Z		ddedee defddZdefd	d
ZdefddZdedefddZ	dd Z
defddZdefddZdeeef fddZdd Zdd ZdS ) Code13BDictionaryN	dict_fileextra_token_idspad_to_vocab_sizec                 C   s   t  | _t  | _d| _g | _| dd | dd | dd | dd | | |d u r9dd tdd	D }|D ]}| |d q;|dkrO| | d S d S )
Nr   z<s>z<pad>z</s>z<unk>c                 S   s   g | ]}t |qS r   )r   .0xr   r   r   
<listcomp>O   s    z.Code13BDictionary.__init__.<locals>.<listcomp>iQ  i  )	dict_idx_count_num_symbols_symbols_add_symbol
_load_dictr   _pad_to_vocab_size)selfr"   r#   r$   r   r   r   r   __init__=   s    
zCode13BDictionary.__init__
vocab_sizec                 C   sB   |t |  }|dkrd S td|d D ]}| d|d qd S )Nr   r
   zvocab_pad_token{})lenr   r.   format)r1   r3   Znum_padir   r   r   r0   X   s   z$Code13BDictionary._pad_to_vocab_sizec                 C   sr   t |d*}|D ]}| }|dks|drq| \}}| |t| qW d    d S 1 s2w   Y  d S )Nrr   #)openstrip
startswithsplitr.   r   )r1   r"   flinesymcountr   r   r   r/   _   s   "zCode13BDictionary._load_dictr?   r@   c                 C   s4   | j | j|< || j|< | j| |  j d7  _ d S )Nr
   )r,   r*   r+   r-   append)r1   r?   r@   r   r   r   r.   h   s   
zCode13BDictionary._add_symbolc                 C   s   | j S N)r,   r1   r   r   r   __len__n   s   zCode13BDictionary.__len__c                 C   
   | j | S rB   )r*   )r1   r?   r   r   r   indexq      
zCode13BDictionary.indexidxc                 C   rE   rB   )r-   )r1   rH   r   r   r   stringt   rG   zCode13BDictionary.stringr   c                 C   s   t |tr	t|}| |S rB   )
isinstancer   r   rF   )r1   r   r   r   r   	map_tokenw   s   

zCode13BDictionary.map_tokenc                    s    fdd|D S )Nc                    s   g | ]}  |qS r   )rK   r&   r   rC   r   r   r(   }   s    z0Code13BDictionary.map_tokens.<locals>.<listcomp>r   )r1   tokensr   rC   r   
map_tokens|   s   zCode13BDictionary.map_tokensc                    s     fdd|D }dd |D S )Nc                    s"   g | ]}|d kr
dn  |qS )iP  Z50256)rI   rL   rC   r   r   r(      s    z3Code13BDictionary.decode_tokens.<locals>.<listcomp>c                 S   s   g | ]}| d st|qS )Zvocab_pad_token)r;   r   r%   r   r   r   r(      s    r   )r1   rM   decodedr   rC   r   decode_tokens   s   
zCode13BDictionary.decode_tokens)Nr!   )__name__
__module____qualname__r   r   r   r2   r0   r/   r.   rD   rF   rI   r   rK   rN   rP   r   r   r   r   r    ;   s&    
	r    c                   @   sN   e Zd Z						ddedededed	ef
d
dZdefddZdd ZdS )CodeGeeXTokenizerNEleutherAI/gpt-j-6B
   codegeex-13b	tokenizertokenizer_pathr   r   r"   c                 C   s|   |d ur|nt || _|dvrtd| d|| _|| _|| _|d ur4| jdkr0t|ddnd | _nd | _| jj	| _	d S )N)rW   codegeex-python-13bzInvalid mode z5, choose from ['codegeex-13b', 'codegeex-python-13b']rZ   i   )r$   )
r   Zfrom_pretrainedrX   
ValueErrorr   r   moder    	code_dictZeos_token_id)r1   rX   rY   r   r   r\   r"   r   r   r   r2      s&   	

zCodeGeeXTokenizer.__init__codec                 C   sr   | j dkrt|| j| j}| j|ddj}|S | j dkr7t|| j| j}| j| j|}t	
|dd}|S )NrW   F)Zis_split_into_wordsrZ   r
   r!   )r\   r   r   r   rX   	input_idsr]   rN   encodetorchZ
LongTensorZreshape)r1   r^   r_   r   r   r   encode_code   s   

zCodeGeeXTokenizer.encode_codec                 C   sr   | j dkr| jj|dd}t|| j| j}|S | j dkr7| j| d g}| jj|dd}t|| j| j}|S )NrW   F)Zskip_special_tokensrZ   r   )	r\   rX   decoder   r   r   r]   rP   tolist)r1   r_   r	   Zoutput_coder   r   r   decode_code   s   

zCodeGeeXTokenizer.decode_code)NrU   rV   rV   rW   N)	rQ   rR   rS   r   r   r   r2   rb   re   r   r   r   r   rT      s(    
rT   )typingr   r   ra   Ztransformersr   Ztransformers.models.gpt2r   r   r   r   r   objectr    rT   r   r   r   r   <module>   s   #L