o
    *jD                     @   s   d Z ddlZddlmZmZmZmZ ddlZddl	Z
ddlmZ ddlmZmZ ddlmZ ddlmZ e Zdd	iZG d
d dZG dd dZG dd deZdS )z!Tokenization classes for ChatGLM.    N)DictListOptionalUnion)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategy)loggerzTHUDM/chatglm-6bi   c                   @   sV   e Zd Zdd Zdd Zdee fddZdd	 Zd
d Z	dd Z
dd Zdd ZdS )TextTokenizerc                 C   s&   t  | _| j| | j | _d S N)spmZSentencePieceProcessorspLoad
vocab_size
num_tokens)selfZ
model_path r   k/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/chatglm/tokenization.py__init__   s   
zTextTokenizer.__init__c                 C      | j |S r   )r   ZEncodeAsIdsr   textr   r   r   encode      zTextTokenizer.encodeidsc                 C   r   r   )r   Z	DecodeIds)r   r   r   r   r   decode   r   zTextTokenizer.decodec                 C   r   r   )r   ZEncodeAsPiecesr   r   r   r   tokenize!   r   zTextTokenizer.tokenizec                    s    fdd|D S )Nc                    s   g | ]} j |qS r   r   Z	PieceToId).0tokenr   r   r   
<listcomp>%       z7TextTokenizer.convert_tokens_to_ids.<locals>.<listcomp>r   )r   tokensr   r!   r   convert_tokens_to_ids$   s   z#TextTokenizer.convert_tokens_to_idsc                 C   r   r   r   r   r    r   r   r   convert_token_to_id'   r   z!TextTokenizer.convert_token_to_idc                 C   r   r   )r   Z	IdToPiece)r   idxr   r   r   convert_id_to_token*   r   z!TextTokenizer.convert_id_to_tokenc                 C      | j S r   )r   r!   r   r   r   __len__-      zTextTokenizer.__len__N)__name__
__module____qualname__r   r   r   intr   r   r%   r'   r)   r+   r   r   r   r   r      s    r   c                   @   s   e Zd Z			d#ddZdd Zedefd	d
Zedd Ze	dd Z
e	dd Zed$dedefddZd%defddZ			d&dedee fddZdee defddZ			d&dedee fddZdeeef fd d!Zd"S )'SPTokenizer N  P   Tc                 C   s<   |d usJ || _ || _g d| _|| _|| _t|| _d S )N)[MASK][gMASK]z[sMASK]z
<unused_0><sop><eop>z<ENC>z<dBLOCK>)
vocab_filenum_image_tokensZspecial_tokensmax_blank_lengthbyte_fallbackr   text_tokenizer)r   r8   r9   r:   r;   r   r   r   r   3   s   
zSPTokenizer.__init__c                 C   r*   r   )r<   r!   r   r   r   _get_text_tokenizerE   r,   zSPTokenizer._get_text_tokenizerlengthc                 C   s   | dksJ d|  dS )N   z<|blank_z|>r   )r>   r   r   r   get_blank_tokenH   s   zSPTokenizer.get_blank_tokenc                   C   s   dS )Nz<|tab|>r   r   r   r   r   get_tab_tokenM   s   zSPTokenizer.get_tab_tokenc                 C      | j jS r   )r<   r   r!   r   r   r   num_text_tokensQ   s   zSPTokenizer.num_text_tokensc                 C   s   | j | j S r   )r9   rC   r!   r   r   r   r   U   s   zSPTokenizer.num_tokensr   max_lenc                 C   s<   |  dt } t|ddD ]}|  d| t|} q| S )N	    )replacer1   rA   ranger@   )r   rD   ir   r   r   _encode_whitespacesY   s   zSPTokenizer._encode_whitespacesc                 C   s(   |r| dd}|r| j|| jd}|S )N
<n>)rD   )rI   rL   r:   )r   r   	linebreakwhitespacesr   r   r   _preprocess`   s   zSPTokenizer._preprocessreturnc                    sN     |||}|sd| }  |} fdd|D }|r!|S |dd S )  
        @param text: Text to encode.
        @param linebreak: Whether to encode newline (
) in text.
        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
        rN   c                    s   g | ]}| j  qS r   r9   )r   xr!   r   r   r"   x   s    z&SPTokenizer.encode.<locals>.<listcomp>r?   N)rQ   r=   r   )r   r   rO   rP   add_dummy_prefixtmpr$   r   r!   r   r   h   s   zSPTokenizer.encodetext_idsc                    sz    fdd|D }dd |D }   |}|dd}|t d}td jd D ]}| |d	| }q-|S )
Nc                    s   g | ]	}t | j qS r   )r0   r9   r   Z_idr!   r   r   r"   |   s    z&SPTokenizer.decode.<locals>.<listcomp>c                 S   s   g | ]}|d kr|qS )r   r   rY   r   r   r   r"   }   r#   rN   rM   rE   r?   rF   rH   )r=   r   rI   r1   rA   rJ   r:   r@   )r   rX   r   r   rK   r   r!   r   r   {   s   zSPTokenizer.decodec                 C   s<   |  |||}|sd| }|  |}|r|S |dd S )rS   rN   r?   N)rQ   r=   r   )r   r   rO   rP   rV   r$   r   r   r   r      s
   zSPTokenizer.tokenizerU   c                 C   s   t |tr|| jk rd|S | j|| j S t |tr@|dr7|dr7|dd 	 r7t|dd S | j
|| j S td)Nz
<image_{}>z<image_>   rG   zThe key should be str or int.)
isinstancer0   r9   formatr<   r)   str
startswithendswithisdigitr'   
ValueError)r   rU   r   r   r   __getitem__   s&   



zSPTokenizer.__getitem__N)r2   r3   T)r3   )TT)TTT)r-   r.   r/   r   r=   staticmethodr0   r@   rA   propertyrC   r   r^   rL   rQ   r   r   r   r   r   rc   r   r   r   r   r1   1   sF    






r1   c                       sV  e Zd ZdZddiZeZg dZ								
		d4	d5 fddZe	de
e fddZe	de
e fddZe	dd Zdd Zdd Zdd Z		d6deeee f ded edefd!d"Zd#d$ Zd%d& Zd7d'd(Z	d7d)ee d*e
ee  dee fd+d,Zdejddfd-eeeef ef d.e
e d/ed0e
e d1e
e de fd2d3Z!  Z"S )8ChatGLMTokenizera  
    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file: Path to the vocabulary file.
        do_lower_case: Use lower case letters.
        remove_space: Remove spaces.
        bos_token: The bos token
        eos_token: The Eos Token
        end_token: The end token
        mask_token: The mask token
        gmask_token: The gmask token
        padding_side: The padding side
        num_image_tokens: The `num_image_tokens` in `SPTokenizer`
    r8   zice_text.model)Z	input_idsattention_maskposition_idsFr6   r7   </s>r4   r5   leftr2   rR   Nc                    sh   t ||
d| _t jd|||	||||||
d	| || _|| _|| _|| _|| _|| _	|| _
|| _d S )NrT   )	do_lower_caseremove_spacepadding_side	bos_token	eos_token	end_token
mask_tokengmask_tokenr9   r   )r1   sp_tokenizersuperr   rk   rl   r8   rn   ro   rp   rq   rr   )r   r8   rk   rl   rn   ro   rp   rq   rr   rm   r9   kwargs	__class__r   r   r      s2   
zChatGLMTokenizer.__init__c                 C   s   | j d u rd S | | j S r   )rr   r%   r!   r   r   r   gmask_token_id   s   
zChatGLMTokenizer.gmask_token_idc                 C   s   | j du rdS | | j S )z
        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
        set.
        N)rp   r%   r!   r   r   r   end_token_id   s   
zChatGLMTokenizer.end_token_idc                 C   rB   )z Returns vocab size )rs   r   r!   r   r   r   r      s   zChatGLMTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )z Returns vocab as a dict c                    s   i | ]}  ||qS r   )_convert_id_to_token)r   rK   r!   r   r   
<dictcomp>   s    
z.ChatGLMTokenizer.get_vocab.<locals>.<dictcomp>)rJ   r   updateZadded_tokens_encoder)r   Zvocabr   r!   r   	get_vocab   s
   
zChatGLMTokenizer.get_vocabc                 C   s0   | j rd|  }n|}| jr| }|S )NrH   )rl   joinstripsplitrk   lower)r   ZinputsZoutputsr   r   r   preprocess_text  s   z ChatGLMTokenizer.preprocess_textc                 K   s   |  |}| j|}|S )z Returns a tokenized string. )r   rs   r   )r   r   ru   seqr   r   r   	_tokenize  s   
zChatGLMTokenizer._tokenizeT	token_idsskip_special_tokensclean_up_tokenization_spacesc                 K   sH   t |tr|g}t|dkrdS | j|v rtt| jj|}| j|S )Nr    )	r\   r0   lenpad_token_idlistfilter__ne__rs   r   )r   r   r   r   ru   r   r   r   _decode  s   

zChatGLMTokenizer._decodec                 C   
   | j | S )z2 Converts a token (str) in an id using the vocab. rs   r&   r   r   r   _convert_token_to_id#     
z%ChatGLMTokenizer._convert_token_to_idc                 C   r   )z=Converts an index (integer) in a token (str) using the vocab.r   )r   indexr   r   r   rz   '  r   z%ChatGLMTokenizer._convert_id_to_tokenc                 C   s   t j|rt j|| jd }n|}t| jd}| }W d   n1 s(w   Y  t|d}|| W d   |fS 1 sDw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        r8   rbNwb)	ospathisdirr~   vocab_files_namesopenr8   readwrite)r   Zsave_directoryZfilename_prefixr8   ZfinZ	proto_strwriterr   r   r   save_vocabulary+  s   

z ChatGLMTokenizer.save_vocabularytoken_ids_0token_ids_1c                 C   s   | j | j }| j | j }| j | j }||vr||vr||g7 }|d |kr4|d |kr4|| j | j g7 }|| j | j g7 }|durR|rI|d |krN||g7 }||7 }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        rG   N)rs   rq   rr   ro   rp   rn   )r   r   r   Zmask_idsZ	gmask_idsZeos_idr   r   r    build_inputs_with_special_tokensF  s   

z1ChatGLMTokenizer.build_inputs_with_special_tokensencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskc                 C   sb  | j | j }| j | j }| j | j }| jdksJ || jd  }	t|	}
|tjkr-t|	}|durC|durC|| dkrC|| d | }|tj	koMt|	|k}|durd|vr||	v r`|	
|}n|
}td|
|
f}t|}d|ddddd|f< t|dk }||d< d|vrtj|
tjd}||	v r|n|}||	v r|	
|}|||d< ttj|tjdtjd|
| d tjdg}tj||gdd	|d< |r/|t|	 }d|v rtj|d d
|df|dfgddd|d< d|v r| jg| |d  |d< d|v rdg| |d  |d< d|v r"tj|d d
|dfgd|d< | jg| |	 || jd < |S )a?  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        rj   r   NrF   rg   g      ?rh   )Zdtype)Zaxis)r   r   ZconstantT)	pad_widthmodeZconstant_valuesZtoken_type_idsZspecial_tokens_mask)r   )rs   rn   rq   rr   rm   model_input_namesr   r	   ZLONGEST
DO_NOT_PADr   npZonesZtrilZbool_ZarangeZint64ZconcatenateZzerosstackpadZpad_token_type_idr   )r   r   r   r   r   r   Zbos_token_idZmask_token_idrx   Zrequired_inputZ
seq_lengthZneeds_to_be_paddedZcontext_lengthrg   rh   rq   Zmask_positionZblock_position_ids
differencer   r   r   _padl  s    








zChatGLMTokenizer._pad)	FFr6   r7   ri   r4   r5   rj   r2   )rR   N)FTr   )#r-   r.   r/   __doc__r   &PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesr   r   re   r   r0   rx   ry   r   r}   r   r   r   r   boolr^   r   r   rz   r   r   r	   r   r   r   r   dictr   __classcell__r   r   rv   r   rf      s    '	
	




)rf   )r   r   typingr   r   r   r   numpyr   Zsentencepiecer   Ztransformers.tokenization_utilsr   Z$transformers.tokenization_utils_baser   r   Ztransformers.utilsr	   Zmodelscope.utilsr
   loggingZ
get_loggerr   r   r1   rf   r   r   r   r   <module>   s    x