o
    0jO                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlZddlmZ ddlmZmZ d	d
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d	dlm,Z,m-Z- d	dl.m/Z/ edrd dl0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 g dZ7edeG dd dZ8dd Z9G dd de:Z;G dd dZ<dee= de=fddZ>dd Z?d d! Z@d"d# ZAd$d% ZBG d&d' d'ZCG d(d) d)eCe'e;d*ZDd+d Z?d,d- ZEd.d/ ZFd0d1 ZGd2d3 ZHd4d5 ZIdS )6    N)OrderedDict)asdict	dataclass)	lru_cache)AnyDictListLiteralOptionalTupleUnion   )logging)class_requires_depsis_dep_available   )CHAT_TEMPLATE_CONFIG_NAME
AddedTokenBatchEncodingEncodedInputEncodedInputPairPaddingStrategyPreTokenizedInputPreTokenizedInputPairPretrainedTokenizerBase
TensorType	TextInputTextInputPairTruncationStrategy)convert_to_dict_messagefn_args_to_dict)VocabZJinja2)Template)TemplateErrorTemplateSyntaxError)ImmutableSandboxedEnvironment)ChatTemplateTrieChatTemplateMixinPretrainedTokenizerInitTrackerMetac                   @   s  e Zd ZU dZeee df ed< dZeedf ed< dZ	eed< e
e dddZd	i fd
eee eeef f dedeeef dee fddZd	i fdededeeeeef f fddZi fdeeeeef f deeeeef f fddZi fdeeeeef f defddZi fdeeee  ef deeeeef f defddZedefddZedefddZdS )r&   Nconversationsystemqueryreturnr"   c                 C   s*   dd }t dddd}||jd< || S )Nc                 S   s   t | N)r#   )message r1   z/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/models/common/tokenizer/tokenizer_utils.pyraise_exceptionK   s   z=ChatTemplate._compile_jinja_template.<locals>.raise_exceptionT)Ztrim_blocksZlstrip_blocksZkeep_trailing_newliner3   )r%   globalsZfrom_string)chat_templater3   Z	jinja_envr1   r1   r2   _compile_jinja_templateH   s   

z$ChatTemplate._compile_jinja_templater   conversation_dataindexcontext_datac                 C   s   | j du r	tdt|ttfr$t|dksJ d|d |d |d}|| g }| j D ]}| |}||}|	| q.|S )z
        Args:
            conversation_data (list[str]): the conversation data which must be two parts
            index (int): the index of current conversation

        Returns:
            list[str]: the rendered conversation data
        NzaThe template for multi-turns is invalid, please check `conversation` filed in your chat-template.   zUEach round/turn of conversation must be two participants, eg: [user-query, bot-query]r   r   )userZbotr8   )
r+   
ValueError
isinstancelisttuplelenupdater6   renderappend)selfr7   r8   r9   Zone_turn_conversationr+   templateresultr1   r1   r2   render_conversationT   s&   




z ChatTemplate.render_conversationc                 C   s0   | j d u r|S | | j }|jd||d|S )N)r-   r8   r1   )r-   r6   rB   )rD   r-   r8   r9   rE   r1   r1   r2   render_queryz   s   
zChatTemplate.render_queryc                 C   s   | dd|d< |S )z'init the context data for chat-templateZis_trainingF)get)rD   r9   r1   r1   r2   _init_context_data   s   zChatTemplate._init_context_datac                 C   s*   | j d u rdS | | j }|jdi |S )N r1   )r,   r6   rB   )rD   r9   rE   r1   r1   r2   render_system   s   
zChatTemplate.render_systemconversationsc              	   C   s   t |tr	|gg}| j|d}t||d< t|dd D ]\}}|dk|d< d|d< |d	| j|||d
7 }qt |d tsLt|d dksLtdt|d dkrbt	
d|d dd   || j|d d t|d |d
7 }|S )zrender the conversations by chat-template

        Args:
            conversations (list[list[str]]): the conversations of use and bot

        Returns:
            str: the result of conversation
        r9   lengthNr   is_firstFis_lastrK   r8   r9   r   zxThe length of last conversation must be one, eg: [[user-query, bot-answer], [user-query, bot-answer], ..., [user-query]]zWThe last conversation is not a single-round, chat-template will skip the conversation: )r=   strrL   r@   	enumeratejoinrG   r>   r<   r   warningrH   )rD   rM   r9   Zfinal_queryr8   r+   r1   r1   r2   __call__   s4   


zChatTemplate.__call__configc                 C   s   | di |S )Nr1   r1   )clsrY   r1   r1   r2   	from_dict   s   zChatTemplate.from_dictfilec                 C   sB   t |ddd}t|}W d    n1 sw   Y  | |S )Nrutf-8encoding)openjsonloadr[   )rZ   r\   frY   r1   r1   r2   	from_file   s   
zChatTemplate.from_file)r.   r"   )__name__
__module____qualname__r+   r   r   rT   __annotations__r,   r-   staticmethodr   r6   r   intr   rG   rH   rJ   rL   rX   classmethodr[   re   r1   r1   r1   r2   r&   A   sZ   
 

'


&

,r&   c                    s:  |dkrt jdsjjjdrS t\}}}}}}tj\}}}	}}}fdddD   rddl}
j	dr\t
d	t|
jjrRjn d
  d nt
d	t|
jjrijn d  d t|
jjrtrt fdd}|S t fdd}|S S )z
    Since there are some monkey patches for forward of PretrainedModel, such as
    model compression, we make these patches compatible with the latest forward
    method.
    forwardZStaticFunctionc                    s    g | ]}| vr|v r|qS r1   r1   ).0arg)patch_spec_args	spec_argsr1   r2   
<listcomp>   s
    z)adapt_stale_fwd_patch.<locals>.<listcomp>)Zoutput_hidden_statesZoutput_attentionsreturn_dictr   NZ	paddlenlpzThe `forward` method of z{ is patched and the patch might be based on an old oversion which missing some arguments compared with the latest, such as zm. We automatically add compatibility on the patch for these arguments, and maybe the patch should be updated.zs is patched and the patch might be conflict with patches made by paddlenlp which seems have more arguments such as c                     s,    D ]}| |d  qg| R i |S r/   popargskwargsro   )new_argsrD   valuer1   r2   wrap_fwd   s   z'adapt_stale_fwd_patch.<locals>.wrap_fwdc                     s$    D ]}| |d  q| i |S r/   rt   rv   )ry   rz   r1   r2   r{     s   )typerf   endswithrm   	__class__inspectgetfullargspecpaddlerg   
startswithr   rW   r=   nnZLayer
isfunction	functoolswraps)rD   namerz   Zpatch_spec_varargsZpatch_spec_varkwZpatch_spec_defaults_Zspec_varargsZ
spec_varkwZspec_defaultsr   r{   r1   )ry   rp   rD   rq   rz   r2   adapt_stale_fwd_patch   sT   
	
r   c                       s:   e Zd ZdZ fddZed	ddZ fddZ  ZS )
r*   a  
    This metaclass wraps the `__init__` method of a class to add `init_config`
    attribute for instances of that class, and `init_config` use a dict to track
    the initial configuration. If the class has `_pre_init` or `_post_init`
    method, it would be hooked before or after `__init__` and called as
    `_pre_init(self, init_fn, init_args)` or `_post_init(self, init_fn, init_args)`.
    Since InitTrackerMeta would be used as metaclass for pretrained model classes,
    which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`
    rather than `type` as base class for it to avoid inheritance metaclass
    conflicts.
    c                    s^   | j }d|v rt| dd nd }d|v rt| dd nd }t|||| _ tt|  ||| d S )N__init__	_pre_initZ
_post_init)r   getattrr*   init_and_track_confsuper)rZ   r   basesattrs	init_funcpre_init_funcpost_init_funcr~   r1   r2   r   #  s   zInitTrackerMeta.__init__Nc                    s   t   fdd}|S )aK  
        wraps `init_func` which is `__init__` method of a class to add `init_config`
        attribute for instances of that class.
        Args:
            init_func (callable): It should be the `__init__` method of a class.
                warning: `self` always is the class type of down-stream model, eg: BertForTokenClassification
            pre_init_func (callable, optional): If provided, it would be hooked after
                `init_func` and called as `pre_init_func(self, init_func, *init_args, **init_args)`.
                Default None.
            post_init_func (callable, optional): If provided, it would be hooked after
                `init_func` and called as `post_init_func(self, init_func, *init_args, **init_args)`.
                Default None.

        Returns:
            function: the wrapped function
        c                    sp   r|  g|R i |  | g|R i | r'|  g|R i | || _ |r0||d< | jj|d< d S )NZ	init_argsZ
init_class)Zinit_configr~   rf   )rD   rw   rx   r   r   r   r1   r2   __impl__D  s   z5InitTrackerMeta.init_and_track_conf.<locals>.__impl__)r   r   )r   r   r   r   r1   r   r2   r   1  s   z#InitTrackerMeta.init_and_track_confc                    s   t | ||}tt| ||S r/   )r   r   r*   __setattr__)rD   r   rz   r   r1   r2   r   U  s   zInitTrackerMeta.__setattr__)NN)	rf   rg   rh   __doc__r   rj   r   r   __classcell__r1   r1   r   r2   r*     s    #r*   c                   @   sD   e Zd ZdZdd ZdefddZdedee fd	d
Zdd Z	dS )r'   z
    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
    Loose reference https://en.wikipedia.org/wiki/Trie
    c                 C   s
   i | _ d S r/   datarD   r1   r1   r2   r   `  s   
zTrie.__init__wordc                 C   sD   |sdS | j }|D ]}||v r|| pi ||< || }q	d|d< dS )uy  
        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
        The special key `""` is used to represent termination.

        This function is idempotent, adding twice the same word will leave the trie unchanged

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        Nr   rK   r   )rD   r   refcharr1   r1   r2   addc  s   
zTrie.addtextr.   c                 C   s  t  }dg}d}t|D ]\}}|r||k rqt }d}| D ]\}	}
d|
v r| D ]V\}}||	kr6 nM||	k rC|d }|d }n|}|}|t|k rQ|| nd}d|v r]|}	|}|}||v r|| }|d7 }d|v rs|}	|}|}|t|krzn|| }||v saq,||	 || d} n||
v r|
| }
|
||	< q ||	 q |ri }n|D ]}	||	= q||kr|| jv r| j| ||< q| D ]\}	}
d|
v rt|}||	 ||  nq| ||S )a\  
        Will look for the words added to the trie within `text`. Output is the original string splitted along the
        boundaries of the words found.

        This trie will match the longest possible word first !

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS] This is a extra_id_100"]

        >>> trie.add("[CLS]")
        >>> trie.add("extra_id_1")
        >>> trie.add("extra_id_100")
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS]", " This is a ", "extra_id_100"]
        ```
        r   FrK   r   NT)	r   rU   setitemsr@   rC   r   r   cut_text)rD   r   ZstatesoffsetsskipcurrentZcurrent_char	to_removeresetstartZtrie_pointerZ	lookstartZlooktrie_pointerZlookahead_indexendZ	next_charr1   r1   r2   split  sz   !	






z
Trie.splitc                 C   sX   | t| g }d}|D ]}||krtd q||krq| |||  |}q|S )Nr   zbThere was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.)rC   r@   r   error)rD   r   r   tokensr   r   r1   r1   r2   r     s   zTrie.cut_textN)
rf   rg   rh   r   r   rT   r   r   r   r   r1   r1   r1   r2   r'   Z  s     r'   
token_list	new_tokenc                 C   s8   t | |}|t| k r| | |krdS | || dS )zm
    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
    N)bisectbisect_leftr@   insert)r   r   Zinsertion_idxr1   r1   r2   !_insert_one_token_to_ordered_list5  s   r   c                 C   8   | dks| dks| dkrdS t | }|drdS dS z.Checks whether `chars` is a control character.	
FCTunicodedatacategoryr   r   catr1   r1   r2   _is_controlB     

r   c                 C   s   t | }d|  krdks>n d|  krdks>n d|  kr$dks>n d|  kr/dks>n d	|  kr;d
kr@dS  dS dS dS )z4Check whether `chars` is a non-normalized character.i   i  iP  ik  iX3  i3  i$  i$  i 2  i2  TFordr   cpr1   r1   r2   _is_nonnormalized_charN  s   r   c                 C   sn   t | }d|  krdks3n d|  krdks3n d|  kr$dks3n d|  kr0dkr5d	S  d
S d	S d
S )z<Check whether `chars` is a non-normalized numeric character.i`$  i$  i$  i$  iv'  i'  i`!  i!  TFr   r   r1   r1   r2   _is_nonnormalized_numeric]  s   r   c                 C   s   g }| D ]F}t |rtd|D ]}|| qqt|r9|d ttt|D ]}|| q+|d qt|dkrE|d q|| qd	|S )z
    Normalize the text for multiligual and chinese models. Unicode range:
    https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
    NFKC iy  u   凉rK   )
r   r   	normalizerC   r   rT   rk   numericr   rV   )r   outputr   cr1   r1   r2   normalize_charsk  s   

r   c                	       s  e Zd ZU dZee ed< di fdeeee	  e
e	e	f e	f dede
e	ef fddZi fdeeee	  e	f de
e	ef fd	d
Z	ddeeee	  e
e	e	f e	f fddZi fdeee	  de
e	ef fddZi fdeee	  de
e	ef fddZi ddfdeee	  de
e	ef de	fddZdee
e	e	f  dee	 fddZe fddZdee	ef fddZdd Z  ZS ) r(   Nr5   Tr+   tokenizer9   c                 K   sn   | j stdt| j tr|dd}| j||d}nt| j tr'| ||}|s+|S d|d< | |fi |S )a  apply chat_template rules to conversation which should not be batched data

        Args:
            conversation (List[List[str]] , str): the conversation messages between user and bot
            context_data (Dict[str, Any]): the context data for chat_template.json
            tokenize (bool, optional): whether do tokenization. Defaults to True.

        Returns:
            str | dict[str, Union[numpy.ndarray, paddle.Tensor]]: return the result of applied data
        9chat_template is not set, please set chat_template first.add_generation_promptTr   Fadd_special_tokens)r5   r<   r=   r"   ru   _apply_chat_templater&   _apply_chat_template_paddle)rD   r+   r   r9   Ztokenizer_kwargsr   r-   r1   r1   r2   apply_chat_template  s   z%ChatTemplateMixin.apply_chat_templatec                 C   sR   | j |}t|tr|gg}nt|tr t|d tr td| j ||d}|S )Nr   qapply_chat_template do not support applying batch conversations, so you should apply the conversation one by one.rN   )r5   rJ   r=   rT   r>   r<   )rD   r+   r9   r-   r1   r1   r2   r     s   

z-ChatTemplateMixin._apply_chat_template_paddlec           	      C   s"  t |trd|dg}n)t |tr5t|dksJ dt |d tr't|}nt |d tr1|}ntdz| jjdd|i| j	d|i}W |S  t
y   tt|D ]'}|| d }t |trvd	}|D ]}|d
dkru|d } nqf||| d< qU| jjdd|i| j	d|i}Y |S w )Nr;   Zrolecontentr   z!empty conversation is not allowedr   messagesr   r   rK   r|   r   r1   )r=   rT   r>   r@   r   dictr<   r5   rB   special_tokens_map	TypeErrorrangerI   )	rD   r+   r   rM   r-   ir   Znew_contentpartr1   r1   r2   r     sP   







z&ChatTemplateMixin._apply_chat_templaterM   c                 K   sV   | j stdt| j tr|dd}| j|||d}|S t| j tr)| ||}|S )a  Encodes conversation to pairs of token ids.
        Turn 0: bos + system + sep + user     bot + eos
        Turn t: sep + bot + query             bot + eos

        Args:
            conversation (List[List[str]]): the conversation of data
            context_data (Dict[str, Any]): the context data of conversation

        Returns:
            List[list[int], list[int]]: the pair of input_ids and target_ids
        r   r   Tr   )r5   r<   r=   r"   ru   _encode_chat_inputsr&   _encode_chat_inputs_paddle)rD   rM   r9   rx   r   r-   r1   r1   r2   encode_chat_inputs  s   z$ChatTemplateMixin.encode_chat_inputsc                 C   s   | j |}i }| j jr| j |}| j|ddd |d< g }t|D ]8\}}|dk|d< |t|d k|d< | j j|||d	\}}	| j|ddd }
| j|	ddd }||
|g q#||d
< |S )NF)r   	input_idsr,   r   rQ   r   rR   rS   rM   )	r5   rJ   r,   rL   encoderU   r@   rG   rC   )rD   rM   r9   rF   r,   conversation_idsr8   r+   Z
user_inputZ
bot_outputZuser_idsZbot_idsr1   r1   r2   r     s&   
z,ChatTemplateMixin._encode_chat_inputs_paddler,   c              
   C   s  i }|r"z| j jd|dd W n ty! } ztd|d }~ww g }g }|D ]}	d|	d dd|	d dg}
||
 ||
 q(g }|D ]A}|rP|g| n|}| j jd|d	d
| j}|ri|g|d g n|d g}| j jd||d
| j}|t|d  }|| qG| ||}t|t|ksJ dt| dt| dg }t	t|D ]}|| j
|| || gd	d	dd  q||d< |S )Nr,   r   )r   z*System is not supported in this tokenizer.r;   r   Z	assistantr   Fr   r   zGet non_learnable_parts len: z, but ans len: .)r   paddingr   rM   r1   )r5   rB   	Exceptionr<   extendrC   r   r@   _extract_non_learnable_partsr   Zbatch_encode)rD   rM   r9   r,   r   rF   eZconversation_dict
origin_msgroundZ
round_roleZansconvZroundiZ
roundi_strZroundi_no_ansZroundi_no_ans_strZ
ans_roundinon_learnable_partsr   r   r1   r1   r2   r     sl   


z%ChatTemplateMixin._encode_chat_inputsr   split_sc                 C   sP   d ttj|}td| | jjd|dd| j}|d dkr&|  |S )	zJSplit the entire chat by specified words. Extract the non-learnable parts.|z(?:%s)Fr   rP   rK   Nr1   )	rV   mapreescaper   r5   rB   r   ru   )rD   r   r   Zregex_patternr   r1   r1   r2   r   ^  s   z.ChatTemplateMixin._extract_non_learnable_partsc                    s   | dd }| dd}| dd}| dd}|d u rd}||d< ||d< ||d< ||d< d|d< t j|g|R i |\}}	tj|	t}
tj|
sP|S |jd ur_t	
d	 t	
d
 ||
 |S )N	cache_dirfrom_hf_hubFfrom_aistudio	subfolderrK   TZreturn_tokenizer_file_dirz_Chat-template already exists in config file, it will be overwritten by chat_template.json file.z``chat_template.json` will be deprecated in the future! Please set it in `tokenizer_config.json`.)ru   r   from_pretrainedospathrV   r   existsr5   r   rW   init_chat_template)rZ   Zpretrained_model_name_or_pathrw   rx   r   r   r   r   	tokenizerZtokenizer_config_file_dirchat_template_filer   r1   r2   r   q  s@   


z!ChatTemplateMixin.from_pretrainedc                 C   s   t |tr-tj|s%z	t|| _W dS  ty$   td	|ddw t
|| _dS t |tr:t|| _dS t |trD|| _dS td|)zinit chat_tempalte by file_path or template dict data

        Args:
            chat_template (str, dict): file_path or template dict data
        z7The chat-template in json is not valid jinja string: {}r   )linenoz"Receive error chat_template data: N)r=   rT   r   r   r   r&   r6   r5   r$   formatre   r   r[   r<   )rD   r5   r1   r1   r2   r     s(   
	



z$ChatTemplateMixin.init_chat_templatec                 C   sv   t | jtr9tj|t}t|ddd}tj	t
| j|ddd W d    n1 s+w   Y  td|  d S d S )Nwr^   r_   F   )ensure_asciiindentz#Chat-template config file saved in )r=   r5   r&   r   r   rV   r   ra   rb   dumpr   r   info)rD   Zsave_directoryr  rd   r1   r1   r2   save_resources  s   z ChatTemplateMixin.save_resources)T)rf   rg   rh   r5   r
   r&   ri   r   r   rT   r   boolr   r   r   r   r   r   r   r   rl   r   r   r   r
  r   r1   r1   r   r2   r(     sh   
 

'


,





 


@
"r(   c                )       s  e Zd ZU dZi Zeeef ed< i Z	eeef ed< g Z
ee ed< e ZdZ fddZdd	 Zed
efddZed
efddZd
eeef fddZdd Z	d^deee ee f ded
efddZdd Zd^ddZded
ee fddZdd Zd d! Zd"d# Z d$d% Z!d&d' Z"d^d(d)Z#d*d+ Z$e%	,	,	,	,d_d-d.Z&e%d/d0 Z'	d` fd1d2	Z(d3d4 Z)d,d5e*j+e,j-d,d6dd,d,d,d,d,d,ddddd5fdeee.e/f d7e0eee.e/f  d8ed9e*d:e,d;e0e d<ed=ed>e0e d?e0e1d@  dAe0eee2f  dBe0e dCe0e dDe0e dEedFedGedHedIed
e3f(dJdKZ4d5e*j+e,j-d,d6dd,d,d,d,d,d,ddd5ddd5fdLeee ee5 ee. ee6 ee/ ee7 f d8ed9e*d:e,d;e0e d<ed=ed>e0e d?e0e1d@  dBe0e dAe0eee2f  dCe0e dDe0e dEedFedMedGedHedIed
e3f(dNdOZ8d5e*j+e,j-d,d6d,d,d,d,d,d,ddd5ddd5fdPeee6e9ee d,f f  d8ed9e*d:e,d;e0e d<ed>e0e d?e0e1d@  dBe0e dAe0e dCe0e dDe0e dEedFedMedGedHedIed
e3f&dQdRZ:defdSdTZ;dadedUe0ee  fdVdWZ<		5	5dbdXee dYedZed[ed
ef
d\d]Z=  Z>S )cr)   an
  
    Base class for all tokenizers.

    Inherits from [`~tokenizer_utils_base.PretrainedTokenizerBase`].

    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
    pretrained tokenizers as well as adding tokens to the vocabulary.

    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).

    - **resource_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
        vocabulary file required by the model, and as associated values, the filename for saving the associated file
        (string).
    - **pretrained_resource_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
        high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
        low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
        associated pretrained vocabulary file.
    - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
        of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
        or `None` if the model has no maximum input size.
    - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
        `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
        pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
        with the [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`] method.
    - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
    - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
        Should be `'right'` or `'left'`.
    - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
        applied. Should be `'right'` or `'left'`.

    Moreover, methods common to tokenizers for tokenization, token/id conversion
    and encoding as model inputs are also provided here.

    Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`,
    by which subclasses can track arguments for initialization automatically
    and expose special tokens initialization used as attributes.
    added_tokens_encoderadded_tokens_decoderunique_no_split_tokensFc                    s   t |g| f| R i |}|dd tt| jdi | i | _| j|di  dd | j D | _g | _	t
 | _d| _dS )z
        It would be hooked before `__init__` to add specials tokens (arguments of
        `__init__` whose name ends with `_token`) as attributes of the tokenizer
        instance.
        rD   Nr  c                 S   s   i | ]\}}|j |qS r1   )r   )rn   vkr1   r1   r2   
<dictcomp>  s    z1PretrainedTokenizer._pre_init.<locals>.<dictcomp>Fr1   )r    ru   r   r)   r   r  rA   r   r  r  r'   tokens_trie_decode_use_source_tokenizer)rD   Zoriginal_initrw   rx   	init_dictr   r1   r2   r     s   
zPretrainedTokenizer._pre_initc                 K   s   |  D ]O\}}|d u rq|| jv rS|dkr9t|ttfs%J d| dtdd |D s2J dt| || qt|ttfrGt| || qt	d| dt
| qd S )	NZadditional_special_tokenszValue z is not a list or tuplec                 s   s    | ]
}t |ttfV  qd S r/   )r=   rT   r   rn   tr1   r1   r2   	<genexpr>
  s    
zIPretrainedTokenizer._build_special_tokens_map_extended.<locals>.<genexpr>z2One of the tokens is not a string or an AddedTokenzspecial token z- has to be either str or AddedToken but got: )r   ZSPECIAL_TOKENS_ATTRIBUTESr=   r>   r?   allsetattrrT   r   r   r|   )rD   rx   keyrz   r1   r1   r2   "_build_special_tokens_map_extended  s.   


z6PretrainedTokenizer._build_special_tokens_map_extendedr.   c                 C      t )zP
        `int`: Size of the base vocabulary (without the added tokens).
        NotImplementedErrorr   r1   r1   r2   
vocab_size  s   zPretrainedTokenizer.vocab_sizec                 C   s   dS NFr1   r   r1   r1   r2   is_fast  s   zPretrainedTokenizer.is_fastc                 C   s   | j S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        )r  r   r1   r1   r2   get_added_vocab   s   z#PretrainedTokenizer.get_added_vocabc                 C   s   | j t| j S )zD
        Size of the full vocabulary with the added tokens.
        )r  r@   r  r   r1   r1   r2   __len__)  s   zPretrainedTokenizer.__len__
new_tokensspecial_tokensc                    s  dd |D }g }|D ]N}t |tstd| dt| d|s-t dr- jr-| }| jkrY |  jkrY||vrY| j	
 vrY||  jrYtd| d qt fd	d
t|D }dd | D } j	|  j| |rt|dkrt j|d  n*tt jt| _nt|dkrt j|d  ntt jt| _  j t|S )aS  
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary.

        Args:
            new_tokens (`List[str]`or `List[AddedToken]`):
                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
                checking if the tokenizer assign the index of the `unk_token` to them).
            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            `int`: The number of tokens actually added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        ```c                 S   s   g | ]}t |qS r1   )rT   )rn   tokr1   r1   r2   rr   L      z3PretrainedTokenizer._add_tokens.<locals>.<listcomp>zToken z is not a string but a r   do_lower_casezAdding z to the vocabularyc                 3   s$    | ]\}}|t  | fV  qd S r/   )r@   )rn   r   r&  r   r1   r2   r  c  s    
z2PretrainedTokenizer._add_tokens.<locals>.<genexpr>c                 S   s   i | ]\}}||qS r1   r1   )rn   r  r  r1   r1   r2   r  f  s    z3PretrainedTokenizer._add_tokens.<locals>.<dictcomp>r   r   )r=   rT   r   r|   hasattrr(  lower	unk_tokenconvert_tokens_to_idsr  keysrC   verboser   r	  r   rU   r   rA   r  r@   r   r  sortedr   union_create_trie)rD   r$  r%  Ztokens_to_addtokenZadded_tok_encoderZadded_tok_decoderr1   r   r2   _add_tokens/  sZ   





zPretrainedTokenizer._add_tokensc                 C   sN   t  }|D ]}t| dr| jr|| jvr||  q|| q|| _d S )Nr(  )r'   r)  r(  all_special_tokensr   r*  r  )rD   r  Ztrier2  r1   r1   r2   r1    s   

z PretrainedTokenizer._create_triec                 K   s   ||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs:
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        r1   )rD   r   is_split_into_wordsrx   r1   r1   r2   prepare_for_tokenization  s   z,PretrainedTokenizer.prepare_for_tokenizationr   c                 K   s  | d| j}tdd | jD }| j|fi |\}}t| drC| jrCdd | j| j D }dd	| d	 d
 }t
|dd |}|rKg }|g}nt| j}| j|}t|D ]I\}	}
|
|v r||
d}|	dkrr||	d  nd}|	t|d k r||	d  nd}t|tr|jr|r| ||	d < |jr|r| ||	d < qZg }|D ]}
|
sq|
|v r||
 q|| |
 q|S )a"  
        Converts a string in a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            **kwargs (additional keyword arguments):
                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            `List[str]`: The list of tokens.
        split_special_tokensc                 s   s&    | ]}t |trt||fV  qd S r/   )r=   r   rT   r  r1   r1   r2   r    s    

z/PretrainedTokenizer.tokenize.<locals>.<genexpr>r(  c                 S   s   g | ]}t |qS r1   )r   r   )rn   Zs_tokr1   r1   r2   rr     s    z0PretrainedTokenizer.tokenize.<locals>.<listcomp>(r   z)|z(.+?)c                 S   s   |   d p|   d  S )Nr   r   )groupsr*  )mr1   r1   r2   <lambda>  s    z.PretrainedTokenizer.tokenize.<locals>.<lambda>Nr   r   )ru   r7  r   all_special_tokens_extendedr6  r)  r(  r  r4  rV   r   subr   r  r   rU   rI   r@   r=   r   rstriplstriprC   r   	_tokenize)rD   r   rx   r7  r<  Zescaped_special_tokspatternZno_split_tokenr   r   r2  Ztok_extendedleftrightZtokenized_textr1   r1   r2   r     sR   

 


zPretrainedTokenizer.tokenizec                 K   r  )a  
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        r  )rD   r   rx   r1   r1   r2   r@    s   zPretrainedTokenizer._tokenizec                 C   sB   |d u rd S t |tr| |S g }|D ]
}|| | q|S r/   )r=   rT   #_convert_token_to_id_with_added_vocrC   )rD   r   idsr2  r1   r1   r2   r,    s   

z)PretrainedTokenizer.convert_tokens_to_idsc                 C   s*   |d u rd S || j v r| j | S | |S r/   )r  _convert_token_to_idrD   r2  r1   r1   r2   rD    s
   


z7PretrainedTokenizer._convert_token_to_id_with_added_vocc                 C      | j |S r/   )vocabZ
to_indicesrG  r1   r1   r2   rF       z(PretrainedTokenizer._convert_token_to_idc                 C   s
   d |S )z
        Converts a sequence of tokens (list of string) to a single string by
        using ``' '.join(tokens)`` .

        Args:
            tokens (list[str]): A sequence of tokens.

        Returns:
            str: Converted string.
        r   )rV   )rD   r   r1   r1   r2   convert_tokens_to_string  s   
z,PretrainedTokenizer.convert_tokens_to_stringc                 C   s   t |tr"|| jv r| j| }t |tr|j}|S |}|S | |S g }|D ]0}t|}|r4|| jv r4q&|| jv rN| j| }t |trF|jn|}|| q&|| | q&|S r/   )r=   rk   r  r   r   _convert_id_to_tokenall_special_idsrC   )rD   rE  skip_special_tokensr2  r   r8   r1   r1   r2   convert_ids_to_tokens$  s&   





z)PretrainedTokenizer.convert_ids_to_tokensc                 C   rH  r/   )rI  Z	to_tokens)rD   r8   r1   r1   r2   rL  9  rJ  z(PretrainedTokenizer._convert_id_to_tokenNc                 K   s|   i }t j| ddd}t|D ]\}}	|	d}
t|||
< qW d   n1 s)w   Y  tj|f||||d|}|S )a  
        Instantiate an instance of `Vocab` from a file reserving all tokens
        by using `Vocab.from_dict`. The file contains a token per line, and the
        line number would be the index of corresponding token.

        Args:
            filepath (str): path of file to construct vocabulary.
            unk_token (str): special token for unknown token. If no need, it also
                could be `None`. Defaults to `None`.
            pad_token (str): special token for padding token. If no need, it also
                could be `None`. Defaults to `None`.
            bos_token (str): special token for bos token. If no need, it also
                could be `None`. Defaults to `None`.
            eos_token (str): special token for eos token. If no need, it also
                could be `None`. Defaults to `None`.
            **kwargs (dict): keyword arguments for `Vocab.from_dict`.

        Returns:
            Vocab: An instance of `Vocab`.
        r]   r^   r_   r   N)r+  	pad_token	bos_token	eos_token)iora   rU   r>  rk   r!   r[   )filepathr+  rP  rQ  rR  rx   Ztoken_to_idxrd   r8   liner2  rI  r1   r1   r2   load_vocabulary=  s$   
z#PretrainedTokenizer.load_vocabularyc                    sx   t  tr	 j}nt   fddd}tj| ddd}|D ]	}||d  q W d   dS 1 s5w   Y  dS )	a3  
        Save all tokens to a vocabulary file. The file contains a token per line,
        and the line number would be the index of corresponding token.

        Args:
            filepath (str): File path to be saved to.
            vocab (Vocab|dict): The `Vocab` or `dict` instance to be saved.
        c                    s    |  S r/   r1   )r2  rI  r1   r2   r;  v  s    z5PretrainedTokenizer.save_vocabulary.<locals>.<lambda>)r  r  r^   r_   r   N)r=   r!   Zidx_to_tokenr/  r-  rS  ra   write)rT  rI  r   rd   r2  r1   rW  r2   save_vocabularyi  s   

"z#PretrainedTokenizer.save_vocabularyc                    sD   |r|dur
t dt j||ddS dg|rt|ndt|  S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``encode`` methods.

        Args:
            token_ids_0 (List[int]): List of ids of the first sequence.
            token_ids_1 (List[int], optional): List of ids of the second sequence.
            already_has_special_tokens (bool, optional): Whether or not the token list is already
                formatted with special tokens for the model. Defaults to None.

        Returns:
            results (List[int]): The list of integers in the range [0, 1]:
                1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.T)token_ids_0token_ids_1already_has_special_tokensr   )r<   r   get_special_tokens_maskr@   )rD   rZ  r[  r\  r   r1   r2   r]  {  s   z+PretrainedTokenizer.get_special_tokens_maskc                 C   s$   g }g }t | ||r|S dS )a  
        Returns the number of added tokens when encoding a sequence with special tokens.

        Args:
            pair (bool, optional):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence. Defaults to `False`.
        Returns:
            int: Number of special tokens added to sequences.
        N)r@    build_inputs_with_special_tokens)rD   pairrZ  r[  r1   r1   r2   num_special_tokens_to_add  s   z-PretrainedTokenizer.num_special_tokens_to_addTr   	text_pairr   padding_strategytruncation_strategy
max_lengthstrider5  pad_to_multiple_ofpadding_side)rC  rB  return_tensorsreturn_position_idsreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthr.  c                    s    fdd}||}|d ur||nd }|r |d< |d< j |fi d|d|d|jd|jd	|d
|d|	d|
d|ddd|d|d|d|d|d|d|d|S )Nc                    s   t | trj| fi }|S t | ttfrBt| dkrBt | d trB r=ttjfdd| D  }|S | S t | ttfrXt| dkrXt | d t	rX| S  rbt
d|  dt
d|  d)Nr   c                 3   &    | ]}j |fd di V  qdS r5  TNr   r  rx   rD   r1   r2   r    
    
zJPretrainedTokenizer._encode_plus.<locals>.get_input_ids.<locals>.<genexpr>zInput z] is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`.zW is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.r=   rT   r   r,  r>   r?   r@   	itertoolschainrk   r<   r   r   r5  rx   rD   r1   r2   get_input_ids  s>   





z7PretrainedTokenizer._encode_plus.<locals>.get_input_idsr   ra  pair_idsr   r   
truncationrd  re  rf  rg  rh  prepend_batch_axisTri  rk  rj  rl  rm  rn  ro  r.  )prepare_for_modelrz   )rD   r   ra  r   rb  rc  rd  re  r5  rf  rg  rh  ri  rj  rk  rl  rm  rn  ro  r.  rx   rz  	first_ids
second_idsr1   ry  r2   _encode_plus  s\   %	
z PretrainedTokenizer._encode_plusbatch_text_or_text_pairsrs   c                    s   fdd}g }|D ]9}t |ttfs|d }}n r,t |d ttfs,|d }}n|\}}||}|d ur<||nd }|||f q|dkrS|d urS|d< n;|rd}t|dkrht |d ttfrhd}d d< d d< |rd	d
 |D d< dd
 |D d< n	dd
 |D d< j|fi d|d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|}|S )Nc                    s   t | trj| fi }|S t | ttfrBt| dkrBt | d trB r=ttjfdd| D  }|S | S t | ttfrXt| dkrXt | d t	rX| S t
d)Nr   c                 3   rp  rq  rr  r  rs  r1   r2   r  1  rt  zPPretrainedTokenizer._batch_encode_plus.<locals>.get_input_ids.<locals>.<genexpr>z\Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.ru  rx  ry  r1   r2   rz  %  s6   



z=PretrainedTokenizer._batch_encode_plus.<locals>.get_input_idsr   r  FTtexts
text_pairsc                 S      g | ]}|d  qS )r   r1   rn   r   r1   r1   r2   rr   _  r'  z:PretrainedTokenizer._batch_encode_plus.<locals>.<listcomp>c                 S   r  )r   r1   r  r1   r1   r2   rr   `  s    c                 S   s   g | ]}|qS r1   r1   r  r1   r1   r2   rr   d  s    r   rb  rc  rd  re  rf  rg  ri  rk  rj  rl  rm  rs   rn  ro  rh  r.  )r=   r>   r?   rC   r@   _batch_prepare_for_model)rD   r  r   rb  rc  rd  re  r5  rf  rg  ri  rh  rj  rk  rl  rm  rs   rn  ro  r.  rx   rz  r   Zids_or_pair_idsrE  r{  r  r  Zhas_pairbatch_outputsr1   ry  r2   _batch_encode_plus  s    
	
z&PretrainedTokenizer._batch_encode_plusbatch_ids_pairsc           .      K   s  |r|st di }g }t|D ]\}\}}|dkr;|dur;|du r*d| jv }|du r3d| jv }|t| |r@| jddnd }|d | \}}| |}| |}d}|t|k r:i }t|| } | |krl|} |}!||||   }"t|"du}#|}$||||   }%|r| |$|%}&| |!|"}'| 	|!|"}(n|$|% }&|#r|!|" n|!}'dgt|! |#rdgt|" ng  }(|&|d	< |'|d
< |r|(|d< |r|r| 
|!|"|d< n	dgt|' |d< | |d
 || |	rttt|d
 |d< |rt|d
 |d< |d |d< ||d< | D ]\})}*|)|vrg ||)< ||) |* q||  t|kr-n|t| |7 }|t|k s^q|rY|d | |d< d|d< |d durY|d | |d< | j||fi d|dtjjd|jd|d|ddd|d|	ddd|d|d |d!|d"|d#dd$dd%||}| D ]\})}*|)|vrg ||)< ||) |* qq| j||j||||d&}|rt||
d'}|S | D ])\}+},tt|,D ]}-|-t|kr||+|,|- i q|,|- ||- |+< qِq|S )(a  
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        zAsking to return token_type_ids while setting add_special_tokens to False results in an undefined behavior. Please set add_special_tokens to True or set return_token_type_ids to None.r   Ntoken_type_idsZattention_maskT)r_  r  offset_mappingr   Zspecial_tokens_maskZposition_idsrO   Zseq_lenZoverflow_to_sampler  r   ra  r  r   r   r|  rd  re  rf  rg  ri  rk  Frj  rl  rm  rn  ro  rh  r}  r.  )r   rd  rf  rg  rk  )Ztensor_type)r<   rU   Zmodel_input_namesr@   r`  get_offset_mappingr  Z(build_offset_mapping_with_special_tokensr^  Z$create_token_type_ids_from_sequencesr]  Z&_eventual_warn_about_too_long_sequencer>   r   r   rC   minr~  r   
DO_NOT_PADrz   padr   ).rD   r  r   rb  rc  rd  re  rf  rg  ri  rh  rj  rk  rl  rm  rs   rn  ro  r.  rx   r  Zbatch_outputs_listZ
example_idr  r  Zmax_len_for_pairr   ra  Ztoken_offset_mappingZtoken_pair_offset_mappingoffsetZencoded_inputsrO   rE  r{  r_  mappingZpair_mappingr  sequencer  r  rz   r  r  r   r1   r1   r2   r  ~  s  







>	

z,PretrainedTokenizer._batch_prepare_for_modelc                 C   s  |du rdS |  |}dg }}t|D ]S\}}t| dr;| jr;| }| jjdur:td|}d	dd |D }n| jjrOtd|}d	dd |D }d	d	d |D }||7 }|
|gt|  q|g d
}}}g }	t|D ]\}
}|dd dkr|dd }|| jv rt| dr| jr| n|}d|v sd|v r||d dd|dd| }n,|||d vr|
t|d k r||
d  | jv r|}d}nd}n||d || }|t| }|	||g |dkr|}qvg }t|	D ]@\}
\}}|dkr.|
d
krd
}n|	|
d  d }|
t|	d kr&t|}n|	|
d  d
 }||| ||d  d f q|S )aR  
        Returns the map of tokens and the start and end index of their start and end character.
        Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372
        Args:
            text (str):
                Input text.
        Returns:
            list: The offset map of input text.

        NrK   r(  FZNFDc                 S      g | ]}t |d kr|qS ZMnr   r   rn   r   r1   r1   r2   rr   D      zEPretrainedTokenizer._get_bert_like_offset_mapping.<locals>.<listcomp>c                 S   r  r  r  r  r1   r1   r2   rr   G  r  c                 S   s0   g | ]}t |d kst |dkst|s|qS )r   i  )r   r   r  r1   r1   r2   rr   J  s
     r   r:   z##   σ   ςr   r   rP   )r   rU   r)  r(  r*  basic_tokenizerZstrip_accentsr   r   rV   r   r@   r4  replacer8   rC   )rD   r   split_tokensnormalized_textchar_mappingr   chtoken_mappingr  char_mapping_indexesr8   r2  r   r   r1   r1   r2   _get_bert_like_offset_mapping.  sz   



 


 z1PretrainedTokenizer._get_bert_like_offset_mappingr  c                 C   s  |du rdS |  |}t| dst| dr| |S |s!|  |}dg }}t|D ]\}}|t|7 }||gt|  q*|g d}}}t| dd}	|	rS| }g }
|D ]Y}| 	|
 }|| jv rk|	rk| }d|v ssd	|v r||d d	d|d	d| }n|||d vrd
}n||d || }|t| }|
||g |d
kr|}qWg }t|
D ]=\}\}}|d
kr|dkrd}n|
|d  d }|t|
d krt|}n|
|d  d }||| ||d  d f q|S )a  
        Returns the map of tokens and the start and end index of their start and end character.
        Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372
        Args:
            text (str):
                Input text.
            split_tokens (Optional[List[str]]):
                the tokens which has been split which can accelerate the operation.

        Returns:
            list: The offset map of input text.

        Nr  Zwordpiece_tokenizerrK   r   r(  Fr  r  rP   r   )r   r)  r  rU   r   r   r@   r   r*  rK  stripr4  r  r8   rC   )rD   r   r  r  r  r   r  r  r  r(  r  r2  r   r   r8   r1   r1   r2   r    sZ   




 
 z&PretrainedTokenizer.get_offset_mapping	token_idsrN  clean_up_tokenization_spacesspaces_between_special_tokensc                 K   s   t |tjr
| }|dd| _| j||d}g }g }|D ]&}	|r(|	| jv r(q|	| jv r?|r9|	| 
| g }|	|	 q|	|	 q|rO|	| 
| |rWd|}
nd|}
|re| |
}|S |
S )NZuse_source_tokenizerF)rN  r   rK   )r=   npZndarraytolistru   r  rO  rM  r  rC   rK  rV   Zclean_up_tokenization)rD   r  rN  r  r  rx   Zfiltered_tokensZ	sub_textsZcurrent_sub_textr2  r   Z
clean_textr1   r1   r2   _decode  s4   


zPretrainedTokenizer._decode)F)NNNNr   r/   )FTT)?rf   rg   rh   r   r  r   rT   rk   ri   r  r  r   r'   r  r  r   r  propertyr  r  r!  r"  r#  r   r   r3  r1  r6  r   r   r@  r,  rD  rF  rK  rO  rL  rj   rV  rY  r]  r`  r   r  r   ZDO_NOT_TRUNCATEr   r   r
   r	   r   r   r  r   r   r   r  r   r  r  r  r  r   r1   r1   r   r2   r)     s  
 '		
S
O	
+
	


d


z
	

 1c^r)   )	metaclassc                 C   r   r   r   r   r1   r1   r2   r     r   c                 C   sh   t | }|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d	S t| }|d
r2d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPFr   r   r   r   )r   r   r   r1   r1   r2   _is_punctuation%  s   

r  c                 C   s(   t | }t| ds|dv rdS dS )z8Check whether CP is the codepoint of a Symbol character.S)         i0        iK  i  TFr  r   r1   r1   r2   
_is_symbol9  s
   r  c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z;
    Checks whether `chars` is a whitespace character.
    r   r   r   r   TZsFr  r   r1   r1   r2   _is_whitespaceC  s    
r  c                 C   s4   t | tr| S t | tr| ddS tdt|  )z
    Converts `text` to Unicode (if it's not already), assuming utf-8 input.
    Args:
        text (str|bytes): Text to be converted to unicode.
    Returns:
        str: converted text.
    r^   ignorezUnsupported string type: %s)r=   rT   bytesdecoder<   r|   )r   r1   r1   r2   convert_to_unicodeQ  s
   

r  c                 C   s   |   } | sg S |  }|S )z
    Runs basic whitespace cleaning and splitting on a piece of text.
    Args:
        text (str): Text to be tokenized.
    Returns:
        list(str): Token list.
    )r  r   rx  r1   r1   r2   whitespace_tokenizea  s
   r  )Jr   r   r   rS  rv  rb   r   r   r   collectionsr   dataclassesr   r   r   typingr   r   r   r	   r
   r   r   numpyr  utilsr   Z
utils.depsr   r   Ztokenizer_utils_baser   r   r   r   r   r   r   r   r   r   r   r   r   r   r    rI  r!   Zjinja2r"   Zjinja2.exceptionsr#   r$   Zjinja2.sandboxr%   __all__r&   r   r|   r*   r'   rT   r   r   r   r   r   r(   r)   r  r  r  r  r  r1   r1   r1   r2   <module>   sr   $<	 ND \  
<        f
