o
    *js                     @   s~   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ e ZdgZG d	d dZdS )
    N)Mapping)AutoTokenizer)Models)
OutputKeys)ModeKeys)
get_loggerNLPTokenizerc                   @   s\   e Zd Z				ddedefddZedd Zedd	 Zd
d Z	dddZ
dddZdS )r   N	model_diruse_fastc                 C   s2   || _ || _|| _| jdu ri | _|| _d| _dS )a  The transformers tokenizer preprocessor base class.

        Any nlp preprocessor which uses the huggingface tokenizer can inherit from this class.

        Args:
            model_dir (str, `optional`): The local path containing the files used to create a preprocessor.
            use_fast (str, `optional`): Use the fast version of tokenizer
            tokenize_kwargs (dict, `optional`): These args will be directly fed into the tokenizer.
        N)r	   
model_typetokenize_kwargs	_use_fast
_tokenizer)selfr	   r   r
   r    r   t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/preprocessors/nlp/transformers_tokenizer.py__init__   s   

zNLPTokenizer.__init__c                 C   s   | j d u r
|  | _ | j S N)r   build_tokenizer)r   r   r   r   	tokenizer-   s   

zNLPTokenizer.tokenizerc                 C   s   | j d u rW| j d u r| jd u rd| _ n9| j d u rLtjtj| jdrLttj| jdddd}t|}|	d| _ W d    n1 sGw   Y  | j d u rSdn| j | _ | j S )NFztokenizer_config.jsonrzutf-8)encodingr
   )
r   r	   ospathisfilejoinopenjsonloadget)r   fZjson_configr   r   r   r
   3   s"   

zNLPTokenizer.use_fastc           
      C   s  | j }| j}|tjtjtjtjtjtjtj	fv r3ddl
m}m} | jr%|n|}|dur0||S | S |tjkrSddl
m}m} | jrE|n|}|durP||S | S |tjkrsddlm}m}	 | jre|	n|}|durp||S | S |dusyJ tj|| jdS )zBuild a tokenizer by the model type.

        NOTE: The fast tokenizers have a multi-thread problem, use it carefully.

        Returns:
            The initialized tokenizer.
        r   )BertTokenizerBertTokenizerFastN)XLMRobertaTokenizerXLMRobertaTokenizerFast)LlamaTokenizerLlamaTokenizerFast)r
   )r   r	   r   Z
structbertZgpt3ZpalmZplugZmegatron_bertZplug_mentalZfid_plugtransformersr!   r"   r
   Zfrom_pretrainedZvecor#   r$   llamaZmodelscope.models.nlpr%   r&   r   )
r   r   r	   r!   r"   r   r#   r$   r%   r&   r   r   r   r   C   sD   	

zNLPTokenizer.build_tokenizerc                 K   s   | d|dd |d< |d d u r|d dd | j D }|| || j t| jjj	}d|vrAd|v rA|d | j||fi |S )N
max_lengthZsequence_lengthc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>i   s    z)NLPTokenizer.__call__.<locals>.<dictcomp>Ztruncation_strategy)
r   popr   itemsupdateinspect	signaturer   __call__
parameters)r   textZ	text_pairkwargsr   r4   r   r   r   r3   d   s   



zNLPTokenizer.__call__c                 C   s$   || j v r
| j | S | jj||S r   )r   r   Zinit_kwargsr   )r   keydefault_valuer   r   r   get_tokenizer_kwargq   s   

z NLPTokenizer.get_tokenizer_kwarg)NNNNr   )__name__
__module____qualname__strboolr   propertyr   r
   r   r3   r9   r   r   r   r   r      s"    



!)r1   r   collections.abcr   r   r'   r   Zmodelscope.metainfor   Zmodelscope.outputsr   Zmodelscope.utils.constantr   Zmodelscope.utils.loggerr   logger__all__r   r   r   r   r   <module>   s   