o
    *j'                     @   s   d dl Z d dlmZ d dlZd dlmZmZ d dlZ	d dl
Zd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ ejdkr_ejj Ze!  e Z"dgZ#ej$ej%ej&dG dd deZ'dS )    N)AnyDict)	Pipelines)Model)
OutputKeys)Pipeline)	PIPELINES)ConfigConfigFields)	ModelFileTasks)
get_loggerz2.0LanguageIdentificationPipeline)module_namec                       s   e Zd ZdZdef fddZdedefddZdedeee	f fd	d
Z
deee	f deee	f fddZdeee	f deee	f fddZ  ZS )r   u[   Language Identification Pipeline.

    Examples:

    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks

    >>> pipeline_ins = pipeline(Tasks.text_classification, 'damo/nlp_language_identification-classification-base')
    >>> pipeline_ins('Elon Musk, co-founder and chief executive officer of Tesla Motors.\n' \
    >>>              'Gleichzeitig nahm die Legion an der Befriedung Algeriens teil, die von.\n' \
    >>>              '使用pipeline推理及在线体验功能的时候，尽量输入单句文本，如果是多句长文本建议人工分句。'

    >>> {
    >>>    "labels":[
    >>>        "en",
    >>>        "de",
    >>>        "zh"
    >>>    ],
    >>>    "scores":[
    >>>        [('en', 0.99)],
    >>>        [('de', 1.0)],
    >>>        [('zh', 1.0)]
    >>>    ]
    >>> }
    modelc              	      s  t  jdd|i| |}d| _ttj|tj	| _
tj|| j
tj d }g }g }tt|dD ]/\}}| }z|d}|||f |||f W q4 tyc   | jratd|| Y q4w t|| _t|| _| jdd| _| jd	d
| _tj|| j
tj d }	tdd tt|	dddD | _d| _t  tjdd}
d|
j_ tj!|
d| _"tj#j$%| j"tj#j&j'g| t( }| jr|) D ]
}t|j*|+  q|,d| _-|,d}|,d}||d| _.t/ }t0 }| j"1||g tj#j$%| j"tj#j&j'g| dS )zBuild a language identification pipeline with a model dir or a model id in the model hub.

        Args:
            model: A Model instance.
        r   Fvocabrbzutf-8zerror vocab:<UNK>   z</S>r   labelc                 S   s   g | ]
\}}||  fqS  )strip).0iwr   r   y/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/pipelines/nlp/language_identification_pipline.py
<listcomp>]   s    z;LanguageIdentificationPipeline.__init__.<locals>.<listcomp>rutf8)encodingZunkT)Zallow_soft_placement)configz	src_cid:0zoutput_label:0zpredict_score:0)Z
output_idsoutput_scoreNr   )2super__init__debugr	   	from_fileospathjoinr   ZCONFIGURATIONcfgr
   Zpreprocessor	enumerateopenr   decodeappendUnicodeDecodeErrorprintdictr   vocab_reversegetunk_idpad_idr   Z	unk_labeltfZreset_default_graphZConfigProtoZgpu_optionsZallow_growthSession_sessionZsaved_modelloaderloadZtag_constantsZSERVINGZget_default_graphZget_operationsnamevaluesZget_tensor_by_name	input_idsoutputZglobal_variables_initializerZlocal_variables_initializerrun)selfr   kwargsZ
export_dirZjoint_vocab_fileZ
vocabfilesZvocabfiles_reverser   r   Zjoint_label_fileZ	tf_configZdefault_graphopZoutput_labelr!   initZ
local_init	__class__r   r   r#   :   st   







z'LanguageIdentificationPipeline.__init__inputreturnc           
         s:  |  }d}t|d|}d}t|d|}d}t|d|}dd d d fdd	|D }td
tj}t|d|}ddd	 | D }g }| D ]"}| j	|| j
}	t|dkro|	| j
kro|d | j
kroqR||	 qRt|dkr|d | j
kr|dd  }t|dkr|d | j
kr|d d }|S )Nz/<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6}); z\S+[./]\S+\s?z
\S*@\S*\s?c                 S   sd   t | }d|k s|dkr|d8 }t|S |dkrd}t|S |dv r(d}t|S |dv r.d	}t|S )
Ni   i_  i  i 0      )i0  i0  i   i   i   i   "   )i   i   i   i   '   )ordchr)ZucharZinside_coder   r   r   stringpartQ2B   s   	zELanguageIdentificationPipeline._lid_preprocess.<locals>.stringpartQ2BuV   ,-+"'\&.!=:;°·$«»|±[]{}_?<>~^*/%#@()，。！《》？、`Â …‼️c                    s    g | ]}| vr|nd qS ) r   )r   cZm_noisyCharsrM   r   r   r      s    zBLanguageIdentificationPipeline._lid_preprocess.<locals>.<listcomp>u`   [😀-🙏🌀-🗿🚀-🛿🇠-🇿🤦-🤷𐀀-􏿿✂-➰♀-♂☀-⭕‍⏏⏩⌚️〰]+rN   c                 S   s0   g | ]}t td |rt td|s|qS )z\dz^[a-z0-9+-_]+$)boolresearchmatch)r   itemr   r   r   r      s    r   r   )lowerrR   subr(   compileUNICODEsplitr   r   r2   r3   lenr-   )
r?   rE   sentenceZCLEANRZURLREZEMAILREZEMOJIREZoutidsr   tmpr   rP   r   _lid_preprocess}   sD   
z.LanguageIdentificationPipeline._lid_preprocessc           	         s   | d} fdd|D } jr/t||D ]\}}td| tdd fdd|D  qtdd |D }|D ]}| jg|t|   q:t	
|}d	|i}|S )
N
c                    s"   g | ]}|  d kr |qS )rG   )r   r_   )r   r]   r?   r   r   r      s
    z=LanguageIdentificationPipeline.preprocess.<locals>.<listcomp>zraw:zres:rG   c                    s$   g | ]} j | jd dqS )r   rN   )r1   r2   r3   replace)r   Zwidra   r   r   r      s    c                 S   s   g | ]}t |qS r   )r\   )r   idsr   r   r   r      s    r<   )r[   r$   zipr/   r(   maxextendr4   r\   nparray)	r?   rE   Z
sentenceltZinput_ids_ltr]   r<   maxlenrc   resultr   ra   r   
preprocess   s$   



z)LanguageIdentificationPipeline.preprocessc                 C   sR   | j   | j|d i}| j j| j|d}|W  d    S 1 s"w   Y  d S )Nr<   )	feed_dict)r7   Z
as_defaultr<   r>   r=   )r?   rE   rl   Zsess_outputsr   r   r   forward   s
   $z&LanguageIdentificationPipeline.forwardinputsc                 C   s   |d }t g d}g }g }|D ]A}g }t|| j D ]\}}	|	|vr%q||	|f qt|dd ddd d }t|dkrCd	g}|| ||d d  qd
d |D }
tj|tj	|
i}|S )Nr!   )hafamarazbebgZbnbscaZcecocscydadeeleneoeseteufafifrfygagdglguZhaZhawhehiZhmnhrZhthuhyidZigisitjaZjvkakkkmknkoZkukyZlaloltlvZmgmimkmlZmnmrmsmtZmynenlnonypaplZpsptrorusdsiskslsmZsnsosqsrstZsusvswtatetgthtltrZugukuruzvixhyiZyozhzzh-twzuc                 S   s   | d S )Nr   r   )r   r   r   r   <lambda>   s    z<LanguageIdentificationPipeline.postprocess.<locals>.<lambda>T)keyreverse   r   )r   g      ?c                 S   s   g | ]	}d d |D qS )c                 S   s&   g | ]\}}|d kr|t |dfqS )g{Gz?   )round)r   r   Zscorer   r   r   r      s    zILanguageIdentificationPipeline.postprocess.<locals>.<listcomp>.<listcomp>r   )r   Zlabels_scoresr   r   r   r      s
    
z>LanguageIdentificationPipeline.postprocess.<locals>.<listcomp>)
setrd   r   r;   r-   sortedr\   r   ZLABELSZSCORES)r?   rn   Zoutput_scores_rawZsupported_104_langZlabels_scores_ltZoutput_labelsr!   ZtmpltslZoutput_scoresrj   r   r   r   postprocess   s,   
z*LanguageIdentificationPipeline.postprocess)__name__
__module____qualname____doc__strr#   listr_   r   r   rk   rm   r   __classcell__r   r   rC   r   r      s    CD"*)(r&   Zos.pathr'   ZosprR   typingr   r   numpyrg   Z
tensorflowr5   Zmodelscope.metainfor   Zmodelscope.models.baser   Zmodelscope.outputsr   Zmodelscope.pipelines.baser   Zmodelscope.pipelines.builderr   Zmodelscope.utils.configr	   r
   Zmodelscope.utils.constantr   r   Zmodelscope.utils.loggerr   __version__compatZv1Zdisable_eager_executionlogger__all__Zregister_moduleZtext_classificationZlanguage_identificationr   r   r   r   r   <module>   s.   
