o
    *j                     @   s`  d Z ddlZddlZddlZddlZddlZddlZddlmZm	Z	 ddl
mZmZ ddlmZmZmZ ddlZddlZddlZddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$ dZ%dZ&dZ'dZ(dZ)e%e&e'e(e)gZ*dd Z+dd Z,G dd deZ-G dd deZ.G dd de.Z/G dd de/Z0G dd de0Z1G d d! d!e0Z2G d"d# d#e0Z3G d$d% d%e/Z4G d&d' d'e/Z5G d(d) d)e/Z6G d*d+ d+e/Z7G d,d- d-e/Z8G d.d/ d/e.Z9G d0d1 d1e/Z:G d2d3 d3e.Z;G d4d5 d5e;Z<G d6d7 d7e.Z=G d8d9 d9e.Z>G d:d; d;e.Z?G d<d= d=e?Z@G d>d? d?e.ZAG d@dA dAe.ZBG dBdC dCeBZCG dDdE dEeBZDG dFdG dGeBZEG dHdI dIeBZFG dJdK dKe.ZGh dLZHdMdNhZIi dOe;dPe<dQe=dRe>dSe?dTe@dUdVdW dXdYdW dZeAd[e4d\e0d]e3d^e5d_e5d`e6dMe7dae8e:e1e2eBeCeDeEeFeGe9eGdbZJdS )czP
This file contains the logic for loading training and test data for all tasks.
    N)ABCabstractmethod)Counterdefaultdict)CallableDictList)build_input_from_idsbuild_samplenum_special_tokens_to_add)punctuation_standardization)Dataset)tqdm)print_rank_0)InputExample)PVPStraindevtestZtrue_dev	unlabeledc                 C   s   t |  |jS N)
PROCESSORSoutput_prediction)	task_nameargs r   s/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/tasks/superglue/dataset.pyget_output_func1   s   r   c                 K   s   t j| fdtjtdd|S )N	F)sepquotingZdtypeZ	na_filter)pdZread_csvcsv
QUOTE_NONEstr)pathkwargsr   r   r   read_tsv5   s   r'   c                   @   s,   e Zd Z			d	ddZdd Zdd ZdS )
SuperGlueDatasetFc
                 C   sN  t | || _| jj|_td| d| d| d | d| | _|j| _|| _|| _|| _|	| _	|	r;| js;J d|| _
|tkrK| jj||d}
nR|tkrV| j|}
nG|tkra| j|}
n<|tkrz|dkrs| jj||jd	}
n*| j|}
n#|tkr| j|}
|
D ]
}| j d
 |_qntdt d| d|tkrd| _nd| _tdd |
D }tdt|
 d| dt|   g | _|
jdd d |
| _ | jr| jrt!| " }g | _#|D ]}| j#$t!| ||| j |||j%|j&|j'|j(|d
 qnt!| ||| j ||j)|j%|j&|j'|j(|d
| _*dd |
D | _+d S )Nz	Creating z dataset from file at z (split=)-z/Labeled examples only exist in cloze evaluation	for_trainwsc
cloze_evalr   z'split' must be one of z, got 'z	' insteadFTc                 s       | ]}|j V  qd S r   label.0exampler   r   r   	<genexpr>r       z,SuperGlueDataset.__init__.<locals>.<genexpr>
Returning  z examples with label dist.: c                 S   s   | j S r   )num_choices)xr   r   r   <lambda>w   s    z+SuperGlueDataset.__init__.<locals>.<lambda>)key)
pattern_idnum_prompt_tokensZis_multi_tokenZmax_segment_lengthfast_decodesplitc                 S   s   i | ]}|j |qS r   )guidr3   r   r   r   
<dictcomp>   s    z-SuperGlueDataset.__init__.<locals>.<dictcomp>),r   	processorvariable_num_choicesr   Zdataset_namer/   
seq_length	tokenizerpattern_ensemblepattern_textr   DEV_SETget_dev_examplesTEST_SETget_test_examplesTRUE_DEV_SETZget_true_dev_examples	TRAIN_SETget_train_examplesUNLABELED_SETget_unlabeled_examples
get_labelsr2   
ValueErrorSPLIT_TYPESlabeledr   lenlistitemsZsamplessortexample_listr   Zavailable_patternspvpsappendr?   Zmulti_tokensegment_lengthr@   r>   pvpexamples)selfr   r   data_dirrF   rA   rG   r,   rH   rI   r[   r5   label_distributionZpattern_idsr>   r   r   r   __init__A   s   

 
zSuperGlueDataset.__init__c                 C   s*   | j r| jrt| jt| j S t| jS r   )r/   rH   rW   r[   r\   ra   r   r   r   __len__   s   
zSuperGlueDataset.__len__c           
      C   s   |t | j }| j| }| jrbi }| jrddd}| jr0|t | j }| j| j|fi |}n
| jj|fi |}| jr`| j	dj
}| j	dj
}|g| |g }	|	tdgt |	 d}|S | j|| j| j| j}|S )NT)rV   ZprimingZeosZENC   )textZ	loss_mask)rW   r[   r/   rI   rH   r\   encoder_   rG   Zget_commandZIdnparrayrD   rF   r   )
ra   idxZ
sample_idxr5   r&   Zpvp_idxsampleZeos_idZcls_idZ	input_idsr   r   r   __getitem__   s,   

zSuperGlueDataset.__getitem__N)FFF)__name__
__module____qualname__rd   rf   rn   r   r   r   r   r(   ?   s    	
Vr(   c                   @   s   e Zd ZdZdd Zdd Zedd Zede	e
 fd	d
Ze	dde	e
 fddZde	e
 fddZde	e
 fddZede	e fddZde
fddZde
fddZdS )DataProcessorz
    Abstract class that provides methods for loading training, testing, development and unlabeled examples for a given
    task
    c                 C   s   || _ d| _d S )Nr   )r   num_truncatedra   r   r   r   r   rd      s   
zDataProcessor.__init__c                 C   sp   t |d)}t||D ]\}}|  | }|j|d}|t|d  qW d    d S 1 s1w   Y  d S )Nwrl   r2   
)openziprS   rl   writejsondumpsra   predictionsr`   output_fileoutput
predictionr5   datar   r   r   r      s   "zDataProcessor.output_predictionc                 C      dS )NFr   re   r   r   r   rE         z"DataProcessor.variable_num_choicesreturnc                 C   r   )z6Get a collection of `InputExample`s for the train set.Nr   ra   rb   r   r   r   rP         z DataProcessor.get_train_examplesFc                 C   r   )z4Get a collection of `InputExample`s for the dev set.Nr   ra   rb   r,   r   r   r   rK      s   zDataProcessor.get_dev_examplesc                 C      g S )z5Get a collection of `InputExample`s for the test set.r   r   r   r   r   rM      r   zDataProcessor.get_test_examplesc                 C   r   )z:Get a collection of `InputExample`s for the unlabeled set.r   r   r   r   r   rR      r   z$DataProcessor.get_unlabeled_examplesc                 C   r   )z)Get the list of labels for this data set.Nr   re   r   r   r   rS      r   zDataProcessor.get_labelsr5   c                 C   s   |j |jfS r   )text_atext_b)ra   r5   rG   r   r   r   get_classifier_input   s   z"DataProcessor.get_classifier_inputc                 C   s   |  ||\}}||j}||j}t||d dddd}	t|t| |	 |kr1|  jd7  _t||d |||dddd	}
|
\}}}}}}}d}|jd urX|j}|  	|}|j
rgt|||||jd}|S t|||||jd}|S )	NTFadd_clsadd_sep	add_piecerg   )r   r   r   r   r   r2   typespaddings	unique_idZ	positionsZmasksr2   r   )r   EncodeAsIdstokenizationr   rW   rs   r	   r2   rS   indexpretrained_bertr
   rB   )ra   r5   rG   rF   r   r   r   tokens_atokens_bnum_special_tokensr   idsr   r   position_idsr   
target_ids
loss_masksr2   rm   r   r   r   ri      s\   

zDataProcessor.encodeNF)ro   rp   rq   __doc__rd   r   propertyrE   r   r   r   rP   rK   rM   rR   r$   rS   r   ri   r   r   r   r   rr      s&    
rr   c                       sF   e Zd Z fddZdd ZdddZdd	 Zd
d Zdd Z  Z	S )SuperGLUEProcessorc                    s   t t| | |j| _d S r   )superr   rd   few_supergluert   	__class__r   r   rd     s   zSuperGLUEProcessor.__init__c                 C      |  tj|ddS )Ntrain.jsonlr   _create_examplesosr%   joinr   r   r   r   rP        z%SuperGLUEProcessor.get_train_examplesFc                 C   2   | j r| tj|ddS | tj|ddS )Nzdev32.jsonlr   	val.jsonlr   r   r   r%   r   r   r   r   r   rK        z#SuperGLUEProcessor.get_dev_examplesc                 C   r   )Nr   r   
test.jsonlr   r   r   r   r   rM   '  r   z$SuperGLUEProcessor.get_test_examplesc                 C   r   )Nzunlabeled.jsonlr   r   r   r   r   r   rR   /  r   z)SuperGLUEProcessor.get_unlabeled_examplesc                 O   s   d S r   r   )ra   r   r&   r   r   r   r   3  s   z#SuperGLUEProcessor._create_examplesr   )
ro   rp   rq   rd   rP   rK   rM   rR   r   __classcell__r   r   r   r   r     s    
r   c                   @   s@   e Zd ZdZdd Z		ddededed	ed
ee f
ddZdS )RteProcessorzProcessor for the RTE data set.c                 C      ddgS N
entailmentZnot_entailmentr   re   r   r   r   rS   :     zRteProcessor.get_labels
hypothesispremiser%   set_typehypothesis_namepremise_namer   c              
   C   s   g }t |ddW}t|D ]I\}}t|}	|	d }
t|
tr1zt|
}
W n ty0   |}
Y nw |	d}d||
f }t	|	| }t	|	| }t
|||||
d}|| qW d    |S 1 sbw   Y  |S )Nutf8encodingrl   r2   %s-%srB   r   r   r2   rl   )rx   	enumerater{   loads
isinstancer$   intrT   getr   r   r]   )ra   r%   r   r   r   r`   fZline_idxlineexample_jsonrl   r2   rB   r   r   r5   r   r   r   r   =  s@   



zRteProcessor._create_examplesN)r   r   )	ro   rp   rq   r   rS   r$   r   r   r   r   r   r   r   r   7  s     r   c                   @   s    e Zd ZdZdd Zdd ZdS )AxGProcessorz+Processor for the AX-G diagnostic data set.c                 C   r   )N
AX-g.jsonlr   r   r   r   r   r   rP   b  r   zAxGProcessor.get_train_examplesc                 C   r   )Nr   r   r   r   r   r   r   rM   f  r   zAxGProcessor.get_test_examplesN)ro   rp   rq   r   rP   rM   r   r   r   r   r   _  s    r   c                       s6   e Zd ZdZdd Zdd Z		d
 fdd		Z  ZS )AxBProcessorz+Processor for the AX-B diagnostic data set.c                 C   r   )N
AX-b.jsonlr   r   r   r   r   r   rP   n  r   zAxBProcessor.get_train_examplesc                 C   r   )Nr   r   r   r   r   r   r   rM   r  r   zAxBProcessor.get_test_examples	sentence2	sentence1c                    s   t  ||||S r   )r   r   )ra   r%   r   r   r   r   r   r   r   v  s   zAxBProcessor._create_examples)r   r   )ro   rp   rq   r   rP   rM   r   r   r   r   r   r   r   k  s    r   c                   @   s   e Zd ZdZdd ZdS )CbProcessorzProcessor for the CB data set.c                 C      g dS )N)r   contradictionneutralr   re   r   r   r   rS     r   zCbProcessor.get_labelsN)ro   rp   rq   r   rS   r   r   r   r   r     s    r   c                   @   sD   e Zd ZdZdd Zedededee fddZ	d	efd
dZ
dS )WicProcessorzProcessor for the WiC data set.c                 C   r   Nfalsetruer   re   r   r   r   rS     r   zWicProcessor.get_labelsr%   r   r   c              
   C   s   g }t | ddP}|D ]D}t|}|d }t|trt|}|dr&dnd}d||f }t|d }	t|d	 }
d
|d
 i}t||	|
|||d}|	| qW d    |S 1 s[w   Y  |S )Nr   r   rl   r2   r   r   r   r   r   word)rB   r   r   r2   rl   meta)
rx   r{   r   r   r$   r   r   r   r   r]   )r%   r   r`   r   r   r   rl   r2   rB   r   r   r   r5   r   r   r   r     s4   


zWicProcessor._create_examplesr5   c                 C   s   |j d d |j }||jfS )Nr   z: )r   r   r   )ra   r5   rG   r   r   r   r   r     s   
z!WicProcessor.get_classifier_inputN)ro   rp   rq   r   rS   staticmethodr$   r   r   r   r   r   r   r   r   r     s    r   c                   @   sZ   e Zd ZdZedd ZdddZdd Zd	efd
dZ		dde
de
dee fddZdS )WscProcessorzProcessor for the WSC data set.c                 C   s   | j jS r   )r   Zwsc_negativere   r   r   r   rE     s   z!WscProcessor.variable_num_choicesTc                 C      | j tj|dd|dS )Nr   r   r.   r   )ra   rb   r/   r   r   r   rP     s
   zWscProcessor.get_train_examplesc                 C   r   )NFalseTruer   re   r   r   r   rS     r   zWscProcessor.get_labelsr5   c                 C   sH   |j d }|j d }|j }d||  d ||< d|}|}||fS )N
span1_textspan2_index*r9   )r   r   rA   r   )ra   r5   rG   targetZpronoun_idxwords_ar   r   r   r   r   r     s   



z!WscProcessor.get_classifier_inputr%   r   r   c                 C   s  g }t |dd}|D ]}t|}|d }d|v r"t|d nd }	d||f }
t|d }|d d |d d	 |d d
 |d d d}d|v rndd |d D }g }t|D ]\}}||d | vrk|| qZ|}|d
 |d }}|d |d	 }}| }|  }|  }t	|}||||  |krdD ]}||| || |  |kr||7 }q|| |krdD ]}|||  |kr||7 }q|| |kr|| 
|r|d | || d t	| || t	|d  g ||d d   }|| |ksJ d||  d| d| d| d	d|}|||d
< |d< | jjdkrYt|
|||	||d}|| |dkrX|	dkrX|D ]}t|
||d||d}|| qEq|rg|dkrg|	dkrgq|dkrd|v rt	|dkrtdt	|dD ];}t|}|||d  |d< t	|d dk r|d  |d dt	|d   7  < t|
||	||d}|| qqd|v r||d< t|
||	||d}|| qW d    |S 1 sw   Y  |S ) Nr   r   rl   r2   r   rh   r   r   
span2_textspan1_indexr   )r   r   r   r   
candidatesc                 S   s   g | ]}|d  qS )rh   r   )r4   candr   r   r   
<listcomp>  s    z1WscProcessor._create_examples.<locals>.<listcomp>)rg   rg   zGot 'z' but expected 'z' at index z for ''r9   wsc1rB   r   r   r2   r   rl   r   r   r   	   r   rB   r   r2   r   rl   )rx   r{   r   r$   r   r   r]   rA   lowerrW   
startswithr   r   taskr   rangecopydeepcopy)ra   r%   r   r/   r`   r   r   r   rl   r2   rB   r   r   r   filteredir   r   r   r   r   r   Zwords_a_lowerZwords_span1_textZ	span1_lenoffsetr5   _metar   r   r   r     s  








& 




qqzWscProcessor._create_examplesN)T)ro   rp   rq   r   r   rE   rP   rS   r   r   r$   r   r   r   r   r   r   r     s    

r   c                   @   s6   e Zd ZdZdd Zedededee fddZ	d	S )
BoolQProcessorz!Processor for the BoolQ data set.c                 C   r   r   r   re   r   r   r   rS   A  r   zBoolQProcessor.get_labelsr%   r   r   c              	   C   s   g }t | ddE}|D ]9}t|}|d }d|v r"t|d  nd }d||f }t|d }	t|d }
t||	|
||d}|| qW d    |S 1 sPw   Y  |S )	Nr   r   rl   r2   r   passagequestionr   )rx   r{   r   r$   r   r   r   r]   )r%   r   r`   r   r   r   rl   r2   rB   r   r   r5   r   r   r   r   D  s2   

zBoolQProcessor._create_examplesN)
ro   rp   rq   r   rS   r   r$   r   r   r   r   r   r   r   r   >  s
     r   c                   @   sD   e Zd ZdZdd ZdefddZededed	e	e fd
dZ
dS )CopaProcessorz Processor for the COPA data set.c                 C   r   Nr   rg   r   re   r   r   r   rS   _  r   zCopaProcessor.get_labelsr5   c                 C   s  |j rg g g }}}ng g g }}}	|jd }
|
dkrdnd}t|jd | }||j}|jd |jd fD ]a}t|}||j}t||d ddd	d
}t|t| | |krc|  jd7  _t	||d |||ddd	d
	}|\}}}}}}}|j r|
| |
| |
| q:|
| |
| |	
| q:d}|jd ur|j}|  |}|j rt|||||jd}|S t|||	||jd}|S )Nr   causeZbecausesor9   choice1choice2TFr   rg   r   r   r   )r   r   r   r   r   r   r   rW   rs   r	   r]   r2   rS   r   r
   rB   )ra   r5   rG   rF   r   ids_list
types_listpaddings_listpositions_listsep_listr   joinerr   r   choicer   r   r   r   r   r   r   r   r   r   r2   rm   r   r   r   ri   b  sx   






zCopaProcessor.encoder%   r   r   c              	   C   sB  g }t | ddD}|D ]9}t|}d|v r|d nd }|d }d||f }|d }	|d |d |d	 d
}
t||	||
|d}|| qW d    n1 sOw   Y  |dks\|dkrg }|D ]+}|jdkridnd}|jd |jd |jd	 d
}
t|jd |j||
d}|| q`||7 }t	dt
| dt
| d |S )Nr   r   r2   rl   r   r   r  r  r   )r  r  r   r   r   r   r   rg   m)rB   r   r2   r   zAdded z  mirror examples, total size is z...)rx   r{   r   r   r]   r2   r   rB   r   r   rW   )r%   r   r`   r   r   r   r2   rl   rB   r   r   r5   Zmirror_examplesexZmirror_exampler   r   r   r     sV   

zCopaProcessor._create_examplesN)ro   rp   rq   r   rS   r   ri   r   r$   r   r   r   r   r   r   r   \  s    < r   c                   @   sL   e Zd ZdZdd Zedededee fddZ	d	d
 Z
defddZdS )MultiRcProcessorz#Processor for the MultiRC data set.c                 C   r   r   r   re   r   r   r   rS     r   zMultiRcProcessor.get_labelsr%   r   r   c                 C   sb  g }t | ddv}|D ]k}t|}|d }t|d d }|d d }|D ]O}	t|	d }
|	d }|	d }|D ]<}d	|v rB|d	 nd }|d }| d
| d| d| }|||t|d d}|||g}t|||
|||d}|| q8q&qW d    n1 sw   Y  ttdd |D }tdd |D }t	dt
| dt
| dt|   |S )Nr   r   rl   r   rh   	questionsr   answersr2   -p-q-a)passage_idxquestion_idx
answer_idxanswerr   c                 s       | ]}|j d  V  qdS r  Nr   r3   r   r   r   r6         z4MultiRcProcessor._create_examples.<locals>.<genexpr>c                 s   r0   r   r1   r3   r   r   r   r6     r7   r8    examples corresponding to # questions with label distribution )rx   r{   r   r   r   r]   rX   setr   r   rW   rY   )r%   r   r`   r   r   r   r  rh   r  question_jsonr   r  r  answer_jsonr2   r  rB   r   rl   r5   question_indicesrc   r   r   r   r     sj   




&
z!MultiRcProcessor._create_examplesc              	   C   s(  t |d}tt}t||D ]\}}||jd  ||f q| D ]]\}}	tt}
|dg id}|	D ]\}}|
|jd  ||f q5|
 D ],\}}	|g d}|	D ]\}}|  | }|d |jd |d	 qU|d
 d | qJ|t	
|d  q$W d    d S 1 sw   Y  d S )Nru   r  r  )rl   r   r  )rl   r  r  r  rv   r   rw   )rx   r   rX   ry   r   r]   rY   rS   rz   r{   r|   )ra   r~   r`   r   r   Zpassage_dictr   r5   r  r   Zquestion_dictZpassage_datar  Zquestion_datar   r   r   r     s8   

"z"MultiRcProcessor.output_predictionr5   c                 C   s&   |j }d|jd|jd g}||fS )Nr9   zanswer:r  )r   r   r   r   )ra   r5   rG   r   r   r   r   r   r     s   z%MultiRcProcessor.get_classifier_inputN)ro   rp   rq   r   rS   r   r$   r   r   r   r   r   r   r   r   r   r    s    1r  c                   @   sT   e Zd Zedd Zdd Zdd Zddd	Zd
d Ze		dde
e fddZdS )RaceProcessorc                 C   r   NTr   re   r   r   r   rE   '  r   z"RaceProcessor.variable_num_choicesc                 C   r   )N)ABCDr   re   r   r   r   rS   +  r   zRaceProcessor.get_labelsc                 C      |  tj|ddS )Nr   r   r   r   r   r   rP   .     z RaceProcessor.get_train_examplesFc                 C   s   | j tj|dd|dS )Nr   r+   r   r   r   r   r   rK   1     zRaceProcessor.get_dev_examplesc                 C   r(  )Nr   r   r   r   r   r   rM   5  r)  zRaceProcessor.get_test_examplesr   c                 C   sF  g }dd }t  tj| ddt  tj| dd }|D ]}t|dddq}|D ]f}t|}	|	d	 }
|	d
 }|	d }|	d }|	d }t|t|ksOJ t|t|ksYJ ||}t|D ].\}}|| }|| }| d|
 d| }||
|g}d|i}t	||||||d}|
| qaq*W d    n1 sw   Y  q|S )Nc                 S   s8   |  dd} tdd| } tdD ]}|  dd} q| S )zDRemove new lines and multiple spaces and adjust end of sentence dot.rw   r9   z\s+   z . z. )replaceresubr   )rh   _r   r   r   
clean_text>  s
   z2RaceProcessor._create_examples.<locals>.clean_textmiddlez*.txthighrutf-8r   idZarticler  optionsr  r  r  choicesr   )globr   r%   r   rx   r{   r   rW   r   r   r]   )r%   r   r,   r`   r0  	filenamesfilenamer   r   r   rl   contextr  r7  r  r  r   r  r  rB   ex_idxr   r5   r   r   r   r   8  sR   



zRaceProcessor._create_examplesNr   )ro   rp   rq   r   rE   rS   rP   rK   rM   r   r   r   r   r   r   r   r   r"  %  s    

r"  c                   @   sf   e Zd ZdZdddZedd Zdd Zd	d
 Zde	fddZ
e			ddedee	 fddZdS )RecordProcessorz"Processor for the ReCoRD data set.Fc                 C   r   )Nr   r   r+   r   r   r   r   r   rK   m  r*  z RecordProcessor.get_dev_examplesc                 C   r   r#  r   re   r   r   r   rE   q  r   z$RecordProcessor.variable_num_choicesc                 C   r   N01r   re   r   r   r   rS   u  r   zRecordProcessor.get_labelsc                 C   sr   t |d*}t||D ]\}}|jd | }|j|d}|t|d  qW d    d S 1 s2w   Y  d S )Nru   r   rv   rw   )rx   ry   r   rl   rz   r{   r|   r}   r   r   r   r   x  s   "z!RecordProcessor.output_predictionr5   c                 C   sr  |j rg g g }}}ng g g }}}	||jj}
|jr%||jjnd }|jd D ]g}||j}t|
t| t| }|t|
|| d dddd7 }||krY|  jd7  _t	|
|| d |||dddd	}|\}}}}}}}|j r|
| |
| |
| q,|
| |
| |	
| q,|j}|  |}|j rt|||||jd}|S t|||	||jd}|S )Nr   TFr   rg   r   r   )r   r   r   r   r   r   rW   r   rs   r	   r]   r2   rS   r   r
   rB   )ra   r5   rG   rF   r   r  r  r  r  r	  r   r   r  Z
answer_idstotal_lengthr   r   r   r   r   r   r   r   r2   rm   r   r   r   ri     sz   




zRecordProcessor.encode*   
   !max_train_candidates_per_questionr   c                    s  g }t |}t| dd}t|D ]\}}	t|	}
|
d }t|
d d }t }|
d d D ]}|d }|d }t|||d	  }|| q1t	|}|
  |d
d}|
d }|D ]}t|d }|d }t  |dg D ]}t|d } | qtt	  |dks|rt D ]S\}} fdd|D }t||d	 kr|| |d |d	  }| d| d| d| }|||g| |gd}|||g}t|||d||t|d	 d}|| qq_| d| d| }||| d}t|||d||t|d}|| q_qW d    n	1 sw   Y  t	tdd |D }tdd |D }tdt| dt| dt	|   |S )Nr   r   rl   r   rh   entitiesstartendrg   z@highlight
z- qasqueryr  r   c                    s   g | ]}| vr|qS r   r   )r4   Zentr  r   r   r     s    z4RecordProcessor._create_examples.<locals>.<listcomp>r  r  r  )r  r  r   r  r?  )rB   r   r   r2   r   rl   r:   r@  c                 s   r  r  r  r3   r   r   r   r6     r  z3RecordProcessor._create_examples.<locals>.<genexpr>c                 s   r0   r   r1   r3   r   r   r   r6     r7   r8   r  r  )randomRandomrx   r   r{   r   r   r  addrX   rZ   r,  r   rW   shuffler   r]   r   r   rY   )r%   r   seedrD  r,   r`   Zentity_shufflerr   rl   r   r   rh   rE  Zentity_jsonrF  rG  entityr  r  r   r  r   r  r  r   rB   r   r<  r5   r!  rc   r   rJ  r   r     s   






V
z RecordProcessor._create_examplesNr   )rB  rC  F)ro   rp   rq   r   rK   r   rE   rS   r   r   ri   r   r   r   r   r   r   r   r   r=  j  s"    

9r=  c                   @   l   e Zd ZdZdd ZdddZdee fdd	Zdee fd
dZ	dd Z
edededee fddZdS )MnliProcessorz3Processor for the MultiNLI data set (GLUE version).c                 C   r   Nz	train.tsvr   r   r   r   r   r   rP   $  r   z MnliProcessor.get_train_examplesFc                 C   r   )Nzdev_matched.tsvZdev_matchedr   r   r   r   r   rK   (  r   zMnliProcessor.get_dev_examplesr   c                 C   r   )Nztest_matched.tsvZtest_matchedr   r   r   r   r   rM   ,  r   zMnliProcessor.get_test_examplesc                 C   
   |  |S r   rP   r   r   r   r   rR   0     
z$MnliProcessor.get_unlabeled_examplesc                 C   r   )N)r   r   r   r   re   r   r   r   rS   3  r   zMnliProcessor.get_labelsr%   r   c                 C   n   g }t | }| D ]*\}}| d| }t|d }t|d }|dd }	t||||	d}
||
 q
|S )Nr*   r   r   Z
gold_labelrB   r   r   r2   r'   iterrowsr   r   r   r]   r%   r   r`   dfrl   rowrB   r   r   r2   r5   r   r   r   r   6     zMnliProcessor._create_examplesNr   ro   rp   rq   r   rP   rK   r   r   rM   rR   rS   r   r$   r   r   r   r   r   rR  !  s    
 rR  c                   @   s,   e Zd ZdZd	ddZdee fddZdS )
MnliMismatchedProcessorz>Processor for the MultiNLI mismatched data set (GLUE version).Fc                 C   r   )Nzdev_mismatched.tsvZdev_mismatchedr   r   r   r   r   rK   J  r   z(MnliMismatchedProcessor.get_dev_examplesr   c                 C   r   )Nztest_mismatched.tsvZtest_mismatchedr   r   r   r   r   rM   N  r   z)MnliMismatchedProcessor.get_test_examplesNr   )ro   rp   rq   r   rK   r   r   rM   r   r   r   r   r`  G  s    
r`  c                   @   rQ  )AgnewsProcessorz#Processor for the AG news data set.c                 C   r   Nz	train.csvr   r   r   r   r   r   rP   V  r   z"AgnewsProcessor.get_train_examplesFc                 C   r   Nztest.csvr   r   r   r   r   r   rK   Z  r)  z AgnewsProcessor.get_dev_examplesr   c                 C      t  r   NotImplementedErrorr   r   r   r   rM   ]     z!AgnewsProcessor.get_test_examplesc                 C   rT  r   rU  r   r   r   r   rR   `  rV  z&AgnewsProcessor.get_unlabeled_examplesc                 C   r   )N)r@  234r   re   r   r   r   rS   c  r   zAgnewsProcessor.get_labelsr%   r   c                 C   s   g }t | ddA}tj|dd}t|D ],\}}|\}}}	d||f }
t|dd}t|	dd}t|
|||d}|| qW d    |S 1 sLw   Y  |S )	Nr4  r   ,	delimiterr   \r9   rX  )rx   r"   readerr   r   r,  r   r]   )r%   r   r`   r   ro  rl   r]  r2   ZheadlinebodyrB   r   r   r5   r   r   r   r   f  s&   


z AgnewsProcessor._create_examplesNr   r_  r   r   r   r   ra  S      
 ra  c                   @   rQ  )YahooAnswersProcessorz)Processor for the Yahoo Answers data set.c                 C   r   rb  r   r   r   r   r   rP   }  r   z(YahooAnswersProcessor.get_train_examplesFc                 C   r   rc  r   r   r   r   r   rK     r)  z&YahooAnswersProcessor.get_dev_examplesr   c                 C   rd  r   re  r   r   r   r   rM     rg  z'YahooAnswersProcessor.get_test_examplesc                 C   rT  r   rU  r   r   r   r   rR     rV  z,YahooAnswersProcessor.get_unlabeled_examplesc                 C   r   )N)
r@  rh  ri  rj  5678910r   re   r   r   r   rS     r   z YahooAnswersProcessor.get_labelsr%   r   c              	   C   s   g }t | dd[}tj|dd}t|D ]F\}}|\}}}	}
d||f }d|dddd|	ddddg}t|}|
dddd}t|}t||||d	}|| qW d    |S 1 sfw   Y  |S )
Nr   r   rk  rl  r   r9   \nrn  rX  )	rx   r"   ro  r   r   r,  r   r   r]   )r%   r   r`   r   ro  rl   r]  r2   Zquestion_titleZquestion_bodyr  rB   r   r   r5   r   r   r   r     s,   
z&YahooAnswersProcessor._create_examplesNr   r_  r   r   r   r   rr  z  rq  rr  c                   @   rQ  )YelpPolarityProcessorz1Processor for the YELP binary classification set.c                 C   r   rb  r   r   r   r   r   rP     r   z(YelpPolarityProcessor.get_train_examplesFc                 C   r   rc  r   r   r   r   r   rK     r)  z&YelpPolarityProcessor.get_dev_examplesr   c                 C   rd  r   re  r   r   r   r   rM     rg  z'YelpPolarityProcessor.get_test_examplesc                 C   rT  r   rU  r   r   r   r   rR     rV  z,YelpPolarityProcessor.get_unlabeled_examplesc                 C   r   )Nr@  rh  r   re   r   r   r   rS     r   z YelpPolarityProcessor.get_labelsr%   r   c                 C   s   g }t | dd=}tj|dd}t|D ](\}}|\}}d||f }	|dddd}
t|
}
t|	|
|d	}|| qW d    |S 1 sHw   Y  |S )
Nr4  r   rk  rl  r   ry  r9   rn  rB   r   r2   )rx   r"   ro  r   r,  r   r   r]   )r%   r   r`   r   ro  rl   r]  r2   rp  rB   r   r5   r   r   r   r     s   
z&YelpPolarityProcessor._create_examplesNr   r_  r   r   r   r   rz    rq  rz  c                   @   s*   e Zd ZdZdee fddZdd ZdS )YelpFullProcessorz/Processor for the YELP full classification set.r   c                 C   rd  r   re  r   r   r   r   rM     rg  z#YelpFullProcessor.get_test_examplesc                 C   r   )N)r@  rh  ri  rj  rs  r   re   r   r   r   rS     r   zYelpFullProcessor.get_labelsN)ro   rp   rq   r   r   r   rM   rS   r   r   r   r   r|    s    r|  c                       s|   e Zd ZdZddef fddZdd Zdd	d
Zdee	 fddZ
dee	 fddZdd Zdedee	 fddZ  ZS )XStanceProcessorz$Processor for the X-Stance data set.Nlanguagec                    s*   t  | |d ur|dv sJ || _d S )N)defr)r   rd   r~  )ra   r   r~  r   r   r   rd     s   
zXStanceProcessor.__init__c                 C      |  tj|dS )Nr   r   r   r   r   r   rP        z#XStanceProcessor.get_train_examplesFc                 C   r  )Nr   r   r   r   r   r   rK     r  z!XStanceProcessor.get_dev_examplesr   c                 C   rd  r   re  r   r   r   r   rM     rg  z"XStanceProcessor.get_test_examplesc                 C   rT  r   rU  r   r   r   r   rR     rV  z'XStanceProcessor.get_unlabeled_examplesc                 C   r   )NZFAVORZAGAINSTr   re   r   r   r   rS     r   zXStanceProcessor.get_labelsr%   c                 C   s   g }t |ddC}|D ]7}t|}|d }|d }t|d }t|d }	|d }
| jd ur5|
| jkr5qt|||	|d}|| qW d    |S 1 sNw   Y  |S )	Nr   r   r2   r5  r   commentr~  rX  )rx   r{   r   r   r~  r   r]   )ra   r%   r`   r   r   r   r2   Zid_r   r   r~  r5   r   r   r   r     s(   

z!XStanceProcessor._create_examplesr   r   )ro   rp   rq   r   r$   rd   rP   rK   r   r   rM   rR   rS   r   r   r   r   r   r   r}    s    
r}  c                   @   sV   e Zd Zdd ZdddZdee fddZd	d
 Ze	de
de
dee fddZdS )Sst2Processorc                 C   r   rS  r   r   r   r   r   rP     r   z Sst2Processor.get_train_examplesFc                 C   r   )Nzdev.tsvr   r   r   r   r   r   rK     r)  zSst2Processor.get_dev_examplesr   c                 C   r   )Nztest.tsvr   r   r   r   r   r   rM     r   zSst2Processor.get_test_examplesc                 C   r   r>  r   re   r   r   r   rS     r   zSst2Processor.get_labelsr%   r   c           
      C   s`   g }t | }| D ]#\}}| d| }t|d }|dd }t|||d}	||	 q
|S )Nr*   sentencer2   r{  rY  
r%   r   r`   r\  rl   r]  rB   r   r2   r5   r   r   r   r     s   zSst2Processor._create_examplesNr   )ro   rp   rq   rP   rK   r   r   rM   rS   r   r$   r   r   r   r   r   r    s    
 r  c                   @   2   e Zd Zdd Zedededee fddZdS )	ColaProcessorc                 C   r   r>  r   re   r   r   r   rS   #  r   zColaProcessor.get_labelsr%   r   r   c           
      C   s   g }|dkrt | d d}nt | }| D ].\}}| d| }|dkr/t|d }|d }nt|d }d }t|||d}	||	 q|S )Nr   )headerr*   r+  rg   r  r{  )r'   rZ  r   r   r]   r  r   r   r   r   &  s   
zColaProcessor._create_examplesN	ro   rp   rq   rS   r   r$   r   r   r   r   r   r   r   r  !       r  c                   @   r  )	MrpcProcessorc                 C   r   r>  r   re   r   r   r   rS   >  r   zMrpcProcessor.get_labelsr%   r   r   c                 C   rW  )Nr*   z	#1 Stringz	#2 StringZQualityrX  rY  r[  r   r   r   r   A  r^  zMrpcProcessor._create_examplesNr  r   r   r   r   r  <  r  r  c                   @   r  )	QqpProcessorc                 C   r   r>  r   re   r   r   r   rS   T  r   zQqpProcessor.get_labelsr%   r   r   c                 C   rW  )Nr*   Z	question1Z	question2Zis_duplicaterX  rY  r[  r   r   r   r   W  r^  zQqpProcessor._create_examplesNr  r   r   r   r   r  R  r  r  c                   @   r  )	QnliProcessorc                 C   r   r   r   re   r   r   r   rS   j  r   zQnliProcessor.get_labelsr%   r   r   c                 C   rW  )Nr*   r   r  r2   rX  rY  r[  r   r   r   r   m  r^  zQnliProcessor._create_examplesNr  r   r   r   r   r  h  r  r  c                   @   sD   e Zd Zdd ZdddZdd Zeded	ed
ee	 fddZ
dS )SquadProcessorc                 C   r   )Nztrain-v2.0.jsonr   r   r   r   r   r   rP     r   z!SquadProcessor.get_train_examplesFc                 C   r   )Nzdev-v2.0.jsonr   r   r   r   r   r   rK     r   zSquadProcessor.get_dev_examplesc                 C   s   dgS )Nr?  r   re   r   r   r   rS     rg  zSquadProcessor.get_labelsr%   r   r   c                 C   s   g }t | dd}t|d }W d    n1 sw   Y  t|D ]J\}}t|d D ]?\}}|d }	t|d D ]0\}
}t|d dkrHq;| d	| d	| d	|
 }t||	|d
 dd|d d id}|| q;q-q#|S )Nr4  r   r   Z
paragraphsr;  rH  r  r   r*   r   r?  r  )rB   r   r   r2   r   )rx   r{   loadr   rW   r   r]   )r%   r   r`   r   r   rl   r   pidZ	paragraphr;  ZqidrH  rB   r5   r   r   r   r     s,   zSquadProcessor._create_examplesNr   )ro   rp   rq   rP   rK   rS   r   r$   r   r   r   r   r   r   r   r  ~  s    
 r  >   multirccbrter-   wicboolqZcoparecordZmnlizmnli-mmZagnewsZyahoozyelp-polarityz	yelp-fullz
xstance-dec                   C      t dS )Nr  r}  r   r   r   r   r<         r<   z
xstance-frc                   C   r  )Nr  r  r   r   r   r   r<     r  Zxstancer  r  r  r-   r   r  r  )r  zax-gzax-bZsst2ZcolaZmrpcZqqpZqnlisquadZracer  )Kr   r   r"   r8  r   rK  r-  abcr   r   collectionsr   r   typingr   r   r   r{   numpyrj   Zpandasr!   Z
data_utilsr	   r
   r   Zdata_utils.corporar   Ztorch.utils.datar   r   utilsr   Z+modelscope.models.nlp.mglm.tasks.data_utilsr   Z.modelscope.models.nlp.mglm.tasks.superglue.pvpr   rO   rJ   rL   rN   rQ   rU   r   r'   r(   rr   r   r   r   r   r   r   r   r   r   r  r"  r=  rR  r`  ra  rr  rz  r|  r}  r  r  r  r  r  r  ZCLASSIFICATION_DATASETSZMULTI_CHOICE_DATASETSr   r   r   r   r   <module>   s   
x^"(# m\E 8&'+%
.%	
