o
    *jŲ                     @   sd  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlZddlZddlZddlZddlZddlZddlmZ ddlmZ ddlmZ d	d
lmZmZ G dd dejZG dd dejZG dd dejZd ddZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&dS )!z2dataset objects for jsons, csvs, and BERT datasets    N)bisect_right)
accumulate)
itemgetter)tokenize)data)print_rank_0   )
LazyLoaderexists_lazyc                   @   s$   e Zd Zdd Zdd Zdd ZdS )ShuffleDatasetc                    sr   | _ ttt j  _t j t|do|j _ jr7 fdd jD  _	 fdd jD  _
d S d S )Nis_lazyc                       g | ]} j j| qS  )dsprompt_lens.0idxselfr   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/mglm/data_utils/datasets.py
<listcomp>.       z+ShuffleDataset.__init__.<locals>.<listcomp>c                    r   r   )r   	text_lensr   r   r   r   r   1   r   )r   listrangelenshuffle_idsrandomshufflehasattrr   r   r   )r   r   r   r   r   __init__(   s   

zShuffleDataset.__init__c                 C      | j | j|  S N)r   r   r   r   r   r   r   __getitem__5      zShuffleDataset.__getitem__c                 C   
   t | jS r#   )r   r   r   r   r   r   __len__8      
zShuffleDataset.__len__N)__name__
__module____qualname__r!   r%   r(   r   r   r   r   r   &   s    r   c                       sx   e Zd ZdZedd Z fddZdd Zdd	 Zd
d Z	dd Z
dd Zedd Zedd Zedd Z  ZS )ConcatDataseta'  
    Dataset to concatenate multiple datasets.
    Purpose: useful to assemble different existing datasets, possibly
    large-scale datasets as the concatenation operation is done in an
    on-the-fly manner.
    Arguments:
        datasets (sequence): List of datasets to be concatenated.
    c                 C   s6   g d}}| D ]}t |}|||  ||7 }q|S Nr   )r   append)sequencerselr   r   r   cumsumF   s   

zConcatDataset.cumsumc                    sp   t t|   t|dksJ dt|| _tdd | jD t| jk| _| | j| _	d | _
d | _d | _d S )Nr   z(datasets should not be an empty iterablec                 S   s&   g | ]}t |tpt|d o|jqS )r   )
isinstancer	   r    r   )r   r   r   r   r   r   S   s
    
z*ConcatDataset.__init__.<locals>.<listcomp>)superr-   r!   r   r   datasetssumr   r5   cumulative_sizes_X_Y_lens)r   r8   kwargs	__class__r   r   r!   O   s   

zConcatDataset.__init__c                 C   s<   t | j|}|dkr|}n	|| j|d   }| j| |S Nr   r   )r   r:   r8   get_text_lenr   r   Zdataset_idx
sample_idxr   r   r   rB   \   s
   zConcatDataset.get_text_lenc                 C   s   | j D ]}|| qd S r#   )r8   SetTokenizer)r   	tokenizerr   r   r   r   rE   d   s   
zConcatDataset.SetTokenizerc                 C   s   | j d  S r.   )r8   GetTokenizerr   r   r   r   rG   h   s   zConcatDataset.GetTokenizerc                 C   s
   | j d S )N)r:   r   r   r   r   r(   k   r)   zConcatDataset.__len__c                 C   s:   t | j|}|dkr|}n	|| j|d   }| j| | S rA   )r   r:   r8   rC   r   r   r   r%   n   s
   zConcatDataset.__getitem__c                 C   s^   | j d u r,g | _ | jr| jD ]	}| j |j q| j S | jD ]}| j dd |D  q| j S )Nc                 S   *   g | ]}t |trt|d  nt|qS textr6   dictr   r   dr   r   r   r          z&ConcatDataset.lens.<locals>.<listcomp>)r=   r   r8   extendlensr   r   r   r   r   rR   v   s   



zConcatDataset.lensc                 C   s0   | j d u rg | _ | jD ]	}| j |j q| j S r#   )r;   r8   rQ   XrS   r   r   r   rT      s
   

zConcatDataset.Xc                 C   sB   | j d u rg | _ | jD ]}| j t|j qt| j | _ | j S r#   )r<   r8   rQ   r   YnparrayrS   r   r   r   rU      s   

zConcatDataset.Y)r*   r+   r,   __doc__staticmethodr5   r!   rB   rE   rG   r(   r%   propertyrR   rT   rU   __classcell__r   r   r?   r   r-   <   s     	


r-   c                   @   s`   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dd Ze
dd Zdd ZdS )SplitDatasetaq  
    Dataset wrapper to access a subset of another dataset.
    Purpose: useful to index into existing datasets, possibly
    large-scale datasets as the subindexing operation is done in an
    on-the-fly manner.
    Arguments:
        ds (Dataset or array-like): List of datasets to be subindexed
        split_inds (1D array-like): List of indices part of subset
    c                 K   s<   t || _|| _t|tpt|do|j| _d | _d | _d S )Nr   )	r   
split_indswrapped_datar6   r	   r    r   r;   r<   )r   r   r]   r>   r   r   r   r!      s   

zSplitDataset.__init__c                 C   r'   r#   )r   r]   r   r   r   r   r(      r)   zSplitDataset.__len__c                 C   s   | j | j| S r#   )r^   rB   r]   r$   r   r   r   rB      s   zSplitDataset.get_text_lenc                 C   r"   r#   )r^   r]   )r   indexr   r   r   r%      r&   zSplitDataset.__getitem__c                 C   s   | j | d S r#   )r^   rE   r   rF   r   r   r   rE      r&   zSplitDataset.SetTokenizerc                 C   s
   | j  S r#   )r^   rG   r   r   r   r   rG      r)   zSplitDataset.GetTokenizerc                 C   s$   | j d u rt| j | jj| _ | j S r#   )r;   r   r]   r^   rT   r   r   r   r   rT      s   
zSplitDataset.Xc                 C   s*   | j d u rtt| j | jj| _ | j S r#   )r<   rV   rW   r   r]   r^   rU   r   r   r   r   rU      s
   
zSplitDataset.Yc                 c   s    | j D ]}| j| V  qd S r#   )r]   r^   r$   r   r   r   __iter__   s   
zSplitDataset.__iter__N)r*   r+   r,   rX   r!   r(   rB   r%   rE   rG   rZ   rT   rU   ra   r   r   r   r   r\      s    


r\   Tc                 C   sX  |du rg d}t |}|dkrtdt|}|| }t| }t|}|r3tjd}|| |durLt	|}t||ksDJ t
d|  n|durdtj dkrdt|| td|  d}	d}
dgt| }t|D ]6\}}|dkr|||  }|
|d 7 }
tt||
 }||	|	t|d  }t| |||< |	|7 }	|
d; }
qs|S )	a  
    Split a dataset into subsets given proportions of how
    much to allocate per split. If a split is 0% returns None for that split.
    Purpose: Useful for creating train/val/test splits
    Arguments:
        ds (Dataset or array-like): Data to be split.
        split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
        shuffle (boolean): Randomly split dataset. Default: True
        save_splits: save split indices to file
        load_splits: load split indices from file
    N)皙?g?        r   zSplit cannot sum to 0.i  zLoad split indices from zSave split indices to r   )r9   	ExceptionrV   rW   r   Zaranger   RandomStater   loadr   torchdistributedZget_ranksaveprint	enumerateintmaxr\   )r   splitr   Zsave_splitsZload_splitsZ	split_sumds_lenZindsrngZ	start_idxZresidual_idxZrtn_dsifZ
proportionZsplit_r]   r   r   r   split_ds   sD   



rs   c                   @   s^   e Zd ZdZ							dddZd	d
 Zdd Zedd Zdd Z	dd Z
dddZdS )csv_dataseta  
    Class for loading datasets from csv files.
    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
    Arguments:
        path (str): Path to csv file with dataset.
        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
        preprocess_fn (callable): Callable that process a string into desired format.
        delim (str): delimiter for csv. Default: ','
        binarize_sent (bool): binarize label values to 0 or 1 if they're on a different scale. Default: False
        drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty
            columns with -1 (regardless if rows are dropped based on value) Default: False
        text_key (str): key to get text from csv. Default: 'sentence'
        label_key (str): key to get label from json dictionary. Default: 'label'
    Attributes:
        X (list): all strings from the csv file
        Y (np.ndarray): labels to train with
    N,Fsentencelabelc	              
   K   s4  d| _ || _| | || _|| _|| _|| _|| _d| jv r"d| _g | _g | _	z|g}
t
|tr6|
|7 }
n|
|g7 }
tj| j| j|
dd}W n   tj| j| j|gdd}Y |jdd}|| j | _z|| j| _	W n ty } ztt| jd | _	W Y d }~nd }~ww |rt| j	|d	| _	d S d S )
NFz.tsv	zlatin-1)sepZusecolsencodingr   ZaxisrH   Zhard)r   preprocess_fnrE   pathdelimtext_key	label_keydrop_unlabeledrT   rU   r6   r   pdZread_csvZdropnavaluestolistrd   rV   onesr   binarize_labels)r   r~   rF   r}   r   binarize_sentr   r   r   r>   colsr   r3   r   r   r   r!     sL   






"zcsv_dataset.__init__c                 C   6   |d u rd| _ t| ds|| _d S d S d| _ || _d S NF
_tokenizerTusing_tokenizerr    r   r`   r   r   r   rE   ?     


zcsv_dataset.SetTokenizerc                 C      | j S r#   r   r   r   r   r   rG   H     zcsv_dataset.GetTokenizerc                 C      | j r| jS d S r#   r   r   r   r   r   r   rF   K     zcsv_dataset.tokenizerc                 C   r'   r#   r   rT   r   r   r   r   r(   Q  r)   zcsv_dataset.__len__c                 C      | j | }| jdur| j|| j}n
| jdur| |}| j| }t|tr?| jdur5| j|| j}n
| jdur?| |}|t||dS )z=process+tokenize string and return string,label,and stringlenNrK   lengthrw   rT   rF   EncodeAsIdsr}   rU   r6   strr   r   r_   xyr   r   r   r%   T     








zcsv_dataset.__getitem__c           	      C   s  |du r	| j d }td|  t|di}tj|| jd}|durU|s6| jftt| | j	f }|
| t|D ]\}}| j| ft| | j| f }|
| q:n|
| j| j	g t| j| jD ]}|
| qeW d   dS W d   dS 1 sw   Y  dS )z
        given a generator of metrics for each of the data points X_i,
            write the metrics, text, and labels to a csv file
        N.resultszgenerating csv at w)	delimiter)r~   rj   opencsvwriterr   r   tuplenextr   writerowrk   rU   rT   zip)	r   
writer_genr~   skip_headerZcsvfilecheaderrq   rowr   r   r   writec  s0   

 "zcsv_dataset.write)NNru   FFrv   rw   NNF)r*   r+   r,   rX   r!   rE   rG   rZ   rF   r(   r%   r   r   r   r   r   rt      s"    
2	
rt   c                   @   sl   e Zd ZdZ						dddZdd	 Zd
d Zedd Zdd Z	dd Z
dddZdd Zdd ZdS )json_dataseta   
    Class for loading datasets from a json dump.
    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
    Arguments:
        path (str): path to json file with dataset.
        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
        preprocess_fn (callable): callable function that process a string into desired format.
            Takes string, maxlen=None, encode=None as arguments. Default: process_str
        text_key (str): key to get text from json dictionary. Default: 'sentence'
        label_key (str): key to get label from json dictionary. Default: 'label'
    Attributes:
        all_strs (list): list of all strings from the dataset
        all_labels (list): list of all labels from the dataset (if they have it)
    NFrv   rw   c                 K   s   d| _ || _|| _| | g | _g | _|| _|| _|| _| 	| jD ]}	|	| }
| j
|
 | j
|	|  q#|rDt| j|d| _d S d S )NFr|   )r   r}   r~   rE   rT   rU   r   r   
loose_jsonload_json_streamr/   r   )r   r~   rF   r}   r   r   r   r   r>   jr2   r   r   r   r!     s    	
zjson_dataset.__init__c                 C   r   r   r   r`   r   r   r   rE     r   zjson_dataset.SetTokenizerc                 C   r   r#   r   r   r   r   r   rG     r   zjson_dataset.GetTokenizerc                 C   r   r#   r   r   r   r   r   rF     r   zjson_dataset.tokenizerc                 C   r   )z)gets the index'th string from the datasetNr   r   r   r   r   r   r%     r   zjson_dataset.__getitem__c                 C   r'   r#   r   r   r   r   r   r(     r)   zjson_dataset.__len__c                    sX   |du r	j d }durfdd nfdd  fdd}||  dS )z
        given a generator of metrics for each of the data points X_i,
            write the metrics, text, and labels to a json file
        Nr   c                  3   s    i }  j | d< stttD ]
\}}|| |d < qtD ]:\}}|dkr=r=t|D ]\}}d|f | |d < q/i }t j| ft| D ]\}}| | }|||< qK|V  q!d S )Nr   r   z	metric_%d)r   rk   r   r   rU   )keysr   krq   r   _r   v)r   r   r   r   r   
gen_helper  s    
 
z&json_dataset.write.<locals>.gen_helperc                  3   s&     j D ]} i }| | j< |V  qd S r#   )rU   r   )r   r   r   r   r   r     s   

c                  3   s0    t   D ]\} }j|  |j< |V  qd S r#   )rk   rT   r   )rq   r   )r   r   r   r   
out_stream  s
   z&json_dataset.write.<locals>.out_stream)r~   save_json_stream)r   r   r~   r   r   r   )r   r   r   r   r   r     s   
zjson_dataset.writec                 C   s   | j r8t|d&}t|D ]\}}d}|dkrd}|t|7 }|| qW d    d S 1 s1w   Y  d S dd |D }tj|t|ddd d S )	Nr    r   
c                 S   s   g | ]}|qS r   r   )r   r   r   r   r   r     s    z1json_dataset.save_json_stream.<locals>.<listcomp>)ru   :)
separators)r   r   rk   jsondumpsr   dump)r   Z	save_pathZjson_streamrr   rq   r   Zwrite_stringjsonsr   r   r   r     s   "zjson_dataset.save_json_streamc                 #   s`    | j stt ddd}t|}n	 fdd}| }|D ]}| j|vr*d|| j< |V  qd S )Nr1   utf-8rz   c                  3   sL    t  ddd} | D ]}t|V  qW d    d S 1 sw   Y  d S )Nr1   r   r   )r   r   loads)rr   r   	load_pathr   r   r     s   "z1json_dataset.load_json_stream.<locals>.gen_helperrH   )r   r   rf   r   iterr   )r   r   r   	generatorr   r   r   r   r   r      s   


zjson_dataset.load_json_stream)NNFrv   rw   Fr   )r*   r+   r,   rX   r!   rE   rG   rZ   rF   r%   r(   r   r   r   r   r   r   r   r   |  s$    
	

(r   c                   @   sF   e Zd Z			dddZdd Zdd	 Zd
d Zdd ZdddZdS )	XLDataset   NTc                 K   s\   || _ || _|| _|d u r|}|| _|| _d\| _| _t| j dr(| j jr(d| _| 	  d S )NNNr   T)
r   rF   max_seq_lenmem_lensample_across_docindicesnum_samplesr    r   init_indices)r   r   rF   r   r   r   r>   r   r   r   r!     s   zXLDataset.__init__c                    s    j rt fddtt jD }ntdd  jD }tt| _t	dt| d jd    jd  j
 d  _d S )Nc                       g | ]} j |qS r   r   rB   r   r   r   r   r   +      z*XLDataset.init_indices.<locals>.<listcomp>c                 S   s6   g | ]}t |trt|d  t|d  nt|qS )promptrK   rL   rN   r   r   r   r   -  s    
Dataset document count , token count rH   r   )r   rV   rW   r   r   r   r   r   r   r   r   r   r   rR   r   r   r   r   (  s   
zXLDataset.init_indicesc                 C   r   r#   r   r   r   r   r   r(   8  r   zXLDataset.__len__c                 C   sZ   |  |\}}}}| |}| |}| j|dd}t|t|t|t|dS )Nr   pad_id)rK   target	loss_maskattention_mask)getidxpad_seqrV   rW   )r   r   tokenstargetsr   r   r   r   r   r%   ;  s   

zXLDataset.__getitem__c                 C   s  g g g }}}t jt j| j| jftdt j| j| jftdfdd}t| j|| j }|dkr2dn| j|d  }|| j | }|dkrZt	| j|}	d|d d | j |	 | j f< d}
t
|| jk r|t
| jk r| j| }|d |d }}|| jdjg }t	t
|d || j t
| }|dg }|
dkrt
|}d||d d || j f< |||| 7 }|||d |d  7 }|||d |d  7 }|
d7 }
|d7 }d}t
|| jk r|t
| jk sj||||fS )N)Zdtyper   r{   r   r   
loss_maskseos)rV   ZconcatenateZzerosr   r   rl   r   r   r   minr   r   rF   get_commandId)r   r   r   r   r   r   rD   Zlast_endZtoken_offsethistorycountitemrK   Zmasksendcurrentr   r   r   r   G  sH   

zXLDataset.getidxc                 C   @   | j }td|t| }||d u r| jdjn|g| 7 }|S Nr   padr   rm   r   rF   r   r   r   seqr   Ztotal_tokensZnum_pad_tokensr   r   r   r   g     zXLDataset.pad_seq)r   NTr#   )	r*   r+   r,   r!   r   r(   r%   r   r   r   r   r   r   r     s    
 r   c                   @   s`   e Zd Z				dddZdd Zd	d
 Zdd Zdd Zdd Zdd Z	dddZ
dd ZdS )BlockDatasetr   Trc   Fc           	      K   s   || _ t| j | _d| j | _|| _|| _|| _|| _|| _d\| _	| _
d| _| jr8ddl}|d| _td t| j drE| j jrEd	| _|   dS )
Z
        sentence_start: the stripped article must start with a complete sentence
          r   Fr   Nz/mnt/lid.176.binzLoad language detection modelr   T)r   r   ro   r   r   rF   r   non_sentence_startfilter_english	weighting	total_lenr   fasttextZ
load_modelmodelr   r    init_weighting)	r   r   rF   r   r   r   r   r>   r   r   r   r   r!   s  s"   zBlockDataset.__init__c                    s    j rt fddtt jD }ntdd  jD }t| _tdt| d j d j	  t
t| _d S )Nc                    r   r   r   r   r   r   r   r     r   z/BlockDataset.init_weighting.<locals>.<listcomp>c                 S   rI   rJ   rL   rN   r   r   r   r     rP   r   r   z, non sentence start)r   rV   rW   r   r   r   r9   r   r   r   r   r   r   r   r   r   r   r     s   
zBlockDataset.init_weightingc                 C   s~   	 | | j}t| j|}| |\}}| jr9| j|d d }| j	|
ddd d }|dkr8	 ||fS n	 ||fS q)NTr   r   r   r   Z__label__en)randintr   r   r   r   r   rF   Z	DecodeIdsr   Zpredictreplace)r   np_rngr   data_idxr   r   rK   langr   r   r   get_weighted_samples  s   z!BlockDataset.get_weighted_samplesc                 C   r   r#   r   r   r   r   r   r(     r   zBlockDataset.__len__c                    s  t | tj j fddtdD d |  \}}t|}|| j d }|dkrd} |}   | j	kr   dk rp|| jd k ro|dkro| 
||d  so|d8 }|d7 }|| jd k ro|dkro| 
||d  rSn4|| jd k r|t|k r| 
||d  s|d7 }|d7 }|| jd k r|t|k r| 
||d  r| jd	jg||d   }dg||d   }t|dkr|d | jd
jkrg g }}| ||| j\}}n]| jd	jg| }dg| }| jr9t|| jk r9|  \}}	| jd	jg| }dg|	 }	t|| jt| k}
| ||	| jt| \}}	||7 }||	7 }|
r2nt|| jk st|t|dS )Nc                       g | ]}  d dqS r   l    r  r   r   rp   r   r   r     r   z,BlockDataset.__getitem__.<locals>.<listcomp>   seedr   r         ?   ENCr   rK   r   )r   RandomrV   re   r   r  r   r   r  r   contains_sentence_endrF   r   r   right_strip_seqr   rW   )r   r   r   r   
num_tokenstokens_to_stripZ
move_countstrip_left_tokens
new_tokensnew_loss_maskZis_lastr   r  r   r%     s   










zBlockDataset.__getitem__c                 C   s   t || }|dkrP|t |d k r2| || d  s2|d7 }|t |d k r2| || d  rt || |d k rBt || }|d |  }|d |  }||fS )Nr   r   r  )r   r  )r   r   r   Z
seq_lengthZstrip_right_tokensr   r   r   r    s4   zBlockDataset.right_strip_seqc                 C   B   | j | }|d |d }}|| jdjg }|dg }||fS Nr   r   r   r   r   rF   r   r   r   r  r   r   r   r   r   r   r     
   

zBlockDataset.getidxNc                 C   r   r   r   r   r   r   r   r     r   zBlockDataset.pad_seqc                 C   sX   | j |}d|v rdS d|v rdS d|v rdS d|v rdS d|v r$dS d|v r*dS dS )	N.T?!;r   r   FrF   Z	IdToTokenr   tokr   r   r   r    s   z"BlockDataset.contains_sentence_end)r   Trc   Fr#   )r*   r+   r,   r!   r   r  r(   r%   r  r   r   r  r   r   r   r   r   q  s    
8

r   c                   @   s\   e Zd Z						dddZdd Zd	d
 Zdd Zdd Zdd ZdddZ	dd Z
dS )GPT2Datasetr   NTFc	           
      K   s   || _ t| j | _|| _|du rd| j | _|| _|| _|| _|| _|| _|| _	d\| _
| _d| _t| j dr>| j jr>d| _|   dS )r   Nr   r   Fr   T)r   r   ro   r   r   rF   weightedr   random_across_doc_samplingsentence_startr   r   r   r    r   )
r   r   rF   r   r   r(  r   r)  r*  r>   r   r   r   r!     s    zGPT2Dataset.__init__c                    s    j r? jrt fddtt jD }ntdd  jD }t| _t	dt| d j  t
t| _d S d  _d S )Nc                    r   r   r   r   r   r   r   r   ;  r   z.GPT2Dataset.init_weighting.<locals>.<listcomp>c                 S   rI   rJ   rL   rN   r   r   r   r   =  rP   r   r   )r(  r   rV   rW   r   r   r   r9   r   r   r   r   r   r   r   r   r   r   7  s   

zGPT2Dataset.init_weightingc                 C   .   | j d ur|| j}t| j |S || jS r#   r   r  r   r   ro   r   r  r   r   r   r   r  I     
z GPT2Dataset.get_weighted_samplesc                 C   r   r#   r   r   r   r   r   r(   P  r   zGPT2Dataset.__len__c                    sv  t | tj j fddtdD d |  }| |\}}t|}|| j d }|dkr^ 	|d }||d  }||d  }t|| j d }|dkr^|d |  }|d |  }| j
rt|| jd k r| jrs|  }n|d | j }| |\}	}
||	7 }||
7 }t|| jd k sj|d | jd  }|d | jd  }| |}| j|dd}t|t|dS )	Nc                    r  r  r	  r
  r  r   r   r   W  r   z+GPT2Dataset.__getitem__.<locals>.<listcomp>r  r  r   r   r   r  )r   r  rV   re   r   r  r   r   r   r  r   r)  ro   r   rW   )r   r   r  r   r   r  r  r  Zstrip_right_rokensr  r  r   r  r   r%   S  s<   

	
zGPT2Dataset.__getitem__c                 C   r  r  r  r  r   r   r   r     r  zGPT2Dataset.getidxc                 C   sD   | j d }td|t| }||d u r| jdjn|g| 7 }|S )Nr   r   r   r   r   r   r   r   r     s   
zGPT2Dataset.pad_seqc                 C   s4   | j |}d|v rdS d|v rdS d|v rdS dS )Nr   Tr!  r"  Fr$  r%  r   r   r   r    s   z!GPT2Dataset.contains_sentence_end)r   NTTTFr#   )r*   r+   r,   r!   r   r  r(   r%   r   r   r  r   r   r   r   r'    s    
1

r'  c                   @   s   e Zd ZdZ							d#dd	Zd
d Zdd Zdd Zdd Zdd Z				d$ddZ
dd Zdd Zdd Zdd Zdd  Zd!d" ZdS )%BertSentencepairDataseta  
    Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
    Arguments:
        ds (Dataset or array-like): data corpus to use for training
        max_seq_len (int): maximum sequence length to use for a sentence pair
        mask_lm_prob (float): proportion of tokens to mask for masked LM
        max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10
        short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len
        dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)

       333333?N{Gz?FTc	           
      K   s   || _ t| j | _| j  | _t| jj | _| j 	d  || _
|| _|d u r3t|| d d }|| _|| _|| _| jd u rJ| j| jd  | _|| _| jsWtjddd || _|   d S )N
   r   Zpunktz./nltk)download_dir)r   r   ro   rG   rF   r   Ztext_token_vocabr   vocab_wordsrE   r   mask_lm_probmathceilmax_preds_per_seqshort_seq_probdataset_sizepresplit_sentencesnltkdownloadr(  get_weighting)
r   r   r   r6  r9  r:  r;  r<  r(  r>   r   r   r   r!     s&   

z BertSentencepairDataset.__init__c                 C   sh   | j r/t| jdr| jjrt| jj}ntdd | jD }t|| _t	t
|| _d S d | _d S )Nr   c                 S   rI   rJ   rL   rN   r   r   r   r     rP   z9BertSentencepairDataset.get_weighting.<locals>.<listcomp>)r(  r    r   r   rV   rW   rR   r9   r   r   r   r   r   r   r   r   r?    s   

z%BertSentencepairDataset.get_weightingc                 C   r+  r#   r,  r-  r   r   r   r    r.  z,BertSentencepairDataset.get_weighted_samplesc                 C   r   r#   )r;  r   r   r   r   r(     r   zBertSentencepairDataset.__len__c                    s2  t | tj j fddtdD d}| j}d}   | jk r) d|}d}d }d}d}|d u s;|d	k s;|d	k r]| | |\}}	}t	|d }t	|	d }|d u s;|d	k s;|d	k s;| 
||	| j \}}	| ||	| j| j| j \}
}}}t|
d t|
d	 t|t|t|t|d
}|S )Nc                    r  r  r	  r
  r  r   r   r     r   z7BertSentencepairDataset.__getitem__.<locals>.<listcomp>r  r  Fr  Tr   r   )rK   typesZ	is_randommaskmask_labelspad_mask)r   r  rV   re   r   r   r:  r  create_random_sentencepairr   truncate_seq_paircreate_masked_lm_predictionsr6  r9  r5  rW   rl   )r   r   r  target_seq_lengthZ	short_seqis_random_nextZlenaZlenbZtokensaZtokensbr   rA  rB  rC  sampler   r  r   r%     sD   

z#BertSentencepairDataset.__getitem__c                 C   sH   | d}| jrdd |D S g }|D ]}|dkr!|t| q|S )zsplit document into sentencesr   c                 S   s   g | ]}|r|qS r   r   )r   liner   r   r   r         z:BertSentencepairDataset.sentence_split.<locals>.<listcomp>r   )rn   r<  rQ   r   Zsent_tokenize)r   ZdocumentlinesrtnrJ  r   r   r   sentence_split
  s   
z&BertSentencepairDataset.sentence_splitr   c                 C   s:   | j |j}dt| }| j |jgt| }||fS )z%tokenize sentence and get token typesr   )rF   r   Ztokenizationr   get_typer   r   )r   sentZsentence_numZ	beginningendingr   Zstr_typetoken_typesr   r   r   sentence_tokenize  s   z)BertSentencepairDataset.sentence_tokenizec                 C   s    | j | }t|tr|d }|S )z*gets text of document corresponding to idxrK   )r   r6   rM   )r   r   rM  r   r   r   get_doc   s   

zBertSentencepairDataset.get_docc                 C   s  d}g }g }d}|dk rd}d}|du r6| j r| |}	n	|d| jd }	| | |	}|s2d}|du s|dt|d }
|
t|k r||
 }| |d|
dk|
t|k\}}|| || |t|7 }|
t|d ksv||krwn
|
d }
|
t|k sF|dk s|rbd}t|dkr|dt|}g }g }t	|D ]}|
||  |
||  qg }g }d}t|dks| dk rGd}|t| }d}|dk rFd}|du r|d| jd }|t||	k7 }| | |}|sd}|du s|dt|d }|t|k rB|| }| |d|dk|t|k\}}|t|7 }|
| |
| t||kr7n|d }|t|k s|dk snd}t	|t|D ]}|
||  |
||  qP||f||f|fS )z
        fetches a random sentencepair corresponding to rng state similar to
        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
        Nr   r   r  Fr  T)r(  r  r  ro   rN  rT  r   rS  r/   r   rQ   r   rl   )r   rG  rp   r  rH  Z	curr_strsZcurr_str_typesZcurr_lenZdoc_aZ	doc_a_idxZrandom_start_arv   Zsentence_typesZnum_atokens_atoken_types_ar   tokens_btoken_types_bZtarget_b_lengthZb_lenZdoc_bZ	doc_b_idxZrandom_start_bZ
sentence_bZnew_b_tokensZnew_b_typesr   r   r   rD  '  s   









z2BertSentencepairDataset.create_random_sentencepairc                 C   s   |\}}|\}}|d }		 t |}
t |}|
| }||	krn3t |t |kr+|}|}n|}|}t |dks7J | dk rH|d |d n|  |  q||f||ffS )z
        Truncate sequence pair according to original BERT implementation:
        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
           Tr   r  r   )r   r   pop)r   abr   rp   rU  rV  rW  rX  Zmax_num_tokenslen_alen_btotal_lengthZtrunc_tokensZtrunc_typesr   r   r   rE  ~  s,   
z)BertSentencepairDataset.truncate_seq_pairc                 C   sL   || }|  dk r| jdj}n|  dk r|}n||}|||< |S )z
        helper function to mask `idx` token from `tokens` according to
        section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf
        rb   MASKr  )r   rF   r   r   choice)r   r   r   r@  r5  rp   rw   Z	new_labelr   r   r   
mask_token  s   
z"BertSentencepairDataset.mask_tokenc                 C   sL   t d| jt| }dgt| dg|  }|| jdjg| 7 }||fS )z$helper function to pad sequence pairr   r   r   )rm   r   r   rF   r   r   )r   r   Znum_padrC  r   r   r   r     s   zBertSentencepairDataset.pad_seqc                    sX  |\}}|\}	}
| j djg| | j djg |	 | j djg }|d g| |d g |
 |
d g }t| t|	}dd t D  fddt|D  }|| | t|\}}| t|\}}t|t	dt
tt|| }dgt| }dgt| }t|d	| D ]}d||< | |||||}|||< q||f|||fS )
z
        Mask sequence pair for BERT training according to:
        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338
        r  ry   r   c                 S   s   g | ]}|d  qS )r   r   r   r   r   r   r     rK  zHBertSentencepairDataset.create_masked_lm_predictions.<locals>.<listcomp>c                    s   g | ]}|d    qS )r  r   r   r]  r   r   r     r   r   rH   N)rF   r   r   r   r   r   r   r   r   rm   rl   roundsortedrb  )r   r[  r\  r6  r9  r5  rp   rU  rV  rW  rX  r   rR  r^  Zcand_indicesZoutput_tokensrC  Zoutput_typesr   Znum_to_predictrA  rB  r   rw   r   rc  r   rF    sH   


z4BertSentencepairDataset.create_masked_lm_predictions)r0  r1  Nr2  NFT)r   FF)r*   r+   r,   rX   r!   r?  r  r(   r%   rN  rS  rT  rD  rE  rb  r   rF  r   r   r   r   r/    s2    
'
Wr/  )NTNN)'rX   r   r7  osr   timebisectr   	itertoolsr   operatorr   r   r=  numpyrV   Zpandasr   rg   Ztqdmr   Ztorch.utilsr   Z modelscope.models.nlp.mglm.utilsr   Zlazy_loaderr	   r
   ZDatasetr   r-   r\   rs   rt   r   r   r   r'  r/  r   r   r   r   <module>   sB   [
4/  ^ ' 
