o
    *j                     @   s  d dl Z d dlZd dlmZmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlm Z m!Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@ d dlAmBZB eB ZCdefddZDG dd dZEdS )    N)	AnyCallableDictIterableListMappingOptionalSequenceUnion)DatasetDatasetDictFeaturesIterableDatasetIterableDatasetDict)_PACKAGED_DATASETS_MODULES)DatasetRepository)DatasetContextConfig)LocalDataLoaderManagerLocalDataLoaderTypeRemoteDataLoaderManagerRemoteDataLoaderType)ExternalDatasetNativeIterableDataset)build_custom_dataset)DatasetDeleteManager)load_dataset_with_ctx)DatasetUploadManager)build_preprocessor)Config
ConfigDict)MS_DATASETS_CACHE)
DEFAULT_DATASET_NAMESPACEDEFAULT_DATASET_REVISIONREPO_TYPE_DATASETConfigFieldsDatasetFormationsDownloadModeHubsModeKeysTasks
UploadMode)is_relative_path)is_tf_availableis_torch_available)
get_loggerreturnc                 C   sJ   | d u rg } | S t | tr| g} | S tt| t| k r#td|  | S )Nz"List columns contains duplicates: )
isinstancestrlenset
ValueError)para r6   a/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/msdatasets/ms_dataset.pyformat_list)   s   
r8   c                +   @   sR  e Zd ZU dZdZdZeed< 	d^dee	e
eef dee fddZdd	 Zd
d Zdd Zedd Zedd Ze	d^dee	eef dedeed f fddZe	d^dee	eeee
ef dedeed f fddZeedeej dddde!j"e#ddde$ dddfdeee%f dee dee dee dee dee dee d ee d!eeee&e e'eeee&e f f f  d"ee! d#ee d$ee( d%ee) d&ee* d'ee$ d(ee d)ee) d*ee) deed ef f&d+d,Z+eeeddd-e,j-fd.ed/ededee dee d0ee* d1ee* d2ee) d3ee, ddfd4d5Z.eeddfd6ed7ed8ee d9ee d:ee ddfd;d<Z/eedddfd6ed=ed8ee d9ee d:ee d>e)ddfd?d@Z0eeefd.ededee dee def
dAdBZ1					-d_dCeee2e f dDee3e2e3 f dEedFe4dGe)f
dHdIZ5							-d`dJe*dKe)dDee3e2e3 f dCeee2e f dLe3dMe)dNe6ee7f dOeee2e f dPe)fdQdRZ8de	fdSdTZ9dUe6eef de	fdVdWZ:		-dadDee3e2e3 f dCeee2e f dGe)fdXdYZ;		-		dbdJe*dKe)dDee3e2e3 f dMe)dPe)dOeee2e f dCeee2e f fdZd[Z<		dcd'e$fd\d]Z=dS )d	MsDataseta  
    ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
    provide efficient data access and local storage managements. On top of
    that, MsDataset supports the data integration and interactions with multiple
    remote hubs, particularly, ModelScope's own Dataset-hub. MsDataset also
    abstracts away data-access details with other remote storage, including both
    general external web-hosted data and cloud storage such as OSS.
    N_dataset_context_configds_instancetargetc                 C   sJ   || _ |d ur|| j jvrtdt| j j  d| || _d| _d S )Nz)"target" must be a column of the dataset(z
, but got F)_hf_dsfeatures	TypeErrorlistkeysr<   	is_custom)selfr;   r<   r6   r6   r7   __init__@   s   
zMsDataset.__init__c                 c   s0    | j D ]}| jd ur|| j V  q|V  qd S N)r=   r<   )rC   itemr6   r6   r7   __iter__L   s   

zMsDataset.__iter__c                 C   s
   | j | S rE   r=   )rC   keyr6   r6   r7   __getitem__S      
zMsDataset.__getitem__c                 C   
   t | jS rE   )r2   r=   rC   r6   r6   r7   __len__V   rK   zMsDataset.__len__c                 C   s   | j S rE   rH   rM   r6   r6   r7   r;   Y   s   zMsDataset.ds_instancec                 C   s   t | jtr
| jjS d S rE   )r0   r=   r   config_kwargsrM   r6   r6   r7   rO   ]   s   zMsDataset.config_kwargshf_dsr/   c                    s   t dt t|tr |S t|tr4t| dkr( tt	|
 S  fdd| D S t|tr= |S tdt| )z
        @deprecated
        This method is deprecated and may be removed in future releases, please use `to_ms_dataset()` instead.
        z@from_hf_dataset is deprecated, please use to_ms_dataset instead.   c                       i | ]
\}}| |qS r6   r6   .0kvclsr<   r6   r7   
<dictcomp>t       z-MsDataset.from_hf_dataset.<locals>.<dictcomp>z2"hf_ds" must be a Dataset or DatasetDict, but got )warningswarnDeprecationWarningr0   r   r   r2   rA   nextitervaluesitemsr   r?   type)rX   rP   r<   r6   rW   r7   from_hf_datasetd   s   



zMsDataset.from_hf_datasetc                    s   t |tr
 |S t |tr.t| dkr" tt| S  fdd| D S t |t	r7 |S t |t
r@ |S t |trI |S t |trmt| dkra tt| S  fdd| D S tdt| )z&Convert input to `MsDataset` instance.rQ   c                    rR   r6   r6   rS   rW   r6   r7   rY      rZ   z+MsDataset.to_ms_dataset.<locals>.<dictcomp>c                    rR   r6   r6   rS   rW   r6   r7   rY      rZ   z8"ds_instance" must be a Dataset or DatasetDict, but got )r0   r   r   r2   rA   r^   r_   r`   ra   r   r   r   r   r?   rb   )rX   r;   r<   r6   rW   r7   to_ms_dataset|   s&   






zMsDataset.to_ms_datasetFrQ   dataset_name	namespaceversionhubsubset_namesplitdata_dir
data_filesdownload_mode	cache_dirr>   use_streamingstream_batch_size
custom_cfgtokendataset_info_onlytrust_remote_codec           '      K   s  |rddl m} | }|| t|	ptj}	t|ptj}|tjk}t| t	s6t| t
s6tdt|  t| t
rO|du rAd}t|| i}tj||dS tj| } tj| }t| r| ddkr|s|s| d}|d  }|d  } |r| sd	|rtd
|  d |ddkrddl}ddl}z||j W n t y   |d Y nw t!d| |||||||||	|
|||d|}| t"v stj#| stj$| rt%|&t'j(}tj||d}t|tr||_)|r|j*dd|i| d|_+|S |tjkr!ddl,m&} || f|||||
||	j-|||d
|S |tjkrddl m} | }|j.|d |  t/d}|j0| ||d\}} t	| t	t1j2j-krt3d|d |  |||||
|d|	j-|||||d|
}!|!W  d   S 1 syw   Y  dS t4|}"|"&t5j6}tj||d}t|tr|"j7|_)|r|j*dd|i| d|_+|S |tj8krt9:dt; ddl<m=}# ddl>m?}$ |t@kr|$jA|_B|tCkr|$jD|_E|
tFkrddlGmH}% tjI|%ddd}
|
|_J|#|}&|&K  |&jLS d) a'
  Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.

            Args:
                dataset_name (str): Path or name of the dataset.
                    The form of `namespace/dataset_name` is also supported.
                namespace(str, optional): Namespace of the dataset. It should not be None if you load a remote dataset
                    from Hubs.modelscope,
                namespace (str, optional):
                    Namespace of the dataset. It should not be None if you load a remote dataset
                    from Hubs.modelscope,
                target (str, optional): Name of the column to output.
                version (str, optional): Version of the dataset script to load:
                subset_name (str, optional): Defining the subset_name of the dataset.
                data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                split (str, optional): Which split of the data to load.
                hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
                download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                               DownloadMode.REUSE_DATASET_IF_EXISTS
                cache_dir (str, Optional): User-define local cache directory.
                use_streaming (bool, Optional): If set to True, no need to download all data files.
                                                Instead, it streams the data progressively, and returns
                                                NativeIterableDataset or a dict of NativeIterableDataset.
                stream_batch_size (int, Optional): The batch size of the streaming data.
                custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
                                           see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
                token (str, Optional): SDK token of ModelScope.
                dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
                trust_remote_code (bool, Optional): If set to True, trust the remote code. Default to `False`.
                **config_kwargs (additional keyword arguments): Keyword arguments to be passed

            Returns:
                MsDataset (MsDataset): MsDataset object for a certain dataset.
            r   )HubApiz.dataset_name must be `str` or `list`, but got Nr<   )r<   /rQ   zUThe dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.z3Use trust_remote_code=True. Will invoke codes from z9. Please make sure that you can trust the external codes.Zenginepythoni)re   rf   rg   ri   rj   r<   rh   rk   rl   rm   cache_root_dirro   rp   rt   rq   T)load_dataset)
namerk   rl   rj   rn   r>   rm   revisionrr   	streaming)Zrepo_idZ	repo_type)re   rf   endpoint)pathrz   rk   rl   rj   rn   r>   Zdownload_configrm   r{   rr   r|   rs   rt   zMThe option `Hubs.virgo` is deprecated, will be removed in the future version.)VirgoDownloader)VirgoDatasetConfig)
CACHE_HOMEvirgorh   datasetszPlease adjust input args to specify a loading mode, we support following scenes: loading from local disk, huggingface hub and modelscope hub.r6   )MZmodelscope.hub.apiru   loginr&   REUSE_DATASET_IF_EXISTSr'   
modelscopeZhuggingfacer0   r1   r@   r?   rb   r   	from_dictr9   rd   osr~   
expanduserexistsr+   countrj   striploggerwarninggetcsvsysfield_size_limitmaxsizeOverflowErrorr   r   isdirisfiler   ry   r   ZHF_DATA_LOADERr:   to_custom_datasetrB   r   valueZget_endpoint_for_readr#   Zget_dataset_id_and_typer%   Zgeneralr   r   r   ZMS_DATA_LOADERdataset_context_configr   r[   r\   r]   Z-modelscope.msdatasets.data_loader.data_loaderr   modelscope.utils.constantr   r!   Zdefault_virgo_namespacerf   r"   Zdefault_dataset_versionrg   r    modelscope.utils.config_dsr   joinrx   processdataset)'re   rf   r<   rg   rh   ri   rj   rk   rl   rm   rn   r>   ro   rp   rq   rr   rs   rt   rO   ru   apiZis_huggingface_hubZdataset_instis_local_pathZdataset_name_splitZ
csv_moduler   r   ry   Z_apir}   Zdataset_id_on_hubZdataset_typeZdataset_resZremote_dataloader_managerr   r   r   Zvirgo_downloaderr6   r6   r7   load   sj  <









&


zMsDataset.loadTobject_namelocal_file_pathnum_processes	chunksizefilter_hidden_filesupload_modec	           
      C   s   	 t dt | stdt|||d}	t|ptj}tj	|r+|	j
| ||d dS tj|r>|	j| |||||d dS t| d)z
        @deprecated
        This method is deprecated and may be removed in future releases, please use git command line instead.
        zThe function `upload` is deprecated, please use git command or modelscope.hub.api.HubApi.upload_folder or modelscope.hub.api.HubApi.upload_file.zobject_name cannot be empty!re   rf   rg   )r   r   r   )Zobject_dir_nameZlocal_dir_pathr   r   r   r   z& is not a valid file path or directoryN)r[   r\   r]   r4   r   r*   	OVERWRITEr   r~   r   uploadr   Z
upload_dir)
r   r   re   rf   rg   r   r   r   r   Z_upload_managerr6   r6   r7   r     s:   

zMsDataset.uploaddataset_work_dir
dataset_idr{   
auth_tokengit_pathc                 C   sR   t dt t| ||||d}| }|rtd| dS td| dS )a  Clone meta-file of dataset from the ModelScope Hub.

        Args:
            dataset_work_dir (str): Current git working directory.
            dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
            revision (str, optional):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
            auth_token (str, optional):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you login the first time, if None, we will use saved token.
            git_path (str, optional):
                The git command line path, if None, we use 'git'
        Returns:
            None
        zWThe function `clone_meta` is deprecated, please use git command line to clone the repo.Zrepo_work_dirr   r{   r   r   zAlready cloned repo to: {}zRepo dir already exists: {}N)	r[   r\   r]   r   cloner   infoformatr   )r   r   r{   r   r   _repoZclone_work_dirr6   r6   r7   
clone_meta  s"   zMsDataset.clone_metacommit_messageforcec                 C   s2   t dt t| d|||d}|j|||d dS )aU  Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.

        Args:
            dataset_work_dir (str): Current working directory.
            commit_message (str): Commit message.
            revision(`Optional[str]`):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
            auth_token(`Optional[str]`):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you log in the first time, if None, we will use saved token.
            git_path:(`Optional[str]`):
                The git command line path, if None, we use 'git'
            force (Optional[bool]): whether to use forced-push.

        Returns:
            None

        zuThe function `upload_meta` is deprecated, please use git command or CLI `modelscope upload owner_name/repo_name ...`. r   )r   branchr   N)r[   r\   r]   r   push)r   r   r{   r   r   r   r   r6   r6   r7   upload_meta  s   zMsDataset.upload_metac                 C   s0   t |||d}|j| d}td|  d |S )as   Delete object of dataset. Please log in first and make sure you have permission to manage the dataset.

        Args:
            object_name (str): The object name of dataset to be deleted. Could be a name of file or directory. If it's
                directory, then ends with `/`.
                For example: your-data-name.zip, train/001/img_001.png, train/, ...
            dataset_name (str): Path or name of the dataset.
            namespace(str, optional): Namespace of the dataset.
            version (str, optional): Version of the dataset.

        Returns:
            res_msg (str): Response message.

        r   )r   zObject z successfully removed!)r   deleter   r   )r   re   rf   rg   Z_delete_managerZresp_msgr6   r6   r7   r     s   zMsDataset.deletecolumnspreprocessors	task_namedata_config	to_tensorc                 K   sz   t  stdt| jtr |d|i || jj t||S |dur,| j|||dS | j	  | jj
d||d | jS )aF  Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
           torch.utils.data.DataLoader.

        Args:
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                will be used as a field of torch.utils.data.Dataset.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
                If the `preprocessors` is not None, the output fields of processors will also be added.
            task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
            data_config (ConfigDict, default None): config dict for model object.
                Attributes of ConfigDict:
                    `preprocessor` (Callable, List[Callable], optional): preprocessors to deal with dataset
                    `type` (str): the type of task
                    `split_config` (dict, optional): get the split config for ExternalDataset
                    `test_mode` (bool, optional): is test mode or not
            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
            :class:`torch.utils.data.Dataset`

        z>The function to_torch_dataset requires pytorch to be installedpreprocessorN)r   r   torch)rb   r   format_kwargs)r-   ImportErrorr0   r=   r   updaterO   r   !_to_torch_dataset_with_processorsreset_format
set_format)rC   r   r   r   r   r   r   r6   r6   r7   to_torch_dataset+  s"   !

zMsDataset.to_torch_dataset
batch_sizeshuffle
collate_fndrop_remaindercollate_fn_args
label_colsprefetchc
           
   
   C   sj   t  std|dur| j|||||	||dS |du r"td dS | j  | jj||||||||	dS )a  Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
           model.fit() or model.predict().

        Args:
            batch_size (int): Number of samples in a single batch.
            shuffle(bool): Shuffle the dataset order.
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each field of the dict will be
                used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
                shouldn't be None.
            columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
                the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
                processors will also be added.
            collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
                the `preprocessors` is None, the `collate_fn` shouldn't be None.
            drop_remainder(bool, default None): Drop the last incomplete batch when loading.
            collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
            label_cols (str or List[str], default None): Dataset column(s) to load as labels.
            prefetch (bool, default True): Prefetch data.

        Returns:
            :class:`tf.data.Dataset`

        z?The function to_tf_dataset requires Tensorflow to be installed.N)r   r   r   r   z?The `preprocessors` and the `collate_fn` should`t be both None.)r   r   r   r   )r,   r   _to_tf_dataset_with_processorsr   errorr=   r   to_tf_dataset)
rC   r   r   r   r   r   r   r   r   r   r6   r6   r7   r   ]  s<   $	
zMsDataset.to_tf_datasetc                 C   s   | j   | j S rE   )r=   r   rM   r6   r6   r7   to_hf_dataset  s   
zMsDataset.to_hf_datasetcolumn_mappingc                 C   s   | j   | j |S )a  
        Rename columns and return the underlying hf dataset directly
        TODO: support native MsDataset column rename.
        Args:
            column_mapping: the mapping of the original and new column names
        Returns:
            underlying hf dataset
        )r=   r   Zrename_columns)rC   r   r6   r6   r7   remap_columns  s   
	zMsDataset.remap_columnsc                    s  t |tr|n|g}t   fdd| jj D  g }g }|ritt| jfdd D }|D ]}|dd |	 D  q3dd }	| D ]}
|	||
 sct
d|
 d	 ||
 qL||
 qLd
d lG fdddjjj}|| j||| |S )Nc                       g | ]}| v r|qS r6   r6   rT   rI   )r   r6   r7   
<listcomp>      z?MsDataset._to_torch_dataset_with_processors.<locals>.<listcomp>c                    s   i | ]
}|t  | qS r6   nparrayrT   rU   )sampler6   r7   rY     rZ   z?MsDataset._to_torch_dataset_with_processors.<locals>.<dictcomp>c                 S      i | ]
\}}|t |qS r6   r   rS   r6   r6   r7   rY     s    c                 S   s    t | jt jpt | jt jS rE   )r   Z
issubdtypedtypeintegerZfloating)r   r6   r6   r7   is_numpy_number  s   zDMsDataset._to_torch_dataset_with_processors.<locals>.is_numpy_numberzData of column z  is non-numeric, will be removedr   c                       s>   e Zd Zdef fddZdd ZfddZdd	 Z  ZS )
zAMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDatasetr   c                    s4   t t  || _|| _|| _|| _|| _|| _d S rE   )	superr9   rD   r   preprocessor_listr   retained_numeric_columnsretained_unumeric_columnsr   )rC   r   r   r   r   r   r   	__class__r6   r7   rD     s   
zJMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__init__c                 S   rL   rE   )r2   r   rM   r6   r6   r7   rN     rK   zIMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__len__c                    s   | j r |S |S rE   )r   Z	as_tensor)rC   xr   r6   r7   type_converter  s   
zPMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.type_converterc                    sx   j |   fddjD }jD ]&}|  D ]\}}jr'|jv r/|||< q|jv r8|||< qq|S )Nc                    s,   i | ]}j r|jv r| | qS r6   )r   r   r   r   Z	item_dictrC   r6   r7   rY     s    
zaMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__.<locals>.<dictcomp>)r   r   r   ra   r   r   r   r   )rC   indexresr   rU   rV   r6   r   r7   rJ     s   



zMMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__)	__name__
__module____qualname__r   rD   rN   r   rJ   __classcell__r6   r   r   r7   MsMapDataset  s
    r   )r0   r@   r8   r=   r>   rA   r^   r_   r   ra   r   r   appendr   utilsdatar   )rC   r   r   r   r   r   r   
sample_res	processorr   rU   r   r6   )r   r   r   r7   r     sF   




&z+MsDataset._to_torch_dataset_with_processorsc                    sH  t |tr|n|gtt|}tt|   fddjj D dd ljj	
tjtjtjd}|rF|jtjd}dfdd	dd	jd jgd
fdd}	ddlm}
 |j|	|
d}rfdd}||}nt|dkr|dd }|dkr|j||d}|r||
}|S )Nc                    r   r6   r6   r   )cols_to_retainr6   r7   r     r   z<MsDataset._to_tf_dataset_with_processors.<locals>.<listcomp>r   )r   )buffer_sizeFc                    s`   t    fddD }D ]}|dd |j   D  q|r(|S tt| S )Nc                    s"   i | ]}|t j  | qS r6   )r   r   r=   r   )irC   r6   r7   rY     s   " zJMsDataset._to_tf_dataset_with_processors.<locals>.func.<locals>.<dictcomp>c                 S   r   r6   r   rS   r6   r6   r7   rY     s    
)intr   r=   ra   tupler@   r`   )r   Zreturn_dictr   r   )r   retained_columnsrC   r   r7   func  s   

z6MsDataset._to_tf_dataset_with_processors.<locals>.funcT)Zinput_signaturec                    s:   j | gfdd D d  fddtD S )Nc                    s   g | ]	} j |jqS r6   )ZdtypesZas_dtyper   )rT   val)tfr6   r7   r   -  s    zTMsDataset._to_tf_dataset_with_processors.<locals>.fetch_function.<locals>.<listcomp>)inpZToutc                    s   i | ]	\}}| | qS r6   r6   )rT   r   rI   outputr6   r7   rY   2  s    zTMsDataset._to_tf_dataset_with_processors.<locals>.fetch_function.<locals>.<dictcomp>)Znumpy_functionr`   	enumerater   )r  r   r  r  r7   fetch_function(  s   
z@MsDataset._to_tf_dataset_with_processors.<locals>.fetch_function)AUTOTUNE)Znum_parallel_callsc                    sV    fdd|   D }t| dkrtt|  } t|dkr'tt| }| |fS )Nc                    s   i | ]\}}| v r||qS r6   r6   )rT   rI   Ztensorr   r6   r7   rY   :  s    z_MsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labels.<locals>.<dictcomp>rQ   )ra   r2   r^   r_   r`   )Zinput_batchlabelsr
  r6   r7   split_features_and_labels9  s   
zKMsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labelsrQ   c                 S   s   t t|  S rE   )r^   r_   r`   )r   r6   r6   r7   <lambda>G  s    z:MsDataset._to_tf_dataset_with_processors.<locals>.<lambda>)r   )F)r0   r@   r8   r3   r=   r>   rA   Z
tensorflowr   r   Zfrom_tensor_slicesr   Zaranger2   Zint64r   functionZ
TensorSpecZtensorflow.data.experimentalr	  mapbatchr   )rC   r   r   r   r   r   r   r   Z
tf_datasetr  r	  r  r6   )r   r  r   r   r   r   rC   r  r7   r      sJ   




z(MsDataset._to_tf_dataset_with_processorsc                 K   sp  t  std|sdS d| _|du rd|v r|d}|tjkr"dnd}|d| }|du rBt|tj	r=t
|j	jdnt
dd}|t|d	 |j}d
|v rV|d
}t|}d|v rd|d}|du rwt|drw|j}	|	rwt|	|}t| jtr|t|d || jj t||jd| _dS |dur|dd}
| j||
d| _dS | j  | jjdd dS )a  Convert the input datasets to specific custom datasets by given model configuration and preprocessor.

        Args:
            custom_cfg (Config): The model configuration for custom datasets.
            preprocessor (Preprocessor, Optional): Preprocessor for data samples.
            mode (str, Optional): See modelscope.utils.constant.ModeKeys

        Returns:
            `MsDataset`
        z?The function to_custom_dataset requires pytorch to be installedNTmodetrainr  zdataset.)rb   )r  taskfieldr   )r   )cfgr   r   )r   r   r   )r-   r   rB   r   r(   ZTRAINZsafe_gethasattrr$   modelr   rb   r   dictr  popr)   Zfind_field_by_taskr   r   r0   r=   r   rO   r   r   r   r   )rC   rq   r   r  kwargsZ
ds_cfg_keyZdata_cfgr   
field_nameZpreprocessor_cfgr   r6   r6   r7   r   P  sZ   





zMsDataset.to_custom_datasetrE   )NNNNT)NNNNNNT)NT)NTNN)NN)>r   r   r   __doc__r=   r:   r   __annotations__r
   r   r   r   r   r   r1   rD   rG   rJ   rN   propertyr;   rO   classmethodr   r  rc   r   rd   staticmethodr!   r"   r'   r   r&   r   r    r   r@   r	   r   r   boolr   r   r*   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r6   r6   r6   r7   r9   3   s  
 	





	 g		D&&
6
	

B
V
Rr9   )Fr   r[   typingr   r   r   r   r   r   r   r	   r
   numpyr   r   r   r   r   r   r   Zdatasets.packaged_modulesr   Zmodelscope.hub.repositoryr   Z4modelscope.msdatasets.context.dataset_context_configr   Z5modelscope.msdatasets.data_loader.data_loader_managerr   r   r   r   Z!modelscope.msdatasets.dataset_clsr   r   Z9modelscope.msdatasets.dataset_cls.custom_datasets.builderr   Z(modelscope.msdatasets.utils.delete_utilsr   Z,modelscope.msdatasets.utils.hf_datasets_utilr   Z(modelscope.msdatasets.utils.upload_utilsr   Zmodelscope.preprocessorsr   Zmodelscope.utils.configr   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   Zmodelscope.utils.file_utilsr+   Zmodelscope.utils.import_utilsr,   r-   Zmodelscope.utils.loggerr.   r   r8   r9   r6   r6   r6   r7   <module>   s0   ,0
