o
    *j                     @   sn  U d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZ ddlmZ ddlZddlmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%mZ& zddlmZ' W n e(yt   dZ'Y nw dd	l)m*Z* dd
l+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZL ddlJmMZMmNZNmOZO ddlPmQZQ ddlRmSZS ddlTmUZU ddlVmWZW ddlXmYZY ddlZm[Z[m\Z\ ddl]m^Z^ ddl_m`Z` ddlambZb dd lcmdZdmeZemfZf dd!lgmhZhmiZimjZjmkZkmlZlmmZmmnZn z
dd"l3moZpmqZr W n e(yO   dd#l3mpZpmrZr Y nw eb Zsed$ Zte'dur^e'ne&Zud%e	fd&d'Zvd(ewd)ed*ewfd+d,Zxddd-ddd.d/ewd0eew d1eey d2ezd3eeezewf  d4eeet  d*eLfd5d6Z{i Z|e
e}eeeNeOf  f e~d7< d/ewd0ewd8ewd9ezd*eeeeNeOf   f
d:d;Z	dLd<d-dddd=d/ewd8eew d9ezd4ezd0eew d>eew d3eeezewf  d*eeeNeOf  fd?d@Zd-ddddAd/ewdBeeew ewf d4ezd0eew d>eew d3eeezewf  d*eeeNeOf  fdCdDZdadMdFdGZG dHdI dIZejdJdK ZdS )Nu  ModelScope dataset loading orchestration.

This module provides :class:`DatasetsWrapperHF` and the
:func:`load_dataset_with_ctx` context manager that monkey-patch the
HuggingFace ``datasets`` library to work with the ModelScope Hub.

Sub-modules:
    _compat           – backward-compat shims for datasets>=4.0 script loading
    _module_factories – dataset module factory functions & data-file resolution
    N)fields)Path)
AnyDictIterableListLiteralMappingOptionalSequenceTupleUnion)	urlencode)DatasetDatasetBuilderDatasetDictDownloadConfigDownloadManagerDownloadModeFeaturesIterableDatasetIterableDatasetDictSplitVerificationModeVersionconfig
data_files	LargeListr   )r   )features)_FEATURE_TYPES)DataFilesDictEmptyDatasetError)DataFilesNotFoundErrorDatasetNotFoundError)CachedDatasetModuleFactoryDatasetModule(HubDatasetModuleFactoryWithParquetExportPackagedDatasetModuleFactoryget_dataset_builder_class)_EXTENSION_TO_MODULE_PACKAGED_DATASETS_MODULES)
file_utils)!_raise_if_offline_mode_is_enabledcached_pathrelative_to_absolute_path)is_small_dataset)tracked_str)
hf_hub_url)OfflineModeIsEnabled)DatasetInfo)HfApiRepoFile
RepoFolder)HfFileSystem)HubApi)get_endpoint)get_from_cache_ms)MS_DATASETS_CACHE)DEFAULT_DATASET_REVISIONREPO_TYPE_DATASET)has_attr_in_class)is_relative_path)
get_logger)_HAS_SCRIPT_LOADING!HubDatasetModuleFactoryWithScript#LocalDatasetModuleFactoryWithScript)_resolve_pattern_download_repo_fileget_module_without_scriptget_module_with_script_compat_local_script_module_compat_hub_script_module_get_hub_api)HubDatasetModuleFactoryLocalDatasetModuleFactory)$HubDatasetModuleFactoryWithoutScript&LocalDatasetModuleFactoryWithoutScript)authorcardDataZcitation	createdAtdisableddescription	downloadsZdownloadsAllTimegatedlastModifiedlikesZpaperswithcode_idprivatesiblingsshatagsobjc                    sN  t | trdd | D S d| vst | d tr dd |  D S t| } | d}|dkr@| d}| dd	}tt||d
S t|dpLt	 |d}|du r_t
d| dtt  |tkrr| d}tt|fi | S tdur|tu st|tr| d}tt|fi | S dd t|D  |di  fdd|  D S )a&  Regenerate the nested feature object from a deserialized dict.

    This is a ModelScope-patched version of ``features.generate_from_dict``
    that handles backward compatibility for legacy ``Sequence`` types in
    datasets 4.0+ where ``Sequence`` is no longer a registered feature type.
    c                 S      g | ]}t |qS  generate_from_dict_ms).0valuer^   r^   m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py
<listcomp>       z)generate_from_dict_ms.<locals>.<listcomp>_typec                 S   s   i | ]	\}}|t |qS r^   r_   )ra   keyrb   r^   r^   rc   
<dictcomp>       z)generate_from_dict_ms.<locals>.<dictcomp>r   featurelength)rj   rk   NzFeature type 'z&' not found. Available feature types: c                 S   s   h | ]}|j qS r^   )name)ra   fr^   r^   rc   	<setcomp>       z(generate_from_dict_ms.<locals>.<setcomp>c                    s   i | ]\}}| v r||qS r^   r^   )ra   kvfield_namesr^   rc   rh      s    r^   )
isinstancelistdictitemspopget
SequenceHfr`   r   globals
ValueErrorkeysr   _NativeList
issubclassr   )r\   rf   rj   rk   Z
class_typer^   rs   rc   r`      s*   




 r`   url_or_filenamedownload_configreturnc                 C   s2  t |}|dru|tdd }dD ]}||r$|t|d } nqd|v rS|d}||d d }|d}|dkrD|}d}	n#|d| }||d d }	n|dd	}
t}t|
d	kre|
d	 nd}	td
||	d}| j| }nt	|rt}td
||d}| j| }t
||d}t|}|| |S )zModelScope replacement for ``DownloadManager._download``.

    Rewrites relative paths and ``hf://`` URLs to ModelScope API endpoints.
    zhf://N)z	datasets/zmodels/@   /rl       ZSDK)ZSourceZRevisionZFilePath)r   )str
startswithlenindexfindsplitr<   r   Z
_base_pathr?   r-   r0   Z
set_origin)selfr   r   Zhf_path_prefixZat_idxZafter_atZ	slash_idxrevision	file_pathparts
params_stroutr^   r^   rc   _download_ms   s<   





r   F)r   timeoutfiles_metadatatokenexpandrepo_idr   r   r   r   r   c                C   s   | j |d|d||td}|d|r|dd nd|dddddg g dd}g }	|D ]}
t|
tr<|	t|
j|
j|
j	d q(|	|d< t
d	i |S )
z2ModelScope replacement for ``HfApi.dataset_info``.r   F)r   path_in_repor   	recursiver   r   	repo_typer   N)idrX   rO   rZ   rV   rU   rR   rT   rW   r[   rP   rQ   )	rfilenameZblobIdsizerY   r^   )list_repo_treer=   r   ru   r5   appendrw   r   Zblob_idr   HfDatasetInfo)r   r   r   r   r   r   r   repo_info_iter	data_infoZdata_siblingsZ	info_itemr^   r^   rc   _dataset_info   sF   
r   _repo_tree_cacher   r   c                 C   s   | |ddf}t |}|du rdS |r|dkr|dnd}g }|D ]5}|j}	|rI|	|d s5|	|kr5q"|	|d rF|	t|d d nd}
n|	}
|sRd|
v rRq"|| q"|S )z;Try to derive results from a cached recursive root listing.r   TNr   r   )r   rz   strippathr   r   r   )r   r   r   r   Zroot_keyZroot_cachedprefixresultsitemZ	item_pathrel_pathr^   r^   rc   _derive_from_recursive_cache
  s"   
(r   T)r   r   r   r   r   r   c             
   c   s   |pt }|pd}||||f}	t|	}
|
dur|
E dH  dS t||||}|dur5|t|	< |E dH  dS t }|j|td}|d\}}|j|||d\}}g }d}d}d}||krz|j	||||||||d}W n t
y } ztd	| d
|  W Y d}~nMd}~ww |snE|D ]3}|d dkrdnd|d |d |d d}|d dkrtdi |ntdi |}|| |V  qt||k rn|d7 }||ks\|t|	< dS )z4ModelScope replacement for ``HfApi.list_repo_tree``.r   Nr   r   dataset_name	namespaceendpointr   i  i'  )r   r   	root_pathr   page_number	page_sizer   dataset_hub_idzGet dataset: z file list failed, message: Typetree	directoryfiler   SizeZSha256)typer   r   oidr   r^   )r<   r   rz   r   rJ   get_endpoint_for_readr=   r   Zget_dataset_id_and_typeZget_dataset_files	Exceptionloggererrorr5   r6   r   r   )r   r   r   r   r   r   r   r   Znormalized_path	cache_keycachedZderivedapir   _owner_dataset_namer   _r   r   r   Z	max_pagesZdataset_fileseZfile_info_dZ	path_infor   r^   r^   rc   _list_repo_tree&  st   





(
"r   )r   r   r   r   pathsc                   s   |pt }t|tr|g}t| t D ],\}}|d |ks$|d |kr%q fdd|D }	|	r4|	  S |||ddfkr@g   S q| j|d||||d}
 fd	d|
D S )
z4ModelScope replacement for ``HfApi.get_paths_info``.r   r   c                       g | ]	}|j  v r|qS r^   r   ra   r   Z	paths_setr^   rc   rd     ri   z#_get_paths_info.<locals>.<listcomp>r   TF)r   r   r   r   r   r   c                    r   r^   r   r   r   r^   rc   rd     ri   )r<   ru   r   setr   rx   r   )r   r   r   r   r   r   r   r   Zcached_itemsZmatchedr   r^   r   rc   _get_paths_infor  s,   
	r   rbc                 K   s8  |dkrd|vrz|  |}| }| |}d}|| jv r;| j| D ]}|d |kr:|ddkr:|dd} nq#|dkrt|j|j|j|j	| j
d	}	| j }
tj|	|
d
dd}|jdkr|jd}|rt|}||d< | j|g D ]}|d |kr||d<  nqwW n	 ty   Y nw t| |fd|i|S )as  Wrapper for HfFileSystem._open that fixes size=0 from ModelScope API.

    The ModelScope tree API may report Size=0 for files. When HfFileSystem
    caches this, AbstractBufferedFile treats the file as empty (0 bytes).
    This wrapper detects size=0 for files opened in read mode and resolves
    the actual size via a HEAD request before creating the file object.
    r   r   Nrm   r   r   rl   r   )r   r   filenamer   r   T   )headersallow_redirectsr      zContent-Lengthmode)Zresolve_pathZ	unresolveZ_parentZdircacherz   r1   r   r   r   r   r   Z_apiZ_build_hf_headersrequestsheadstatus_coder   intr   _hf_fs_open_original)r   r   r   kwargsresolvedZresolved_nameparentZcached_sizeentryurlr   respclZactual_sizer^   r^   rc   _hf_fs_open  sJ   




r   c                /   @   st  e Zd Ze																				d dedee dee deeeee eeeeee f f f  deeee	f  d	ee d
ee
 dee deeeef  deeeef  dee dedeeeef  deeeef  dedee dee dedee deeeeeef f(ddZe													d!dedee dee deeeee eeeeee f f f  d	ee d
ee
 dee deeeef  deeeef  deeeef  dee dee defddZe										d"dedeeeef  dee deeeef  dee dee deeeeeef  d	ee dee defddZdS )#DatasetsWrapperHFNF
deprecatedr   rm   data_dirr   r   	cache_dirr   r   download_modeverification_modekeep_in_memory
save_infosr   r   	streamingnum_procstorage_optionstrust_remote_codedataset_info_onlyr   c           #      K   s  |dkrt dt |}|dkrt dt nd }|d ur'|s'td| dt| tj r3td|r=|d ur=tdt	|pBt	j
}t|sL|	pKtjntj}	|r[td|  d	 tjd| ||||||||||||d u d
|}|ri }t| tr| drtj| rddlm} || }dd |D }|S |d u st|dstd|  d |S |j}| D ]%\}}t|}t|dr|jd urdd t|j D ||< qg ||< q|S |r|j |dS |j!|||	||d |
d ur|
nt"|j#j$}
|j%||	|
d}|d ur(t &  t 'dt |(|}W d    n	1 s#w   Y  |r/|)  z/t* }t+| rY| ,ddkr\| -d\}} |j.| t/d}!|j0| ||!d W |S W |S W |S  t1yz }" ztd|"  W Y d }"~"|S d }"~"ww ) Nr   'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'token=<use_auth_token>' instead.zF'task' was deprecated in version 2.13.0 and will be removed in 3.0.0.
zEmpty 'data_files': 'z3'. It should be either non-empty or None (default).zjYou are trying to load a dataset that was saved using `save_to_disk`. Please use `load_from_disk` instead.zLoading a streaming dataset in parallel with `num_proc` is not implemented. To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead.z3Use trust_remote_code=True. Will invoke codes from z9. Please make sure that you can trust the external codes.)r   rm   r   r   r   r   r   r   r   r   r   r   _require_default_config_name.pyr   )get_dataset_config_namesc                 S   s   i | ]}|g qS r^   r^   )ra   Z_subsetr^   r^   rc   rh   +  rp   z2DatasetsWrapperHF.load_dataset.<locals>.<dictcomp>builder_configszNo builder_configs found for z	 dataset.r   c                 S   r]   r^   )r   r   r^   r^   rc   rd   7  re   z2DatasetsWrapperHF.load_dataset.<locals>.<listcomp>)r   )r   r   r   r   r   )r   r   Z	in_memoryignorer   r   r   r   z&Could not record download statistics: r^   )2warningswarnFutureWarningr}   r   r   ZDATASET_STATE_JSON_FILENAMEexistsNotImplementedErrorr   REUSE_DATASET_IF_EXISTSr   ZBASIC_CHECKSZ
ALL_CHECKSr   warningr   load_dataset_builderru   r   endswithosr   datasetsr   hasattrr   r   rx   r   rv   r~   Zas_streaming_datasetZdownload_and_preparer/   infoZdataset_sizeZ
as_datasetcatch_warningssimplefilterZprepare_for_taskZ_save_infosrJ   r?   countr   r   r=   Zdataset_download_statisticsr   )#r   rm   r   r   r   r   r   r   r   r   r   r   r   r   use_auth_tokentaskr   r   r   r   r   config_kwargsbuilder_instanceZret_dictr   Zsubset_listZ_tmp_builder_configsZtmp_config_nameZtmp_builder_configZdsr   Z
_namespacer   r   r   r^   r^   rc   load_dataset  s   
 
	

zDatasetsWrapperHF.load_datasetTc                    s  |
dkrt dt |
}	t|ptj}|	d ur#|r| nt }|	|_|d ur6|r-| nt }|j	| t
j ||||||||t||d}|j}|d|}|d|}|d|p_|jj}|dd }|jrp|j|nd } tv r|d u r|jjd jd u rd	  d
} fddtD }|r|d|d  d7 }t|t||d}|d||||||j|||	|d
||}|| |S )Nr   r   )
r   r   r   r   r   r   r   r   _require_custom_configsrm   r   r   config_namer   r   z@Please specify the data files or data directory to load for the z dataset builder.c                    s   g | ]
}t |  kr|qS r^   )r)   )ra   	extensionr   r^   rc   rd     s
    z:DatasetsWrapperHF.load_dataset_builder.<locals>.<listcomp>z9
For example `data_files={"train": "path/to/data/train/*.z"}`)r   )
r   r   r  r   r   hashr  r   r   r   r^   )r   r   r   r   r   copyr   r   r   updater   dataset_module_factoryboolbuilder_kwargsry   Zbuilder_configs_parametersZdefault_config_nameZdataset_infosrz   r*   r   r   r)   r}   r(   r  Z!_use_legacy_cache_dir_if_possible)r   rm   r   r   r   r   r   r   r   r   r
  r   r   r   r  Zdataset_moduler  r  r   r  	error_msgZexample_extensionsZbuilder_clsr  r^   r   rc   r  a  s   



z&DatasetsWrapperHF.load_dataset_builderdynamic_modules_pathc                 K   s  | dd }|p	t}|d u rtd.i |}|jd|i |jd|i |r/|jd u r/t|_t|p4tj}d|_	d|_
|tjk|_ttdd | tjddd }|ds^|d }tj| |}| tv rtt| ||||d	 S | |rtj| rtrt| |||d
 S t| |||d
S tdt|  tj|rtrt||||d
 S t||||d
S tj| rt | |||d S t!| ro| "ddkroz!t#  zt$ j%| ||j&dd}W ng t'yL } zZt(|t)t*j+j,t*j+j-fr	t-d|  dt.|j/ ddt0|v r&d|  d}t1|r#|d| d |dt0|v rFd|  d}|r>|d| d n|}t1|d |d }~ww t2| d||d}tj3tj4|}|dd |j5D v rd}t6j7r|rzt8t9drt9| ||d W W S t9| ||d  W W S  t'y } zt:;| W Y d }~nd }~ww trt8t<drt<| |||||d! W S t<| |||||d" W S t=| |||||d!W S t8t>drt>| |||||d# W S t>| |||||d$ W S  t'yn } zlt:;d%|  d&|  zd'|i}tr||d(< t?| fi | W W  Y d }~S  t'yi   t(|t)r>t-d)|  d*| d t(|t@t1tAfrJ|d t(|trftdt| d+|  d,t.|j/ d&| d |d w d }~ww tdt| d-)/Nrm   r   Tc                 S   s   | S Nr^   )xr^   r^   rc   <lambda>  s    z:DatasetsWrapperHF.dataset_module_factory.<locals>.<lambda>r   rl   r   )r   r   r   r   )r   r  r   z"Couldn't find a dataset script at )r   r   r   r   g      Y@)r   r   r   r   zCouldn't reach 'z' on the Hub ()Z404z	Dataset 'z' doesn't exist on the Hubz at revision ''Z401zT. If the repo is private or gated, make sure to log in with `huggingface-cli login`.z	README.md)r   r   r   r   c                 S   s   g | ]}|j qS r^   )r   )ra   Zsiblingr^   r^   rc   rd   H  s    z<DatasetsWrapperHF.dataset_module_factory.<locals>.<listcomp>F)r   r   )commit_hashr   )r   r   r   r  r   )r  r   r   r  r   )r   r   r   r   r   )r  r   r   r   r   z>> Error loading z: r   r  z1Couldn't reach the Hugging Face Hub for dataset 'z': z8 or any data file in the same directory. Couldn't find 'z"' on the Hugging Face Hub either: z( or any data file in the same directory.r^   )Bry   r<   r   r   r  r   r;   r   r   Zextract_compressed_fileZforce_extractZFORCE_REDOWNLOADZforce_downloadrv   filterreplacer  sepr   r  r   joinr*   r'   
get_moduleisfilerA   rC   rH   FileNotFoundErrorr.   isdirrN   r?   r	  r,   r4   dataset_infor   r   ru   r2   r   
exceptionsConnectTimeoutConnectionErrorr   __name__r   r#   rE   basenamedirnamerY   r   ZUSE_PARQUET_EXPORTr>   r&   r   r   rB   rI   rM   r$   r"   r!   )r   r   r   r   r  r   r   r   r   r   r  Zdownload_kwargsZsubset_namer   Zcombined_pathr(  r   msgZdataset_readme_pathr  Z#can_load_config_from_parquet_exporte1Z_cached_factory_kwargsr^   r^   rc   r    s  




				z(DatasetsWrapperHF.dataset_module_factory)NNNNNNNNNNFNNr   r   FNNFF)NNNNNNNNNr   NNT)
NNNNNNNNTF)r,  
__module____qualname__staticmethodr   r
   r   r   r	   r   r   r   r   r   r  r   r   r   r   r   r   r   rw   r  r   r  r   r    r%   r  r^   r^   r^   rc   r     s&   	

 	
^	r   c               	   o   s   t j}tj}ttdrtjntj}tj	}tj
}tj}tj}tj}	tr&tjnd}
tj}tj}t t _tt_ttdr>tt_ntt_tt_	tt_
tt_tt_tt_trUtt_tt_|a t!t_|"dd}zKt#j$| i |}|V  W t%&  t'j(&  |t_da |s|t _|t_|t_ttdr|t_n|t_|t_	|t_
|t_|t_|	t_tr|
t_dS dS dS t%&  t'j(&  |t_da |s|t _|t_|t_ttdr|t_n|t_|t_	|t_
|t_|t_|	t_tr|
t_w )a  Context manager that monkey-patches ``datasets`` to use ModelScope.

    All monkey-patches are applied on entry and restored on exit (for
    non-streaming mode) or kept alive (for streaming mode, where lazy
    iteration needs the patches to remain active).
    	_downloadNr   F))r   ZHF_ENDPOINTr+   Zget_from_cacher  r   r4  Z_download_singler4   r(  r   Zget_paths_infor   Zresolve_patternrM   r$  rA   rB   r   Zgenerate_from_dictr7   _openr9   r:   r   r   r   r   rD   rF   rG   r`   r   r   rz   r   r  r   clearr8   Z_dataset_id_type_cache)argsr   Zhf_endpoint_originZget_from_cache_originZ_download_originZdataset_info_originZlist_repo_tree_originZget_paths_info_originZresolve_pattern_originZ get_module_without_script_originZget_module_with_script_originZgenerate_from_dict_originZhf_fs_open_originr   Zdataset_resr^   r^   rc   load_dataset_with_ctx  s   





r8  r  )r   )__doc__
contextlibr  r   dataclassesr   pathlibr   typingr   r   r   r   r   r	   r
   r   r   r   urllib.parser   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   ZDatasetListImportErrorZdatasets.featuresr   Zdatasets.features.featuresr   Zdatasets.data_filesr    r!   Zdatasets.exceptionsr"   r#   Zdatasets.loadr$   r%   r&   r'   r(   Zdatasets.packaged_modulesr)   r*   Zdatasets.utilsr+   Zdatasets.utils.file_utilsr,   r-   r.   Zdatasets.utils.info_utilsr/   Zdatasets.utils.trackr0   Zhuggingface_hubr1   Zhuggingface_hub.errorsr2   Zhuggingface_hub.hf_apir3   r   r4   r5   r6   Zhuggingface_hub.hf_file_systemr7   Z
modelscoper8   Zmodelscope.hub.utils.utilsr9   Z)modelscope.msdatasets.utils.hf_file_utilsr:   Zmodelscope.utils.config_dsr;   Zmodelscope.utils.constantr<   r=   Zmodelscope.utils.import_utilsr>   Zmodelscope.utils.file_utilsr?   Zmodelscope.utils.loggerr@   Z#modelscope.msdatasets.utils._compatrA   rB   rC   Z-modelscope.msdatasets.utils._module_factoriesrD   rE   rF   rG   rH   rI   rJ   rK   rM   rL   rN   r   ZExpandDatasetProperty_Tr   r`   r   r   floatr  r   r   tuple__annotations__r   r   r   r   r   r   contextmanagerr8  r^   r^   r^   rc   <module>   s  
0H$'/
	
 5
	

Q	
+
/   a