o
    *jV                     @   sN  d dl Zd dlZd dlZd dlmZmZ d dlZd dlZ	d dl
Zd dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- e- Z.dZ/dZ0G dd de j1Z2G dd de2Z3G dd de j1Z4dS )    N)DictUnion)ArrowBasedBuilderDatasetDatasetDictGeneratorBasedBuilderIterableDatasetIterableDatasetDict)is_remote_filesystem)DatasetInfo)camelcase_to_snakecase)csv)
map_nested)FileLock)HubApi)DatasetContextConfig)ExternalDatasetNativeIterableDataset)DataStreamingDownloadManager)get_subdir_hash_from_split)DEFAULT_DATASET_NAMESPACEDatasetPathNameDownloadMode)
get_logger	delimiter,c                       sz   e Zd Zdef fddZefddZddefdefdd	Zd
d Z	dd Z
dd Zdd Zdd ZdefddZ  ZS )CsvDatasetBuilderdataset_context_configc                    sN  |j | _ |j| _|j| _|j| _|j| _|j| _|jj| _|jj| _|j	| _
ti | _tj| j| j| j | jtj| _t| _t| j
v rI| j
t | _| j
dd | _| j
dd | _| jpct|jj }t|| jd}ddlm}m   fdd| j  D }|!|}t" j#d	| j| j||d| j
 | j | j$_%t&| j | _'ti | _(d S )
Nengine	chunksizesplitversionr   )DataFilesDictDataFilesListc                    s    i | ]\}}| |gd dqS )N)Zorigin_metadata .0kvr$   r%   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/msdatasets/download/dataset_builder.py
<dictcomp>I   s    z.CsvDatasetBuilder.__init__.<locals>.<dictcomp>)	cache_dirconfig_namehash
data_filesr%   ))dataset_namecache_root_dir	namespacer"   subset_namer!   data_meta_configmeta_data_fileszip_data_filesconfig_kwargsinput_config_kwargsdictsplit_path_dictospathjoinr   	META_NAMEcache_build_dirDEFAULT_CSV_DELIMITERcsv_delimiterDELIMITER_NAMEpop
csv_enginecsv_chunksizelisttarget_dataset_structurekeysr   Zdatasets.data_filesr#   r$   itemsZfrom_local_or_remotesuper__init__infobuilder_namer   namelocal_meta_csv_paths)selfr   r!   sub_dir_hashr#   r0   	__class__r*   r+   rL   *   sT   







zCsvDatasetBuilder.__init__c                 C   s    t j| j| jdd|d}|S )NFT)with_version	with_hashr3   )r<   r=   r>   _cache_dir_root_relative_data_dir)rQ   r3   builder_data_dirr%   r%   r+   _build_cache_dir[   s   z"CsvDatasetBuilder._build_cache_dirTreturnc                 C   s   |du r| j jn| d| j j }| j}| j}|r!tj|| j}|r.tj|t| jj	}|r>|r>t
|tr>tj||}|S )zRelative path of this dataset in cache_dir:
        Will be:
            self.name/self.config.version/self.hash/
        or if a namespace has been specified:
            self.namespace___self.name/self.config.version/self.hash/
        NZ___)rM   rN   configr/   r<   r=   r>   Z	config_idstrr"   
isinstance)rQ   rU   rV   r3   rY   Zbuilder_configr/   r%   r%   r+   rX   c   s   "

z$CsvDatasetBuilder._relative_data_dirc              	   C   sz   | j jstd|| j j}|| j}g }| D ]\}}t|tr'|g}|t	j
|||||dd q|S )Nz7At least one data file must be specified, but got none.)filesbase_dirrO   
gen_kwargs)r\   r0   
ValueErrordownload_and_extractr7   rJ   r^   r]   appenddatasetsSplitGeneratorZ
iter_filesget)rQ   
dl_managerr0   r7   splits
split_namer_   r%   r%   r+   _split_generatorsy   s&   
z#CsvDatasetBuilder._split_generatorsc                 #   s   | j dkrzttj W n ty   td Y nw | jjd ur+t	| jjj
nd }|r;dd t|j|jD nd }t|D ]\}}td|| j| jpNdd}| j d ur[| j |d< tj|fi |}g }	|jjD ]}
|
d	rv|	|
 qjz.t|D ]'\}}|	D ]}
 r||
  fd
d||
< qtjj||d}||f|fV  q|W qA ty } ztd| dt
| d|   d }~ww d S )Npythonic                 S   s   i | ]	\}}||  qS r%   )Zto_pandas_dtype)r'   rO   dtyper%   r%   r+   r,      s    z6CsvDatasetBuilder._generate_tables.<locals>.<dictcomp>Ti'  )iteratorrn   r   r   r   :FILEc                       t j | S Nr<   r=   r>   xr`   r%   r+   <lambda>       z4CsvDatasetBuilder._generate_tables.<locals>.<lambda>)schemazFailed to read file 'z' with error z: )rE   
csv_modulefield_size_limitsysmaxsizeOverflowErrorr\   featurespary   typezipnamestypes	enumerater:   rB   rF   pdread_csvZ_engineendswithre   applyTablefrom_pandasrc   loggererror)rQ   r_   r`   ry   rn   Zfile_idxfile	pd_kwargsZcsv_file_readertransform_fields
field_nameZ	batch_idxdfpa_tableer%   rv   r+   _generate_tables   sf   





z"CsvDatasetBuilder._generate_tablesc                 K   s*  |j j}|j j}|stj}|j j}|stj}| j}|stj}g }|tj ||j j	 || || || tj
|}	tj|tj|	d }
t|
8 tj|}|ro|tjjkrotd| j d| d td| j d| d | j||d W d    d S 1 sw   Y  d S )N.lockReusing dataset  ()Generating dataset )ri   download_mode)download_configr-   r!   r   ZLOCK_FILE_NAME_ANYr"   r4   re   ZDATA_FILES_NAMEr1   ZLOCK_FILE_NAME_DELIMITERr>   r<   r=   stripr   existsr   REUSE_DATASET_IF_EXISTSvaluer   warningrO   rM   _download_and_prepare)rQ   r   ri   download_kwargstarget_cache_dirrk   Zversion_namer4   Zlock_file_namesZlock_file_name	lock_pathdata_existsr%   r%   r+   download_and_prepare   sD   




"z&CsvDatasetBuilder.download_and_preparec                    sd   dd l }|jj |tjjkr|j dd tj dd  fdd| j	
 D | _|| j| _d S )Nr   T)ignore_errors)exist_okc                    s   i | ]\}}|t | qS r%   )r   fetch_meta_files_from_urlr&   r   r%   r+   r,          z;CsvDatasetBuilder._download_and_prepare.<locals>.<dictcomp>)shutilr   r-   r   ZFORCE_REDOWNLOADr   rmtreer<   makedirsr6   rJ   rP   rd   r7   r;   )rQ   ri   r   r   r%   r   r+   r      s   

z'CsvDatasetBuilder._download_and_preparec              
      s   t d| jd}| jd ur| j|d< tj|fi |}g }|j D ]}|dr-|| q!| j	
|d |D ]M}t trgt dkrg|jd t krbtd| d|jd  d	t  d
 q7 ||< q7t tr| r|||  fdd||< q7td|  q7tj|}t|dS )NFro   r   r   rp    r   z,Number of lines in meta-csv file for split 'z' (z&) does not match number of data-files(z)!c                    rq   rr   rs   rt   Zbase_extracted_dirr%   r+   rw     rx   z;CsvDatasetBuilder._convert_csv_to_dataset.<locals>.<lambda>zNothing to do for field )Zarrow_table)r:   rB   rE   r   r   columnstolistr   re   r;   rh   r^   rG   lenshaper   r   r]   r   r   r   r   r   r   )rQ   rk   Zcsv_file_pathr   r   r   r   Zpa_datar%   r   r+   _convert_csv_to_dataset   s>   







z)CsvDatasetBuilder._convert_csv_to_datasetc                    s   t  fdd j D S )Nc                    s   i | ]\}}|  ||qS r%   )r   r&   rQ   r%   r+   r,     r   z0CsvDatasetBuilder.as_dataset.<locals>.<dictcomp>)r   rP   rJ   r   r%   r   r+   
as_dataset  s   zCsvDatasetBuilder.as_dataset)__name__
__module____qualname__r   rL   r   rZ   r]   rX   rl   r   r   r   r   r   r   __classcell__r%   r%   rS   r+   r   (   s    1	
('!r   c                   @   s2   e Zd ZdefddZdd Zdd Zdd	 Zd
S )TaskSpecificDatasetBuilderr   c                 C   s   |j | _|j| _|j| _|j| _|j| _| jpt|jj	 }t
|| jd| _|jj| _|jj| _d | _d | _td|j i| _tj|j| _|  | _|jj| _d S )Nr    rN   )r1   rO   r4   r3   r!   r"   rG   r5   rH   rI   r   r/   r6   r0   r7   r;   r\   r   	from_dictrM   r<   r=   
expanduserr2   rW   rZ   
_cache_dirZmeta_args_map_config_kwargs)rQ   r   r!   r%   r%   r+   rL     s.   


z#TaskSpecificDatasetBuilder.__init__c                 K   s   t j| j| jt jdd }t|; t j| j}|r:|t	j
kr:td| j d| j d 	 W d    d S td| j d| j d W d    n1 sRw   Y  | j|d d S )N_r   r   r   r   r   )ri   )r<   r=   r>   rW   r   replacesepr   r   r   r   r   r   rO   rM   r   )rQ   r   ri   r   r   r   r%   r%   r+   r   -  s   
z/TaskSpecificDatasetBuilder.download_and_preparec                 C   s   | | j| _d S rr   )rd   r7   r;   )rQ   ri   r%   r%   r+   r   <  s   
z0TaskSpecificDatasetBuilder._download_and_preparec                 C   s   t | j| jS rr   )r   r;   r   r   r%   r%   r+   r   @  s   z%TaskSpecificDatasetBuilder.as_datasetN)r   r   r   r   rL   r   r   r   r%   r%   r%   r+   r     s
    r   c                       s   e Zd Zdef fddZededejfddZde	de
eeef ef fdd	Zde	fd
dZdefddZdd ZdeddfddZedededefddZ  ZS )IterableDatasetBuilderr   c                    s  |j | _ |j| _|j| _|j| _|j| _|j| _|jj| _|jj| _|j	| _
|j| _tj| j| j| j | jtj| _t| _t| j
v rH| j
t | _| jpRt|jj }t|| jd}t jd| j| j | j|d d| j
 | j | j_t| j | _d | _|jj | _ d S )Nr    )r-   r1   r.   r/   r0   r%   )!r1   r2   r3   r"   r4   r!   r5   r6   r7   r8   r9   stream_batch_sizer<   r=   r>   r   r?   r@   rA   rB   rC   rG   rH   rI   r   rK   rL   rM   rN   r   rO   meta_csv_dfmeta_cache_dir)rQ   r   r!   rR   rS   r%   r+   rL   F  sJ   




zIterableDatasetBuilder.__init__r[   c                 C   s   t | d}|S )N)r   )r   )r   Zbuilder_instancer%   r%   r+   get_builder_instancen  s   z+IterableDatasetBuilder.get_builder_instanceri   c                 C   s   t | ttfstd| j dt| j }|s$tdt| jj	 d| 
| dd | |D }|jj}|d u r>|}n||v rG|| }ntd| dt| t| j|d	d
}t |trdt|}|S )NzBuilder z is not streamable.z(Loading a streaming dataset cached in a z is not supported yet.c                 S   s   i | ]}|j |qS r%   )rO   )r'   Zsgr%   r%   r+   r,     s    z?IterableDatasetBuilder.as_streaming_dataset.<locals>.<dictcomp>zBad split: z. Available splits: T)Z	map_tuple)r^   r   r   rc   rO   r
   Z_fsNotImplementedErrorr   r   Z_check_manual_downloadrl   r   r!   rG   r   _as_streaming_dataset_singler:   r	   )rQ   ri   is_localZsplits_generatorsr!   splits_generatorZstreaming_datasetsr%   r%   r+   as_streaming_datasetu  s6   


z+IterableDatasetBuilder.as_streaming_datasetc              	   C   s*  g }d}d}| j rtt| j  }| jrtt| j }|r<|s<| j  D ]\}}|tj||g |dd q'|S |rh|rh| j D ] \}}t	|t
rQ|g}| j |}|tj||||dd qE|S |s|r| j D ]\}}t	|t
r}|g}|tj|d||dd qq|S d| j d)Nr   )metar_   ri   ra   +Neither column meta nor data file found in z#.json, specify at least one column.)r6   nextitervaluesr7   rJ   re   rf   rg   r^   r]   rh   r1   )rQ   ri   rj   Zmeta_data_fileZzip_data_filerk   meta_file_urlr_   r%   r%   r+   rl     sb   '

z(IterableDatasetBuilder._split_generatorsc                 C   s    |  |}t|| j|j| jdS )N)rM   r!   r   )Z _get_examples_iterable_for_splitr   rM   rO   r   )rQ   r   Zex_iterabler%   r%   r+   r     s   
z3IterableDatasetBuilder._as_streaming_dataset_singlec                 k   s   | d}| d}| d}t }d}d}|r0ttt|}|dr0d}tj|d }|rG|sG| 	| t
j| j}	d|	fV  d S |ro|ro| 	| |ra|| j| j| j|}
|
|j_t
j| j}	d|	fV  d S |s|rt
jd	|i}	d|	fV  d S d
| j d)Nr   r_   ri   Fr   z.zipTr   z
Input:FILEr   z.json .)rh   r   r]   r   r   r   r<   r=   splitext_get_meta_csv_dfr   r   r   r   Z&get_dataset_access_config_for_unzippedr1   r3   r"   r   Z
oss_configZfrom_pydict)rQ   rb   r   r_   ri   Zhub_apiZis_zipZzip_file_namezip_filer   Zoss_config_for_unzippedr%   r%   r+   r     s<   





z'IterableDatasetBuilder._generate_tablesr   Nc                 C   s<   | j d u s	| j jrt|| j}tj|d| jd| _ d S d S )NFr   )r   emptyr   r   r   r   r   rB   )rQ   r   Zmeta_csv_file_pathr%   r%   r+   r     s   z'IterableDatasetBuilder._get_meta_csv_dfheaderstextsr   c                 C   sT   i }|  |} tdt| D ]}g }|D ]}|| ||  q||| | < q|S )Nr   )r!   ranger   re   )r   r   r   residxZcol_listliner%   r%   r+   trans_data_to_mapping  s   
z,IterableDatasetBuilder.trans_data_to_mapping)r   r   r   r   rL   staticmethodr   Csvr   r   r   r   r]   r   r   rl   r   r   r   r   rG   r   r   r%   r%   rS   r+   r   D  s*    (
(3
)	 r   )5r   rz   r<   r|   typingr   r   rf   Zpandasr   Zpyarrowr   r   r   r   r   r   r	   Zdatasets.filesystemsr
   Zdatasets.infor   Zdatasets.namingr   Zdatasets.packaged_modulesZdatasets.utils.py_utilsr   filelockr   Zmodelscope.hub.apir   Z4modelscope.msdatasets.context.dataset_context_configr   Z!modelscope.msdatasets.dataset_clsr   r   Z/modelscope.msdatasets.download.download_managerr   Z)modelscope.msdatasets.utils.dataset_utilsr   Zmodelscope.utils.constantr   r   r   Zmodelscope.utils.loggerr   r   rC   rA   r   r   r   r   r%   r%   r%   r+   <module>   s8     m0