o
    1je                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* dd	l+m,Z, d
eddfddZ-i deg fdeg fdeg fdeg fdeefdeefdeefdeefdeefdeefde!e fde!e fdeefdeg fdeg fdeg fdeg feg fe&e%feg feg feefd Z.d!Z/g d"Z0d#d$ Z1d%d& Z2d'd( Z3G d)d dZ4dS )*a  Weight converter: pdparams -> safetensors.

Conversion flow:
  1. paddle.load() -> state dict with OLD PaddleOCR/PaddleDetection key names
  2. Rename BatchNorm keys: _mean -> running_mean, _variance -> running_var
  3. Apply per-architecture regex key mappings (old keys -> HF keys)
  4. Transpose linear weight keys (Paddle [in, out] -> HF [out, in])
  5. Save as safetensors via safetensors.numpy.save_file()
    N)Path   )logging)AttrDict   )!CHART2TABLE_ADDED_TOKENSCHART2TABLE_GENERATION_CONFIGCHART2TABLE_SPECIAL_TOKENS_MAPCHART2TABLE_TOKENIZER_CONFIGMOBILE_DET_DROP_PREFIXESPP_CHART2TABLE_DROP_PREFIXESPP_CHART2TABLE_MAPPINGPP_DOCLAYOUTV2_DROP_PREFIXESPP_DOCLAYOUTV2_MAPPINGPP_FORMULANET_MAPPINGPPLCNET_MAPPINGPPOCRV5_MOBILE_DET_MAPPINGPPOCRV5_MOBILE_REC_MAPPINGPPOCRV5_SERVER_DET_MAPPINGPPOCRV5_SERVER_REC_MAPPINGPREPROCESSOR_CONFIGSREC_DROP_PREFIXESRTDETR_MAPPINGSERVER_DET_DROP_PREFIXESSERVER_REC_DROP_PREFIXESSLANET_DROP_PREFIXESSLANET_MAPPINGSLANEXT_DROP_PREFIXESSLANEXT_MAPPINGUNIMERNET_GENERATION_CONFIGUNIMERNET_PROCESSOR_CONFIGUNIMERNET_TOKENIZER_CONFIGUVDOC_DROP_PREFIXESUVDOC_MAPPINGapply_key_mappingbuild_inference_metaload_character_dictrename_bn_keys)MODEL_CONFIGSconfigreturnWeightConverterc                 C   s   t | S )z-Build a weight converter from PaddleX config.)r+   )r)    r,   f/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/modules/base/weight_converter.pybuild_weight_converterE   s   r.   zPP-LCNet_x1_0_doc_orizPP-LCNet_x1_0_table_clszPP-LCNet_x0_25_textline_orizPP-LCNet_x1_0_textline_orizPP-OCRv5_mobile_detzPP-OCRv5_server_detPP-OCRv5_mobile_recPP-OCRv5_server_recZSLANetZSLANet_plusZSLANeXt_wiredZSLANeXt_wirelesszPP-DocLayoutV2zPP-DocLayoutV3zRT-DETR-L_wired_table_cell_detz!RT-DETR-L_wireless_table_cell_detzPP-DocLayout_plus-L)zPP-DocBlockLayoutZUVDocPP-FormulaNet-LPP-FormulaNet_plus-LzPP-Chart2Table)r1   r2   )AfcchannelwiseZout_projZq_projZk_projZv_projZo_projZ	gate_projZup_projZ	down_projZlm_headZlinear_1Zlinear_2Zlinear1Zlinear2zattn.qkvzmlp.lin1zmlp.lin2z	attn.projz	mixer.qkvz
mixer.projzself_attn.qkvzself_attn.projectionZ
mapper_crpZ
mapper_scaz.mapper.Z
txt_mapperZtxt_pooled_mapperZclip_img_mapperZ	kv_mapperZclip_mapperZmm_projector_varyZenc_to_dec_projZ
score_headZenc_score_headZdec_score_headZ	bbox_headZenc_bbox_headZdec_bbox_headZmask_query_headZ
enc_outputZquery_pos_headZdec_global_pointerZdec_order_headZattention_weightsZsampling_offsetsZ
value_projZoutput_projz	head.headZctc_headZconv_reduce_channelzstructure_attention_cell.scorezstructure_attention_cell.i2hzstructure_attention_cell.h2hzstructure_generator.0.zstructure_generator.1.Zspatial_projzattention.self.queryzattention.self.keyzattention.self.valuezintermediate.densez.output.denseZrelative_headZlabel_features_projectionZpos_projc                    s   t  fddtD S )z@Check if a 2D weight tensor should be transposed (linear layer).c                 3   s    | ]}| v V  qd S Nr,   ).0subkeyr,   r-   	<genexpr>   s    z$_should_transpose.<locals>.<genexpr>)any_TRANSPOSE_SUBSTRINGSr8   r,   r8   r-   _should_transpose   s   r=   c                 C   s  ddl }i }|  D ]\}}t|dr-ddl}|j|j|jfv r&||j}|	   }nt
||jr6|}n||}d|v rLd|v sGd|v rL|d}d|v sTd	|v rd
|v r| }|jd d }|d| ||dd< ||d|  ||dd< |d| d ||dd< n3d|v r|jd d }|d| ||d	d< ||d|  ||d	d< |d| d ||d	d< q
|jdkrd|vrt|r| }|||< q
|S )a\  Preprocess Paddle tensors for safetensors output.

    Converts to numpy, transposes linear weight tensors from Paddle [in, out]
    to HF [out, in] format, reshapes channelwise parameters, and splits
    fused in_proj weights into separate q/k/v.

    Applied on OLD key names before regex key mapping.
    Returns dict of {key: numpy_array}.
    r   Nnumpyr4   gammabeta)r   r   r   Zin_proj_weightZin_proj_biasweightr   zq_proj.weight   zk_proj.weightzv_proj.weightZbiaszq_proj.biaszk_proj.biaszv_proj.bias)r>   itemshasattrpaddledtypeZbfloat16Zfloat16ZastypeZfloat32cpu
isinstanceZndarrayarrayZreshape	transposeshapereplacendimr=   )
state_dictnpresultr9   ZtensorrF   Z	np_weightZ
split_sizer,   r,   r-   _preprocess_tensors   sT   








rR   c                 C   s   t | }| r|jdstd| t|S | rfg d}|D ]}|| }| r3t|  S q#t|	d}t
|dkrGt|d S t
|dkr_dd |D }td	| d
| dtd| td| )a  Resolve input_path to a concrete .pdparams file.

    Accepts a direct .pdparams file path or a directory containing one.
    Directory resolution checks: model_state.pdparams, inference.pdparams,
    best_model.pdparams, best_accuracy.pdparams, or the single .pdparams file.
    z	.pdparamsz.input_path file must end with .pdparams, got: )zmodel_state.pdparamszinference.pdparamszbest_model.pdparamszbest_accuracy.pdparamsz
*.pdparamsr   r   c                 S   s   g | ]}|j qS r,   )name)r6   fr,   r,   r-   
<listcomp>  s    z'_resolve_input_path.<locals>.<listcomp>z"Multiple .pdparams files found in z: z%. Please specify the exact file path.z'No .pdparams files found in directory: zinput_path does not exist: )r   is_filerS   endswith
ValueErrorstris_direxistslistgloblenFileNotFoundError)
input_pathp
candidatesrS   	candidateZpdparams_filesnamesr,   r,   r-   _resolve_input_path   s,   re   c                   @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )r+   z8Converts Paddle .pdparams weights to safetensors format.c                    s   |j j| _|j t tr jnd
 fdd	| _| d| _| d| _	| jd u r-t
d| j	d u r6t
d| jtvrOdtt }t
d| j d	| t| j | _|  | _d S )Nc                    s   t  | |S r5   )getattr)kdZconvert_configr,   r-   <lambda>'  s    z*WeightConverter.__init__.<locals>.<lambda>r`   
output_dirzdPdparams2safetensors.input_path is required. Specify a .pdparams file or a directory containing one.z,Pdparams2safetensors.output_dir is required.z, zModel 'zJ' is not supported for pdparams2safetensors conversion. Supported models: r5   )Globalmodel
model_nameZPdparams2safetensorsrI   dictget_getr`   rk   rX   _MODEL_REGISTRYjoinsortedkeysr   rZ   _input_is_dir_load_user_configs_user_configs)selfr)   	supportedr,   ri   r-   __init__   s.   




zWeightConverter.__init__c              	      s   | j std| j i S ddl t| j}i }ddd fddd fd	 fd
dffD ]@\}}|| }| r[t|dd}||||< W d   n1 sMw   Y  td|  q*t	| d| d| j d q*|S )z5Load user-provided config files from input directory.zDInput is a single pdparams file. Using official config files for %s.r   Nconfig.jsonc                 S   
   t | S r5   jsonloadrT   r,   r,   r-   rj   N     
 z4WeightConverter._load_user_configs.<locals>.<lambda>preprocessor_config.jsonc                 S   r}   r5   r~   r   r,   r,   r-   rj   O  r   inference.ymlc                    s
     | S r5   )Z	safe_loadr   yamlr,   r-   rj   P  r   utf-8encodingzLoaded user config:  not found in z. Using official default for .)
rv   r   inforn   r   r   r`   r[   openwarning)ry   Z	input_dirZuser_configsfnameloaderZfpathrT   r,   r   r-   rw   >  s2   


z"WeightConverter._load_user_configsc                 C   s   ddl m} t| j \}}| ||}tj| jdd | | | 	  | 
  |   | j|v r5|   | jtv r>|   td| j  dS )z/Execute the pdparams -> safetensors conversion.r   )PP_CHART2TABLE_MODELST)exist_okz&Conversion complete. Output saved to: N)Z"inference.models.doc_vlm.constantsr   rr   rn   _convert_weightsosmakedirsrk   _save_safetensors_save_model_config_save_preprocessor_config_save_inference_yml_save_llm_configPP_FORMULANET_MODELS_save_pp_formulanet_assetsr   r   )ry   r   key_mappingdrop_prefixesnumpy_sdr,   r,   r-   convert_  s   


zWeightConverter.convertc           	         s   ddl }t| j}td|  ||} r6 fdd|D }|D ]}||= q#|r6tdt| d t|}t|}|rFt	||}n
t
d| j d	 | | |S )
z@Load pdparams and convert to numpy state dict with HF key names.r   NzLoading weights from: c                    s&   g | ] t  fd dD r qS )c                 3   s    | ]}  |V  qd S r5   )
startswith)r6   ra   rg   r,   r-   r:     s    z>WeightConverter._convert_weights.<locals>.<listcomp>.<genexpr>)r;   )r6   r   r   r-   rU   ~  s
    z4WeightConverter._convert_weights.<locals>.<listcomp>zDropped z keys not needed for inferencezNo key mapping defined for z). Keys will be saved as-is from pdparams.)rF   re   r`   r   r   r   r^   r'   rR   r$   r   rn   _postprocess_weights)	ry   r   r   rF   resolved_pathrO   Zdroppedrg   r   r,   r   r-   r   u  s*   



z WeightConverter._convert_weightsc              	   C   s  ddl }t| ji }d}||v r^|ddkr^|di }t|d }|| jd }||k r^|j|| || jd f|| jd}|j|| |gdd	||< t	
d
| d| d| d d}	|	|v r|ddkr|dd}
||	 jd |
kr||	  ||	< t	
d|	 d |ddkrdd |D }d}|D ]}||vr|d||< |d7 }q|rt	
d| d dS dS dS )z:Apply model-specific post-processing to converted weights.r   Nz"model.denoising_class_embed.weightZ
model_typeZrt_detrid2labelr   )rG   )ZaxiszPadded z from z to z (added background class)zlm_head.weightZpp_chart2table
vocab_sizei4Q zReverted transpose on z (tied embedding, not linear)c                 S   s,   g | ]}| d r|ds|d dqS )z.running_meanzmodel.backbone.z.num_batches_tracked)rW   r   rM   )r6   rg   r,   r,   r-   rU     s    
z8WeightConverter._postprocess_weights.<locals>.<listcomp>zAdded z num_batches_tracked keys)r>   r(   rp   rn   r^   rL   ZzerosrG   Zconcatenater   r   rK   Zint64)ry   r   rP   r)   Z	embed_keyr   Zexpected_sizeZcurrent_sizepadZlm_head_keyr   Znbt_keysaddedrg   r,   r,   r-   r     sV   


z$WeightConverter._postprocess_weightsc                 C   s:   ddl m} tj| jd}||| td|  dS )z+Save numpy state dict as model.safetensors.r   )	save_filezmodel.safetensorszSaved model.safetensors to: N)Zsafetensors.numpyr   r   pathrs   rk   r   r   )ry   r   r   out_pathr,   r,   r-   r     s   
z!WeightConverter._save_safetensorsc                 C   s|   | j dt| ji }tj| jd}t|ddd}t	j
||ddd W d   n1 s/w   Y  td	|  dS )
u7   Save config.json — user-provided or official default.r|   wr   r   rC   Findentensure_asciiNzSaved config.json to: )rx   rp   r(   rn   r   r   rs   rk   r   r   dumpr   r   ry   datar   rT   r,   r,   r-   r     s   z"WeightConverter._save_model_configc                 C   s   | j tv rdS d| jv r| jd }ntt| j i }| j dv r+dgt  dg |d< tj	| j
d}t|ddd	}tj||d
dd W d   n1 sNw   Y  td|  dS )uD   Save preprocessor_config.json — user-provided or official default.Nr   r/   r0   blank Zcharacter_listr   r   r   rC   Fr   z#Saved preprocessor_config.json to: )rn   r   rx   ro   r   rp   r&   r   r   rs   rk   r   r   r   r   r   r   r,   r,   r-   r     s   


z)WeightConverter._save_preprocessor_configc                 C   s   ddl }d| jv r| jd }n8dd| jii}|t| j | jdv r-t |di d< n| jtv rGtt	}d	|d
< | 
 |d|di d< tj| jd}t|ddd}|j||ddd W d   n1 sjw   Y  td|  dS )u9   Save inference.yml — user-provided or official default.r   Nr   rl   rn   r   ZPostProcessZcharacter_dictZVariableDonutProcessorZprocessor_class)Zfast_tokenizer_fileZtokenizer_config_filer   r   r   FT)Zdefault_flow_styleZallow_unicodezSaved inference.yml to: )r   rx   rn   updater%   r&   
setdefaultr   ro   r!   _load_unimernet_fast_tokenizerr   r   rs   rk   r   r   r   r   )ry   r   r   Ztokenizer_configr   rT   r,   r,   r-   r     s4   



z#WeightConverter._save_inference_ymlc              	   C   s0  ddl }|  }tj| jd}||| td|  t	t
ttd}| D ]l\}}| jr`t| j| }| rMtt|dd}td|  n|}t| d	| j d
| j d n|}tj| j|}	t|	ddd}
tj||
ddd W d   n1 sw   Y  td| d|	  q)dS )zSave tokenizer and generation config for Chart2Table models.

        Outputs: qwen.tiktoken, added_tokens.json, generation_config.json,
        special_tokens_map.json, tokenizer_config.json.
        r   Nqwen.tiktokenzCopied qwen.tiktoken to: )zadded_tokens.jsongeneration_config.jsonzspecial_tokens_map.jsontokenizer_config.jsonr   r   zLoaded user tokenizer config: r   . Using default for r   r   rC   Fr   Saved  to: )shutil_resolve_tiktoken_sourcer   r   rs   rk   copy2r   r   r   r   r	   r
   rD   rv   r   r`   r[   r   r   r   r   rn   r   )ry   r   Ztiktoken_srcZtiktoken_dstZ_TOKENIZER_DEFAULTSr   default_datasrcr   r   rT   r,   r,   r-   r     s:   z WeightConverter._save_llm_configc                 C   s   | j rt| jd }| rt|S td| j d ddlm} t|d | j	 d d }| r8t|S t
d| d	)
z.Find qwen.tiktoken for Chart2Table conversion.r   zqwen.tiktoken not found in '. Falling back to official model cache.r   	CACHE_DIRofficial_models_safetensorszWqwen.tiktoken not found. For single-file input, ensure the official model is cached at ze (run inference once to download). For directory input, include qwen.tiktoken in the input directory.rv   r   r`   r[   rY   r   r   Zutils.cacher   rn   r_   ry   r   r   
cache_pathr,   r,   r-   r   <  s.   
z(WeightConverter._resolve_tiktoken_sourcec                 C   s   | j rt| jd }| rt|S td| j d ddlm} t|d | j	 d d }| r8t|S t
d| d	| j	 d
)u  Resolve filesystem path to tokenizer.json (PP-FormulaNet fast tokenizer).

        Resolution chain (mirrors :meth:`_resolve_tiktoken_source`):
            input dir → ~/.paddlex/official_models/{name}_safetensors/tokenizer.json
            → FileNotFoundError
        tokenizer.jsonztokenizer.json not found in r   r   r   r   r   zXtokenizer.json not found. For single-file input, ensure the official model is cached at z (run `create_model('zs', engine='paddle_dynamic')` once to download). For directory input, include tokenizer.json in the input directory.r   r   r,   r,   r-   #_resolve_unimernet_tokenizer_sourceY  s2   
z3WeightConverter._resolve_unimernet_tokenizer_sourcec                 C   sR   |   }td|  t|dd}t|W  d   S 1 s"w   Y  dS )zKLoad tokenizer.json content as a parsed dict (for inference.yml embedding).zLoaded tokenizer.json: r   r   N)r   r   r   r   r   r   )ry   r   rT   r,   r,   r-   r   |  s
   $z.WeightConverter._load_unimernet_fast_tokenizerc              	   C   sR  ddl }|  }tj| jd}||| td|  t	t
td}| D ]~\}}| jrqt| j| }| r^t|dd}t|}	W d   n1 sPw   Y  td|  n|}	t| d	| j d
| j d n|}	tj| j|}
t|
ddd}tj|	|ddd W d   n1 sw   Y  td| d|
  q(dS )u  Save HF-style assets for PP-FormulaNet (transformers-engine compatible).

        Outputs ``processor_config.json``, ``generation_config.json``,
        ``tokenizer_config.json`` (all hardcoded), and copies ``tokenizer.json``
        from the input dir or official_models cache. The tokenizer JSON is also
        already embedded in inference.yml (read by UniMERNetDecode) — saving the
        standalone file lets HF AutoTokenizer load the converted directory.
        r   Nr   zCopied tokenizer.json to: )zprocessor_config.jsonr   r   r   r   zLoaded user asset: r   r   r   r   rC   Fr   r   r   )r   r   r   r   rs   rk   r   r   r   r    r   r!   rD   rv   r   r`   r[   r   r   r   r   rn   r   )ry   r   Ztokenizer_srcZtokenizer_dstZ_ASSET_DEFAULTSr   r   r   rT   r   r   r,   r,   r-   r     s<   	z*WeightConverter._save_pp_formulanet_assetsN)__name__
__module____qualname____doc__r{   rw   r   r   r   r   r   r   r   r   r   r   r   r   r,   r,   r,   r-   r+     s     !4%)#)5r   r   r   pathlibr   utilsr   Zutils.configr   Zutils.pdparams2safetensorsr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   Z'utils.pdparams2safetensors.model_configr(   r.   rr   r   r<   r=   rR   re   r+   r,   r,   r,   r-   <module>   sp   
#	
G>*