# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Model config.json definitions for pdparams2safetensors conversion.

Each entry maps a model_name to the full config dict that will be
saved as config.json, matching the official HuggingFace repos exactly.
"""

# PP-LCNet configs
_PPLCNET_BASE = {
    "model_type": "pp_lcnet",
    "scale": 1.0,
    "reduction": 4,
    "hidden_dropout_prob": 0.2,
    "class_expand": 1280,
    "hidden_act": "hardswish",
}

_TEXTLINE_BLOCK_CONFIGS = [
    [[3, 16, 32, 1, False]],
    [[3, 32, 64, [2, 1], False], [3, 64, 64, 1, False]],
    [[3, 64, 128, [2, 1], False], [3, 128, 128, 1, False]],
    [
        [3, 128, 256, [2, 1], False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
    ],
    [[5, 256, 512, [2, 1], True], [5, 512, 512, 1, True]],
]

# PP-OCRv5 configs
_PPOCRV5_MOBILE_DET_CONFIG = {
    "model_type": "pp_ocrv5_mobile_det",
    "reduction": 4,
    "layer_list_out_channels": [12, 18, 42, 360],
    "backbone_config": {
        "model_type": "pp_lcnet_v3",
        "scale": 0.75,
        "out_features": ["stage2", "stage3", "stage4", "stage5"],
        "out_indices": [2, 3, 4, 5],
        "divisor": 16,
    },
    "neck_out_channels": 96,
    "kernel_list": [3, 2, 2],
    "interpolate_mode": "nearest",
}

_PPOCRV5_SERVER_DET_CONFIG = {
    "model_type": "pp_ocrv5_server_det",
    "mode": "large",
    "upsample_mode": "nearest",
    "upsample_align_mode": 1,
    "backbone_config": {
        "model_type": "hgnet_v2",
        "arch": "L",
        "return_idx": [0, 1, 2, 3],
        "freeze_stem_only": True,
        "freeze_at": 0,
        "freeze_norm": True,
        "lr_mult_list": [0, 0.05, 0.05, 0.05, 0.05],
        "out_features": ["stage1", "stage2", "stage3", "stage4"],
    },
    "use_lab": False,
    "use_last_conv": True,
    "class_expand": 2048,
    "class_num": 1000,
    "out_indices": [0, 1, 2, 3],
    "neck_out_channels": 256,
    "reduce_factor": 2,
    "intraclass_block_number": 4,
    "intraclass_block_config": {
        "reduce_channel": [1, 1, 0],
        "return_channel": [1, 1, 0],
        "vertical_long_to_small_conv_longratio": [[7, 1], [1, 1], [3, 0]],
        "vertical_long_to_small_conv_midratio": [[5, 1], [1, 1], [2, 0]],
        "vertical_long_to_small_conv_shortratio": [[3, 1], [1, 1], [1, 0]],
        "horizontal_small_to_long_conv_longratio": [[1, 7], [1, 1], [0, 3]],
        "horizontal_small_to_long_conv_midratio": [[1, 5], [1, 1], [0, 2]],
        "horizontal_small_to_long_conv_shortratio": [[1, 3], [1, 1], [0, 1]],
        "symmetric_conv_long_longratio": [[7, 7], [1, 1], [3, 3]],
        "symmetric_conv_long_midratio": [[5, 5], [1, 1], [2, 2]],
        "symmetric_conv_long_shortratio": [[3, 3], [1, 1], [1, 1]],
    },
    "head_in_channels": 1024,
    "scale_factor": 2,
    "scale_factor_list": [1, 2, 4, 8],
    "hidden_act": "relu",
    "kernel_list": [3, 2, 2],
}

_PPOCRV5_REC_BASE = {
    "hidden_act": "silu",
    "hidden_size": 120,
    "mlp_ratio": 2.0,
    "depth": 2,
    "head_out_channels": 18385,
    "conv_kernel_size": [1, 3],
    "qkv_bias": True,
    "num_attention_heads": 8,
    "attention_dropout": 0.0,
}

_PPOCRV5_MOBILE_REC_CONFIG = {
    "model_type": "pp_ocrv5_mobile_rec",
    "backbone_config": {
        "model_type": "pp_lcnet_v3",
        "scale": 0.95,
        "out_features": ["stage2", "stage3", "stage4", "stage5"],
        "out_indices": [2, 3, 4, 5],
        "divisor": 16,
        "block_configs": [
            [[3, 16, 32, 1, False]],
            [[3, 32, 64, 1, False], [3, 64, 64, 1, False]],
            [[3, 64, 128, [2, 1], False], [3, 128, 128, 1, False]],
            [
                [3, 128, 256, [1, 2], False],
                [5, 256, 256, 1, False],
                [5, 256, 256, 1, False],
                [5, 256, 256, 1, False],
                [5, 256, 256, 1, False],
            ],
            [
                [5, 256, 512, [2, 1], True],
                [5, 512, 512, 1, True],
                [5, 512, 512, [2, 1], False],
                [5, 512, 512, 1, False],
            ],
        ],
    },
    **_PPOCRV5_REC_BASE,
}

_PPOCRV5_SERVER_REC_CONFIG = {
    "model_type": "pp_ocrv5_server_rec",
    "backbone_config": {
        "model_type": "hgnet_v2",
        "arch": "L",
        "return_idx": [0, 1, 2, 3],
        "freeze_stem_only": True,
        "freeze_at": 0,
        "freeze_norm": True,
        "lr_mult_list": [1.0, 1.0, 1.0, 1.0, 1.0],
        "out_features": ["stage1", "stage2", "stage3", "stage4"],
        "stage_downsample": [True, True, True, True],
        "stem_strides": [2, 1, 1, 1, 1],
        "stage_downsample_strides": [[2, 1], [1, 2], [2, 1], [2, 1]],
    },
    **_PPOCRV5_REC_BASE,
}

# RT-DETR / DocLayout configs
_RTDETR_BACKBONE_CONFIG = {
    "arch": "L",
    "depths": [3, 4, 6, 3],
    "embedding_size": 64,
    "hidden_act": "relu",
    "hidden_sizes": [256, 512, 1024, 2048],
    "initializer_range": 0.02,
    "model_type": "hgnet_v2",
    "num_channels": 3,
    "out_features": ["stage2", "stage3", "stage4"],
    "out_indices": [2, 3, 4],
    "stage_downsample": [False, True, True, True],
    "stage_downsample_strides": [2, 2, 2, 2],
    "stage_in_channels": [48, 128, 512, 1024],
    "stage_kernel_size": [3, 3, 5, 5],
    "stage_light_block": [False, False, True, True],
    "stage_mid_channels": [48, 96, 192, 384],
    "stage_names": ["stem", "stage1", "stage2", "stage3", "stage4"],
    "stage_num_blocks": [1, 1, 3, 1],
    "stage_numb_of_layers": [6, 6, 6, 6],
    "stage_out_channels": [128, 512, 1024, 2048],
    "stem_channels": [3, 32, 48],
    "stem_strides": [2, 1, 1, 2, 1],
    "use_learnable_affine_block": False,
    "return_idx": [1, 2, 3],
    "freeze_stem_only": True,
    "freeze_at": 0,
    "freeze_norm": False,
    "lr_mult_list": [0.05, 0.05, 0.1, 0.15, 0.2],
}


def _rtdetr_config(id2label, label2id):
    """Build a standard RT-DETR config with given labels."""
    return {
        "activation_dropout": 0.0,
        "activation_function": "silu",
        "anchor_image_size": None,
        "attention_dropout": 0.0,
        "auxiliary_loss": True,
        "backbone_config": _RTDETR_BACKBONE_CONFIG,
        "batch_norm_eps": 1e-05,
        "box_noise_scale": 1.0,
        "d_model": 256,
        "decoder_activation_function": "relu",
        "decoder_attention_heads": 8,
        "decoder_ffn_dim": 1024,
        "decoder_in_channels": [256, 256, 256],
        "decoder_layers": 6,
        "decoder_n_points": 4,
        "disable_custom_kernels": True,
        "dropout": 0.0,
        "encode_proj_layers": [2],
        "encoder_activation_function": "gelu",
        "encoder_attention_heads": 8,
        "encoder_ffn_dim": 1024,
        "encoder_hidden_dim": 256,
        "encoder_in_channels": [512, 1024, 2048],
        "encoder_layers": 1,
        "eos_coefficient": 0.0001,
        "eval_size": None,
        "feat_strides": [8, 16, 32],
        "focal_loss_alpha": 0.75,
        "focal_loss_gamma": 2.0,
        "freeze_backbone_batch_norms": True,
        "hidden_expansion": 1.0,
        "id2label": id2label,
        "initializer_bias_prior_prob": None,
        "initializer_range": 0.01,
        "is_encoder_decoder": True,
        "label2id": label2id,
        "label_noise_ratio": 0.5,
        "layer_norm_eps": 1e-05,
        "learn_initial_query": False,
        "matcher_alpha": 0.25,
        "matcher_bbox_cost": 5.0,
        "matcher_class_cost": 2.0,
        "matcher_gamma": 2.0,
        "matcher_giou_cost": 2.0,
        "model_type": "rt_detr",
        "normalize_before": False,
        "num_denoising": 100,
        "num_feature_levels": 3,
        "num_queries": 300,
        "positional_encoding_temperature": 10000,
        "transformers_version": "5.3.0.dev0",
        "use_focal_loss": True,
        "weight_loss_bbox": 5.0,
        "weight_loss_giou": 2.0,
        "weight_loss_vfl": 1.0,
        "with_box_refine": True,
    }


_DOCLAYOUTV2_CONFIG = {
    "activation_dropout": 0.0,
    "activation_function": "silu",
    "anchor_image_size": None,
    "architectures": ["PPDocLayoutV2ForObjectDetection"],
    "attention_dropout": 0.0,
    "backbone": None,
    "backbone_config": {
        "model_type": "hgnet_v2",
        "arch": "L",
        "return_idx": [1, 2, 3],
        "freeze_stem_only": True,
        "freeze_at": 0,
        "freeze_norm": True,
        "lr_mult_list": [0, 0.05, 0.05, 0.05, 0.05],
        "out_features": ["stage2", "stage3", "stage4"],
    },
    "backbone_kwargs": None,
    "batch_norm_eps": 1e-05,
    "box_noise_scale": 1.0,
    "d_model": 256,
    "decoder_activation_function": "relu",
    "decoder_attention_heads": 8,
    "decoder_ffn_dim": 1024,
    "decoder_in_channels": [256, 256, 256],
    "decoder_layers": 6,
    "decoder_n_points": 4,
    "disable_custom_kernels": True,
    "dropout": 0.0,
    "encode_proj_layers": [2],
    "encoder_activation_function": "gelu",
    "encoder_attention_heads": 8,
    "encoder_ffn_dim": 1024,
    "encoder_hidden_dim": 256,
    "encoder_in_channels": [512, 1024, 2048],
    "encoder_layers": 1,
    "eos_coefficient": 0.0001,
    "eval_size": None,
    "feat_strides": [8, 16, 32],
    "hidden_expansion": 1.0,
    "id2label": {
        "0": "abstract",
        "1": "algorithm",
        "2": "aside_text",
        "3": "chart",
        "4": "content",
        "5": "formula",
        "6": "doc_title",
        "7": "figure_title",
        "8": "footer",
        "9": "footer",
        "10": "footnote",
        "11": "formula_number",
        "12": "header",
        "13": "header",
        "14": "image",
        "15": "formula",
        "16": "number",
        "17": "paragraph_title",
        "18": "reference",
        "19": "reference_content",
        "20": "seal",
        "21": "table",
        "22": "text",
        "23": "text",
        "24": "vision_footnote",
    },
    "initializer_range": 0.01,
    "is_encoder_decoder": True,
    "label2id": {},
    "label_noise_ratio": 0.5,
    "layer_norm_eps": 1e-05,
    "learn_initial_query": False,
    "matcher_alpha": 0.25,
    "matcher_bbox_cost": 5.0,
    "matcher_class_cost": 2.0,
    "matcher_gamma": 2.0,
    "matcher_giou_cost": 2.0,
    "model_type": "pp_doclayout_v2",
    "normalize_before": False,
    "num_denoising": 100,
    "num_feature_levels": 3,
    "num_queries": 300,
    "positional_encoding_temperature": 10000,
    "torch_dtype": "float32",
    "use_pretrained_backbone": False,
    "use_timm_backbone": False,
    "reading_order_config": {
        "hidden_size": 512,
        "num_attention_heads": 8,
        "attention_probs_dropout_prob": 0.1,
        "has_relative_attention_bias": False,
        "has_spatial_attention_bias": True,
        "layer_norm_eps": 1e-05,
        "hidden_dropout_prob": 0.1,
        "intermediate_size": 2048,
        "hidden_act": "gelu",
        "num_hidden_layers": 6,
        "rel_pos_bins": 32,
        "max_rel_pos": 128,
        "rel_2d_pos_bins": 64,
        "max_rel_2d_pos": 256,
        "max_position_embeddings": 514,
        "max_2d_position_embeddings": 1024,
        "type_vocab_size": 1,
        "vocab_size": 4,
        "start_token_id": 0,
        "pad_token_id": 1,
        "end_token_id": 2,
        "pred_token_id": 3,
        "coordinate_size": 171,
        "shape_size": 170,
        "num_classes": 20,
        "relation_bias_embed_dim": 16,
        "relation_bias_theta": 10000,
        "relation_bias_scale": 100,
        "global_pointer_head_size": 64,
        "tril_mask": True,
    },
    "class_thresholds": [
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.4,
        0.4,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.4,
        0.5,
        0.4,
        0.5,
        0.5,
        0.45,
        0.5,
        0.4,
        0.4,
        0.5,
    ],
    "class_order": [
        4,
        2,
        14,
        1,
        5,
        7,
        8,
        6,
        11,
        11,
        9,
        13,
        10,
        10,
        1,
        2,
        3,
        0,
        2,
        2,
        12,
        1,
        2,
        15,
        6,
    ],
}

_DOCLAYOUTV3_CONFIG = {
    "activation_dropout": 0.0,
    "activation_function": "silu",
    "anchor_image_size": None,
    "architectures": ["PPDocLayoutV3ForObjectDetection"],
    "attention_dropout": 0.0,
    "backbone": None,
    "backbone_config": {
        "model_type": "hgnet_v2",
        "arch": "L",
        "return_idx": [0, 1, 2, 3],
        "freeze_stem_only": True,
        "freeze_at": 0,
        "freeze_norm": True,
        "lr_mult_list": [0, 0.05, 0.05, 0.05, 0.05],
        "out_features": ["stage1", "stage2", "stage3", "stage4"],
    },
    "backbone_kwargs": None,
    "batch_norm_eps": 1e-05,
    "box_noise_scale": 1.0,
    "d_model": 256,
    "decoder_activation_function": "relu",
    "decoder_attention_heads": 8,
    "decoder_ffn_dim": 1024,
    "decoder_in_channels": [256, 256, 256],
    "decoder_layers": 6,
    "decoder_n_points": 4,
    "disable_custom_kernels": True,
    "dropout": 0.0,
    "encode_proj_layers": [2],
    "encoder_activation_function": "gelu",
    "encoder_attention_heads": 8,
    "encoder_ffn_dim": 1024,
    "encoder_hidden_dim": 256,
    "encoder_in_channels": [512, 1024, 2048],
    "encoder_layers": 1,
    "eos_coefficient": 0.0001,
    "eval_size": None,
    "feature_strides": [8, 16, 32],
    "hidden_expansion": 1.0,
    "id2label": {
        "0": "abstract",
        "1": "algorithm",
        "2": "aside_text",
        "3": "chart",
        "4": "content",
        "5": "formula",
        "6": "doc_title",
        "7": "figure_title",
        "8": "footer",
        "9": "footer",
        "10": "footnote",
        "11": "formula_number",
        "12": "header",
        "13": "header",
        "14": "image",
        "15": "formula",
        "16": "number",
        "17": "paragraph_title",
        "18": "reference",
        "19": "reference_content",
        "20": "seal",
        "21": "table",
        "22": "text",
        "23": "text",
        "24": "vision_footnote",
    },
    "initializer_range": 0.01,
    "is_encoder_decoder": True,
    "label2id": {},
    "label_noise_ratio": 0.5,
    "layer_norm_eps": 1e-05,
    "learn_initial_query": False,
    "matcher_alpha": 0.25,
    "matcher_bbox_cost": 5.0,
    "matcher_class_cost": 2.0,
    "matcher_gamma": 2.0,
    "matcher_giou_cost": 2.0,
    "model_type": "pp_doclayout_v3",
    "normalize_before": False,
    "num_denoising": 100,
    "num_feature_levels": 3,
    "num_queries": 300,
    "positional_encoding_temperature": 10000,
    "torch_dtype": "float32",
    "use_pretrained_backbone": False,
    "use_timm_backbone": False,
    "global_pointer_head_size": 64,
    "mask_feature_channels": [64, 64],
    "x4_feat_dim": 128,
}

# UVDoc config
_UVDOC_CONFIG = {
    "model_type": "uvdoc",
    "kernel_size": 5,
    "backbone_config": {
        "model_type": "uvdoc_backbone",
        "resnet_head": [[3, 32], [32, 32]],
        "resnet_configs": [
            [[32, 32, 1, False], [32, 32, 3, False], [32, 32, 3, False]],
            [
                [32, 64, 1, True],
                [64, 64, 3, False],
                [64, 64, 3, False],
                [64, 64, 3, False],
            ],
            [
                [64, 128, 1, True],
                [128, 128, 3, False],
                [128, 128, 3, False],
                [128, 128, 3, False],
                [128, 128, 3, False],
                [128, 128, 3, False],
            ],
        ],
        "stage_configs": [
            [[128, 1]],
            [[128, 2]],
            [[128, 5]],
            [[128, 8], [128, 3], [128, 2]],
            [[128, 12], [128, 7], [128, 4]],
            [[128, 18], [128, 12], [128, 6]],
        ],
        "out_features": [
            "stage1",
            "stage2",
            "stage3",
            "stage4",
            "stage5",
            "stage6",
        ],
        "out_indices": [1, 2, 3, 4, 5, 6],
    },
    "bridge_connector": [128, 128],
    "out_point_positions2D": [[128, 32], [32, 2]],
    "dilation_values": [[1], [2], [5], [8, 3, 2], [12, 7, 4], [18, 12, 6]],
    "padding_mode": "reflect",
    "hidden_act": "prelu",
}

# Label dicts for RT-DETR models
_TABLE_CELL_LABELS = {"0": "cell"}
_TABLE_CELL_LABEL2ID = {"cell": 0}

_DOC_LAYOUT_PLUS_ID2LABEL = {
    "0": "paragraph_title",
    "1": "image",
    "2": "text",
    "3": "number",
    "4": "abstract",
    "5": "content",
    "6": "figure_title",
    "7": "formula",
    "8": "table",
    "9": "reference",
    "10": "doc_title",
    "11": "footnote",
    "12": "header",
    "13": "algorithm",
    "14": "footer",
    "15": "seal",
    "16": "chart",
    "17": "formula_number",
    "18": "aside_text",
    "19": "reference_content",
}
_DOC_LAYOUT_PLUS_LABEL2ID = {v: int(k) for k, v in _DOC_LAYOUT_PLUS_ID2LABEL.items()}

_DOC_BLOCK_LABELS = {"0": "Region"}
_DOC_BLOCK_LABEL2ID = {"Region": 0}


# PP-FormulaNet configs (L and _plus-L share the vision encoder + decoder
# architecture; only max_position_embeddings differs). Field order matches
# the published config.json on PaddlePaddle/PP-FormulaNet-L_safetensors and
# PaddlePaddle/PP-FormulaNet_plus-L_safetensors.
_PP_FORMULANET_VISION = {
    "image_size": 768,
    "output_channels": 256,
    "num_channels": 3,
    "patch_size": 16,
    "hidden_act": "gelu",
    "layer_norm_eps": 1e-6,
    "attention_dropout": 0.0,
    "qkv_bias": True,
    "use_abs_pos": True,
    "use_rel_pos": True,
    "window_size": 14,
    "hidden_size": 768,
    "num_hidden_layers": 12,
    "num_attention_heads": 12,
    "global_attn_indexes": [2, 5, 8, 11],
    "mlp_dim": 3072,
    "post_conv_in_channels": 256,
    "post_conv_mid_channels": 512,
    "post_conv_out_channels": 1024,
    "decoder_hidden_size": 512,
}


def _pp_formulanet_text(max_position_embeddings):
    return {
        "activation_dropout": 0.0,
        "activation_function": "gelu",
        "attention_dropout": 0.0,
        "bos_token_id": 0,
        "d_model": 512,
        "decoder_attention_heads": 16,
        "decoder_ffn_dim": 2048,
        "decoder_layerdrop": 0.0,
        "decoder_layers": 8,
        "dropout": 0.1,
        "encoder_attention_heads": 16,
        "encoder_layers": 12,
        "eos_token_id": 2,
        "forced_eos_token_id": 2,
        "init_std": 0.02,
        "max_position_embeddings": max_position_embeddings,
        "num_hidden_layers": 12,
        "pad_token_id": 1,
        "scale_embedding": True,
        "tie_word_embeddings": False,
        "vocab_size": 50000,
    }


_PP_FORMULANET_CONFIG_L = {
    "model_type": "pp_formulanet",
    "text_config": _pp_formulanet_text(1024),
    "vision_config": dict(_PP_FORMULANET_VISION),
}

# Published _plus-L/config.json omits vision_config.decoder_hidden_size and
# relies on the HF default (512). Match that exactly.
_pp_formulanet_plus_vision = dict(_PP_FORMULANET_VISION)
_pp_formulanet_plus_vision.pop("decoder_hidden_size")
_PP_FORMULANET_CONFIG_PLUS_L = {
    "model_type": "pp_formulanet",
    "text_config": _pp_formulanet_text(2560),
    "vision_config": _pp_formulanet_plus_vision,
}

# Model config registry
MODEL_CONFIGS = {
    "PP-LCNet_x1_0_doc_ori": {
        **_PPLCNET_BASE,
        "id2label": {"0": "0", "1": "90", "2": "180", "3": "270"},
    },
    "PP-LCNet_x1_0_table_cls": {
        **_PPLCNET_BASE,
        "id2label": {"0": "wired_table", "1": "wireless_table"},
    },
    "PP-LCNet_x0_25_textline_ori": {
        **_PPLCNET_BASE,
        "scale": 0.25,
        "block_configs": _TEXTLINE_BLOCK_CONFIGS,
        "id2label": {"0": "0_degree", "1": "180_degree"},
    },
    "PP-LCNet_x1_0_textline_ori": {
        **_PPLCNET_BASE,
        "block_configs": _TEXTLINE_BLOCK_CONFIGS,
        "id2label": {"0": "0_degree", "1": "180_degree"},
    },
    "PP-OCRv5_mobile_det": _PPOCRV5_MOBILE_DET_CONFIG,
    "PP-OCRv5_server_det": _PPOCRV5_SERVER_DET_CONFIG,
    "PP-OCRv5_mobile_rec": _PPOCRV5_MOBILE_REC_CONFIG,
    "PP-OCRv5_server_rec": _PPOCRV5_SERVER_REC_CONFIG,
    "SLANet": {
        "model_type": "slanet",
        "backbone_config": {
            "model_type": "pp_lcnet",
            "scale": 1,
            "out_features": ["stage2", "stage3", "stage4", "stage5"],
            "out_indices": [2, 3, 4, 5],
        },
        "post_conv_out_channels": 96,
        "out_channels": 50,
        "hidden_size": 256,
        "max_text_length": 500,
    },
    "SLANet_plus": {
        "model_type": "slanet",
        "backbone_config": {
            "model_type": "pp_lcnet",
            "scale": 1,
            "out_features": ["stage2", "stage3", "stage4", "stage5"],
            "out_indices": [2, 3, 4, 5],
        },
        "post_conv_out_channels": 96,
        "out_channels": 50,
        "hidden_size": 256,
        "max_text_length": 500,
    },
    "SLANeXt_wired": {
        "model_type": "slanext",
        "vision_config": {
            "image_size": 512,
            "output_channels": 256,
            "num_channels": 3,
            "patch_size": 16,
            "hidden_act": "gelu",
            "layer_norm_eps": 1e-6,
            "attention_dropout": 0.0,
            "qkv_bias": True,
            "use_abs_pos": True,
            "use_rel_pos": True,
            "window_size": 14,
            "hidden_size": 768,
            "num_hidden_layers": 12,
            "num_attention_heads": 12,
            "global_attn_indexes": [2, 5, 8, 11],
            "mlp_dim": 3072,
        },
        "post_conv_in_channels": 256,
        "post_conv_out_channels": 512,
        "out_channels": 50,
        "hidden_size": 512,
        "max_text_length": 500,
        "loc_reg_num": 8,
    },
    "SLANeXt_wireless": {
        "model_type": "slanext",
        "vision_config": {
            "image_size": 512,
            "output_channels": 256,
            "num_channels": 3,
            "patch_size": 16,
            "hidden_act": "gelu",
            "layer_norm_eps": 1e-6,
            "attention_dropout": 0.0,
            "qkv_bias": True,
            "use_abs_pos": True,
            "use_rel_pos": True,
            "window_size": 14,
            "hidden_size": 768,
            "num_hidden_layers": 12,
            "num_attention_heads": 12,
            "global_attn_indexes": [2, 5, 8, 11],
            "mlp_dim": 3072,
        },
        "post_conv_in_channels": 256,
        "post_conv_out_channels": 512,
        "out_channels": 50,
        "hidden_size": 512,
        "max_text_length": 500,
        "loc_reg_num": 8,
    },
    "PP-DocLayoutV2": _DOCLAYOUTV2_CONFIG,
    "PP-DocLayoutV3": _DOCLAYOUTV3_CONFIG,
    "RT-DETR-L_wired_table_cell_det": _rtdetr_config(
        _TABLE_CELL_LABELS,
        _TABLE_CELL_LABEL2ID,
    ),
    "RT-DETR-L_wireless_table_cell_det": _rtdetr_config(
        _TABLE_CELL_LABELS,
        _TABLE_CELL_LABEL2ID,
    ),
    "PP-DocLayout_plus-L": _rtdetr_config(
        _DOC_LAYOUT_PLUS_ID2LABEL,
        _DOC_LAYOUT_PLUS_LABEL2ID,
    ),
    "PP-DocBlockLayout": _rtdetr_config(
        _DOC_BLOCK_LABELS,
        _DOC_BLOCK_LABEL2ID,
    ),
    "UVDoc": _UVDOC_CONFIG,
    "PP-FormulaNet-L": _PP_FORMULANET_CONFIG_L,
    "PP-FormulaNet_plus-L": _PP_FORMULANET_CONFIG_PLUS_L,
    "PP-Chart2Table": {
        "model_type": "pp_chart2table",
        "architectures": ["GotOcr2ForConditionalGeneration"],
        "auto_map": {
            "AutoConfig": "configuration_pp_chart2table.PPChart2TableConfig",
            "AutoModel": "modeling_pp_chart2table.PPChart2TableForConditionalGeneration",
        },
        "freeze_vision_tower": False,
        "image_seq_length": 256,
        "use_im_start_end": True,
        "bos_token_id": 151643,
        "eos_token_id": 151643,
        "im_start_token": 151857,
        "im_end_token": 151858,
        "image_token_index": 151859,
        "vocab_size": 151860,
        "initializer_range": 0.02,
        "max_window_layers": 21,
        "tie_word_embeddings": True,
        "dtype": "bfloat16",
        "use_cache": True,
        "use_sliding_window": False,
        "vision_config": {
            "architectures": ["PPChart2TableVisionModel"],
            "auto_map": {
                "AutoConfig": "configuration_pp_chart2table.PPChart2TableConfig",
                "AutoModel": "modeling_pp_chart2table.PPChart2TableVisionModel",
            },
            "num_hidden_layers": 12,
            "hidden_size": 768,
            "output_channels": 256,
            "hidden_act": "gelu",
            "num_channels": 3,
            "image_size": 1024,
            "mlp_ratio": 4.0,
            "mlp_dim": 3072,
            "layer_norm_eps": 1e-05,
            "num_attention_heads": 12,
            "patch_size": 16,
            "qkv_bias": True,
            "use_rel_pos": True,
            "use_abs_pos": True,
            "global_attn_indexes": [2, 5, 8, 11],
            "window_size": 14,
            "attention_dropout": 0.0,
        },
        "text_config": {
            "architectures": ["PPChart2TableTextModel"],
            "auto_map": {
                "AutoConfig": "configuration_qwen2.Qwen2Config",
                "AutoModel": "modeling_qwen2.Qwen2Model",
            },
            "hidden_size": 1024,
            "hidden_act": "silu",
            "num_hidden_layers": 24,
            "num_attention_heads": 16,
            "num_key_value_heads": 16,
            "intermediate_size": 2816,
            "attention_dropout": 0.0,
            "sliding_window": 32768,
            "rms_norm_eps": 1e-06,
            "vocab_size": 151860,
            "max_position_embeddings": 32768,
            "rope_parameters": {
                "rope_theta": 1000000.0,
                "rope_type": "default",
            },
        },
    },
}
