o
    *j/A                     @   s6  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZmZm Z  d d
l!m"Z" e" Z#dgZ$G dd de%Z&G dd deZ'ej(e j)ej*dG dd deZ+			dddZ,dS )    N)	roi_align)Models)
TorchModel)MODELS)FPNTrans)LayoutRobertaModelLayoutRobertaPreTrainedModel)TransformerDecoderTransformerDecoderLayer)ModeKeys	ModelFileTasks)
get_loggerVLDocForDocVLEmbeddingc                   @   s   e Zd Zdd ZdS )GeoVLDocModelOutputsc                 C   s"   || _ || _|| _|| _|| _d S )Ntext_featurestext_mm_featuresblock_vis_featuresblock_vis_mm_featuresimage_mm_features)selfr   r   r   r   r    r   j/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/vldoc/model.py__init__"   s
   	
zGeoVLDocModelOutputs.__init__N)__name__
__module____qualname__r   r   r   r   r   r       s    r   c                       sZ   e Zd Zd
 fdd	ZdefddZ																		ddd	Z  ZS )GeoVLDocModelFc                    s   t  | || _|| _t| jdd r'| jjd dkr!t|| _nt|| _nt|| _t| jj	dd| _
tddg| _td| jj| _t| jj| jj| jjdd	}t|d| _t| jj| jj| jjdd	}t|d| _|   d S )
Narchitecturesr   r   F)Zimg_sizeZ	inner_vit      T)Z	self_attn)superr   confighard_negtive_samplinggetattrr   r   text_encoderr   Z
image_sizevisual_encodernnZAdaptiveAvgPool2dpoolZLinearZhidden_size
vis_linearr
   Znum_attention_headsZintermediate_sizer	   cross_modal_textcross_modal_visualZinit_weights)r   r#   r$   Zcross_modal_text_layerZcross_modal_visual_layer	__class__r   r   r   8   s<   
zGeoVLDocModel.__init__	ckpt_pathc                 C   sF   t j|dd}i }| D ]\}}|dd}|||< q| | d S )NcpuZmap_locationgeo_vl_doc_model. )torchloaditemsreplaceZload_state_dict)r   r/   
state_dictZstate_dict_newkvr   r   r   from_pretrainedZ   s   
zGeoVLDocModel.from_pretrainedNc           +      K   s  |j \}}|d ur|n| jj}||d< | jjd dkr/| j|f||||	|
||||d	|}n| j|f||||	|
||||d	|}|d d \}}|j \}}tjd||jd|d	||}|||f }|j \}}}| 
|}tjd||jd|d	||d}t||fd|| d	j|d
 jd} |d
 jtjkrt|d
 tj| tjd|d
 dd d}!|!j|d
 jd}!nt|d
 | tjd|d
 dd d}!|!dd||d}!| |!}!|!|d }!| |d
 dd}"| |"d}"t|"|!fd}#t|df|j}$t|$|fd}%d| dk}&d|% dk}'| j|dd|#dd|&|'d}(| j|#dd|dd|'|&d})|(dd}(|)dd})|)d d dd f }*t||(|!|*|)dS )NZ	line_bboxr   r   )	bboxattention_masktoken_type_idsposition_ids	head_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dict   )devicer       Zfeat_ms)dtypeg     @@)Zspatial_scaler!   )ZtgtZmemoryZtgt_key_padding_maskZmemory_key_padding_maskr   )shaper#   Zuse_return_dictr   r&   r4   ZarangerF   Zreshapeexpandr'   Z	unsqueezecattorI   Zfloat16r   Zfloat32sizeZsqueezer*   r)   Zonesr+   Z	transposer,   r   )+r   	input_idsimager<   bbox_4p_normalizedr=   first_token_idxesfirst_token_idxes_maskr>   r?   r@   rA   encoder_hidden_statesencoder_attention_maskpast_key_values	use_cacherB   rC   rD   kwargsZ
batch_sizeZseq_lenZoutputsZsequence_outputZpooled_output_Z	num_firstZB_batch_dimZfeature_bboxZ	block_numZ
visual_outZ
batch_idxsZbatch_idx_with_bboxZblk_vis_featuresZfull_img_featuresZvis_inpsZglb_feat_attnZvis_maskZnew_attention_maskZnew_vis_maskZtext_mm_featZvis_mm_featr   r   r   r   forwardb   s   










zGeoVLDocModel.forward)FNNNNNNNNNNNNNNNNNN)r   r   r   r   strr;   rZ   __classcell__r   r   r-   r   r   6   s,    "	r   )module_namec                       sT   e Zd ZdZdef fddZ																		dddZ  ZS )	r   z
    Generate multi-modal document embeddings in segment-level and token-level.

    Args:
        model_dir:
            the path in model hub, e.g., 'damo/multi-modal_convnext-roberta-base_vldoc-embedding'
    	model_dirc           	   	      sF  t  j|d|i| ddlm} tj|d}td	| tj
|s(J ||| _t| j| _tj|tj}tj
|sDJ | j| td	| ddlm} tj|tj}||| _tj rwd	ttjd	dnd
| _tj r| j| j td	ttjd	d d S | j  td d S )Nr_   r   )LayoutRobertaConfigzconfig.jsonzLoading config file from {}zLoading model from {})VLDocXLMTokenizerzcuda:{}Z
LOCAL_RANKr0   z%Use GPU {} for finetuning & inferencez"Use CPU for finetuning & inference)r"   r   ;modelscope.models.multi_modal.vldoc.modeling_layout_robertar`   ospathjoinloggerinfoformatexistsZfrom_json_filer#   r   	doc_modelr   ZTORCH_MODEL_FILEr;   Z0modelscope.models.multi_modal.vldoc.tokenizationra   ZTOKENIZER_FOLDER	tokenizerr4   cudaZis_availableintenvirongetrF   rM   float)	r   r_   argsrX   r`   Zmodel_cfg_pathZ
model_pathra   Ztokenizer_pathr-   r   r   r   	  s6   


zVLDocForDocVLEmbedding.__init__Nc                 K   s   | j di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d||}t|j|jdS )a  
        Args:
            - input_ids: :math:`(B, T, E)`, the input tokens, where B is the batch size,
              T is the max token size, E is the embedding dimension.
            - image: :math:`(B, C, H, W)`, normalized images.
            - bbox: :math:`(B, T, 4)`, segment boxes denoted by top-left and bottom-right
              vertexes whose values are normalized to [0, 1000).
            - bbox_4p_normalized: :math:`(B, T, 8)`, word boxes denoted by 4 vertexes, whose
              values are normalized to [0, 1).
            - attention_mask: :math:`(B, T)`, mask for input tokens, where 0 means masked.
            - first_token_idxes: :math:`(B, S)`, indexes of the corresponding first tokens
              of all segments, where S is the max segment size.
            - first_token_idxes_mask: :math:`(B, S)`, mask for segments, where 0 means masked.
        Optional:
            - line_rank_id: :math:`(B, T)`, orders of segments.
            - line_rank_inner_id: :math:`(B, T)`, BIE-like tags.

        To be more specific, please refer to the class `TextLayoutSerializer` in
          `modelscope/models/multi_modal/vldoc/processing.py`.
        rO   rP   r<   rQ   r=   rR   rS   r>   r?   r@   rA   rT   rU   rV   rW   rB   rC   rD   )Zimg_embeddingZtext_embeddingNr   )rj   dictr   r   )r   rO   rP   r<   rQ   r=   rR   rS   r>   r?   r@   rA   rT   rU   rV   rW   rB   rC   rD   rX   Zvldoc_outputsr   r   r   rZ   *  sT   
)	
zVLDocForDocVLEmbedding.forwardr[   )r   r   r   __doc__r\   r   rZ   r]   r   r   r-   r   r      s,    "robertac                    s  d u rt j|ddg }g }t }|dkrFtt|D ]&}|| }	d }
|	dr7|	dd}
t	|
}	|
rE|
||  |
|
 qt||D ]\}}
||
< qKg g g  tdd  d urn_d fdd		d}t| d
stdd  D rd}| |d tdkrtd| jj tdkrtd| jj t dkrtd| jjd | S )Nr0   r1   rt   zroberta.zgeo_vl_doc_model.text_encoder.	_metadatar3   c              	      sh   d u ri n	 |d d i }| ||d  | j D ]\}}|d ur1||| d  q d S )NrG   T.)ro   Z_load_from_state_dictZ_modulesr6   )moduleprefixZlocal_metadatanamechild
error_msgsr5   metadataZmissing_keysr8   Zunexpected_keysr   r   r5     s   z$init_pretrained_weight.<locals>.loadZgeo_vl_doc_modelc                 s   s    | ]}| d V  qdS )r2   N)
startswith).0sr   r   r   	<genexpr>  s    

z)init_pretrained_weight.<locals>.<genexpr>r2   )rx   r   z7Weights of {} not initialized from pretrained model: {}z0Weights from pretrained model not used in {}: {}z*Error(s) in loading state_dict for {}:
	{}z
	)r3   )r4   r5   listkeysrangelenr~   r7   copydeepcopyappendzippopr%   ru   hasattranyrf   rg   rh   r.   r   RuntimeErrorre   )modelZpretrained_model_pathr8   	cache_dirZinit_backboneZold_keysZnew_keysZstate_dict_keysikeyZnew_keyold_keyZstart_prefixr   r{   r   init_pretrained_weightn  sf   


	r   )NNrt   )-r   loggingmathrc   resysjsonr4   Ztorch.distributeddistributeddistZtorch.nnr(   Ztorchvision.opsr   Zmodelscope.metainfor   Zmodelscope.modelsr   Zmodelscope.models.builderr   Z2modelscope.models.multi_modal.vldoc.conv_fpn_transr   rb   r   r   Z5modelscope.models.multi_modal.vldoc.transformer_localr	   r
   Zmodelscope.utils.constantr   r   r   Zmodelscope.utils.loggerr   rf   __all__objectr   r   Zregister_moduleZdocument_vl_embeddingZvldocr   r   r   r   r   r   <module>   s<    Jq