o
    *j=                     @   s  d dl mZmZ d dlZd dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z! d dl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( e# Z)dgZ*ej+e!j,ej-dG dd deZ.dS )    )AnyDictN)Image)BertWordPieceTokenizer)Compose	NormalizeResizeToTensor)Models)
TorchModel)MODELS)
OutputKeys)	LoadImage)	ModelFileTasks)
get_logger   )TEAMBertWrapperCLIPVisionWrapper
CrossLayerTEAMForMultiModalSimilarity)module_namec                       sj   e Zd Zd fdd	Zdd Zdeeef deeef fdd	Zd
eeef deeef fddZ	  Z
S )r   r   c                    s   t  j|||d| td|ddd}d |j_t }t||d|tj	d| _
| j
  || _| jdkrPtj rP| j
d	| j td
| j nd| _td td|tjdd| _| jjdd tdd}ttdtjdt |g| _d S )N)	model_dir	device_idz{}/text_config.jsoni   i   )Zconfig_jsonZfeat_dimZ	token_dimz{}/{})Z
pretrainedr   cuda:{}zUse GPU: {}zUse CPU for inferenceF)Z	lowercase   )
max_length)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)   r   )interpolation)super__init__r   formatZbertclsr   r   r   ZTORCH_MODEL_BIN_FILEmodelevalr   torchcudaZis_availabletologgerinfor   Z
VOCAB_FILEtext_tokenizerZenable_truncationr   r   r   r   ZBICUBICr	   img_preprocessor)selfr   r   argskwargsZ
text_modelZimage_modelZnorm_op	__class__ n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/team/team_model.pyr"      sJ   

z$TEAMForMultiModalSimilarity.__init__c                 C   sz   | j |}d}td|f }td|f}|j|j}}t||ddt|f< t||ddt|f< ||fS )Nr   r   r   )	r,   encoder'   ZzeroslongZidsZattention_maskZtensorlen)r.   text_strtokensZ
max_tokenstext_ids_tensortext_mask_tensorZtext_idsZ	text_maskr3   r3   r4   tokenize_textC   s   z)TEAMForMultiModalSimilarity.tokenize_textinputreturnc                 C   s  t   d|v r@|d d ur@|d }t|}| |d }| jdkr-|d| j}| j	d d |\}}}}|
  }nd\}}d|v r|d d ur|d }t|tr_| |\}}	n	tdt| | jdkr|d| j}|	d| j}	| j	||	d \}
}}}|

  }
nd\}}	|d ur|	d ur|d ur| j||	|d  }nd }tj|tj|
tj|i}|W  d    S 1 sw   Y  d S )NZimg)N.r   r   )NNtextztext should be str, but got )r'   Zno_gradr   Zconvert_to_imgr-   r   r)   r#   r%   Zget_featurecpunumpy
isinstancestrr<   	TypeErrortypeZget_cross_scoreitemr   ZIMG_EMBEDDINGZTEXT_EMBEDDINGZSCORES)r.   r=   Z	input_imgZ
img_tensor_Zimage_featureZimage_tensorsr8   r:   r;   Ztext_featureZtext_tensorsZscoreoutputr3   r3   r4   forwardM   sf   





$z#TEAMForMultiModalSimilarity.forwardinputsc                 C   s   |S )Nr3   )r.   rJ   r3   r3   r4   postprocess~   s   z'TEAMForMultiModalSimilarity.postprocess)r   )__name__
__module____qualname__r"   r<   r   rC   r   rI   rK   __classcell__r3   r3   r1   r4   r      s
    %"
*1)/typingr   r   Zcv2rA   npr'   Ztorch.nnnnZtorch.nn.functionalZ
functionalFZPILr   Z
tokenizersr   Ztorchvision.transformsr   r   r   r	   Zmodelscope.metainfor
   Zmodelscope.models.baser   Zmodelscope.models.builderr   Zmodelscope.outputsr   Zmodelscope.preprocessorsr   Zmodelscope.utils.constantr   r   Zmodelscope.utils.loggerr   utilsr   r   r   r   r*   __all__Zregister_moduleZmulti_modal_similarityZteamr   r3   r3   r3   r4   <module>   s*   