o
    *jR                     @   s   d Z ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ e ZdgZejeje	jdG dd deZdS )z% Generative Multimodal Model Wrapper.    )AnyDictN)
transforms)Models)
TorchModel)MODELS)	RLEGModel)
OutputKeys)	LoadImage)	ModelFileTasks)
get_loggerRLEGForMultiModalEmbedding)module_namec                       sT   e Zd ZdZd fdd	Zdd Zdd Zd	eee	f d
eee	f fddZ
  ZS )r   z Generative multi-modal model for multi-modal embedding.
    The model is trained by representation learning with embedding generation.
    Inputs could be image or text or both of them.
    Outputs could be features of input image or text,
    r   c                    s   t  j|||d| t|d| _td|tj}| j	| | j
  || _| jdkrHtj rH| jd| j td| j nd| _td ttd	t td
dg| _d S )N)	model_dir	device_id)r   z{}/{}r   cuda:{}zUse GPU: {}zUse CPU for inference)   r   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)super__init__r   modeltorchloadformatr   ZTORCH_MODEL_BIN_FILEZload_state_dictevalr   cudaZis_availabletologgerinfoTZComposeZResizeZToTensorZ	Normalizeimg_preprocessor)selfr   r   argskwargsZpretrained_params	__class__ h/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/rleg/rleg.pyr      s0   

z#RLEGForMultiModalEmbedding.__init__c                 C   sD   |d u rd S t |}| |d }| jdkr |d| j}|S )N)N.r   r   )r
   Zconvert_to_imgr!   r   r   r   )r"   Z	input_imgZ
img_tensorr'   r'   r(   parse_image5   s   

z&RLEGForMultiModalEmbedding.parse_imagec                 C   sj   |d u s
t |dkrd S t|tr| j|}n	tdt| | jdkr/|d	| j}|
ddS )Nr   ztext should be str, but got r      r   )len
isinstancestrr   tokenize	TypeErrortyper   r   r   view)r"   Ztext_strZtext_ids_tensorr'   r'   r(   
parse_text>   s   

z%RLEGForMultiModalEmbedding.parse_textinputreturnc              	   C   sz   | d| dd }| d| dd }| |}| |}| ||}tj| dd tj| dd tj| dd i}|S )NimageZimgtexttxtZimage_featureZtext_featurecaption)getr)   r2   r   r	   ZIMG_EMBEDDINGZTEXT_EMBEDDINGZCAPTION)r"   r3   Zimage_inputZ
text_inputr5   r6   outoutputr'   r'   r(   forwardJ   s   

z"RLEGForMultiModalEmbedding.forward)r   )__name__
__module____qualname____doc__r   r)   r2   r   r-   r   r<   __classcell__r'   r'   r%   r(   r      s    	*)r@   typingr   r   r   Ztorchvisionr   r    Zmodelscope.metainfor   Zmodelscope.models.baser   Zmodelscope.models.builderr   Z(modelscope.models.multi_modal.rleg.modelr   Zmodelscope.outputsr	   Zmodelscope.preprocessorsr
   Zmodelscope.utils.constantr   r   Zmodelscope.utils.loggerr   r   __all__Zregister_moduleZ generative_multi_modal_embeddingZrlegr   r'   r'   r'   r(   <module>   s$   