o
    *j                     @   s  d Z ddlmZ ddlmZmZ ddlZddlZ	ddl
Z
ddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% e% Z&dgZ'ej(e#j)ej*dG dd deZ+dS )z% Generative Multimodal Model Wrapper.    N)AnyDict)Image)
transforms)Models)
TorchModel)MODELS)	GEMMModel)
OutputKeys)	LoadImage)	ModelFileTasks)
get_loggerGEMMForMultiModalEmbedding)module_namec                       sT   e Zd ZdZd fdd	Zdd Zdd Zd	eee	f d
eee	f fddZ
  ZS )r   z Generative multi-modal model for multi-modal embedding
    Inputs could be image or text or both of them.
    Outputs could be features of input image or text,
    image caption could also be produced when image is available.
    r   c              	      s   t  j|||d| t|d| _td|tj}| j	| | j
  || _| jdkrHtj rH| jd| j td| j nd| _td ttd	td	t td
dg| _d S )N)	model_dir	device_id)r   z{}/{}r   cuda:{}zUse GPU: {}zUse CPU for inference   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)super__init__r	   
gemm_modeltorchloadformatr   ZTORCH_MODEL_BIN_FILEZload_state_dictevalr   cudaZis_availabletologgerinfoTZComposeZResizeZ
CenterCropZToTensorZ	Normalizeimg_preprocessor)selfr   r   argskwargsZpretrained_params	__class__ n/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/gemm/gemm_model.pyr   %   s2   

z#GEMMForMultiModalEmbedding.__init__c                 C   sD   |d u rd S t |}| |d }| jdkr |d| j}|S )N)N.r   r   )r   Zconvert_to_imgr"   r   r   r   )r#   Z	input_imgZ
img_tensorr(   r(   r)   parse_image<   s   

z&GEMMForMultiModalEmbedding.parse_imagec                 C   sj   |d u s
t |dkrd S t|tr| j|}n	tdt| | jdkr/|d	| j}|
ddS )Nr   ztext should be str, but got r      r   )len
isinstancestrr   tokenize	TypeErrortyper   r   r   view)r#   Ztext_strZtext_ids_tensorr(   r(   r)   
parse_textE   s   

z%GEMMForMultiModalEmbedding.parse_textinputreturnc           
   	   C   s   | d| dd }| d| dd }| dd }| |}| |}|du p+|dk}| |||}tj| dd tj| d	d tj| d
d i}	|	S )NimageZimgtexttxt
captioningT Zimage_featureZtext_featurecaption)getr*   r3   r   r
   ZIMG_EMBEDDINGZTEXT_EMBEDDINGZCAPTION)
r#   r4   Zimage_inputZ
text_inputZcaptioning_inputr6   r7   r9   outoutputr(   r(   r)   forwardQ   s   

z"GEMMForMultiModalEmbedding.forward)r   )__name__
__module____qualname____doc__r   r*   r3   r   r.   r   r?   __classcell__r(   r(   r&   r)   r      s    	*),rC   Zos.pathpathZosptypingr   r   jsonnumpynpr   Ztorch.nnnnZtorch.nn.functionalZ
functionalFZPILr   Ztorchvisionr   r!   Zmodelscope.metainfor   Zmodelscope.models.baser   Zmodelscope.models.builderr   Z,modelscope.models.multi_modal.gemm.gemm_baser	   Zmodelscope.outputsr
   Zmodelscope.preprocessorsr   Zmodelscope.utils.constantr   r   Zmodelscope.utils.loggerr   r   __all__Zregister_moduleZ generative_multi_modal_embeddingZgemmr   r(   r(   r(   r)   <module>   s0   