o
    *j                     @   s   d Z ddlZddlZddlZddlm  mZ ddlmZ ddlm	Z	m
Z
 G dd dejZG dd dejZG d	d
 d
ejZdS )z* Generative Multimodal Model Architecture.    N)nn)	gemm_base	tokenizerc                       s*   e Zd ZdZ fddZdddZ  ZS )ImageEncoderz4Image Feature Encoder
    ViT Style Transformer
    c              	      sB   t    |d d \}}}}}tj|||||d |dd| _d S )N   @   F)Zinput_resolutionZ
patch_sizewidthlayersheadsZ
output_dimZuse_gc)super__init__r   ZVisualTransformervisual)selfconfigsZ	embed_dimZimage_resolutionZvision_layersZvision_widthZvision_patch_size	__class__ i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/rleg/model.pyr      s   


zImageEncoder.__init__Fc                 C   sJ   |  |}|d d dd d d f }|d d dd d f }|r#||fS |S )N   r   )r   )r   imagereturn_tokensfeaturestokens	embeddingr   r   r   forward%   s   
zImageEncoder.forwardF)__name__
__module____qualname____doc__r   r   __classcell__r   r   r   r   r      s    r   c                       s4   e Zd ZdZ fddZd
ddZddd	Z  ZS )TextEncoderz4Text Feature Encoder
    BERT style transformer
    c                    s   t    |dd  \}}}}}tj|||| |d| _t||| _t	t
||| _t|| _t	t
||d | _d S )N)r   r	   r
   Z	attn_maskr   )r   r   r   ZTransformerbuild_attention_masktransformerr   Z	Embeddingtoken_embedding	Parametertorchemptypositional_embeddingZ	LayerNormln_finaltext_projection)r   r   Zcontext_lengthZ
vocab_sizeZmodel_widthZmodel_headsZmodel_layersr   r   r   r   1   s$   




zTextEncoder.__init__Nc                 C   s   t ||d }|d |S )Ng     r   )r'   onesZtriu_)r   Z
seq_lengthmaskr   r   r   r#   D   s   
z TextEncoder.build_attention_maskFc                 C   s|   |  |}|| j }|ddd}| |}|ddd}| |}|t|jd |jdddf | j	 }|r<||fS |S )Nr   r      )dim.)
r%   r)   Zpermuter$   r*   r'   ZarangeshapeZargmaxr+   )r   textr   xr   r   r   r   r   I   s   



zTextEncoder.forwardNr   )r   r   r   r   r   r#   r   r    r   r   r   r   r!   ,   s
    
r!   c                       sR   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Ze	
 dddZ  ZS )	RLEGModelz Generative multi-modal model, trained with RLEG method.
    It takes image or text or both of them as input, and produce
    the corresponding features of inputs.
    c                    s   t    td|ddd}t| }W d    n1 s!w   Y  t| d }|| }t	j
|d}t|| _t|| _t|| _ttg | _d S )Nz{}/encoder_config.jsonrzutf-8)encodingr   zbpe_vocab_16e6.txt.gz)r   r   openformatjsonloadsreadlistkeysospathjoinr   ZSimpleTokenizerr   image_encoderr!   text_encoderr   r&   r'   r,   Zlogit_scale)r   Z	model_dirfZmodel_configZ
model_nameZconfig_argsZbpe_pathr   r   r   r   \   s   



zRLEGModel.__init__c                 C   s   t | j |gd }|S )Nr   )r   Zclip_tokenize)r   Ztext_strZtext_tensorr   r   r   tokenizek   s   zRLEGModel.tokenizec                 C      |  |}tj|ddd}|S Nr.   r/   )pr0   )rC   F	normalize)r   r2   featurer   r   r   encode_texto      
zRLEGModel.encode_textc                 C   rF   rG   )rB   rI   rJ   )r   r   rK   r   r   r   encode_imaget   rM   zRLEGModel.encode_imagec                 C   s   |   }|S r4   )cpunumpy)r   Zfeatoutr   r   r   
parse_featy   s   zRLEGModel.parse_featNc                 C   sF   d\}}|dur|  | |}|dur|  | |}||d}|S )zW It takes image or text as input,
        and extracts the features as output.
        NNN)Zimage_featuretext_feature)rR   rN   rL   )r   r   r2   Zimg_featurerT   rQ   r   r   r   r   }   s   zRLEGModel.forwardrS   )r   r   r   r   r   rE   rL   rN   rR   r'   Zno_gradr   r    r   r   r   r   r5   V   s    r5   )r   r?   r:   r'   Ztorch.nn.functionalr   Z
functionalrI   Z"modelscope.models.multi_modal.gemmr   r   Moduler   r!   r5   r   r   r   r   <module>   s   *