o
    *jT                     @   s2  d Z ddlZddlmZ ddlmZmZ ddlZddlZ	ddl
Z
ddlm  mZ ddl
mZ ddlmZ ddlmZmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )z* Generative Multimodal Model Architecture.    N)OrderedDict)TupleUnion)nn)	LayerNorm)SimpleTokenizerclip_tokenizec                       s6   e Zd ZdZdZd	 fdd	ZdejfddZ  Z	S )

Bottleneckzd ResNet style bottleneck module
    From https://github.com/openai/CLIP/blob/main/clip/model.py
          c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr   Fbias   )paddingr   TZinplacez-10)strider   1)super__init__r   Conv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dIdentityavgpool	expansionconv3bn3ReLUrelu
downsampler   r	   
Sequentialr   )selfZinplanesplanesr   	__class__ m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/gemm/gemm_base.pyr   #   s6   

zBottleneck.__init__xc                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S N)	r#   r   r   r   r   r   r!   r    r$   )r&   r,   identityoutr*   r*   r+   forward;   s   



zBottleneck.forwardr   )
__name__
__module____qualname____doc__r   r   torchTensorr0   __classcell__r*   r*   r(   r+   r	      s
    r	   c                   @   s    e Zd ZdZdejfddZdS )	QuickGELUzd A quick version of GELU module
    From https://github.com/openai/CLIP/blob/main/clip/model.py
    r,   c                 C   s   |t d|  S )NgZd;?)r6   Zsigmoidr&   r,   r*   r*   r+   r0   M   s   zQuickGELU.forwardN)r2   r3   r4   r5   r6   r7   r0   r*   r*   r*   r+   r9   H   s    r9   c                       sT   e Zd ZdZ	ddededejf fddZdejfd	d
ZdejfddZ	  Z
S )ResidualAttentionBlockzz Multihead attention block with residual link
    Adapted from https://github.com/openai/CLIP/blob/main/clip/model.py
    Nd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )NZc_fcr
   Zgeluc_proj)r   r   r   ZMultiheadAttentionattnr   ln_1r%   r   Linearr9   mlpln_2r>   )r&   r<   r=   r>   r(   r*   r+   r   V   s   



zResidualAttentionBlock.__init__r,   c                 C   s   | j d ur| j j|j|jdnd | _ | j }|d ur4|jd |jd kr4| j d |jd d |jd f }| j|||d|dd S )Ndtypedevicer   F)need_weightsr>   )r>   torF   rG   shaper@   )r&   r,   r>   r*   r*   r+   	attentiond   s   
"z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r-   )rK   rA   rC   rD   r:   r*   r*   r+   r0   m   s   zResidualAttentionBlock.forwardr-   )r2   r3   r4   r5   intr6   r7   r   rK   r0   r8   r*   r*   r(   r+   r;   Q   s    	r;   c                       sN   e Zd ZdZ		ddedededejdef
 fd	d
ZdejfddZ	  Z
S )Transformerzh Transformer encoder module
    Adapted from https://github.com/openai/CLIP/blob/main/clip/model.py
    NFwidthlayersheadsr>   use_gcc                    sB   t    || _| _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r*   )r;   ).0_r>   rP   rN   r*   r+   
<listcomp>   s    
z(Transformer.__init__.<locals>.<listcomp>)	r   r   rQ   rN   rO   r   r%   range	resblocks)r&   rN   rO   rP   r>   rQ   r(   rT   r+   r   x   s   
zTransformer.__init__r,   c                 C   s
   |  |S r-   )rW   r:   r*   r*   r+   r0      s   
zTransformer.forward)NF)r2   r3   r4   r5   rL   r6   r7   boolr   r0   r8   r*   r*   r(   r+   rM   s   s     rM   c                	       s>   e Zd ZdZ	ddedededef fddZd	d
 Z  ZS )AttentionPool2dzn Pool layer with attention module
    Adapted from https://github.com/openai/CLIP/blob/main/clip/model.py
    Nspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r   g      ?)r   r   r   	Parameterr6   randnpositional_embeddingrB   k_projq_projv_projr?   r\   )r&   rZ   r[   r\   r]   r(   r*   r+   r      s   

zAttentionPool2d.__init__c              	   C   s>  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }t	j
di d|d	|d
|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|ddd S )Nr   r   r^   r   Tdimkeepdimrf   querykeyvalueZembed_dim_to_checkr\   Zq_proj_weightZk_proj_weightZv_proj_weightZin_proj_weightZin_proj_biasZbias_kZbias_vZadd_zero_attnFZ	dropout_pZout_proj_weightZout_proj_biasZuse_separate_proj_weighttrainingrH   r*   )reshaperJ   permuter6   catmeanra   rI   rF   FZmulti_head_attention_forwardr\   rc   weightrb   rd   r   r?   rm   
contiguous)r&   r,   rS   r*   r*   r+   r0      sb   
$

	
zAttentionPool2d.forwardr-   )r2   r3   r4   r5   rL   r   r0   r8   r*   r*   r(   r+   rY      s    rY   c                       s8   e Zd ZdZddddddd fdd	
Zd
d Z  ZS )CrossAttentionz Cross attention module with query and context as input
    Adapted from https://github.com/lucidrains/CoCa-pytorch/blob/main/coca_pytorch/coca_pytorch.py
    N@      Fr
   )context_dimdim_headrP   parallel_ffff_multnorm_contextc          
   	      s   t    || _|d | _|| }|d u r|n|}t|| _|r$t|nt | _tj	||dd| _
tj	||d dd| _tj	||dd| _|| }	|rettj	||	d ddt tj	|	|dd| _d S d | _d S )N      Fr   r^   )r   r   rP   scaler   normr   r   context_normrB   to_qto_kvto_outr%   ZSwiGLUff)
r&   rf   rx   ry   rP   rz   r{   r|   Z	inner_dimZff_inner_dimr(   r*   r+   r      s.   
	


zCrossAttention.__init__c           	      C   s   |  |}| |}| |}||jd |jd | jddddd }|| j }| 	|j
ddd\}}td||}||jddd	 }|jdd}td
||}|dddd |jd |jd d}| |}| jdurz|| | }|S )z
        einstein notation
        b - batch
        h - heads
        n, i, j - sequence length (base sequence length, source, target)
        d - feature dimension
        r   r   rl   r^   r   rh   zb h i d, b j d -> b h i jTre   zb h i j, b j d -> b h i dN)r   r   r   viewrJ   rP   ro   rt   r~   r   chunkr6   ZeinsumZamaxsoftmaxrn   r   r   )	r&   r,   contextqkvsimr@   r/   r*   r*   r+   r0      s2   
	





zCrossAttention.forward)r2   r3   r4   r5   r   r0   r8   r*   r*   r(   r+   ru      s    ru   c                       s8   e Zd ZdZ		d fdd	ZdddZd	d
 Z  ZS )ModifiedResNeta   Modified ResNet backbone
    From https://github.com/openai/CLIP/blob/main/clip/model.py
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       rv   c                    s6  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr   r^   r   F)kernel_sizer   r   r   )r   r   r   Tr   r   )r   r
   rw       )r   r   r]   input_resolutionr   r   r   r   r   r   r   r    r!   r   r   r"   r#   	_inplanes_make_layerlayer1layer2layer3layer4rY   attnpool)r&   rO   r]   rP   r   rN   r[   r(   r*   r+   r      s4   


zModifiedResNet.__init__r   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr   )r	   r   r   rV   appendr   r%   )r&   r'   blocksr   rO   rS   r*   r*   r+   r     s
   
zModifiedResNet._make_layerc                    sJ    fdd}||}  |} |} |} |} |}|S )Nc                    sL    j  jf j jf j jffD ]\}} ||| } q | } | S r-   )r   r   r   r   r    r!   r#   r   )r,   convZbnr&   r*   r+   stem)  s   

z$ModifiedResNet.forward.<locals>.stem)r   r   r   r   r   )r&   r,   r   r*   r   r+   r0   '  s   




zModifiedResNet.forward)r   rv   r1   )r2   r3   r4   r5   r   r   r0   r8   r*   r*   r(   r+   r      s    
	r   c                       sN   e Zd ZdZdededededededef fd	d
ZdejfddZ	  Z
S )VisualTransformerz^ ViT transformer backbone
    From https://github.com/openai/CLIP/blob/main/clip/model.py
    r   
patch_sizerN   rO   rP   r]   rQ   c           	         s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t||||d| _t|| _t|t	|| | _d S )Nr   F)Zin_channelsZout_channelsr   r   r   r}   r^   r   )rQ   )r   r   r   r]   r   r   r   r_   r6   r`   class_embeddingra   r   ln_prerM   transformerln_postproj)	r&   r   r   rN   rO   rP   r]   rQ   r~   r(   r*   r+   r   ?  s&   




zVisualTransformer.__init__r,   c                 C   s   |  |}||jd |jd d}|ddd}tj|jd d|jd |j|jd}tj| j	
|j| |gdd}|| j
|j }| |}|ddd}| |}|ddd}| |}| jd urk|| j }|S )Nr   r   rl   r^   rE   rh   )r   rn   rJ   ro   r6   ZzerosrF   rG   rp   r   rI   ra   r   r   r   r   )r&   r,   zr*   r*   r+   r0   S  s    
 




zVisualTransformer.forward)r2   r3   r4   r5   rL   rX   r   r6   r7   r0   r8   r*   r*   r(   r+   r   :  s    r   c                   @   s   e Zd ZdZdededeeeeeef ef dedededed	ed
ededefddZdddZ	e
dd ZdddZdddZdd ZdS )GEVLa   Generative vision-language model
    Support learning from both generative and contrastive loss.
    Given image and text input, it could output the features of
    image and text respectively. Furthermore, caption could also
    be produced when image input is available.
    r[   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersrQ   c              	   C   s  t j|  || _|| _|| _t|ttfr'|d d }t	|||||d| _
n|d }t|||||||d| _
t||
|	|  |d| _|| _t ||| _t t| j|| _t|| _t t||| _t jj| j| jjd d t t||| _t tg td | _ t|d	|	| | j| j | j|d| _!t "t|t #||t j#||d
d| _$t tg tt| | _%t t|| _&| jj'| j$d _'| j&| j$d _&t t(| j|| _)t*|dd| _+t|| _,d S )Nr   rv   )rO   r]   rP   r   rN   )r   r   rN   rO   rP   r]   rQ   )rN   rO   rP   r>   rQ   r}   )Zstdg$I$I,@r
   Fr   rl   T)rf   r|   )-r   Moduler   r   Zvis_token_size	tokenizer
isinstancetuplelistr   visualr   rM   build_attention_maskr   r   Z	Embeddingtoken_embeddingr_   r6   emptyra   r   ln_finalvis_token_projectioninitZnormal_rN   text_projectionZonesnplogZlogit_scaledecoderr%   rB   	to_logitsgen_logit_scaler   rs   r`   img_queriesru   img_attn_poolimg_attn_pool_norm)r&   r[   r   r   r   r   r   r   r   r   r   rQ   r   Zvision_headsr*   r*   r+   r   m  s   
	




zGEVL.__init__Nr   c                 C   sb   |d u r| j n|}t||}|tttjj |d |dkr/d|d |d |f< |S )Nr   r   )	r   r6   r   Zfill_ZtensorZfinfoZfloat16minZtriu_)r&   Z
seq_lengthZprefix_lengthmaskr*   r*   r+   r     s   
zGEVL.build_attention_maskc                 C   s   | j jjjS r-   )r   r   rs   rF   r   r*   r*   r+   rF     s   z
GEVL.dtypeFc                 C   sd   |  |}|d d dd d f }||jdddd }|r0|d d dd d d f | j }||fS |S )Nr   rl   r^   Trf   prg   r   )r   r   r   )r&   imagereturn_tokensZimage_outputsimage_featuresimage_tokensr*   r*   r+   encode_image  s   
 zGEVL.encode_imagec                 C   s   |  |}|| jd |jd d d f  }|ddd}| |}|ddd}| |}|t|jd |jdddf | j	 }||j
dddd }|rS|}||fS |S )	Nr   r   r^   rl   rh   .Tr   )r   ra   rJ   ro   r   r   r6   Zarangeargmaxr   r   )r&   textr   r,   text_featurestext_tokensr*   r*   r+   encode_text  s$   
 

zGEVL.encode_textc                 C   s  | j |dd\}}| j|jd dd}| ||}| |}| jjd }| jjd }|j|jd dt	j
d| }|}	g }
t| jD ]g}| j|dd\}}t	j||gdd	}	| |	ddd
 }| |dd df }tj| j | dd}t	j|dt	|  dd	}t||kst|dkr n|
| t	j||dd gdd	}qBt	j|
dd	dd}g }|D ]}g }|D ]	}|t| q| j|}| }|| q||d fS )NT)r   r   rl   z<|startoftext|>z<|endoftext|>r   )rF   )Zaxisr^   .rh   g       @)r   r   expandrJ   r   r   r   encoderZnew_onesr6   longrV   r   r   rp   r   ro   rt   r   rr   r   r   expr   Z	rand_likerL   r   decodestrip)r&   r   r   r   r   Zimg_token_featuresZ	sot_tokenZ	eot_tokenZ
text_inputZinput_tokensZpred_tokensZtext_idxr   r   Zout_embsZ
gen_logitsZprobspredZpred_text_tokensZ	text_listZ
out_tokenstokensr,   Zout_textr*   r*   r+   image_to_text  sT   



zGEVL.image_to_textNr   )F)r2   r3   r4   r5   rL   r   r   rX   r   r   propertyrF   r   r   r   r*   r*   r*   r+   r   e  s:    


I	


r   c                       sB   e Zd ZdZ fddZdd Zdd Ze dd
dZ	  Z
S )	GEMMModelz Generative multi-modal model, wrapper of GEVL module.
    It takes image or text or both of them as input, and output
    features of input or caption when image input is available.
    c                    s   t    td|ddd}t| }W d    n1 s!w   Y  t| d }|| }t	j
|d}t|| _tg || jR  | _d S )Nz{}/encoder_config.jsonrzutf-8)encodingr   zbpe_vocab_16e6.txt.gz)r   r   openformatjsonloadsreadr   keysospathjoinr   r   r   model)r&   Z	model_dirfZmodel_configZ
model_nameZconfig_argsZbpe_pathr(   r*   r+   r     s   


zGEMMModel.__init__c                 C   s   t | j|gd }|S r   )r   r   )r&   Ztext_strZtext_tensorr*   r*   r+   tokenize  s   zGEMMModel.tokenizec                 C   s   |   }|S r-   )cpunumpy)r&   Zfeatr/   r*   r*   r+   
parse_feat  s   zGEMMModel.parse_featNTc                 C   sv   d\}}}|r|d ur| j |\}}| |}n|d ur&| | j |}|d ur3| | j |}|||d}|S )N)NNN)Zimage_featuretext_featurecaption)r   r   r   r   r   )r&   r   r   Z
captioningZimg_featurer   r   r/   r*   r*   r+   r0     s   
zGEMMModel.forward)NNT)r2   r3   r4   r5   r   r   r   r6   Zno_gradr0   r8   r*   r*   r(   r+   r     s    r   )r5   r   collectionsr   typingr   r   r   r   r   r6   Ztorch.nn.functionalr   Z
functionalrr   Ztorch.nnr   Z,modelscope.models.multi_modal.gemm.tokenizerr   r   r   r	   r9   r;   rM   rY   ru   r   r   r   r   r*   r*   r*   r+   <module>   s,   
,	"0<C+ !