o
    *ju                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
m  mZ d dl	mZ ddlmZ G dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZeddddddedddddddZdddZedkre ZdS dS )    N)OrderedDict)TupleUnion)nn   )ViMc                       s$   e Zd Zdejf fddZ  ZS )	LayerNormxc                    s$   |j }t |tj}||S N)dtypesuperforwardtypetorchZfloat32)selfr	   	orig_typeret	__class__ p/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/vision_middleware/backbone.pyr      s   
zLayerNorm.forward)__name__
__module____qualname__r   Tensorr   __classcell__r   r   r   r   r      s    r   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr	   c                 C   s   |t d|  S )NgZd;?)r   Zsigmoidr   r	   r   r   r   r      s   zQuickGELU.forwardN)r   r   r   r   r   r   r   r   r   r   r      s    r   c                       sT   e Zd Z	ddededejf fddZdejfdd	Zdejd
efddZ	  Z
S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      s   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _t | _t | _d S )NZc_fc   ZgeluZc_proj)r   __init__r   ZMultiheadAttentionattnr   ln_1Z
Sequentialr   ZLinearr   mlpln_2r!   r   vim_attvim_mlp)r   r   r    r!   r   r   r   r#   !   s   


zResidualAttentionBlock.__init__r	   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )N)r   deviceF)Zneed_weightsr!   r   )r!   tor   r*   r$   r   r   r   r   	attention3   s   
z ResidualAttentionBlock.attention	task_namec                 C   sT   |  |}|| | }|| || }| |}|| | }|| || }|S r
   )r%   r,   r(   r'   r&   r)   )r   r	   r-   Z
x_normed_1Z
x_normed_2r   r   r   r   :   s   

zResidualAttentionBlock.forwardr
   )r   r   r   intr   r   r#   r,   strr   r   r   r   r   r   r      s    r   c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )TransformerNwidthlayersheadsr!   c                    s<   t    | _|| _t fddt|D | _d S )Nc                    s   g | ]}t  qS r   )r   ).0_r!   r3   r1   r   r   
<listcomp>P   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r#   r1   r2   r   Z
ModuleListrange	resblocks)r   r1   r2   r3   r!   r   r6   r   r#   H   s   
zTransformer.__init__r	   c           	      K   sD   |  \}}}g }t| jD ]\}}||fi |}|| q|S r
   )size	enumerater9   append)	r   r	   kwargsLBDfeaturesiZblkr   r   r   r   U   s   zTransformer.forwardr
   )	r   r   r   r.   r   r   r#   r   r   r   r   r   r   r0   F   s    r0   c                       sN   e Zd ZdZ	ddedededededef fd	d
ZdejfddZ  Z	S )VisionTransformeraf  
    The Vision Transformer (ViT) model
    Args:
        - input_resolution (int): shape of input image
        - patch_width (int): size of patch tokens
        - width (int): feature channels
        - layers (int): number of transformer layers
        - heads (int): number of multi-head attention
        - output_dim (int): output feature channels
       input_resolution
patch_sizer1   r2   r3   
output_dimc                    s   t    || _tjd|||dd| _|d }t|t| | _	t|t|| d d | | _
t|| _|| | _t|||| _t|| _t|t|| | _|| _d S )N   F)Zin_channelsZout_channelsZkernel_sizeZstrideZbiasg         r   )r   r#   rE   r   ZConv2dconv1	Parameterr   Zrandnclass_embeddingpositional_embeddingr   ln_preZpatch_per_sider0   transformerln_postprojrG   )r   rE   rF   r1   r2   r3   rG   scaler   r   r   r#   j   s(   






zVisionTransformer.__init__r	   c           	   	   K   sN  |  |}|d}|d}||jd |jd d}|ddd}| j|jddd|dd}t	j
||gdd}|| j|j }| |}|ddd}| j|fi |}|d }|ddd}| |d d dd d f }| jd ur~|| j }g }|D ]}||dd d d d d f ddd|d|| q|| |S )Nr   rI   r   )dim)rJ   r:   ZreshapeshapeZpermuterL   r+   r   repeatr   catrM   rN   rO   rP   rQ   r<   )	r   r	   r=   r?   PZ	cls_tokenZx_per_layerZoutputsoutputr   r   r   r      s:   






"
zVisionTransformer.forward)rD   )
r   r   r   __doc__r.   r#   r   r   r   r   r   r   r   r   rC   ^   s"    rC         i      )rE   rF   r1   r2   r3       )vit_b16_224Zvit_b32_224r_   c                 C   s$   t |  }tdi |}|| |S )z build a ViT + ViM model
        Args:
            arch: name of backbone
            pretrained: weights of pretrained model
    Nr   )
model_dictrC   Zload_state_dict)archZ
pretrainedZ
model_argsmodelr   r   r   build_backbone   s   
rc   __main__)r_   N)mathoscollectionsr   typingr   r   numpynpr   Ztorch.nn.functionalr   Z
functionalFvimr   r   Moduler   r   r0   rC   dictr`   rc   r   rb   r   r   r   r   <module>   s*   'L

