o
    *j"@                     @   s  d dl mZ d dlmZmZ d dlZd dlZd dlm	  m
Z d dlm	Z	 G dd de	jZG dd de	jZG d	d
 d
e	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZde	jfddZdd ZdS )    )OrderedDict)TupleUnionN)nnc                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneck      c                    s2  t    tj||ddd| _t|| _tjdd| _tj||dddd| _	t|| _
tjdd| _|dkr=t|nt | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksp||tj krttdt|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr   F)biasTZinplace   )paddingr	   z-10)strider	   1)super__init__r   Conv2dconv1BatchNorm2dbn1ReLUrelu1conv2bn2relu2	AvgPool2dZIdentityavgpool	expansionconv3bn3relu3
downsampler   r   
Sequentialr   )selfZinplanesplanesr   	__class__ t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/cv/text_driven_segmentation/model.pyr      s:   

zBottleneck.__init__xc                 C   st   |}|  | | |}| | | |}| |}| | |}| j	d ur/| 	|}||7 }| 
|}|S N)r   r   r   r   r   r   r   r   r   r!   r    )r#   r)   identityoutr'   r'   r(   forward2   s   



zBottleneck.forwardr   )	__name__
__module____qualname__r   r   torchTensorr-   __classcell__r'   r'   r%   r(   r      s    "r   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r   g      ?)r   r   r   	Parameterr2   randnpositional_embeddingLineark_projq_projv_projc_projr8   )r#   r6   r7   r8   r9   r%   r'   r(   r   D   s   

zAttentionPool2d.__init__c              	   C   s   |j ddddd}tj|jddd|gdd}|| jd d d d d f |j }tj	di d|d d d	|d
|d|j
d d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|dS )Nr:   )Z	start_dimr   r   TdimZkeepdimrD   querykeyvalueZembed_dim_to_checkr8   Zq_proj_weightZk_proj_weightZv_proj_weightin_proj_weightin_proj_biasbias_kbias_vZadd_zero_attnFZ	dropout_pZout_proj_weightZout_proj_biasZuse_separate_proj_weighttrainingneed_weightsr'   )flattenpermuter2   catmeanr=   todtypeFZmulti_head_attention_forwardshaper8   r@   weightr?   rA   r	   rB   rN   Zsqueeze)r#   r)   _r'   r'   r(   r-   R   sZ   $

	

zAttentionPool2d.forwardr*   )r/   r0   r1   intr   r-   r4   r'   r'   r%   r(   r5   B   s    r5   c                       s8   e Zd ZdZ		d fdd	ZdddZd	d
 Z  ZS )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    sR  t    || _|| _tjd|d ddddd| _t|d | _tj	dd| _
tj|d |d dddd| _t|d | _tj	dd| _tj|d |dddd| _t|| _tj	dd| _td| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr   r:   r   F)kernel_sizer   r   r	   Tr
   )r^   r   r	   r   )r   r          )r   r   r9   input_resolutionr   r   r   r   r   r   r   r   r   r   r   r   r    r   r   	_inplanes_make_layerlayer1layer2layer3layer4r5   attnpool)r#   layersr9   headsra   widthr7   r%   r'   r(   r   v   s8   


zModifiedResNet.__init__r   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr   )r   rb   r   rangeappendr   r"   )r#   r$   blocksr   ri   rY   r'   r'   r(   rc      s
   
zModifiedResNet._make_layerc                    sZ    fdd}|  jjj}||} |} |} |} |} |}|S )Nc                    sP       | }    | }    | }  	| } | S r*   )
r   r   r   r   r   r   r    r   r   r   )r)   r#   r'   r(   stem   s
   
z$ModifiedResNet.forward.<locals>.stem)	typer   rX   rU   rd   re   rf   rg   rh   )r#   r)   rp   r'   ro   r(   r-      s   




zModifiedResNet.forward)r\   r]   r.   )r/   r0   r1   __doc__r   rc   r-   r4   r'   r'   r%   r(   r[   n   s    
$	r[   c                       s(   e Zd ZdZdejf fddZ  ZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.r)   c                    s$   |j }t |tj}||S r*   )rU   r   r-   rq   r2   Zfloat32)r#   r)   	orig_typeretr%   r'   r(   r-      s   
zLayerNorm.forward)r/   r0   r1   rr   r2   r3   r-   r4   r'   r'   r%   r(   rs      s    rs   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr)   c                 C   s   |t d|  S )NgZd;?)r2   Zsigmoidr#   r)   r'   r'   r(   r-         zQuickGELU.forwardN)r/   r0   r1   r2   r3   r-   r'   r'   r'   r(   rv      s    rv   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fcr   ZgelurB   )r   r   r   MultiheadAttentionattnrs   ln_1r"   r   r>   rv   mlpln_2r|   )r#   rz   r{   r|   r%   r'   r(   r      s   



zResidualAttentionBlock.__init__r)   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )NrU   deviceF)rO   r|   r   )r|   rT   rU   r   r   rw   r'   r'   r(   	attention   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r*   )r   r   r   r   rw   r'   r'   r(   r-      s   zResidualAttentionBlock.forwardr*   )
r/   r0   r1   rZ   r2   r3   r   r   r-   r4   r'   r'   r%   r(   ry      s    ry   c                       s.   e Zd Zd fdd	ZdejfddZ  ZS )TransformerNc                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r'   )ry   ).0rY   r|   rj   rk   r'   r(   
<listcomp>   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r   rk   ri   r   r"   rl   	resblocks)r#   rk   ri   rj   r|   r%   r   r(   r      s   
zTransformer.__init__r)   c                 C   s
   |  |S r*   )r   rw   r'   r'   r(   r-      s   
zTransformer.forwardr*   )r/   r0   r1   r   r2   r3   r-   r4   r'   r'   r%   r(   r      s    	r   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )VisionTransformerra   
patch_sizerk   ri   rj   r9   c                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )Nr   F)Zin_channelsZout_channelsr^   r   r	         r:   r   )r   r   ra   r9   r   r   r   r;   r2   r<   class_embeddingr=   rs   ln_prer   transformerln_postproj)r#   ra   r   rk   ri   rj   r9   scaler%   r'   r(   r      s&   




zVisionTransformer.__init__r)   c                 C   s   |  |}||jd |jd d}|ddd}| j|j}tj|jd d|jd |j|j	d}tj
|| |gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urv|| j }|S )Nr   r   rI   r:   r   rE   )r   ZreshaperW   rQ   r   rT   rU   r2   Zzerosr   rR   r=   r   r   r   r   )r#   r)   x1Zx2r'   r'   r(   r-     s&   




zVisionTransformer.forward)	r/   r0   r1   rZ   r   r2   r3   r-   r4   r'   r'   r%   r(   r      s    r   c                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd Zdd Zdd Z  ZS )CLIPr7   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _t|ttfr |d d }t|||||d| _n|d }t||||||d| _t	||
|	| 
 d| _|| _t||| _tt| j|| _t|| _tt||| _ttg td | _|   d S )Nr`   r]   )ri   r9   rj   ra   rk   )ra   r   rk   ri   rj   r9   )rk   ri   rj   r|   g$I$I,@)r   r   r   
isinstancetuplelistr[   visualr   r   build_attention_maskr   r   r   Z	Embeddingtoken_embeddingr;   r2   emptyr=   rs   ln_finaltext_projectionZonesnploglogit_scaleinitialize_parameters)r#   r7   r   r   r   r   r   r   r   r   r   Zvision_headsr%   r'   r(   r   &  sJ   



zCLIP.__init__c           	      C   s  t jj| jjdd t jj| jdd t| jtr|| jj	d urW| jj	j
jd }t jj| jj	jj|d t jj| jj	jj|d t jj| jj	jj|d t jj| jj	j
j|d | jj| jj| jj| jjfD ]}| D ]\}}|drzt j| qkqe| jjd d| jj d  }| jjd }d| jj d }| jjD ]-}t jj|jj|d t jj|jjj|d t jj|jjj|d t jj|jj
j|d q| jd urt jj| j| jjd d d S d S )Ng{Gz?)stdg{Gz?r   z
bn3.weightr:   ) r   initZnormal_r   rX   r=   r   r   r[   rh   rB   Zin_featuresr@   r?   rA   rd   re   rf   rg   Znamed_parametersendswithZzeros_r   rk   ri   r   r   rJ   Zout_projr   r}   r   )	r#   r   Zresnet_blocknameparamZproj_stdZattn_stdZfc_stdblockr'   r'   r(   r   \  s@   



zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-infr   )r2   r   r   Zfill_floatZtriu_)r#   maskr'   r'   r(   r   ~  s   
zCLIP.build_attention_maskc                 C   s   | j jjjS r*   )r   r   rX   rU   ro   r'   r'   r(   rU     s   z
CLIP.dtypec                 C   s   |  || jS r*   )r   rq   rU   )r#   imager'   r'   r(   encode_image  rx   zCLIP.encode_imagec                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|t|j	d |j
ddf | j }|S )Nr   r   r:   rI   rE   )r   rq   rU   r=   rQ   r   r   r2   ZarangerW   Zargmaxr   )r#   textr)   r'   r'   r(   encode_text  s   

zCLIP.encode_textc                 C   sb   |  |}| |}||jddd }||jddd }| j }|| |  }| }||fS )Nr   TrC   )r   r   Znormr   expt)r#   r   r   Zimage_featuresZtext_featuresr   Zlogits_per_imageZlogits_per_textr'   r'   r(   r-     s   


zCLIP.forward)r/   r0   r1   rZ   r   r   r   r   r   propertyrU   r   r   r-   r4   r'   r'   r%   r(   r   $  s:    	
6"
r   modelc                 C   s   dd }|  | dS )z+Convert applicable model parameters to fp16c                 S   s   t | tjtjtjfr | jj | j_| jd ur | jj | j_t | tj	rGg dd dD dddD ]}t
| |}|d urF|j |_q5dD ]}t| |r_t
| |}|d ur_|j |_qId S )Nc                 S   s   g | ]}| d qS )Z_proj_weightr'   )r   sr'   r'   r(   r     s    zEconvert_weights.<locals>._convert_weights_to_fp16.<locals>.<listcomp>)inqkvrK   rL   rM   )r   r   )r   r   ZConv1dr   r>   rX   dataZhalfr	   r~   getattrhasattr)ZllattrZtensorr   r'   r'   r(   _convert_weights_to_fp16  s2   



z1convert_weights.<locals>._convert_weights_to_fp16N)apply)r   r   r'   r'   r(   convert_weights  s   r   c                  C   s*   t dddddddddd
} t|  |  S )	Ni   r\      i   r`   M   i   r_   )r   r   eval)r   r'   r'   r(   build_model  s   r   )collectionsr   typingr   r   numpyr   r2   Ztorch.nn.functionalr   Z
functionalrV   Moduler   r5   r[   rs   rv   ry   r   r   r   r   r   r'   r'   r'   r(   <module>   s$   5,I	1 