o
    *j                     @   s   d dl Z d dlZd dlmZ d dlm  mZ d dlmZm	Z	 d dl
mZ G dd dejZG dd dejZG dd	 d	ejZedddZdS )    N)DropPathtrunc_normal_)register_modelc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	Blocka   ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch

    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
            ư>c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	 | _
td| || _|dkr>tj|t| dd	nd | _|d
krLt|| _d S t | _d S )N      )kernel_sizepaddinggroupsr   )eps   r   T)Zrequires_gradr   )super__init__nnConv2ddwconv	LayerNormnormLinearpwconv1ZGELUactpwconv2	Parametertorchonesgammar   ZIdentity	drop_path)selfdimr   layer_scale_init_value	__class__ m/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/vldoc/convnext.pyr      s0   



zBlock.__init__c                 C   s|   |}|  |}|dddd}| |}| |}| |}| |}| jd ur-| j| }|dddd}|| | }|S )Nr      r	      )r   Zpermuter   r   r   r   r   r   )r   xinputr$   r$   r%   forward)   s   






zBlock.forward)r   r   __name__
__module____qualname____doc__r   r*   __classcell__r$   r$   r"   r%   r      s    r   c                       sD   e Zd ZdZdg dg dddf fdd	Zd	d
 Zdd Z  ZS )ConvNeXta   ConvNeXt
        A PyTorch impl of : `A ConvNet for the 2020s`  -
          https://arxiv.org/pdf/2201.03545.pdf
    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    r	   r	   r	   	   r	   `      i  i   r   r   c           	   
      s(  t    t | _ttj|d dddtd ddd}| j| t	dD ]"tt dddtj d  d	d	d}| j| q*t | _
d
d td|t|D d t	dD ]"tj fddt	| D  }| j
|  | 7  qf| _| | j d S )Nr   r   )r
   Zstrider   channels_first)r   data_formatr	   r'   r&   c                 S   s   g | ]}|  qS r$   )item).0r(   r$   r$   r%   
<listcomp>`   s    z%ConvNeXt.__init__.<locals>.<listcomp>c                    s&   g | ]}t   |  d qS ))r    r   r!   )r   )r:   jcurdimsZdp_ratesir!   r$   r%   r;   e   s    
)r   r   r   Z
ModuleListdownsample_layersZ
Sequentialr   r   appendrangestagesr   Zlinspacesumr?   apply_init_weights)	r   Zin_chansdepthsr?   Zdrop_path_rater!   stemZdownsample_layerZstager"   r=   r%   r   G   s4   



zConvNeXt.__init__c                 C   s8   t |tjtjfrt|jdd tj|jd d S d S )Ng{Gz?)Zstdr   )	
isinstancer   r   r   r   weightinitZ	constant_bias)r   mr$   r$   r%   rG   t   s   zConvNeXt._init_weightsc                 C   s@   g }t dD ]}| j| |}| j| |}|| qt|S )Nr   )rC   rA   rD   rB   tuple)r   r(   Zxsr@   r$   r$   r%   r*   y   s   zConvNeXt.forward)r,   r-   r.   r/   r   rG   r*   r0   r$   r$   r"   r%   r1   9   s    -r1   c                       s.   e Zd ZdZ		d fdd	Zdd Z  ZS )	r   aF   LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
    with shape (batch_size, channels, height, width).
    r   channels_lastc                    sT   t    tt|| _tt|| _|| _	|| _
| j
dvr$t|f| _d S )N)rP   r7   )r   r   r   r   r   r   rK   ZzerosrM   r   r8   NotImplementedErrornormalized_shape)r   rR   r   r8   r"   r$   r%   r      s   

zLayerNorm.__init__c                 C   s   | j dkrt|| j| j| j| jS | j dkrN|jddd}|| djddd}|| t	
|| j  }| jd d d d f | | jd d d d f  }|S d S )NrP   r7   r'   T)Zkeepdimr&   )r8   FZ
layer_normrR   rK   rM   r   meanpowr   sqrt)r   r(   usr$   r$   r%   r*      s   

,zLayerNorm.forward)r   rP   r+   r$   r$   r"   r%   r      s    r   Fc                 K   s    t dg dg dd|}|S )Nr2   r4   )rH   r?   r$   )r1   )Z
pretrainedZin_22kkwargsmodelr$   r$   r%   convnext_tiny   s   r[   )FF)osr   Ztorch.nnr   Ztorch.nn.functionalZ
functionalrS   Ztimm.models.layersr   r   Ztimm.models.registryr   Moduler   r1   r   r[   r$   r$   r$   r%   <module>   s   ,L 