o
    *j#                     @   s  d dl Z d dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZmZ G dd dejZd/ddZd	d
 Zdd Zdd Zdd ZG dd dejZG dd dejZG dd dejeZG dd dejZG dd dejZG dd deZG dd dejZdd  ZG d!d" d"ejZG d#d$ d$ejZ G d%d& d&ejZ!G d'd( d(e!Z"G d)d* d*ejZ#G d+d, d,eZ$G d-d. d.eZ%dS )0    N)abstractmethod)PretrainedConfigPreTrainedModelc                       s   e Zd Z fddZ  ZS )	GroupNormc                    s   t t| | |jS N)superr   forwardfloattypedtypeselfx	__class__ t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/guided_diffusion/unet.pyr      s   zGroupNorm.forward)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s    r   '  c                 C   s   |d }t t| t jd|t jd | j| jd}| dddf  |d  }t j	t 
|t |gdd}|d rRt j	|t |ddddf gdd}|S )	aY  
    Create sinusoidal timestep embeddings.

    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an [N x dim] Tensor of positional embeddings.
       r   )startendr   )deviceNdim   )thexpmathlogZarangefloat32tor   r	   catcossinZ
zeros_like)	timestepsr   Z
max_periodhalfZfreqsargsZ	embeddingr   r   r   timestep_embedding   s   
r,   c                 C   L   t | tjtjtjfr"| jj | j_| jdur$| jj | j_dS dS dS )z/
    Convert primitive modules to float16.
    N)	
isinstancennConv1dConv2dConv3dweightdatar*   biasZllr   r   r   convert_module_to_f16+      
r7   c                 C   r-   )zP
    Convert primitive modules to float32, undoing convert_module_to_f16().
    N)	r.   r/   r0   r1   r2   r3   r4   r	   r5   r6   r   r   r   convert_module_to_f325   r8   r9   c                 O   sV   | dkrt j|i |S | dkrt j|i |S | dkr$t j|i |S td|  )z4
    Create a 1D, 2D, or 3D convolution module.
    r   r      zunsupported dimensions: )r/   r0   r1   r2   
ValueError)dimsr+   kwargsr   r   r   conv_nd?   s   r>   c                 C   s4   |rt |t | }tj| t|g|R  S | | S )a  
    Evaluate a function without caching intermediate activations, allowing for
    reduced memory at the expense of extra compute in the backward pass.
    :param func: the function to evaluate.
    :param inputs: the argument sequence to pass to `func`.
    :param params: a sequence of parameters `func` depends on but does not
                   explicitly take as arguments.
    :param flag: if False, disable gradient checkpointing.
    )tupleZCheckpointFunctionapplylen)funcZinputsparamsflagr+   r   r   r   
checkpointL   s   
rE   c                	       s>   e Zd ZdZ	ddedededef fddZd	d
 Z  ZS )AttentionPool2dzS
    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
    Nspacial_dim	embed_dimnum_heads_channels
output_dimc                    sp   t    tt||d d |d  | _td|d| d| _td||p'|d| _	|| | _
t| j
| _d S )Nr   r         ?r:   )r   __init__r/   	Parameterr    Zrandnpositional_embeddingr>   qkv_projc_proj	num_headsQKVAttention	attention)r   rG   rH   rI   rJ   r   r   r   rL   b   s   

zAttentionPool2d.__init__c                 C   s   |j ^}}}|||d}tj|jddd|gdd}|| jd d d d d f |j }| |}| 	|}| 
|}|d d d d df S )Nr   T)r   Zkeepdimr   r   )shapereshaper    r&   meanrN   r%   r   rO   rS   rP   )r   r   bcZ_spatialr   r   r   r   q   s   $


zAttentionPool2d.forwardr   )r   r   r   __doc__intrL   r   r   r   r   r   r   rF   ]   s    	rF   c                   @   s   e Zd ZdZedd ZdS )TimestepBlockzT
    Any module where forward() takes timestep embeddings as a second argument.
    c                 C   s   dS )zJ
        Apply the module to `x` given `emb` timestep embeddings.
        Nr   r   r   embr   r   r   r      s    zTimestepBlock.forwardN)r   r   r   rY   r   r   r   r   r   r   r[   |   s    r[   c                   @   s   e Zd ZdZdd ZdS )TimestepEmbedSequentialzt
    A sequential module that passes timestep embeddings to the children that
    support it as an extra input.
    c                 C   s,   | D ]}t |tr|||}q||}q|S r   )r.   r[   )r   r   r]   layerr   r   r   r      s
   

zTimestepEmbedSequential.forwardN)r   r   r   rY   r   r   r   r   r   r^      s    r^   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	UpsampleaB  
    An upsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    r   Nc                    sJ   t    || _|p|| _|| _|| _|r#t|| j| jddd| _d S d S )Nr:   r   padding)r   rL   channelsout_channelsuse_convr<   r>   conv)r   rd   rf   r<   re   r   r   r   rL      s   

zUpsample.__init__c                 C   st   |j d | jks
J | jdkr(tj||j d |j d d |j d d fdd}ntj|ddd}| jr8| |}|S )Nr   r:   r      nearestmode)Zscale_factorrk   )rT   rd   r<   Finterpolaterf   rg   r   r   r   r   r      s   
$
zUpsample.forwardr   Nr   r   r   rY   rL   r   r   r   r   r   r   ra      s    	
ra   c                       r`   )	
DownsampleaE  
    A downsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    r   Nc                    s|   t    || _|p|| _|| _|| _|dkrdnd}|r,t|| j| jd|dd| _d S | j| jks4J tj	||d| _d S )Nr:   r   )r   r   r   r   )striderc   )Zkernel_sizerq   )
r   rL   rd   re   rf   r<   r>   opr/   Z	AvgPool2d)r   rd   rf   r<   re   rq   r   r   r   rL      s"   

zDownsample.__init__c                 C   s   |j d | jks
J | |S )Nr   )rT   rd   rr   r   r   r   r   r      s   
zDownsample.forwardrn   ro   r   r   r   r   rp      s    	rp   c                       s@   e Zd ZdZ							d fdd	Zdd Zd	d
 Z  ZS )ResBlocka  
    A residual block that can optionally change the number of channels.

    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NFr   c                    s  t    || _|| _|| _|p|| _|| _|| _|| _t	
td|t	 t||| jddd| _|	p4|
| _|	rGt|d|| _t|d|| _n|
rXt|d|| _t|d|| _nt	  | _| _t	
t	 t	||rod| j n| j| _t	
td| jt	 t	j|dt|| j| jddd| _t	j| jd j | j|krt	 | _d S |rt||| jddd| _d S t||| jd| _d S )	N    r:   r   rb   Fr   )pr   )r   rL   rd   emb_channelsdropoutre   rf   use_checkpointuse_scale_shift_normr/   
Sequentialr   SiLUr>   	in_layersupdownra   h_updx_updrp   ZIdentityLinear
emb_layersZDropout
out_layersinitzeros_r3   skip_connection)r   rd   rv   rw   re   rf   ry   r<   rx   updownr   r   r   rL      s\   







zResBlock.__init__c                 C   s   t | j||f|  | jS )a	  
        Apply the block to a Tensor, conditioned on a timestep embedding.

        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        rE   _forward
parametersrx   r\   r   r   r   r   ,  s   zResBlock.forwardc                 C   s  | j r#| jd d | jd }}||}| |}| |}||}n| |}| ||j}t|jt|jk rI|d }t|jt|jk s;| j	rr| j
d | j
dd  }}tj|ddd\}	}
||d|	  |
 }||}n	|| }| 
|}| || S )Nr   ).Nr   r   r   r   )r}   r|   r~   r   r   r
   r   rA   rT   ry   r   r    chunkr   )r   r   r]   Zin_restZin_convhZemb_outZout_normZout_restscaleshiftr   r   r   r   7  s&   





zResBlock._forward)NFFr   FFFr   r   r   rY   rL   r   r   r   r   r   r   r   rs      s    Ars   c                       s:   e Zd ZdZ				d fdd	Zdd Zd	d
 Z  ZS )AttentionBlocka  
    An attention block that allows spatial positions to attend to each other.

    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    r   r   Fc                    s   t    || _|dkr|| _n|| dks J d| d| || | _|| _td|| _td||d d| _|rAt	| j| _
nt| j| _
td||d| _tj| jj d S )Nr   r   zq,k,v channels z' is not divisible by num_head_channels rt   r   r:   )r   rL   rd   rQ   rx   r   normr>   qkvrR   rS   QKVAttentionLegacyproj_outr/   r   r   r3   )r   rd   rQ   num_head_channelsrx   use_new_attention_orderr   r   r   rL   V  s    

zAttentionBlock.__init__c                 C   s   t | j|f|  | jS r   r   r   r   r   r   r   u  s   zAttentionBlock.forwardc                 C   sV   |j ^}}}|||d}| | |}| |}| |}|| j||g|R  S )Nr   )rT   rU   r   r   rS   r   )r   r   rW   rX   spatialr   r   r   r   r   r   y  s   

zAttentionBlock._forward)r   r   FFr   r   r   r   r   r   N  s    
r   c                 C   sL   |d j ^}}}tt|}d| |d  | }|  jt|g7  _dS )a(  
    A counter for the `thop` package to count the operations in an
    attention operation.
    Meant to be used like:
        macs, params = thop.profile(
            model,
            inputs=(inputs, timestamps),
            custom_ops={QKVAttention: QKVAttention.count_flops},
        )
    r   r   N)rT   rZ   npprodZ	total_opsr    ZDoubleTensor)model_xyrW   rX   r   Znum_spatialZ
matmul_opsr   r   r   count_flops_attn  s   r   c                       4   e Zd ZdZ fddZdd Zedd Z  ZS )r   zi
    A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
    c                       t    || _d S r   r   rL   n_headsr   r   r   r   r   rL        

zQKVAttentionLegacy.__init__c                 C   s   |j \}}}|d| j  dksJ |d| j  }||| j |d |j|dd\}}}dtt| }	td||	 ||	 }
tj|
	 dd
|
j}
td|
|}||d|S )z
        Apply QKV attention.

        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r:   r   r   r   bct,bcs->btsr   bts,bcs->bct)rT   r   rU   splitr"   sqrtr    einsumsoftmaxr	   r
   r   r   r   bswidthlengthchqkvr   r3   ar   r   r   r     s   zQKVAttentionLegacy.forwardc                 C      t | ||S r   r   r   r   r   r   r   r   count_flops     zQKVAttentionLegacy.count_flops	r   r   r   rY   rL   r   staticmethodr   r   r   r   r   r   r     s    r   c                       r   )rR   zP
    A module which performs QKV attention and splits in a different order.
    c                    r   r   r   r   r   r   r   rL     r   zQKVAttention.__init__c              	   C   s   |j \}}}|d| j  dksJ |d| j  }|jddd\}}}dtt| }	td||	 || j ||||	 || j ||}
tj|
	 dd
|
j}
td|
||| j ||}||d|S )z
        Apply QKV attention.

        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r:   r   r   r   r   r   r   )rT   r   r   r"   r   r    r   viewr   r	   r
   r   rU   r   r   r   r   r     s   zQKVAttention.forwardc                 C   r   r   r   r   r   r   r   r     r   zQKVAttention.count_flopsr   r   r   r   r   rR     s    rR   c                       sV   e Zd ZdZ															d fd
d	Zdd Zdd ZdddZ  ZS )	UNetModela  
    The full UNet model with attention and timestep embedding.

    :param in_channels: channels in the input Tensor.
    :param model_channels: base channel count for the model.
    :param out_channels: channels in the output Tensor.
    :param num_res_blocks: number of residual blocks per downsample.
    :param attention_resolutions: a collection of downsample rates at which
        attention will take place. May be a set, list, or tuple.
        For example, if this contains 4, then at 4x downsampling, attention
        will be used.
    :param dropout: the dropout probability.
    :param channel_mult: channel multiplier for each level of the UNet.
    :param conv_resample: if True, use learned convolutions for upsampling and
        downsampling.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param num_classes: if specified (as an int), then this model will be
        class-conditional with `num_classes` classes.
    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
    :param num_heads: the number of attention heads in each attention layer.
    :param num_head_channels: if specified, ignore num_heads and instead use
                               a fixed channel width per attention head.
    :param num_heads_upsample: works with num_heads to set a different number
                               of heads for upsampling. Deprecated.
    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
    :param resblock_updown: use residual blocks for up/downsampling.
    :param use_new_attention_order: use a different attention pattern for potentially
                                    increased efficiency.
    r   r   r   rh      Tr   NFr   r   c                     s  t    |dkr|}|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
|| _|| _|r1tjntj| _|| _|| _|| _|d }tt||t t||| _| jd ur`t||| _t|d |  }}ttt|
||dddg| _|| _ |g}d}t!|D ]~\}}t"|D ]<}t#|||t|| |
||dg}t|| }||v r|$t%|||||d | j$t|  |  j |7  _ |$| q|t&|d kr|}| j$t|rt#|||||
||d	d
nt'||	|
|d |}|$| |d9 }|  j |7  _ qtt#||||
||dt%|||||dt#||||
||d| _(|  j |7  _ tg | _)t*t!|d d d D ]s\}}t"|d D ]g}|+ }t#|| ||t|| |
||dg}t|| }||v rt|$t%|||||d |r||kr|}|$|rt#|||||
||d	dnt,||	|
|d |d }| j)$t|  |  j |7  _ qEq;tt-d|t t|
||ddd| _.tj/0| j.d j1 d S )Nr   rh   r   r:   r   rb   re   r<   rx   ry   rx   rQ   r   r   Tre   r<   rx   ry   r   r<   re   r   r<   rx   ry   )re   r<   rx   ry   r   rt   )2r   rL   
image_sizein_channelsmodel_channelsre   num_res_blocksattention_resolutionsrw   channel_multconv_resamplenum_classesrx   r    float16r$   r   rQ   r   num_heads_upsampler/   rz   r   r{   
time_embedZ	Embedding	label_embrZ   
ModuleListr^   r>   input_blocks_feature_size	enumeraterangers   appendr   rA   rp   middle_blockoutput_blockslistpopra   r   outr   r   r3   ) r   r   r   r   re   r   r   rw   r   r   r<   r   rx   use_fp16rQ   r   r   ry   resblock_updownr   time_embed_dimr   Zinput_chinput_block_chansdslevelmult_layersout_chiZichr   r   r   rL     sJ  





	



	)zUNetModel.__init__c                 C   (   | j t | jt | jt dS z<
        Convert the torso of the model to float16.
        N)r   r@   r7   r   r   r   r   r   r   convert_to_fp16     zUNetModel.convert_to_fp16c                 C   r   z<
        Convert the torso of the model to float32.
        N)r   r@   r9   r   r   r   r   r   r   convert_to_fp32  r   zUNetModel.convert_to_fp32c                 C   s   |du| j duksJ dg }| t|| j}| j dur/|j|jd fks(J || | }|| j}| jD ]}|||}|	| q8| 
||}| jD ]}tj|| gdd}|||}qN||j}| |S )a  
        Apply the model to an input batch.

        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :param y: an [N] Tensor of labels, if class-conditional.
        :return: an [N x C x ...] Tensor of outputs.
        Nz<must specify y if and only if the model is class-conditionalr   r   r   )r   r   r,   r   rT   r   r
   r   r   r   r   r   r    r&   r   r   )r   r   r)   r   hsr]   r   moduler   r   r   r     s,   	





zUNetModel.forward)r   r   Tr   NFFr   r   r   FFFr   	r   r   r   rY   rL   r   r   r   r   r   r   r   r   r     s(    & ;r   c                       s.   e Zd ZdZ fddZd fdd	Z  ZS )SuperResModelz
    A UNetModel that performs super-resolution.

    Expects an extra kwarg `low_res` to condition on a low-resolution image.
    c                    s$   t  j||d g|R i | d S )Nr   )r   rL   )r   r   r   r+   r=   r   r   r   rL     s   $zSuperResModel.__init__Nc           	         sJ   |j \}}}}tj|||fdd}tj||gdd}t j||fi |S )NZbilinearrj   r   r   )rT   rl   rm   r    r&   r   r   )	r   r   r)   Zlow_resr=   r   Z
new_heightZ	new_widthZ	upsampledr   r   r   r     s   
zSuperResModel.forwardr   ro   r   r   r   r   r     s    r   c                       sT   e Zd ZdZ														d fd
d	Zdd Zdd Zdd Z  ZS )EncoderUNetModelz^
    The half UNet model with attention and timestep embedding.

    For usage, see UNet.
    r   r   Tr   Fr   r   adaptivec                    sL  t    |dkr|}|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|r+tjntj| _|| _|| _|| _|d }tt||t t||| _t|d | }ttt|
||dddg| _|| _|g}d}t|D ]}\}}t|D ]<}t|||t|| |
||dg}t|| }||v r| t!|||||d | j t|  |  j|7  _| | qz|t"|d kr|}| j t|rt|||||
||d	d
nt#||	|
|d |}| | |d9 }|  j|7  _qrtt||||
||dt!|||||dt||||
||d| _$|  j|7  _|| _%|dkrBtt&d|t t'dt|
||dt( | _)tj*+| j)d j, d S |dkrd|dksNJ tt&d|t t-|| |||| _)d S |dkrtt| jdt. td| j| _)d S |dkrtt| jdt&ddt td| j| _)d S t/d| d)Nr   rh   r   r:   r   rb   r   r   Tr   r   r   r   r   rt   )r   r   rS   r   i   Z
spatial_v2zUnexpected z pooling)0r   rL   r   r   re   r   r   rw   r   r   rx   r    r   r$   r   rQ   r   r   r/   rz   r   r{   r   rZ   r   r^   r>   r   r   r   r   rs   r   r   rA   rp   r   poolr   ZAdaptiveAvgPool2dZFlattenr   r   r   r3   rF   ZReLUNotImplementedError)r   r   r   r   re   r   r   rw   r   r   r<   rx   r   rQ   r   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rL     s  




	







zEncoderUNetModel.__init__c                 C      | j t | jt dS r   )r   r@   r7   r   r   r   r   r   r        z EncoderUNetModel.convert_to_fp16c                 C   r   r   )r   r@   r9   r   r   r   r   r   r     r   z EncoderUNetModel.convert_to_fp32c                 C   s   |  t|| j}g }|| j}| jD ]}|||}| jdr.|||jj	dd q| 
||}| jdrT|||jj	dd tj|dd}| |S ||j}| |S )z
        Apply the model to an input batch.

        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :return: an [N x K] Tensor of outputs.
        r   )r   r:   r   r   )Zaxis)r   r,   r   r
   r   r   r   
startswithr   rV   r   r    r&   r   )r   r   r)   r]   resultsr   r   r   r   r   r     s"   




zEncoderUNetModel.forward)r   r   Tr   FFr   r   r   FFFr   r   r   r   r   r   r     s(     $r   c                       sD   e Zd Zdddddg dddd	d
ddddddd
f fdd	Z  ZS )
UNetConfigi   r:         r   )   rt   @   g        )rK   r   r   r   r   rh   rh   NFTrh   r   r   c                    s|   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _t jdi | d S )Nr   )r   r   r   re   r   r   rw   r   r   rx   r   rQ   r   r   ry   r   r   r   rL   )r   r   r   r   re   r   r   rw   r   r   rx   r   rQ   r   r   ry   r   r   r=   r   r   r   rL     s$   zUNetConfig.__init__)r   r   r   rL   r   r   r   r   r   r     s&    r   c                       s2   e Zd ZeZ fddZdddZdd Z  ZS )	HFUNetModelc                    s   t  | tdi d|jd|jd|jd|jd|jd|jd|j	d|j
d	|jd
|jd|jd|jd|jd|jd|jd|jd|j| _d S )Nr   r   r   re   r   r   rw   r   r   rx   r   rQ   r   r   ry   r   r   r   )r   rL   r   r   r   r   re   r   r   rw   r   r   rx   r   rQ   r   r   ry   r   r   r   )r   configr   r   r   rL     sH   	

zHFUNetModel.__init__Nc                 C   s   | j |||S r   )r   r   )r   r   r)   r   r   r   r   r     s   zHFUNetModel.forwardc                 C   s.   | j jt | j jt | j jt dS r   )r   r   r@   r7   r   r   r   r   r   r   r     s   zHFUNetModel.convert_to_fp16r   )	r   r   r   r   Zconfig_classrL   r   r   r   r   r   r   r   r     s
    
r   )r   )&r"   abcr   numpyr   Ztorchr    Ztorch.nnr/   Ztorch.nn.functionalZ
functionalrl   Ztransformersr   r   r   r,   r7   r9   r>   rE   ModulerF   r[   rz   r^   ra   rp   rs   r   r   r   rR   r   r   r   r   r   r   r   r   r   <module>   s>   


!"t4"$   T)