o
    *jE#                    @   s  d dl Z d dlZd dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ dgZdZdSd	d
Zdd Zdd Zdd Zdd Zdd ZG dd dejZG dd dejZejddZG dd dejZG dd dejZ G dd  d ejZ!d!d" Z"G d#d$ d$ejZ#G d%d& d&ejZ$G d'd( d(ejZ%G d)d* d*ejZ&G d+d, d,ejZ'G d-d. d.ejZ(G d/d0 d0ejZ)G d1d2 d2ejZ*G d3d4 d4ejZ+G d5d6 d6ejZ,G d7d8 d8ejZ-G d9d: d:ejZ.G d;d< d<ejZ/G d=d dejZ0G d>d? d?ejZ1G d@dA dAejZ2G dBdC dCejZ3G dDdE dEejZ4G dFdG dGejZ5G dHdI dIejZ6G dJdK dKejZ7e8dLkrd dMl9m:Z: e0e:j;e:j<e:j=e:j>e:j?e:j@e:jAe:jBe:jCe:jDe:jEd e:jFde:jGdNZHeIeJeKdOdP eHL D dQ dR dS dS )T    N)partial	rearrange)checkpoint_wrapper)RotaryEmbedding)einsumUNetSD_temporalTc                    sP   |d u r }i } fdd|   D } |   D ]\}}| |}|||< q|S )Nc                    s   i | ]\}} |v r||qS  r	   ).0keyvalueprefixr	   t/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/unet_sd.py
<dictcomp>   s    zload_Block.<locals>.<dictcomp>)itemsreplace)stater   
new_prefix
state_dictr   r   Znew_keyr	   r   r   
load_Block   s   
r   c              	      s   i }|j  |j}|j}|j} fdddg| D } fdd|d g|d d d  D }g }d}	t| dd}
||
 t| d	d}
||
 t| d
d}
||
 d}t| d| d| dd}
||
 |d7 }|  tt|d d |dd  D ]}\}\}}t	|D ]r}d}d}t| d| d| d| d| d}
||
 |d7 }d}|	|v rt| d| d| d| d| d}
||
 |}|d7 }|| |t
|d kr||d krt| ddd}
||
 || |	d }	|d7 }qqd}t| d| d}
||
 |d7 }t| dd| d}
||
 |d7 }t	|jD ]}|d7 }q)t| dd| d}
||
 |d7 }d}tt|d d |dd  D ]\}\}}t	|d D ]}d}d}t| d| d| d| d| d}
||
 |d7 }|d7 }|	|v rt| d| d| d| d| d}
||
 |d7 }|d7 }t	|jD ]}|d7 }q|t
|d kr||krt| d| d| d| d| d}
||
 |d7 }|d7 }|	d9 }	|d7 }qaqUt| dd}
||
 |S )Nc                       g | ]} | qS r	   r	   r
   udimr	   r   
<listcomp>.       z1load_2d_pretrained_state_dict.<locals>.<listcomp>   c                    r   r	   r	   r   r   r	   r   r   /   r         ?Ztime_embeddingr   Zy_embeddingZcontext_embeddingr   zencoder.z.0)r   r   .   zencoder.{encoder_idx}zencoder.{encoder_idx}.0       @zmiddle.zmiddle.1zmiddle.2zdecoder.head)unet_dimunet_res_blocksunet_dim_multunet_attn_scalesr   updateappend	enumerateziprangelentemporal_attn_times)r   cfgZnew_state_dictnum_res_blocksdim_multattn_scalesenc_dimsdec_dimsshortcut_dimsscaler   Zencoder_idxiin_dimout_dimjidxZidx_Z
middle_idx_Zdecoder_idxr	   r   r   load_2d_pretrained_state_dict$   s   &





*




%


*



#
r>   c              	   C   s   |d }|   } t| tdt|| | }tjt|t	|gdd}|d dkrEtj|t
|d d d df gdd}|S )Nr"   i'  r   r   r   )floattorchouterpowarangetodivcatcossinZ
zeros_like)Z	timestepsr   ZhalfZsinusoidxr	   r	   r   sinusoidal_embedding   s   (rJ   c                 C   s   | d uS Nr	   )rI   r	   r	   r   exists   s   rL   c                 C   s   t | r| S t|r| S |S rK   )rL   callable)valdr	   r	   r   default   s   rP   c                 C   sf   |dkrt j| |t jdS |dkrt j| |t jdS t j| |d dd|k }| r1d|d< |S )Nr   devicedtyper   rR   F)r@   onesboolzerosr?   Zuniform_all)shapeZprobrR   maskr	   r	   r   prob_mask_like   s   r[   c                       s8   e Zd Zd
 fdd	Ze		dddZdd	 Z  ZS )RelativePositionBias          c                    s(   t    || _|| _t||| _d S rK   )super__init__num_bucketsmax_distancennZ	Embeddingrelative_attention_bias)selfheadsrb   rc   	__class__r	   r   ra      s   
zRelativePositionBias.__init__c                 C   s   d}|  }|d }||dk   | 7 }t|}|d }||k }|t| | t||  ||     }t|t||d }|t|||7 }|S )Nr   r"   r   )	longr@   abslogr?   mathminZ	full_likewhere)Zrelative_positionrb   rc   retnZ	max_exactZis_smallZval_if_larger	   r	   r   _relative_position_bucket   s(   
z.RelativePositionBias._relative_position_bucketc                 C   s`   t j|t j|d}t j|t j|d}t|dt|d }| j|| j| jd}| |}t|dS )N)rS   rR   zj -> 1 jzi -> i 1)rb   rc   zi j h -> h i j)r@   rC   rj   r   rr   rb   rc   re   )rf   rq   rR   Zq_posZk_posZrel_posZ	rp_bucketvaluesr	   r	   r   forward   s   

zRelativePositionBias.forward)r]   r^   r_   )r^   r_   )__name__
__module____qualname__ra   staticmethodrr   rt   __classcell__r	   r	   rh   r   r\      s    r\   c                       s8   e Zd ZdZ						d fdd	Zdd	d
Z  ZS )SpatialTransformera  
    Transformer block for image-like data.
    First, project the input (aka embedding)
    and reshape to b, t, d.
    Then apply standard transformer action.
    Finally, reshape to image
    NEW: use_linear for more efficiency instead of the 1x1 convs
    r           NFTc
           
   	      s   t    t rt ts g || _ tjjd|ddd| _	|s1tj
|dddd| _nt|| _t fdd	t|D | _|s]ttj
|dddd| _n	tt|| _|| _d S )
Nr^   ư>TZ
num_groupsZnum_channelsepsZaffiner   r   kernel_sizestridepaddingc                    s&   g | ]}t  | d qS ))dropoutcontext_dimdisable_self_attn
checkpointBasicTransformerBlockr
   rO   r   d_headr   r   	inner_dimn_headsuse_checkpointr	   r   r     s    z/SpatialTransformer.__init__.<locals>.<listcomp>)r`   ra   rL   
isinstancelistin_channelsr@   rd   	GroupNormnormConv2dproj_inLinear
ModuleListr-   transformer_blockszero_moduleproj_out
use_linear)
rf   r   r   r   depthr   r   r   r   r   rh   r   r   ra     s2   






zSpatialTransformer.__init__c           
      C   s   t |ts|g}|j\}}}}|}| |}| js| |}t|d }| jr-| |}t| j	D ]\}}	|	||| d}q2| jrG| 
|}t|d||d }| jsY| 
|}|| S )Nzb c h w -> b (h w) ccontextzb (h w) c -> b c h whw)r   r   rY   r   r   r   r   
contiguousr+   r   r   )
rf   rI   r   bcr   r   x_inr8   blockr	   r	   r   rt   /  s$   





zSpatialTransformer.forward)r   r{   NFFTrK   ru   rv   rw   __doc__ra   rt   ry   r	   r	   rh   r   rz      s    *rz   ZATTN_PRECISIONfp32c                       s0   e Zd Z				d	 fdd	Zd
ddZ  ZS )CrossAttentionNr]   @   r{   c                    s   t    || }t||}|d | _|| _tj||dd| _tj||dd| _tj||dd| _	t
t||t|| _d S )N      Fbias)r`   ra   rP   r7   rg   rd   r   to_qto_kto_v
SequentialDropoutto_out)rf   	query_dimr   rg   dim_headr   r   rh   r	   r   ra   J  s   



zCrossAttention.__init__c           
         s2  | j  | |}t||}| |}| |}t fdd|||f\}}}tdkrUtjddd |	 |	 }}t
d||| j }W d    n1 sOw   Y  n
t
d||| j }~~t|rt|d}t|jj }t|d	 d
}|| | |jdd}t
d||}	t|	d d
}	| |	S )Nc                       t | d dS )Nzb n (h d) -> (b h) n dr   r   tr   r	   r   <lambda>f      z(CrossAttention.forward.<locals>.<lambda>r   Fcuda)enabledZdevice_typezb i d, b j d -> b i jzb ... -> b (...)zb j -> (b h) () jr   r   r   zb i j, b j d -> b i dz(b h) n d -> b n (h d))rg   r   rP   r   r   map_ATTN_PRECISIONr@   Zautocastr?   r   r7   rL   r   finforS   maxrepeatZmasked_fill_softmaxr   )
rf   rI   r   rZ   qkvsimZmax_neg_valueoutr	   r   r   rt   ^  s2   






zCrossAttention.forward)Nr]   r   r{   )NNru   rv   rw   ra   rt   ry   r	   r	   rh   r   r   H  s    r   c                       s<   e Zd Z					d fdd	ZdddZdd	d
Z  ZS )r   r{   NTFc	           
         s   t    t}	|| _|	||||| jr|nd d| _t|||d| _|	|||||d| _t	|| _
t	|| _t	|| _|| _d S )N)r   rg   r   r   r   )r   glu)r   r   rg   r   r   )r`   ra   r   r   attn1FeedForwardffattn2rd   	LayerNormnorm1norm2norm3r   )
rf   r   r   r   r   r   Zgated_ffr   r   Zattn_clsrh   r	   r   ra     s,   
	
zBasicTransformerBlock.__init__c                 C   s   t | j||f|  | j S rK   )r   _forward
parametersrf   rI   r   r	   r	   r   forward_  s   zBasicTransformerBlock.forward_c                 C   sR   | j | || jr|nd d| }| j| ||d| }| | || }|S )Nr   )r   r   r   r   r   r   r   r   r	   r	   r   rt     s   zBasicTransformerBlock.forward)r{   NTTFrK   )ru   rv   rw   ra   r   rt   ry   r	   r	   rh   r   r     s    
r   c                       $   e Zd Z fddZdd Z  ZS )GEGLUc                    s    t    t||d | _d S )Nr"   )r`   ra   rd   r   proj)rf   Zdim_indim_outrh   r	   r   ra     s   
zGEGLU.__init__c                 C   s&   |  |jddd\}}|t| S )Nr"   r   r   )r   chunkFZgelu)rf   rI   Zgater	   r	   r   rt     s   zGEGLU.forwardr   r	   r	   rh   r   r     s    r   c                 C   s   |   D ]}|   q| S )z<
    Zero out the parameters of a module and return it.
    )r   detachZzero_)modulepr	   r	   r   r     s   r   c                       s&   e Zd Zd	 fdd	Zdd Z  ZS )
r   N   Fr{   c                    sh   t    t|| }t||}|stt||t nt||}t|t	|t||| _
d S rK   )r`   ra   intrP   rd   r   r   ZGELUr   r   net)rf   r   r   Zmultr   r   r   Z
project_inrh   r	   r   ra     s   




zFeedForward.__init__c                 C   s
   |  |S rK   )r   rf   rI   r	   r	   r   rt     s   
zFeedForward.forward)Nr   Fr{   r   r	   r	   rh   r   r     s    
r   c                       0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
UpsampleaA  
    An upsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    r"   Nr   c                    sJ   t    || _|p|| _|| _|| _|r#tj| j| jd|d| _d S d S )N   r   )	r`   ra   channelsout_channelsuse_convdimsrd   r   conv)rf   r   r   r   r   r   rh   r	   r   ra     s   

zUpsample.__init__c                 C   st   |j d | jks
J | jdkr(tj||j d |j d d |j d d fdd}ntj|ddd}| jr8| |}|S )Nr   r   r"   r   nearest)mode)Zscale_factorr   )rY   r   r   r   interpolater   r   r   r	   r	   r   rt     s   
$
zUpsample.forwardr"   Nr   r   r	   r	   rh   r   r     s    r   c                       sB   e Zd ZdZ								d fdd	Zdd	 Zd
d Z  ZS )ResBlocka  
    A residual block that can optionally change the number of channels.
    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NFr"   Tc                    s  t    || _|| _|| _|p|| _|| _|| _|
| _t	
t	d|t	 t	j|| jddd| _|p5|	| _|rHt|d|| _t|d|| _n|	rYt|d|| _t|d|| _nt	  | _| _t	
t	 t	||rpd| j n| j| _t	
t	d| jt	 t	j|dtt	j| j| jddd| _| j|krt	 | _n|rt||| jddd| _n	t	|| jd| _| jrt| j| jd|d	| _d S d S )
Nr^   r   r   r   Fr"   )r   皙?)r   use_image_dataset)r`   ra   r   emb_channelsr   r   r   use_scale_shift_normuse_temporal_convrd   r   r   SiLUr   	in_layersupdownr   h_updx_upd
DownsampleIdentityr   
emb_layersr   r   
out_layersskip_connectionZconv_ndTemporalConvBlock_v2temopral_conv)rf   r   r   r   r   r   r   r   upZdownr   r   rh   r	   r   ra     sj   






zResBlock.__init__c                 C   s   |  |||S )a  
        Apply the block to a Tensor, conditioned on a timestep embedding.
        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        )r   )rf   rI   emb
batch_sizer	   r	   r   rt   O  s   zResBlock.forwardc                 C   s0  | j r#| jd d | jd }}||}| |}| |}||}n| |}| ||j}t|jt|jk rI|d }t|jt|jk s;| j	rr| j
d | j
dd  }}	tj|ddd\}
}||d|
  | }|	|}n	|| }| 
|}| || }| jrt|d|d}| |}t|d	}|S )
Nr   ).Nr   r   r"   r   (b f) c h w -> b c f h wr   b c f h w -> (b f) c h w)r   r   r   r   r   typerS   r.   rY   r   r   thr   r  r   r   r  )rf   rI   r  r  Zin_restZin_convr   Zemb_outZout_normZout_restr7   shiftr	   r	   r   r   X  s0   







zResBlock._forward)NFFr"   FFTF)ru   rv   rw   r   ra   rt   r   ry   r	   r	   rh   r   r     s    G	r   c                       r   )
r   aD  
    A downsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    r"   Nr   c                    s|   t    || _|p|| _|| _|| _|dkrdnd}|r,tj| j| jd||d| _d S | j| jks4J t	|||d| _d S )Nr   r"   )r   r"   r"   r   r   )r   r   )
r`   ra   r   r   r   r   rd   r   opZavg_pool_nd)rf   r   r   r   r   r   r   rh   r	   r   ra   ~  s    

zDownsample.__init__c                 C   s   |j d | jks
J | |S Nr   )rY   r   r  r   r	   r	   r   rt     s   
zDownsample.forwardr   r   r	   r	   rh   r   r   u  s    r   c                       s&   e Zd Z fddZdddZ  ZS )Resamplec                    s0   |dv sJ t t|   || _|| _|| _d S )N)noneupsample
downsample)r`   r  ra   r9   r:   r   )rf   r9   r:   r   rh   r	   r   ra     s
   
zResample.__init__Nc                 C   sh   | j dkr|d usJ tj||jdd  dd}|S | j dkr2tj|tdd |jdd  D d}|S )	Nr  r   )sizer   r  c                 s   s    | ]}|d  V  qdS )r"   Nr	   r   r	   r	   r   	<genexpr>  s    z#Resample.forward.<locals>.<genexpr>)Zoutput_size)r   r   r   rY   Zadaptive_avg_pool2dtuple)rf   rI   	referencer	   r	   r   rt     s   

zResample.forwardrK   r   r	   r	   rh   r   r    s    r  c                       s.   e Zd Z			d	 fdd	Zd
ddZ  ZS )ResidualBlockTr  r{   c                    s   t t|   || _|| _|| _|| _|| _t	t
d|t tj||ddd| _t|||| _t	t t||r@|d n|| _t	t
d|t t|tj||ddd| _||krft nt||d| _tj| jd j d S )Nr^   r   r   r   r"   r   )r`   r  ra   r9   	embed_dimr:   r   r   rd   r   r   r   r   layer1r  resampler   	embeddingr   layer2r   shortcutinitzeros_weight)rf   r9   r  r:   r   r   r   rh   r	   r   ra     s2   zResidualBlock.__init__Nc                 C   s   |  ||}| jd |  | jd d ||}| |dd|j}| jrJ|jddd\}}| jd |d|  | }| jdd  |}n	|| }| |}|| 	| }|S )Nr   r"   r   r   r   )
r  r  r  	unsqueezer
  rS   r   r   r  r  )rf   rI   er  identityr7   r  r	   r	   r   rt     s   $
zResidualBlock.forward)Tr  r{   rK   r   r	   r	   rh   r   r    s     r  c                       s(   e Zd Zd fdd	ZdddZ  ZS )AttentionBlockNc                    s   |r|| n|}|| }|| |ksJ t t|   || _|| _|| _|| _t|d| _	t
d|| _t
||d d| _|d urLt
||d | _t
||d| _t
j| jj d S )Ng      пr^   r   r   r"   )r`   r&  ra   r   r   	num_headshead_dimrm   rB   r7   rd   r   r   r   to_qkvr   
context_kvr   r   r!  r"  )rf   r   r   r'  r(  rh   r	   r   ra     s   zAttentionBlock.__init__c                 C   s&  |}g |  | j| jR \}}}}}}	| |}| |||d |	|| jddd\}
}}|dura| ||d|d |		ddddjddd\}}t
j||gdd}t
j||gdd}t
|
dd| j || j }tj|dd}t
||dd}|||||}| |}|| S )	zGx:       [B, C, H, W].
            context: [B, L, C] or None.
        r   r   r   Nr   r"   r   r  )r  r'  r(  r   r)  viewr   r*  ZreshapeZpermuter@   rF   matmulZ	transposer7   r   r   r   )rf   rI   r   r%  r   r   r   r   rq   rO   r   r   r   ZckZcvattnr	   r	   r   rt     s,   &
.
 
zAttentionBlock.forwardNNNrK   r   r	   r	   rh   r   r&    s    r&  c                       s8   e Zd Z					d	 fdd	Z			d
ddZ  ZS )TemporalAttentionBlockr   r^   NFc                    s~   t    || }|| |ksJ || _|| _|d | _|| _|| }td|| _|| _	t
||d | _t
||| _d S )Nr   r^   r   )r`   ra   r   use_sim_maskr7   rg   rd   r   r   
rotary_embr   r)  r   )rf   r   rg   r   r1  r   r0  Z
hidden_dimrh   r	   r   ra     s   

zTemporalAttentionBlock.__init__c                 C   s  |}|j d |j d |j}}}| |}t|d}| |jddd}	t|rA| rA|	d }
| |
}t|d|d}|| S t|	d	 d
| j	d}t|	d d
| j	d}t|	d d
| j	d}|| j
 }t| jru| j|}| j|}td||}t|r|| }|d u r|d ur|d d d d d f |d d d d d f  }|dd}|| t|jj }n:t|r|  stj||f|tjd}tj||tjd}tt|dt|dt|d}|| t|jj }| jrtjtj||f|tjdd	d}|| t|jj }||jddd  }|jdd}td||}t|d}| |}t|d|d}| jrH|d	|  }|S || }|S )Nr"   r  zb c f h w -> b (h w) f cr   r   r   zb (h w) f c -> b c f h wr   r   z... n (h d) -> ... h n dr   z!... h i d, ... h j d -> ... h i jrQ   zb -> b 1 1 1 1zi j -> 1 1 1 i j)ZdiagonalT)r   Zkeepdimz!... h i j, ... h j d -> ... h i dz... h n d -> ... n (h d))rY   rR   r   r   r)  r   rL   rX   r   rg   r7   r1  Zrotate_queries_or_keysr@   r   r#  masked_fillr   rS   r   rU   rV   eyero   r0  ZtrilZamaxr   r   r   )rf   rI   pos_biasfocus_present_mask
video_maskr%  rq   heightrR   qkvrs   r   r   r   r   r   rZ   Zattend_all_maskZattend_self_maskZsim_maskr-  r	   r	   r   rt   .  sj   




,


zTemporalAttentionBlock.forward)r   r^   NFFr.  r   r	   r	   rh   r   r/    s    r/  c                       s<   e Zd ZdZ								d fdd	Zdd	d
Z  ZS )TemporalTransformerz
    Transformer block for image-like data.
    First, project the input (aka embedding)
    and reshape to b, t, d.
    Then apply standard transformer action.
    Finally, reshape to image
    r   r{   NFTc                    s  t    || _|
| _d| _| jrd  t ts g || _ tj	j
d|ddd| _|s;t	j|dddd| _nt	|| _| jrLt	tt| _t	 fd	d
t|D | _|sptt	j|dddd| _ntt	|| _| jrt	tt| _|| _d S )NFr^   r|   Tr}   r   r   r   c              
      s$   g | ]}t  | d qS ))r   r   r   r   r   r   r   r   r   r   r   r	   r   r     s    z0TemporalTransformer.__init__.<locals>.<listcomp>)r`   ra   multiply_zeroonly_self_attZuse_adaptorr   r   r   r@   rd   r   r   ZConv1dr   r   framesZ
adaptor_inr   r-   r   r   r   Zadaptor_outr   )rf   r   r   r   r   r   r   r   r   r   r<  r;  rh   r:  r   ra     sD   




	
zTemporalTransformer.__init__c                 C   s  | j rd }t|ts|g}|j\}}}}}|}| |}| js+t|d }| |}| jr=t|d| j	d }| |}| j r_t|d }t
| jD ]\}	}
|
|}qLt|d|d }nFt|d|d }t
| jD ]7\}	}
t||	 d| j	d ||	< t|D ] }t||	 | d	|| | j	 | j	d
 }|
|| |d||< qqm| jr| |}t|d||d }| jst|d }| |}t|d|||d }| jrd| | }|S || }|S )Nzb c f h w -> (b h w) c fz(b f) c h w -> b (h w) f c)fzbhw c f -> bhw f cz(b hw) f c -> b hw f cr  z(b hw) c f -> b hw f cz(b f) l con -> b f l conzf l con -> (f r) l con)rr>  r   zb (h w) f c -> b f c h wr   zb hw f c -> (b hw) c fz(b h w) c f -> b c f h w)r   r   r   r{   )r<  r   r   rY   r   r   r   r   r   r=  r+   r   r-   r   r   r;  )rf   rI   r   r   r   r>  r   r   r   r8   r   r;   Zcontext_i_jr	   r	   r   rt     sr   








zTemporalTransformer.forward)r   r{   NFFTTFrK   r   r	   r	   rh   r   r9    s    4r9  c                       s:   e Zd Z						d
 fdd	Z			ddd	Z  ZS )TemporalAttentionMultiBlockr   r^   NFr   c                    s6   t    t fddt|D | _d S )Nc              	      s   g | ]}t  qS r	   )r/  )r
   r=   r   r   rg   r1  r   r0  r	   r   r     s    
z8TemporalAttentionMultiBlock.__init__.<locals>.<listcomp>)r`   ra   rd   r   r-   
att_layers)rf   r   rg   r   r1  r   r0  r/   rh   rA  r   ra     s   

z$TemporalAttentionMultiBlock.__init__c                 C   s   | j D ]	}|||||}q|S rK   )rB  )rf   rI   r4  r5  r6  layerr	   r	   r   rt     s   
z#TemporalAttentionMultiBlock.forward)r   r^   NFFr   r.  r   r	   r	   rh   r   r@    s    r@  c                       ,   e Zd Z			d fdd	Zdd Z  ZS )	InitTemporalConvBlockNr{   Fc                    s   t t|   |d u r|}|| _|| _|| _ttd|t	 t
|tj||ddd| _tj| jd j tj| jd j d S Nr^   )r   r   r   )r   r   r   r   r   )r`   rE  ra   r9   r:   r   rd   r   r   r   r   Conv3dr   r   r!  r"  r   rf   r9   r:   r   r   rh   r	   r   ra     s   zInitTemporalConvBlock.__init__c                 C   s0   |}|  |}| jr|d|  }|S || }|S Nr   )r   r   rf   rI   r%  r	   r	   r   rt   ,  s   
zInitTemporalConvBlock.forwardNr{   Fr   r	   r	   rh   r   rE    s    rE  c                       rD  )	TemporalConvBlockNr{   Fc                    s   t t|   |d u r|}|| _|| _|| _ttd|t	 tj
||ddd| _ttd|t	 t|tj
||ddd| _tj| jd j tj| jd j d S rF  )r`   rL  ra   r9   r:   r   rd   r   r   r   rG  conv1r   conv2r   r!  r"  r   rH  rh   r	   r   ra   8  s    zTemporalConvBlock.__init__c                 C   s:   |}|  |}| |}| jr|d|  }|S || }|S rI  )rM  rN  r   rJ  r	   r	   r   rt   P  s   

zTemporalConvBlock.forwardrK  r   r	   r	   rh   r   rL  6  s    rL  c                       rD  )	r  Nr{   Fc                    s  t t|   |d u r|}|| _|| _|| _ttd|t	 tj
||ddd| _ttd|t	 t|tj
||ddd| _ttd|t	 t|tj
||ddd| _ttd|t	 t|tj
||ddd| _tj| jd j tj| jd j d S rF  )r`   r  ra   r9   r:   r   rd   r   r   r   rG  rM  r   rN  conv3conv4r   r!  r"  r   rH  rh   r	   r   ra   ]  s0   zTemporalConvBlock_v2.__init__c                 C   sN   |}|  |}| |}| |}| |}| jr!|d|  }|S || }|S )Nr{   )rM  rN  rO  rP  r   rJ  r	   r	   r   rt   {  s   



zTemporalConvBlock_v2.forwardrK  r   r	   r	   rh   r   r  [  s    r  c                       s   e Zd Zdddddddg dddd	g d
dddddddddddddgddddf fdd	Z															dddZ	dddZ  ZS )r      i      r]      )r   r"   r   r   Nr   r   )      ?g      ?g      ?Tr   r   FrT  textrZ   c           -         s

   d }|
r|
n d }
t t|   || _|| _|| _|| _ | _|| _|| _	|| _
|| _|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d}d} d}! fdddg|	 D }" fdd|	d g|	d d d  D }#g }$d	}%t |d
r|j!r|j!}&nd}&t"#t"$ |t"% t"$||| _&t"#t"$ddt"% t"$dd| _'d| jv r	t"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _*t+d|||d|d|&d| _,d| jv rMt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _-t+d|||d|d|&d| _.d| jv rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _/t+d|||d|d|&d| _0d| jv r|rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |ddddnd | _1t+d|||d|d|&d| _2d| jv rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _3t+d|||d|d|&d| _4d| jv rbt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _5t+d|||d|d|&d| _6d| jv rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _7t+d|||d|d|&d| _8t9|| _|rt:st;t<d|| _=t>|
dd| _?| jrt"#t"$ |t"% t"$||| _@t"jAB| j@d jC t"jAB| j@d jD t"E | _F|jGr	t"# | _Ht"Et"j(| j|  dddg}'nt"#t"j(| j| | jddd| _Ht"Et"j(| j dddg}'|rLt:r>|'ItJ |
|| ||!||d n|'ItK |
|| j=||d | jFI|' |$I  tLtM|"d d |"dd  D ]\}(\}}tN|D ]})t"EtO||||d|dg}*|%|v r|*ItP||| |d| j	ddd  | jrt:r|*ItJ||| || ||!||d n|*ItK||
|| j=|||d! |}| jFI|* |$I| |(tQ|	d kr|)|d krtR|dd|d"}+|$I| |%d# }%| jFI|+ qpqft"EtO|||d|d$tP||| |d| j	ddd g| _S| jr;t:r+| jSItJ||| || ||!||d n| jSItK||
|| j=|||d! | jSItO|||dd% t"E | _TtLtM|#d d |#dd  D ]\}(\}}tN|d D ]y})t"EtO||$U  |||d|d$g}*|%|v r|*ItP||| |ddddd  | jrt:r|*ItJ||| || ||!||d n|*ItK||
|| j=|||d! |}|(tQ|	d kr|)|krtV|dd#|d"},|%d#9 }%|*I|, | jTI|* qgq[t"#t"Wd|t"% t"j(|| jddd| _Xt"jAB| jXd jC d S )&Nr   r^   Fr   c                    r   r	   r	   r   r   r	   r   r     r   z,UNetSD_temporal.__init__.<locals>.<listcomp>c                    r   r	   r	   r   r   r	   r   r     r   r   r    adapter_transformer_layersi   Zdepthmapr   r   )r_   r_   r"   r  皙?)rg   r   
dim_head_k
dim_head_vdropout_attemlp_dimdropout_ffnr   motioncannyrZ   sketchsingle_sketchlocal_image)rg   rc   r   )r   r   r   r   r;  )r1  r/   r   )r   r   r   T)r   r   r   r   )r1  r   r0  r/   )r   r   r#   )r   r   )r   )Yr`   r   ra   zero_yblack_image_featurer0   r9   r   y_dimr   hist_dim
concat_dimr  r:   r2   r'  r(  r1   r3   r   r/   temporal_attentionr   r   use_fps_conditionr0  training
inpaintingvideo_compositionsmisc_dropout
p_all_zero
p_all_keephasattrrV  rd   r   r   r   
time_embedpre_image_conditionr   ZAdaptiveAvgPool2ddepth_embeddingTransformer_v2depth_embedding_aftermotion_embeddingmotion_embedding_aftercanny_embeddingcanny_embedding_aftermasked_embeddingmask_embedding_aftersketch_embeddingsketch_embedding_aftersingle_sketch_embeddingsingle_sketch_embedding_afterlocal_image_embeddinglocal_image_embedding_afterDropPathUSE_TEMPORAL_TRANSFORMERr   rn   r1  r\   time_rel_pos_biasfps_embeddingr   r!  r"  r   r   input_blocksresume	pre_imager*   r9  r@  r+   r,   r-   r   rz   r.   r   middle_blockoutput_blockspopr   r   r   )-rf   r0   r9   r   rd  r   re  rf  r:   r2   r'  r(  r1   r3   r   r   r/   rg  r   r   rh  r0  rl  ri  rj  rk  rm  rn  rb  rc  r  Zuse_linear_in_temporalZtransformer_depthZdisabled_sar4   r5   r6   r7   rV  Z
init_blockr8   r;   r   r  r  rh   r   r   ra     sR   &








	


		
 
:




		
6zUNetSD_temporal.__init__r{   r   c           %         s  | j s|	d u sJ d|j\ }}}}|j | _|dkr'd }d|| d < nt| fdd}| jrCtsC| j|jd |jd}nd }tj	 tj
d|j}tj	 tj
d|j}| jrt | jk  }t | jk  }t }d	||d| < d	|||||  < ||@  rJ t| j||d
}| | j|||}|d urt|d}| |}|jd }| t|d d}t|d |d}||| }|d urt|d}| |}|jd }| t|d d}t|d |d}||| }|d urXt|d}| |}|jd }| t|d d}t|d |d}t| jdrR| jj rR| jrRt | jj!k }|d d d d d d f }|"|# d}|| }n||| }|
d urt|
d}
| $|
}
|
jd }| %t|
d d}
t|
d |d}
|||
 }|d urt|d}| &|}|jd }| 't|d d}t|d |d}||| }|d urt|d}| (|}|jd }| )t|d d}t|d |d}||| }|	d urt|	d}	| *|	}	|	jd }| +t|	d d}	t|	d |d}	|||	 }tj,||gdd}t|d}| -|}t|d d}| j.r>|d ur>| /t0|| j1| 2t0|| j1 }n	| /t0|| j1}| d| j3} |d urb||}!tj,| |!gdd} n| j45 dd}!tj,| |!gdd} |d ur|| 6|}"tj,| |"gdd} |j7|dd}| j7|dd} t|d}g }#| j8D ]}$| 9|$||| |||}|#:| q| j;D ]}$| 9|$||| |||}q| j<D ]&}$tj,||#= gdd}| j9|$||| |||t>|#dkr|#d nd d}q| ?|}t|d d}|S )Nzinpainting is not supportedr   Fc                      s   t  fdS )NrT   )r[   r	   batchrR   prob_focus_presentr	   r   r   e  s    z)UNetSD_temporal.forward.<locals>.<lambda>r"   rT   rS   Tzerokeepr	  z(b f) c h w -> (b h w) f cr  z(b h w) f c -> b c f h wr   r   p_zero_motion_aloner   r   r  )Zrepeatsr   r   )r  )@rj  rY   rR   r  rP   rg  r  r  r@   rW   rV   rD   ri  randrm  sumrn  randpermanyr   rl  Z	new_zerosrf  r   rr  rt  r  r  ru  rv  ro  r0   r  Zp_zero_motionr2  r   rw  rx  r{  r|  r}  r~  ry  rz  rF   r  rh  rp  rJ   r   r  r   rb  r   rq  Zrepeat_interleaver  _forward_singler*   r  r  r  r.   r   )%rf   rI   r   yr   imager]  ra  r`  Zmaskedr^  r_  Z	histogramZfpsr6  r5  r  Zmask_last_frame_numr   r>  r   r   r  r  r  ZnzeroZnkeepindexrl  concatZmotion_dr$  r   Z	y_contextZimage_contextZxsr   r	   r  r   rt   D  sD  






































zUNetSD_temporal.forwardc	           
      C   s  t |tr| jrt|n|}| }||||}|S t |tr5| jr&t|n|}| }|||| j}|S t |trJ| jrAt|n|}|||}|S t |trl| jrVt|n|}t	|d| jd}|||}t	|d}|S t |t
r| jrxt|n|}|||}|S t |tr| jrt|n|}|||}|S t |tr|||}|S t |tr||}|S t |tr||}|S t |tr|||}|S t |tr| jrt|n|}t	|d| jd}|||||}t	|d}|S t |tr| jrt|n|}t	|d| jd}|||||}t	|d}|S t |tr0| jrt|n|}t	|d| jd}||}t	|d}|S t |trS| jr>t|n|}t	|d| jd}||}t	|d}|S t |tjrn|D ]}	| |	|||||||}q\|S ||}|S )Nr  r  r	  )r   r  r   r   r   r   r  rz   r9  r   r   r   r   r   r   r  r/  r@  rE  rL  rd   r   r  )
rf   r   rI   r$  r   r  r5  r6  r  r   r	   r	   r   r    s   
	?
:

6


0

,

(

&
$
"

 




zUNetSD_temporal._forward_single)NNNNNNNNNNNNNr{   r   rK   )ru   rv   rw   ra   rt   r  ry   r	   r	   rh   r   r     sh       @
 `c                       r   )PreNormattentionc                        t    t|| _|| _d S rK   r`   ra   rd   r   r   fnrf   r   r  rh   r	   r   ra   l     

zPreNormattention.__init__c                 K   s   | j | |fi || S rK   r  r   rf   rI   kwargsr	   r	   r   rt   q     zPreNormattention.forwardr   r	   r	   rh   r   r  j      r  c                       r   )PreNormattention_qkvc                    r  rK   r  r  rh   r	   r   ra   w  r  zPreNormattention_qkv.__init__c                 K   s,   | j | || || |fi || S rK   r  )rf   r   r   r   r  r	   r	   r   rt   |  s   ,zPreNormattention_qkv.forwardr   r	   r	   rh   r   r  u  r  r  c                       &   e Zd Zd fdd	Zdd Z  ZS )		Attentionr]   r   r{   c                    s   t    || }|dko||k }|| _|d | _tjdd| _tj||d dd| _|r>t	t||t
|| _d S t | _d S )Nr   r   r   r   r   Fr   )r`   ra   rg   r7   rd   Softmaxattendr   r)  r   r   r   r   rf   r   rg   r   r   r   Zproject_outrh   r	   r   ra     s   



zAttention.__init__c           
         s   g |j | jR \}}} | |jddd}t fdd|\}}}td||| j }| |}td||}	t|	d}	| 	|	S )	Nr   r   r   c                    r   )Nb n (h d) -> b h n dr   r   r   r   r	   r   r     r   z#Attention.forward.<locals>.<lambda>b h i d, b h j d -> b h i jb h i j, b h j d -> b h i db h n d -> b n (h d))
rY   rg   r)  r   r   r   r7   r  r   r   )
rf   rI   r=   r8  r   r   r   dotsr-  r   r	   r   r   rt     s   


zAttention.forwardr]   r   r{   r   r	   r	   rh   r   r    s    r  c                       r  )	Attention_qkvr]   r   r{   c                    s   t    || }|dko||k }|| _|d | _tjdd| _tj||dd| _tj||dd| _	tj||dd| _
|rNtt||t|| _d S t | _d S )Nr   r   r   r   Fr   )r`   ra   rg   r7   rd   r  r  r   r   r   r   r   r   r   r   r  rh   r	   r   ra     s    



zAttention_qkv.__init__c           
      C   s   g |j | jR \}}}}|j d }| |}| |}| |}t|d|d}t|d||d}t|d||d}td||| j }| |}td||}	t|	d}	| 	|	S )Nr   r  r   r  r  r  r  )
rY   rg   r   r   r   r   r   r7   r  r   )
rf   r   r   r   r=   r   Zbkr  r-  r   r	   r	   r   rt     s   






zAttention_qkv.forwardr  r   r	   r	   rh   r   r    s    r  c                       r   )PostNormattentionc                    r  rK   r  r  rh   r	   r   ra     r  zPostNormattention.__init__c                 K   s   |  | j|fi || S rK   )r   r  r  r	   r	   r   rt     r  zPostNormattention.forwardr   r	   r	   rh   r   r    r  r  c                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )rs  r]         rW  r   c	           
         s`   t    tg | _|| _t|D ]}	| jtt|t	||||dt
|||dg qd S )N)rg   r   r   )r   )r`   ra   rd   r   layersr   r-   r*   r  r  r   )
rf   rg   r   rX  rY  rZ  r[  r\  r   r=   rh   r	   r   ra     s&   
	zTransformer_v2.__init__c                 C   sf   | j d d D ]\}}||}||| }q| jdkr1| j dd  D ]\}}||}||| }q"|S r  )r  r   )rf   rI   r-  r   r	   r	   r   rt     s   
zTransformer_v2.forward)r]   r  r  r  rW  r  rW  r   r   r	   r	   rh   r   rs    s    rs  c                       s8   e Zd ZdZ fddZdddddZdd	 Z  ZS )
r  zSDropPath but without rescaling and supports optional all-zero and/or all-keep.
    c                    s   t t|   || _d S rK   )r`   r  ra   r   )rf   r   rh   r	   r   ra     s   
zDropPath.__init__Nr  c          
         s  j st|dkr|d S |S |d }|d}t|jk  }|j|tjd}|d ur2d||< |d ur:d||< t	|d }|t
t|d |  }|d ur`tj|t	|d gdd}|| d |< t fdd|D }	t|dkr|	d S |	S )	Nr   r   r  Fr   r{   c                 3   s     | ]}|  | V  qd S rK   )	broadcastr   Z
multiplierrf   r	   r   r    s    z#DropPath.forward.<locals>.<genexpr>)ri  r.   r  r@   r  r   r  Znew_onesrV   ro   r  rF   r  )
rf   r  r  argsrI   r   rq   rZ   r  outputr	   r  r   rt     s$   

zDropPath.forwardc                 C   s<   | d| dksJ | dfd|jd   }||S )Nr   )r   r   )r  ndimr+  )rf   srcdstrY   r	   r	   r   r    s   
zDropPath.broadcast)ru   rv   rw   r   ra   rt   r  ry   r	   r	   rh   r   r    s
    r  __main__)r0   )r9   r   rd  r   r:   r2   r'  r(  r1   r3   r   r/   r   r   rh  c                 c   s    | ]	\}}|  V  qd S rK   )Znumel)r
   r   r   r	   r	   r   r  5  s    r  i   zM parametersrK   )Mrm   os	functoolsr   r@   Ztorch.nnrd   Ztorch.nn.functionalZ
functionalr   Zeinopsr   Zfairscale.nn.checkpointr   Zrotary_embedding_torchr   r   __all__r  r   r>   rJ   rL   rP   r[   Moduler\   rz   environgetr   r   r   r   r   r   r   r   r   r  r  r&  r/  r9  r@  rE  rL  r  r   r  r  r  r  r  rs  r  ru   configr0   Zunet_in_dimr%   Z
unet_y_dimZunet_context_dimZunet_out_dimr'   Zunet_num_headsZunet_head_dimr&   r(   Zunet_dropoutr   rh  modelprintr   r  Znamed_parametersr	   r	   r	   r   <module>   s   
 ,J9.	%}%17ot!%.     f&&
*