o
    *jS                     @   s  d dl Z d dlmZmZ d dl mZ d dlmZmZ d dlZd dl	Z	d dl
Z
d dlm  mZ d dlmZ d dlmZ d dlm  m  m  mZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< dgZ=ej>e:j?ejdG dd deZ@dS )    N)copydeepcopy)path)AnyDict)	rearrange)Models)
TorchModel)MODELS)pidinet_bsdsketch_simplification_gan)AutoencoderKL)FrozenOpenCLIPEmbedderFrozenOpenCLIPVisualEmbedder)GaussianDiffusionbeta_schedule)get_first_stage_encodingmake_masked_imagesprepare_model_kwargssave_with_model_kwargs)UNetSD_temporal)Config)find_free_port
setup_seed	to_device)
OutputKeys)
load_image)	ModelFileTasks   )cfgVideoComposer)module_namec                       s6   e Zd ZdZ fddZdeeef fddZ  Z	S )r!   a$  
    task for video composer.

    Attributes:
        sd_model: denosing model using in this task.
        diffusion: diffusion model for DDIM.
        autoencoder: decode the latent representation into visual space with VQGAN.
        clip_encoder: encode the text into text embedding.
    c                    sV  t  j|d|i| tj rtdntd| _|dd| _|dd}|dd	}|d
d}tddd||d}t	
|j tt	j}tt	j}	t	jd||	  |	  t	_t	jtt	j t	_t	| _	dtjvrudtjd< t tjd< ttdd| j	_ttdd| j	_t| j	j |dd| _|dd| _|dd| _|dd| _|dg d| _ | j	j| _!t"dtj#$||d| _%| j%&| j| _%t'dtj#$||d| _(| j(j)&| j ddd d!d!d"g d#d$g d%d&
}
t*|
dtj#$||d'| _+| %d(, | _-| (| j(j./d}t0|}| j+1  | j+2 D ]}d|_3q| j+  t4dNi d)| j	d*| j	j5d+| j	j6d,| j	j7d-| j	j8d.| j	j9d/| j	j:d0| j	j;d1| j	j<d2| j	j=d3| j	j>d4| j	j?d5| j	j@d6| j	jAd7| j	jBd8| j	jCd9| j	jDd:| j	jEd| j	j d;| j	jFd<| j	jGd=| j	jGd>| j-d?|&| j| _)| j	jHr| j	jIrtJ| j	d@r| j	jKrt	jILdAdB }tMtj#$| jNt	jI}dCdD |O D }| j)jP|ddE nt	jILdAdB }| j)jPtjMtj#$| jN|ddFddE tjQ  n
tRdG| j	jI dHtSdI| j	jTdJdKdL}tU|| j	jV| j	jW| j	jXddM| _YdS )Oa8  
        Args:
            model_dir (`str` or `os.PathLike`)
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on modelscope
                      or modelscope.cn. Valid model ids can be located at the root-level, like `bert-base-uncased`,
                      or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
                      `True`.
        	model_dircudacpuduration   clip_checkpointzopen_clip_pytorch_model.binsd_checkpointzv2-1_512-ema-pruned.ckptcfg_file_namezexp06_text_depths_vs_style.yamlTN)loadcfg_dictZ	cfg_levelr#   r*   r   ZMASTER_ADDR	localhostZMASTER_PORTZRANKZ
WORLD_SIZEr   
read_imageF
read_styleread_sketchsave_origin_videovideo_compositions)textmaskdepthmapsketchmotionimagelocal_imagesingle_sketchZpenultimate)layer
pretrained            )r      r=   r=   rA           )
Zdouble_zZ
z_channels
resolutionZin_channelsZout_chchZch_multnum_res_blocksZattn_resolutionsdropout)Z	ckpt_path r    Zin_dimZ
concat_dimdimZy_dimZcontext_dimZout_dimZdim_multZ	num_headsZhead_dimrE   Zattn_scalesrF   temporal_attentiontemporal_attn_timesuse_checkpointuse_fps_conditionuse_sim_maskmisc_dropout
p_all_zeroZ
p_all_keepzero_yblack_image_featuretext_to_video_pretrain/c                 S   s   i | ]\}}d |vr||qS )zinput_blocks.0.0 ).0keyprU   rU   /var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/videocomposer_model.py
<dictcomp>   s    z*VideoComposer.__init__.<locals>.<dictcomp>)strict)Zmap_locationzThe checkpoint file z
 is wrong Z	linear_sdg_QK?g~jt?)Z	init_betaZ	last_beta)betas	mean_typevar_type	loss_typeZrescale_timestepsrU   )Zsuper__init__torchr$   Zis_availabledevicepopr&   r   r    updater,   lenZ
frame_lensZfeature_framerates
max_framesZbatch_sizesstr
batch_sizeosenvironr   intgetenvZpmi_rankZpmi_world_sizer   seedr.   r/   r0   r1   r2   viz_numr   r   joinclip_encodertor   clip_encoder_visualmodelr   autoencoderdetachrP   Zblack_image	unsqueeze
zeros_likeeval
parametersZrequires_gradr   Zunet_in_dimZunet_concat_dimZunet_dimZ
unet_y_dimZunet_context_dimZunet_out_dimZunet_dim_multZunet_num_headsZunet_head_dimZunet_res_blocksZunet_attn_scalesZunet_dropoutrI   rJ   rK   rL   rM   rN   rO   resumeZresume_checkpointhasattrrR   splitr+   r#   itemsZload_state_dictZempty_cache
ValueErrorr   Znum_timestepsr   r]   r^   r_   	diffusion)selfr#   argskwargsr(   r)   r*   Z_cfgl1l2ZddconfigrQ   paramZcheckpoint_namessr\   	__class__rU   rY   ra   7   s@  








	
zVideoComposer.__init__inputc           ;         s	  d }| j r|d }t|}t|g}d }| jr$| jj}t|}t|g}d }| jr/t|d }d| jv rLtj	d| j
d djtjd | j}d| jv rTt  d| jv rt| j
ddd	 d| j}	t| j
dd
 d| j}
t| jjdddd| j}t| jjdddd| j}d }| j  |d }| jjdkr| jjr|d }|d }|d }|d }|d }tj| jjg| jj tj| jd}n%|d }|d }|d }|d }|d }tj| jjg| jj tj| jd}t|}t |d}g }d| jjv rt |d}g }d| jjv r,t!|"d#d|}t |d}g }d| jjv rm|j$d }|j$d }| jj rS|%d&||ddd' }n|d d d df ( &d|ddd}t |d|d}|j$d }t |d}t |d}tj)||j$d | jj* dd}tj)||j$d | jj* dd}t+  g }|D ]}| j,-|} t.| / }!|0|! qtj1|dd}t |d|d}g }"d| jjv r|D ]%}#||#"d#djtjd }$|$| jj2 3d| jj4}$|"0|$ qtj1|"dd}"t |"d|d}"g }%d| jjv r@|D ] }#t |#( d}#t5 fdd|#D }&t |&d }&|%0|& qtj1|%dd}%t |%d|d}%g }'d| jjv r|}(| jjr]|&|ddd' })|)g}(|(D ]}#|	|#"|#|}*d!|
d!|*  }*|'0|* q_tj1|'dd}'t |'d|d}'g }+d"| jjv r|'( d d d d d df &dd|dd}+W d    n	1 sw   Y  | 6|/ },|,( }-g }.d#| jjv rt+ 4 | jjr| 7| j78|%d' %d}.|.( }/n|9d}| 7|%d}.|.( }/W d    n	1 sw   Y  t+  t:;  t<j=| jj>d$ | jj?rN|j$\}0}1}2}3}4tj@| jA|1|3|4f| jd%}5|5jB|2dd&}5t |5d'| jAd}5|5C }5n
tD|d | jA }5|-d | jA tE|dkrgd n|d | jA tE|.dkrvd n|/d | jA tE|"dkrd n|"d | jA tE|%dkrd n|%d | jA tE|'dkrd n|'d | jA tE|dkrd n|d | jA tE|dkrd n|d | jA tE|+dkrd n|+d | jA |d | jA d(
| jjFs| jG&| jAddn	tH|-d | jA tE|dkrd n|d | jA tE|.dkrd n	tH|/d | jA tE|"dkrd n|"d | jA tE|%dkr.d n|%d | jA tE|'dkr=d n|'d | jA tE|dkrLd n|d | jA tE|dkr[d n|d | jA tE|+dkrjd n|+d | jA |d | jA d(
g}6| jjI}7|5( }8tJ|7|6| jjFd)}9| jKjL|8| j |9d*| jjMd+d,}:tN|9|:| j,|| jAd||| j| jOd-
 W d    n	1 sw   Y  W d    n	1 sw   Y  |:PtjQR | jd.S )/NZstyle_imager5   T)r<   r#   F)Zmemory_formatcannyr6   )r<   Zvanilla_cnn)r<   r   rT   Zcap_txtZ	ref_frame
video_data	misc_datar4   mv_data)Zdtyperc   zb f c h w -> b c f h wr7   g      ?r9   r   )bzb f c h w -> (b f) c h w)rH   z(b f) c h w -> b c f h wzk c h w -> k h w cc                    s   g | ]} |qS rU   rU   )rV   Zmisc_imgZcanny_detectorrU   rY   
<listcomp>Q  s    z)VideoComposer.forward.<locals>.<listcomp>zk h w c-> k c h wg      ?r:   r8   )enabled)rc   )ZrepeatsrH   z(b f) c h w->b c f h w)
yr9   r8   depthr   r6   Zmaskedr7   r:   fps)partial_keysfull_model_kwargsrL   g      "@rB   )noisert   model_kwargsZguide_scaleddim_timestepseta)
r   r   ru   Z	ori_videoro   stepcapspaletter    r&   )ZvideoZ
video_path)Sr.   r   Zmisc_transformsr0   r    Zsketch_pathr/   r2   modelsZmidas_v3r#   ry   Zrequires_grad_rr   rb   Zchannels_lastZhalfrc   ZCannyDetectorr   r   ZtensorZsketch_meanviewZ
sketch_stdrt   rg   Zuse_image_datasetZfeature_framerateri   longr   r   r   subZdiv_shaperw   repeatr$   clonechunk
chunk_sizeZno_gradru   encoder   rv   appendcatZ	depth_stdZclamp_Zdepth_clampstackrq   rs   
preprocessZsqueezepynvmlZnvmlInitampZautocastZuse_fp16Zshare_noiseZrandnro   Zrepeat_interleave
contiguousZ
randn_likerf   rL   rP   rx   Z	guidancesr   r   Zddim_sample_loopr   r   r&   typeZfloat32r%   );r   r   Zframe_inZ	image_keyframeZframe_sketchZ
sketch_keyZframe_styleZmidasZpidinetZcleanerZ	pidi_meanZpidi_stdr   r   Zref_imgsr   r   r4   r   r   Zmisc_backupsZmv_data_videoZmasked_videoZimage_localZ
frames_numZbs_vd_localZbs_vdZvideo_data_listZmisc_data_listZdecode_dataZvd_dataZencoder_posteriortmpZ
depth_dataZ	misc_imgsr   Z
canny_dataZcanny_conditionZsketch_dataZsketch_listZsketch_repeatr6   Zsingle_sketch_datar   Zy0Zy_visualZ	y_visual0r   cfhwr   r   r   Znoise_motionr   Zvideo_outputrU   r   rY   forward   sJ  




















;






>izVideoComposer.forward)
__name__
__module____qualname____doc__ra   r   rh   r   r   __classcell__rU   rU   r   rY   r!   *   s
    
 )Arj   r   r   r   Zosptypingr   r   Z	open_clipr   rb   Ztorch.cuda.ampr$   r   Ztorch.nnnnZeinopsr   Z2modelscope.models.multi_modal.videocomposer.modelsr   Zmulti_modalZvideocomposerZmodelscope.metainfor   Zmodelscope.modelsr	   Zmodelscope.models.builderr
   Z<modelscope.models.multi_modal.videocomposer.annotator.sketchr   r   Z7modelscope.models.multi_modal.videocomposer.autoencoderr   Z0modelscope.models.multi_modal.videocomposer.clipr   r   Z5modelscope.models.multi_modal.videocomposer.diffusionr   r   Z5modelscope.models.multi_modal.videocomposer.ops.utilsr   r   r   r   Z3modelscope.models.multi_modal.videocomposer.unet_sdr   Z8modelscope.models.multi_modal.videocomposer.utils.configr   Z7modelscope.models.multi_modal.videocomposer.utils.utilsr   r   r   Zmodelscope.outputsr   Zmodelscope.preprocessors.imager   Zmodelscope.utils.constantr   r   configr    __all__Zregister_moduleZtext_to_video_synthesisr!   rU   rU   rU   rY   <module>   s>   