o
    *j                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZ dgZejejejd	G d
d de
ZG dd deddZdd Zdd Zdd Zdd ZdMddZdNddZdOddZdd ZdMd d!ZdMd"d#ZdMd$d%Zi d&fd'd(Z i d)fd*d+Z!dddi d,fd-d.Z"dPd1d2Z#dQd4d5Z$d6d7 Z%	8		dRd9d:Z&d;d< Z'	=dSd>d=Z(			 		dTd?d@Z)dAdB Z*dCdD Z+dEdF Z,dZ-dGdH Z.G dIdJ dJej/j0j1j2Z3dKdL Z4dS )U    N)
namedtuple)Dict)Models)ModelTensor)MODELS)TasksCsanmtForTranslation)module_namec                       s   e Zd Z fddZ			d#deeef deeef deeef deeef deeef f
d	d
Zdeeef deeef fddZ	dd Z
d$ddZdd Zd%ddZi dfddZ		d&ddZdd Zdd Zi dfdd Zd!d" Z  ZS )'r	   c                    s.   t  j|g|R i | || _t| j dS )zK
        Args:
            params (dict): the model configuration.
        N)super__init__paramsprint)selfZ	model_dirargskwargs	__class__ i/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/csanmt/translation.pyr      s   zCsanmtForTranslation.__init__Ninputlabelprefix
prefix_hitreturnc           	      C   st   |du r-t jjd | |||d| j\}}W d   n1 s#w   Y  ||dS | ||\}}||dS )a  return the result by the model

        Args:
            input: the preprocessed input source sequence
            label: the ground truth target data for model training
            prefix: the preprocessed input target prefix sequence for interactive translation
            prefix_hit: the preprocessed target prefix subword vector for interactive translation

        Returns:
            output_seqs: output sequence of target ids
        NNmtModel)
input_widsprefix_widsr   )output_seqsoutput_scores)train_oploss)tfcompatv1variable_scopebeam_searchr   transformer_model_train_fn)	r   r   r   r   r   r   r   r    r!   r   r   r   __call__   s"   
zCsanmtForTranslation.__call__c                 C   s   dS )z
        Run the forward pass for a model.

        Args:
            input (Dict[str, Tensor]): the dict of the model inputs for the forward method

        Returns:
            Dict[str, Tensor]: output from the model forward pass
        Nr   )r   r   r   r   r   forwardA   s   
zCsanmtForTranslation.forwardc                 C   s  |d }|d }t jjjd|d t jd}|d rBt jjjdt jjjd t jjjd	||g|d
}W d    n1 s<w   Y  n#t jjd t jjjd	||g|d
}W d    n1 s`w   Y  t jjd|g}t j|t j	dd d d df }t 
||gd}	t jt |	dt jd}
|
d d d df }t j|ddgddggdd}t |t |	t j}||d  }|d dkrt|}t |t |d}t j||}t|d}|d dkrt jj||d d}t||||}||fS )Nsrc_vocab_sizehidden_size              ࿩dtypeshared_source_target_embeddingShared_EmbeddingreuseWeightsinitializerZSource_EmbeddingZencoder_input_bias   r   tensorpaddingsZconstant_values      ?position_info_typeabsolute   maskingresidual_dropoutZrate)r"   r#   r$   random_normal_initializerfloat32r%   
AUTO_REUSEget_variable
zeros_likeint64concatcast	not_equalpadgatherint32add_timing_signalmultiplyexpand_dimsnnbias_addattention_biasdropouttransformer_encoder)r   featuresr   r*   r+   r6   Zsrc_embeddingZsrc_biaseos_paddingZsrc_seqZsrc_maskZshift_src_maskencoder_inputencoder_self_attention_biasencoder_outputr   r   r   encoding_graphM   sf   

 

z#CsanmtForTranslation.encoding_graphc                 C   s  |d }t jjjd|d t jd}d }|d r|d }d}n|dkr)|d }d	}n|d
kr4|d }d}ntdt jjj|t jjjd t jjjd||g|d}W d    n1 s[w   Y  t j	|t j
dd d d df }	t ||	gd}
t jt |
dt jd}|d d d df }t j|ddgddggdd}t |t |
t j}||d  }t |t |d}t|d}|d dkrt jj||d d}t||||}|S )Nr+   r,   r-   r.   r0   r*   ZShared_Semantic_EmbeddingsourceZSource_Semantic_Embeddingtargettrg_vocab_sizeZTarget_Semantic_Embeddingzerror: no right name specified.r2   r4   r5   r7   r   r8   r9   r<   r?   r@   rA   rB   )r"   r#   r$   rC   rD   
ValueErrorr%   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rP   rQ   rT   rR   rU   transformer_semantic_encoder)r   rW   r   namer+   r6   scopeZ
vocab_sizeZembedding_matrX   Z	input_seqZ
input_maskZshift_input_maskrY   rZ   r[   r   r   r   semantic_encoding_graph   s\   
 
z,CsanmtForTranslation.semantic_encoding_graphc                 C   s@   d}d}|d rd }d }| j |||d}| j |||d}||fS )Nr]   r^   r0   rb   )rd   )r   rW   labelsr   source_nametarget_namefeature_outputlabel_outputr   r   r    build_contrastive_training_graph   s   z5CsanmtForTranslation.build_contrastive_training_graph-q=c                    sl   |d  |d  d dksJ  fdd}g }|||}| ||| t| ks/J tj|ddS )Nnum_of_samplesetar?   r   c              	      s   ||  }t jt |t jt |ddd  t jt |dddt jt |ddd d  }g }t d D ]+}t jt j	|dd| d t jt j	|ddd  }| ||  }|
| q;|S )Nr?   T)input_tensoraxisZkeepdimsr   r,         ?)r"   mathdivideabs
reduce_minZ
reduce_maxrangerandomnormalshapeappend)Zx_vectorZy_vectorZbias_vectorZw_rRiomegasampleKepsilonrn   r   r   get_samples   s2   
z7CsanmtForTranslation.MGMC_sampling.<locals>.get_samplesrp   )extendlenr"   rI   )r   Zx_embeddingZy_embeddingr   r   r   ZALL_SAMPLESr   r   r   MGMC_sampling   s   
z"CsanmtForTranslation.MGMC_samplingc              
   C   s  |d }|d }t jjjd|d t jd}|d rBt jjjdt jjjd t jjjd	||g|d
}	W d    n1 s<w   Y  n#t jjd t jjjd	||g|d
}	W d    n1 s`w   Y  t j|t j	dd d d df }
t 
||
gd}t jt |dt jd}|d d d df }t j|ddgddggdd}t |	t |t j}||d 9 }tt j|dd d}t j|ddgddgddggdd d d dd d f }|d dkrt|}t jj|dd|d   d}t||||d d ||d\}}| ||}|d }d|d  t j|d t jd }t jt |t j|||d}t ||j}t jj|t |d| }t j|dt j|d }|S )Nr_   r+   r,   r-   r.   r0   r1   r2   r4   r5   Target_Embeddingr7   r   r8   r9   r<   rq   causalr:   r;   r=   r>   rr   rA   rB   
states_key
states_valembedding_augmentationr   Z
confidence)depthon_value	off_value)logitsrf   ro   )r"   r#   r$   rC   rD   r%   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rT   rz   rO   rR   rU   transformer_decoder
predictionZone_hotr/   Z!softmax_cross_entropy_with_logitsZstop_gradient
reduce_sum)r   r[   rZ   rf   r   r   r_   r+   r6   trg_embeddingrX   trg_seqZtrg_maskZshift_trg_maskdecoder_inputdecoder_self_attention_biasdecoder_outputattention_weightsr   r   r   Zsoft_targetsmaskZxentropyr!   r   r   r   decoding_graph   s   
 



z#CsanmtForTranslation.decoding_graphc           
      C   s   |  ||\}}d }|d ur:|d ur:| |||}t||d ddg}t||d dddg}t||d dg}| j|||||d}	|	S )Nrm   r7   )r   )r\   r   r"   tiler   )
r   rW   rf   r   Zfeature_embeddingZlabel_embeddingr[   rZ   r   r!   r   r   r   build_training_graph)  s0   z)CsanmtForTranslation.build_training_graphc                    s  t | j}tjjjd|d | jd | jd }tjjj }t| t	| jd || j}tj
|tjd}| jd dkrGtjjj|}n)| jd d	krdtjjjj|| jd
 | jd | jd d}ntjjjd t  t|| jd }dd tjtj dd k  fdd fddd tjtjdd k fddfddddkrt }	t}
n g}	g}
dkrdd tD }ndg}g }g }t|D ]\}}t| tjjjtjj |dkrdnd dg tdd |f M | |	| |
| | j\}}| |	| |
| | j||}|| tjjjd! || d"d tjj! D }|j"||dd#}|| W d    n	1 sVw   Y  W d    n	1 sfw   Y  W d    n	1 svw   Y  qt#|t$| }t%|}|d$krt&t'| \}}t(||\}}t'||}|j)|tjjj d%}||fW  d    S 1 sw   Y  d S )&Nr   r5   num_gpusgradient_clip_normlearning_rate)valuer/   	optimizerZsgdZadamZ
adam_beta1Z
adam_beta2Zadam_epsilon)r   Zbeta1Zbeta2r   zoptimizer not supportedZupdate_cyclec                 S   s6   | }t |D ]}tj|| gdd}q|d |f }|S )Nr   r   )rw   r"   rI   )inputsr   outputsr}   r   r   r   	fill_gpuse  s
   zBCsanmtForTranslation.transformer_model_train_fn.<locals>.fill_gpusrq   r   c                      s
    S Nr   r   )rW   r   r   r   r   <lambda>n     
 zACsanmtForTranslation.transformer_model_train_fn.<locals>.<lambda>c                          S r   r   r   )rW   r   r   r   o      )predZtrue_fnZfalse_fnc                      s
    S r   r   r   )r   rf   r   r   r   r   r  r   c                      r   r   r   r   )rf   r   r   r   s  r   c                 S   s   g | ]}d | qS )zgpu:%dr   ).0dr   r   r   
<listcomp>}  s    zCCsanmtForTranslation.transformer_model_train_fn.<locals>.<listcomp>zcpu:0Tr2   z%s_%dZGPUzmle_loss_{}c                 S   s$   g | ]}d |j vrd|j vr|qS )ZSemantic_Embeddingmini_xlm_encoderre   )r   vr   r   r   r     s    

)var_listcolocate_gradients_with_opsr,   )global_step)*get_initializerr   r"   r#   r$   r%   trainZget_global_stepr   get_learning_rate_decayconvert_to_tensorrD   ZGradientDescentOptimizerZAdamOptimizerlogginginfosysexitMultiStepOptimizercondrz   shard_featuresrw   	enumeratedeviceZget_variable_scope
name_scoperk   r   r{   summaryZscalarformatZtrainable_variablescompute_gradientsadd_nr   average_gradientslistzipZclip_by_global_normapply_gradients)r   rW   rf   r6   r   r   r   r   optZfeature_shardsZlabel_shardsZdevicesZmulti_gradsZsharded_lossesr}   r   ri   rj   Zmle_lossZtrainable_vars_listgrads_and_varsZ
total_lossgradsr   _r    r   )rW   r   rf   r   r   r'   H  s   










 

&z/CsanmtForTranslation.transformer_model_train_fnc           	      C   s   |d }|d }|d r5|d rdnd}t jjj|dd t jjd	}W d    n1 s/w   Y  n
t jjd
t|g}t j|dd d }t |d|g}t j||dd}t |t 	||ggd}|S )Nr+   r_   $shared_embedding_and_softmax_weightsr0   r1   r   Tr2   r4   Softmaxrq   r8   Ztranspose_br   )
r"   r#   r$   r%   rF   Ztgt_vocab_sizerz   reshapematmulrI   )	r   r   r   r+   r_   embedding_scopeweightsrz   r   r   r   r   r     s(   
zCsanmtForTranslation.predictionFc	              
   C   sz  |d }	|d }
t jjjd|
d t jd}|d rBt jjjdt jjjd t jjjd	|	|
g|d
}W d    n1 s<w   Y  n#t jjd t jjjd	|	|
g|d
}W d    n1 s`w   Y  t |t 	|t j
}||
d 9 }tt j|dd d}t j|ddgddgddggdd d d dd d f }|d dkrt|}|s|d d dd d d f }|d d d d dd d d f }t||||||||d\}}|s|d d dd d f }|d d dd d f }n|}|}|d r|d rdnd}t jjj|dd t jjd	}W d    n	1 sw   Y  n
t jjd|	|
g}t j||dd}t j|}||||fS )Nr_   r+   r,   r-   r.   r0   r1   r2   r4   r5   r   r<   rq   r7   r   r   r   r8   r=   r>   r   r   Tr   r   )r"   r#   r$   rC   rD   r%   rE   rF   rM   rJ   rN   rT   rz   rL   rO   r   r   rR   Zlog_softmax)r   r[   ri   rZ   r   r   r   r   	is_prefixr_   r+   r6   r   r   r   r   r   Zdecoder_output_lastZattention_weights_lastr   r   r   Zlog_probr   r   r   inference_func  s   	

	

z#CsanmtForTranslation.inference_funcc           !         s$  
d 
d 
d 
d 	
d 
d |d }d|v r)|d }|d	 }nd }d }t |d
 t|}t|}|d urPt t|t j}t|}|
\d}
d r`d }j|
|dfddt	D fddt	D t	D ]}| 	t 
d d g | 	t 
d d g qfddt	D fddt	D d}|d urtt j|t dgd
gdd}	t |	d }t|	}
fddt	D }fddt	D }j|
||
dd\}}fddt	D fddt	D t|}t ||d d dd d f t |d d dd d f t jj }t |
d d d df t t t |dt jdgd}	t|	}	t j|	t dgd
gdd}	n	t dgd
}	t dgt jjgd   g}t |dg}t |}|	}t gt jj}t t gd
t j}t|	||ff|||fd}	
f
dd  fd!d"} fd#d$}t jd
d%d}tt 
g d&t 
d d gt 
d d gffd'dt	D fd(dt	D ft 
d d gt 
g d&t 
d d gfd}t j||||gt 
g |gdd)d*}|d }|jd
 }|jd }|jd
 }|jd }|jd } |	d d g |	d d g t jjt j|dd+||}t jjt j|dd+| |} |d d d d |d df }|| fS ),N	beam_sizer_   r+   num_decoder_layerslp_ratemax_decoded_trg_lenr   r   r   r   r]   r0   re   c                       g | ]}t  d gdqS r   r,   r"   fillr   layer
batch_sizer+   r   r   r          z4CsanmtForTranslation.beam_search.<locals>.<listcomp>c                    r   r   r   r   r   r   r   r   $  r   c                       g | ]	}t |  qS r   tile_to_beam_sizer   )r   r   r   r   r   -      c                    r   r   r   r   )r   r   r   r   r   1  r   r7   r?   r   r8   c                       g | ]}t  | qS r   merge_first_two_dimsr   r   r   r   r   ;      
c                    r   r   r   r   r   r   r   r   ?  r   Tr   r   c                       g | ]
}t |  qS r   split_first_two_dimsr   r   r   step_states_keyr   r   r   N      
c                    r   r   r   r   r   r   step_states_valr   r   r   S  r   r,   r   statefinishc           "   
      sd  |j d d \}}|j\t|}fddtD }fddtD }j	
|||dd\}}t|}t|d| }	fddtD fddtD td	tj	| d
 tj
d d }
|	|
 }t|d g}tjj|d d\}}| }| }t||}tj|d d d d d df t|dgdd}td d
gtdtj}tj||gdd}t|d}|tj	|tj
dtj
j  }tj|\} t| }t|  t| }tj|d d d d d df t|dgdd}td
gtdtj}tj||gdd} fddtD } fddtD }||
 }|j\}}}|dtj	|tj
d tj
j  }tj||gd
d}tj||gd
d}tj|\}}t||}td
gtdtj}tj||gdd}tj||gd
d} t| |} t|||f||f|| |fd}!| d
 |!fS )Nr?   c                    r   r   r   r   r   r   r   r   }  r   zOCsanmtForTranslation.beam_search.<locals>._beam_search_step.<locals>.<listcomp>c                    r   r   r   r   r   r   r   r     r   Fr   c                    r   r   r   r   r   r   r   r     r   c                    r   r   r   r   r   r   r   r     r         @r7   r.         @r8   )kr   r   c                    r   r   	gather_2dr   )alive_indicesnext_states_keyr   r   r     r   c                    r   r   r   r   )r  next_states_valr   r   r     r   rr   r   )r   r   r   rw   r   r   r"   rQ   powrJ   rD   r   rR   Ztop_kr  rI   r   constantrN   equalminr   BeamSearchState)"timer   Zseqs	log_probs	flat_seqsflat_states_keyflat_states_valstep_log_probsstep_attn_weightsZcurr_log_probsZlength_penaltyZcurr_scoresZ
top_scoresZtop_indicesZbeam_indicesZsymbol_indicesZcandidate_seqsZpad_seqsflagsalive_scoresZalive_symbols
alive_seqsZalive_states_keyZalive_states_valZalive_log_probsZprev_fin_flagsZprev_fin_seqsZprev_fin_scoresZstep_fin_scores	fin_flags
fin_scoresZfin_indicesfin_seqs	new_state)
r   r   r[   rZ   ri   r   r   r   r   r_   )r  r  r  r   r   r   r   r   _beam_search_stepx  s   












z;CsanmtForTranslation.beam_search.<locals>._beam_search_stepc                    s   |j d }|jd }|jd }tdtjtjd d  }|d d df | }tj|tj|tjd dd}dtjtj|ddtjd }|tjj| 7 }tj	t
||d	}	tt| t|	}
|
S )
Nr7   r   r?   r   r.   r   ro   rp   rr   r   )r   r   r"   r  rJ   rD   rv   
reduce_anyr  Z
reduce_allgreaterlogical_andlessZlogical_not)tsr  Zfinished_flagsZfinished_scoresZmax_lpZbest_alive_scoreZworst_finished_scoreZadd_maskZbound_is_metr   )r   r   r   r   _is_finished  s8   


z6CsanmtForTranslation.beam_search.<locals>._is_finishedc                    s    | |}|S r   r   )r  r  Zouts)r  r   r   _loop_fn  s   
z2CsanmtForTranslation.beam_search.<locals>._loop_fnr
  NNNc                       g | ]}t d d d  gqS r   r"   TensorShaper   r+   r   r   r     r   c                    r#  r   r$  r   r&  r   r   r     r   F)r   bodyZ	loop_varsshape_invariantsZparallel_iterationsZ	back_propr  ) r"   rz   r   r   rJ   rN   r\   rd   rw   	set_shaper%  rI   r   r   whereZ	ones_likerD   r  rQ   Zargmaxr   r  r   rG   boolr	  Z
while_loopr   r   r#   r$   r  )!r   rW   r   Z	src_inputr   r   rg   r   Zfixed_lengthZ	init_seqsr  r  r  r  r  r  Zinit_log_probsZinit_scoresr  r  r  r   r   r!  r
  r(  r   Zfinal_stater  r  Zfinal_flagsZ
final_seqsZfinal_scoresr   )r  r   r   r[   rZ   ri   r+   r   r   r   r   r   r   r   r   r   r_   r   r&     s6  






"
u






z CsanmtForTranslation.beam_searchr"  r   )rl   NN)__name__
__module____qualname__r   r   strr   r+  r(   r)   r\   rd   rk   r   r   r   r'   r   r   r&   __classcell__r   r   r   r   r	      sB    





"#
20
%
O
e
Cc                   @   s   e Zd ZdS )r	  N)r-  r.  r/  r   r   r   r   r	  6  s    r	  r   c                 C   s0   t j| dd} dg| jj }||d< t | |S )z#Tiles a given tensor by beam_size. r7   r   )r"   rQ   rz   ndimsr   )r:   r   Z	tile_dimsr   r   r   r   ;  s   r   c                 C   sp   t | } | jjd u rt | S | j }t | }g }tt|D ]}|| }|d u r0|| }|| q"|S r   )r"   r   rz   dimsas_listrw   r   r{   )xZstatic_shapeZdynamic_shaperetr}   dimr   r   r   infer_shapeD  s   



r8  c                 C   s,   t | }|g|g |dd   }t| |S Nr7   )r8  r"   r   )r:   Zdim_0Zdim_1rz   	new_shaper   r   r   r   W  s   r   c                 C   s2   t | }|d  |d 9  < |d t| |S )Nr   r7   )r8  popr"   r   )r:   rz   r   r   r   r   ]  s   
r   c                 C   sd   t | d }t |d }t || | }t |||g}t j||gdd}t j| ||d}|S )z Gather the 2nd dimension given indices
    :param params: A tensor with shape [batch_size, M, ...]
    :param indices: A tensor with shape [batch_size, N]
    :param name: An optional string
    :return: A tensor with shape [batch_size, N, ...]
    r   r7   r8   r   re   )r"   rz   rw   r   stackZ	gather_nd)r   indicesrb   r   Z
range_sizeZ	batch_posoutputr   r   r   r  d  s   r  Tc                 C   s  t jjj|d| g|d t| ttfs| g} dd | D }t| t|kr*tdt j	t 
| d d d |ggdd}d	d | D } g }|rht|}t 	| d
} ||g}	t jjd|	}
|t | |
 n&tt|D ]}|| |g}	d| }t jj||	}
|t | | |
 qnt |}|r|g}	t jjd|	}t j||}t ||}|W  d    S 1 sw   Y  d S )Nlinear)default_namevaluesr/   c                 S   s   g | ]}|  d  qS r8   )	get_shape)r   itemr   r   r   r   {  s    zlinear.<locals>.<listcomp>z inputs and input_size unmatched!r   r8   r   c                 S   s"   g | ]}t |d |jd  gqS rB  )r"   r   rz   )r   inpr   r   r   r     s   " r7   matrixz	matrix_%dbias)r"   r#   r$   r%   
isinstancer   tupler   RuntimeErrorrI   rz   sumrF   r{   r   rw   r   rR   rS   r   )r   Zoutput_sizerG  rI   r/   rc   Z
input_sizeZoutput_shaperesultsrz   rF  r}   rb   r>  r   r   r   r?  u  s@   

$r?  ư>c           
      C   s   t jjj|d| g|dP |   d }t jjjd|gt  d}t jjjd|gt  d}t 	| dd}t 	t 
| | dd}| | t jj||  }	|	| | W  d    S 1 s_w   Y  d S )N
layer_norm)r@  rA  r3   r8   Zlayer_norm_scaler5   Zlayer_norm_offsetT)r"   r#   r$   r%   rC  r4  rF   Zones_initializerZzeros_initializerreduce_meanZsquareZrsqrt)
r   r   rb   r3   Zchannel_sizescaleoffsetmeanZvarianceZnorm_inputsr   r   r   rN    s"   

$rN  c                 C   s,   |r|dkr| S |dkrt | S td| )NnonerN  Unknown mode %s)rN  r`   )r5  moder   r   r   _layer_process  s
   rV  c                 C   s(   |r|dk rt jj|d| d}| | S )Nrr   r7   rB   )r"   rR   rU   )r5  y	keep_probr   r   r   _residual_fn  s   rY  c              	   C   s   |d }d|d  }t jjj|d| |gd^ t jjd t||dd}t j|}W d    n1 s6w   Y  |rK|dk rKt jj|d| d	}t jjd
 t||dd}W d    n1 sdw   Y  | | W  d    S 1 sww   Y  d S )Nr+   rr   relu_dropoutembedding_augmentation_layerr@  rA  input_layerTr7   rB   output_layerr"   r#   r$   r%   r?  rR   ZrelurU   )r5  r   r   rb   r+   rX  hiddenr>  r   r   r   r[    s$   $r[  c              	   C   s   |d }|d }d|d  }t jjj|d| gd\ t jjd t| |dd}t j|}W d    n1 s9w   Y  |rN|dk rNt jj|d	| d
}t jjd t||dd}W d    n1 sgw   Y  |W  d    S 1 sxw   Y  d S )Nfilter_sizer+   rr   rZ  Z	ffn_layerr\  r]  Tr7   rB   r^  r_  )r5  r   rb   ra  r+   rX  r`  r>  r   r   r   transformer_ffn_layer  s"   $rb  encoderc                 C   s\  |d }|d }|d }|d }|d }	|d }
|d }| }t |d}t jjj|t jjjd	t t|D ]b}t jjd
| O |d dkrK|d nd }tt||
d ||||||	|dd
\}}t	||d| }t||}t
t||
|}t	||d| }t||}t ||}W d    n1 sw   Y  q5t||
W  d    S 1 sw   Y  d S )Nnum_encoder_layersr+   	num_headsrA   attention_dropoutlayer_preproclayer_postprocr?   r2   layer_%dr=   relativemax_relative_disencoder_self_attentionrk  rb   rr   )r"   rQ   r#   r$   r%   rE   rw   multihead_attentionrV  rY  rb  rP   )rY   rZ   r   r   rb   rd  r+   re  rA   rf  rg  rh  r5  r   rk  owr   r   r   rV     sP   



$rV   r   c                 C   s  |d }|d }|d }|d }|d }	|d }
|d }| }t |d}t jjj|t jjjd	 t|D ]Z}t jjd
| G |d }tt||
d ||||||	|dd
\}}t	||d| }t||}t
t||
|}t	||d| }t||}t ||}W d    n1 sw   Y  q5t jjjdt jjjd	" t j|ddt j|dd }tt j|dd|dd}W d    n1 sw   Y  t||
W  d    S 1 sw   Y  d S )NZnum_semantic_encoder_layersr+   re  rA   rf  rg  rh  r?   r2   ri  rk  rl  rm  rr   Zpooling_layerr7   r  r   T)r"   rQ   r#   r$   r%   rE   rw   rn  rV  rY  rb  rP   r   r?  )rY   rZ   r   r   rb   rd  r+   re  rA   rf  rg  rh  r5  r   rk  ro  rp  r>  r   r   r   ra     sf   




$ra   decoderc	                 C   s  |d }	|d }
|d }|d }|d }|d }|d }| }t jjj|t jjjd t|	D ]}t jjd	| ~ |d
 dkrE|d nd }|d urYt|t|||}t||}tt||d ||
|
|
||||||dd\}}t	||d| }t||}tt|||||
|
|
|||dd
\}}t	||d| }t||}t
t|||}t	||d| }t||}W d    n1 sw   Y  q/t|||fW  d    S 1 sw   Y  d S )Nr   r+   re  rA   rf  rg  rh  r2   ri  r=   rj  rk  Zdecoder_self_attention)r   r   r   rk  rb   rr   Zencdec_attentionrm  )r"   r#   r$   r%   rE   rw   r[  rV  rn  rY  rb  )r   r[   r   Zencoder_decoder_attention_biasr   r   r   r   rb   r   r+   re  rA   rf  rg  rh  r5  r   rk  ro  rp  r   r   r   r   H  s~   	





-$r   rr        @c              	   C   s  t | d }t | d }t t |t j}|d }tt|t| t |t jd  }|t t t |t j|   }t 	|dt 	|d }	t j
t |	t |	gdd}
t |
ddgdt jj|dgg}
t |
d||g}
| t |
| j S )Nr7   r?   r   r   )r"   rz   rJ   rw   rD   rs   logfloatexprQ   rI   sincosrL   r#   r$   modr   r/   )r5  Zmin_timescaleZmax_timescalelengthZchannelspositionZnum_timescalesZlog_timescale_incrementZinv_timescalesZscaled_timesignalr   r   r   rO     s"   &$rO       ec                 C   s   |d u rt j}|t jkr|j}|dkr&| }d| | }t t |dd}n+|dkrK| }t jt ||gddd}|d|  }t |dd||g}ntd| t 	||S )Nr@   rr   r7   r   r8   r   rT  )
r"   rD   r  rQ   ZlinalgZ	band_partr   r   r`   rJ   )r   rU  infr/   r   r6  ry  Zlower_triangler   r   r   rT     s"   
rT   c           	      C   s   |}|   j}| jj}|d }|d d |g |r|| nd g }t| tt| d d |dggd}|| d|d gdd td|d D  |g }t	||S )Nr8   r   r7   c                 S   s   g | ]}|qS r   r   )r   r}   r   r   r   r     s    zsplit_heads.<locals>.<listcomp>)
rC  r3  rz   r2  r"   r   rI   r)  rw   	transpose)	r5  re  n	old_shaper2  lastr:  r6  permr   r   r   split_heads  s   
$*
*r  r,   c              	   C   s  t jjj|d| ||gd t | }|d |d |d |d f\}}	}
}t |d }t |d }|d ur@|d |d }}|d u rMt j| |d	d
}n9t j| |d	d
}t t | g d|
||	 |g} t | t |g d}t t |g d||	|
|g}|| }|d ur||7 }t jj	|dd}|dkrt j
|d| }|d u rt |||fW  d    S t ||}t t |g d|
||	 |g}t ||}t t |g d||	|
|g}|| }t t |g d||	|
|g}||fW  d    S 1 sw   Y  d S )Ndot_product_attentionr\  r   r7   r?      rpr_krpr_vTr   )r?   r   r7   r  )r   r?   r7   )r7   r   r?   r   re   r,   rr   )r"   r#   r$   r%   rz   r   r   r~  rR   ZsoftmaxrU   )qr   r   rG  dropout_raterb   rprZq_shapebsZhdZlqZdkZlkZdvr  r  r   Zlogits_part1Zlogits_part2r   Zoutputs_part1Zoutputs_part2r   r   r   r   r    sd   
$%
&r  c                 C   s   t | g d} |  j}|dd  \}}|d d |r"|r"|| nd g }t | t t | d d dggd} | | | S )N)r   r?   r7   r  r8   r   )r"   r~  rC  r3  r   rI   rz   r)  )r5  r  abr:  r   r   r   combine_heads  s   
"(
r  
create_rprc           	      C   s   t |G t t |ddg}t t |ddg}|| }|| }t |d}t |d| }|| d d d f }t | |}|W  d    S 1 sOw   Y  d S )Nr8   r7   r   r?   )r"   r   r   rw   maximumminimumrM   )	Zorginal_varlength_q	length_kvrk  rb   ZidxsZidysZidsr  r   r   r   r    s   $c              	   C   sb  || dkrt dt|f || dkrt dt|f tjjj|d| |gd |d u rIt| |d | dddd	}tj||||gdd
\}}}n t| |dddd	}t||| dddd	}tj|||gdd
\}}|d ur|tj	||
 |gdd
 }||
< |	d urtj	|	|
 |gdd
 }|	|
< t
||}t
||}t
||}|| }||d 9 }t|d }t|d }|d u r|d urtjjdd| d || g}tjjdd| d || g}t||||}t||||}||d}t||||||d\}}n
t|||||\}}t|}t|d}t||dddd	}||fW  d    S 1 s*w   Y  d S )Nr   zFKey size (%d) must be divisible by the number of attention heads (%d).zHValue size (%d) must be divisible by the number of attention heads (%d).rn  r\  r?   TZqkv_transform)rc   r   Zq_transformZkv_transformr7   r-   r  r  )r  r  )r  Zoutput_transform)r`   Zkey_sizeZ
value_sizer"   r#   r$   r%   r?  splitrI   r  rz   rF   r  r  r  rO  )ZqueriesZmemoriesrG  Z	key_depthZvalue_depthZoutput_depthre  r  r   r   r   rk  rb   combinedr  r   r   Zkey_depth_per_headr  r  r  r  r  r5  rp  r   r   r   rn    s   




&rn  c                 C   s   | d dkr| d }t jj| |S | d dkr#t jjd| d S | d dkr5t jjj| d dddS | d d	krGt jjj| d dddS td
| d  )Nr6   uniformZinitializer_scalery   r,   Znormal_unit_scalingZfan_avg)rU  distributionZuniform_unit_scalingzUnrecognized initializer: %s)r"   r#   r$   Zrandom_uniform_initializerrC   Zvariance_scaling_initializerr`   )r   Zmax_valr   r   r   r   m  s(   
r   c                 C   s   |d dv r4t j|t jd}t j|d t jd}|d d }|t |d |d  |d d  }| | S |d d	krNt jjjt j|t jd|d
 |d S |d dkrV| S t	d)NZlearning_rate_decay)Zlinear_warmup_rsqrt_decayZnoamr.   warmup_stepsr+   r-   r7   g      piecewise_constantZlearning_rate_boundariesZlearning_rate_valuesrS  zUnknown learning_rate_decay)
r"   rJ   rD   r  r#   r$   r   r  rN   r`   )r   r   r   stepr  Z
multiplierZdecayr   r   r   r     s    

r   c           
      C   sv   g }t |  D ]2}g }|D ]\}}t|d}|| qtjd|d}t|d}|d d }||f}	||	 q|S )Nr   )rp   rA  r7   )r   r"   rQ   r{   rI   rO  )
Ztower_gradsZaverage_gradsZgrad_and_varsr   gr   Z
expanded_ggradr   Zgrad_and_varr   r   r   r     s   r   c                 C   s   t d u r| S t j| t jjdS )N)compression)_ENGINEZ	allreduceZCompressionZfp16)r:   r   r   r   
all_reduce  s   r  c                       sT   e Zd Z			 d fdd	Zdd Zdejjjj	j
dddfdd	Zdd
dZ  ZS )r   r7   Fc                    s2   t t| || || _|| _tj|dd| _d S )Nr  re   )r   r   r   
_optimizer_stepr"   r   _step_t)r   r   r  use_lockingrb   r   r   r   r     s   zMultiStepOptimizer.__init__c                 C   sl   t | jd $ |d u r|W  d    S t|t jr!t |}t|W  d    S 1 s/w   Y  d S )NZ
_Allreduce)r"   r   _namerH  IndexedSlicesr   r  )r   r:   r   r   r   _all_reduce  s   
$zMultiStepOptimizer._all_reduceNc                    s&  j ||||||}tt| \}}jdkr(fdd|D }tt||S t|dd d}	jjdkr9dndd|	d	}
g }t||D ]E\}}|d
j t	|t
jret
j |j|jjd n	t
j |jd  fdd} fdd}t
t
|
d||}|| qFtt||S )Nr7   c                    s   g | ]}  |qS r   )r  )r   r  )r   r   r   r     s    z8MultiStepOptimizer.compute_gradients.<locals>.<listcomp>c                 S   s   | j S r   re   )r5  r   r   r   r     s    z6MultiStepOptimizer.compute_gradients.<locals>.<lambda>)keyr   iter)initial_valuerb   Zcolocate_withgrad_accr  c                      r   r   r   r   )r  r   r   	_acc_grad  s   z7MultiStepOptimizer.compute_gradients.<locals>._acc_gradc                      s     j S r   )r  r  r   r  r   r   r   	_avg_grad  s   z7MultiStepOptimizer.compute_gradients.<locals>._avg_grad)r  r   r   r   r  r  Z_create_non_slot_variableZ_zeros_slotr  rH  r"   r  Zscatter_addr=  rA  _use_lockingZ
assign_addr   r  r{   )r   r!   r   Zgate_gradientsZaggregation_methodr   Z	grad_lossr   r   Z	first_variter_varZ	new_gradsr  varr  r  r   r  r   r     s@   
z$MultiStepOptimizer.compute_gradientsc           	         s   j dkrjj| dS tt| \fdd} fdd}dt }tt	|d||}t
|g |jt|d jjd	}W d    n1 sYw   Y  tj||g S )
Nr7   re   c                      s
   t j  S r   )r"   groupr   )r   r   r   _pass_gradients  s   
z;MultiStepOptimizer.apply_gradients.<locals>._pass_gradientsc                     s   j t } t| g( g }D ]}|d}||jt|j	d qtj
| }W d    n1 s<w   Y  tj
| |g S )Nr  r  )r  r   r   r"   control_dependenciesZget_slotr{   assignrG   r  r  )opZzero_opsr  r  Zzero_opr   r   rb   r   r   r   r   _apply_gradients  s    	z<MultiStepOptimizer.apply_gradients.<locals>._apply_gradientsr  r   r  )r  r  r   r   r   Z_get_non_slot_variabler"   Zget_default_graphr   r  r  r  rx  r  r  r  )	r   r   r   rb   r  r  r  Z	update_opZiter_opr   r  r   r     s$   
z"MultiStepOptimizer.apply_gradients)r7   Fr   r,  )r-  r.  r/  r   r  r"   r#   r$   r   	OptimizerZGATE_OPr   r   r1  r   r   r   r   r     s    

2r   c                    s   t | } t | d  g }t d- tD ] }|t t t jj	
 | fdd fdd qW d    n1 sCw   Y  t j| |ddS )Nr   z/cpu:0c                      s     d S r9  r   r   r   num_datashardsr   r   r   #  s    z shard_features.<locals>.<lambda>c                      s     S r   r   r   r  r   r   r   $  s    r   )r"   r   rz   r   rw   r{   r   r  r#   r$   rx  r  )r5  r  Zsize_splitsr}   r   r  r   r     s$   
	r   r   )TNN)rM  NN)rr   rr  )r|  N)r,   NN)r  )NNr   NN)5rs   collectionsr   typingr   Z
tensorflowr"   Zmodelscope.metainfor   Zmodelscope.models.baser   r   Zmodelscope.models.builderr   Zmodelscope.utils.constantr   __all__Zregister_moduletranslationr	   r	  r   r8  r   r   r  r?  rN  rV  rY  r[  rb  rV   ra   r   rO   rT   r  r  r  r  rn  r   r   r   r  r  r#   r$   r   r  r   r   r   r   r   r   <module>   s         
)	


+
	


.
6

C

=

Ol