o
    *j                     @   s*  d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlm Z  d dl!m"Z# d d	lm$Z$ d
Z%d
Z&e'dZ(G dd deZ)G dd deZ*G dd deZ+dd Z,dddZ-dd Z.dd Z/dS )    N)AnyDictListOptionalTuple)utils)	fsdp_wrap)FairseqEncoderFairseqEncoderDecoderModelFairseqIncrementalDecoderregister_modelregister_model_architecture)	AdaptiveSoftmax	BaseLayerFairseqDropoutLayerDropModuleList	LayerNormPositionalEmbeddingSinusoidalPositionalEmbeddingTransformerDecoderLayerTransformerEncoderLayer)checkpoint_wrapper)quant_noise)Tensori   g    חAc                       s  e Zd ZdZ fddZedd Zedd Zed+d	d
Z	edd Z
edd Z				d,dededee dee fddZejj	d+deeeeeeee  f  f dedeeeef  fddZ	d-deeee f deeeeee f f defd d!Z	d-deeee f deeeeee f f defd"d#Zd$eeef fd%d&Zdeeeee f  fd'd(Zdeeeeee f f fd)d*Z  ZS ).
CanmtModela%  

    Args:
        encoder (TransformerEncoder): the encoder
        decoder (TransformerDecoder): the decoder

    The CanmtModel provides the following named architectures and
    command-line arguments:

    .. argparse::
        :ref: fairseq.models.transformer_parser
        :prog:
    c                    s0   t  || || _d| _|| _|| _|| _d S )NT)super__init__argsZsupports_align_argsencoderdecodersecond_decoder)selfr   r   r   r    	__class__ h/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/canmt/canmt_model.pyr   -   s   
zCanmtModel.__init__c                 C   s  | j dt dd | j dtddd | j dtdd	d | j d
dtddd | j dtddd | j dtddd | j dtddd | j dtddd | j dtddd | j dddd | j dddd | j dtdd d | j d!tdd"d | j d#tdd$d | j d%tdd&d | j d'tdd(d | j d)dd*d | j d+dd,d | j d-tdd.d | j d/dd0d | j d1dd2d | j d3d4dd5d6 | j d7d8d9d:f | j d;tdd<d | j d=dd>d | j d?dd@d | j dAddBd | j dCddDd | j dEd4ddFd6 | j dGd4ddHd6 | j dItddJdKdL | j dMtddJdNdL | j dOdPdQdR | j dSdPdQdR | j dTtddJdUdL | j dVtddWdXdL | j dYtddJdZdL | j d[tdtd\dL dPS )]z+Add model-specific arguments to the parser.z--activation-fnzactivation function to use)choiceshelpz	--dropoutDzdropout probability)typemetavarr'   z--attention-dropoutz)dropout probability for attention weightsz--activation-dropoutz--relu-dropoutz,dropout probability after activation in FFN.z--encoder-embed-pathZSTRz%path to pre-trained encoder embeddingz--encoder-embed-dimNzencoder embedding dimensionz--encoder-ffn-embed-dimz#encoder embedding dimension for FFNz--encoder-layersznum encoder layersz--encoder-attention-headsznum encoder attention headsz--encoder-normalize-before
store_truez)apply layernorm before each encoder block)actionr'   z--encoder-learned-posz0use learned positional embeddings in the encoderz--decoder-embed-pathz%path to pre-trained decoder embeddingz--decoder-embed-dimzdecoder embedding dimensionz--decoder-ffn-embed-dimz#decoder embedding dimension for FFNz--decoder-layersznum decoder layersz--decoder-attention-headsznum decoder attention headsz--decoder-learned-posz0use learned positional embeddings in the decoderz--decoder-normalize-beforez)apply layernorm before each decoder blockz--decoder-output-dimzPdecoder output dimension (extra linear layer if different from decoder embed dimz"--share-decoder-input-output-embedz)share decoder input and output embeddingsz--share-all-embeddingszWshare encoder, decoder and output embeddings (requires shared dictionary and embed dim)z --no-token-positional-embeddingsFz?if set, disables positional embeddings (outside self attention))defaultr-   r'   z--adaptive-softmax-cutoffEXPRzacomma separated list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion)r*   r'   z--adaptive-softmax-dropoutz6sets adaptive softmax dropout for the tail projectionsz--layernorm-embeddingzadd layernorm to embeddingz--no-scale-embeddingzif True, dont scale embeddingsz--checkpoint-activationszicheckpoint activations at each layer, which saves GPU memory usage at the cost of some additional computez--offload-activationszUcheckpoint activations at each layer, then save to gpu.Sets --checkpoint-activations.z--no-cross-attentionzdo not perform cross-attentionz--cross-self-attentionzperform cross+self-attentionz--encoder-layerdropr   z!LayerDrop probability for encoder)r)   r*   r.   r'   z--decoder-layerdropz!LayerDrop probability for decoderz--encoder-layers-to-keepNz=which layers to *keep* when pruning as a comma-separated list)r.   r'   z--decoder-layers-to-keepz--quant-noise-pqz0iterative PQ quantization noise at training timez--quant-noise-pq-block-size   z1block size of quantization noise at training timez--quant-noise-scalarzBscalar quantization noise and scalar quantization at training timez--min-params-to-wrapad  minimum number of params for a layer to be wrapped with FSDP() when training with --ddp-backend=fully_sharded. Smaller values will improve memory efficiency, but may make torch.distributed communication less efficient due to smaller input sizes. This option is set to 0 (i.e., always wrap) when --checkpoint-activations or --offload-activations are passed.)add_argumentr   Zget_available_activation_fnsfloatstrintDEFAULT_MIN_PARAMS_TO_WRAP)parserr$   r$   r%   add_args5   s  
zCanmtModel.add_argsc                 C   sr  t | |jrt|jd|_|jrt|jd|_t|dddu r't|_	t|dddu r2t
|_|j|j}}|jrk||krDtd|j|jkrNtd|jr[|j|jkr[td| |||j|j}|}d|_n| |||j|j}| |||j|j}t|d	d
rd|_| |||}| |||}| |||}	|jst|dt}
t||
d}t||
d}| ||||	S )zBuild a new model instance.,max_source_positionsNmax_target_positionsz3--share-all-embeddings requires a joined dictionaryzP--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dimz?--share-all-embeddings not compatible with --decoder-embed-pathToffload_activationsFmin_params_to_wrapZmin_num_params)base_architectureencoder_layers_to_keeplensplitencoder_layersdecoder_layers_to_keepdecoder_layersgetattrDEFAULT_MAX_SOURCE_POSITIONSr9   DEFAULT_MAX_TARGET_POSITIONSr:   Z	vocab_srcZ	vocab_tgtshare_all_embeddings
ValueErrorencoder_embed_dimdecoder_embed_dimdecoder_embed_pathencoder_embed_pathbuild_embedding share_decoder_input_output_embedcheckpoint_activationsbuild_encoderbuild_decoderr5   r   )clsr   tasksrc_dicttgt_dictZencoder_embed_tokensZdecoder_embed_tokensr   r   r    r<   r$   r$   r%   build_model   sh   zCanmtModel.build_modelNc           	      C   s<   t |}| }t|||}|rt|}t||| |S N)r@   pad	Embeddingr   Zparse_embeddingZload_embedding)	rS   r   
dictionary	embed_dimpathnum_embeddingspadding_idxZembZ
embed_dictr$   r$   r%   rN   9  s   
zCanmtModel.build_embeddingc                 C   s   t |||S rX   )TransformerEncoder)rS   r   rU   embed_tokensr$   r$   r%   rQ   E  s   zCanmtModel.build_encoderc                 C   s   t |||t|dddS )Nno_cross_attentionF)no_encoder_attn)TransformerDecoderrE   )rS   r   rV   ra   r$   r$   r%   rR   I  s   
zCanmtModel.build_decoderTFreturn_all_hiddensfeatures_onlyalignment_layeralignment_headsc	              
   C   s   | j |||d}	| j||	|||||d}
| j|d|d||||d}|d d }|d d }|g|gd	}| j|||||d|d}|
||fS )
z
        Run the forward pass for an encoder-decoder model.

        Copied from the base class, but without ``**kwargs``,
        which are not supported by TorchScript.
        )src_lengthsre   )encoder_outrf   rg   rh   ri   re   NT)rj   rf   full_context_alignmentrg   rh   ri   re      
last_layerself_attn_padding_mask)rj   encoder_padding_mask)r   r   r    )r!   
src_tokensri   prev_output_tokensprev_src_tokensre   rf   rg   rh   rj   decoder_outZdecoder_out_redecoder_out_tensorZdecoder_paddingZdecoder_kvsZsrc_outr$   r$   r%   forwardR  sN   


	zCanmtModel.forward
net_output	log_probssamplec                 C   s   |  |||S )z@Get normalized probabilities (or log probs) from a net's output.)Zget_normalized_probs_scriptable)r!   rv   rw   rx   r$   r$   r%   get_normalized_probs  s   zCanmtModel.get_normalized_probs      ?encoder_outsincremental_statestemperaturec                 C   s  d }|}| j j|||d}d }t|}|dkrN|d d urNt|d tr)|d }n|d d }	t|	tr7|	}n|	d ur?|	d }|d urN|d d dd d f }|d d d dd d d f ||dkred n|d f}
| j|
dd d}|d d dd d f }|d d }|||fS )	N)rj   incremental_staterl   attnr   Trw   rx   rm   )r   ru   r@   
isinstancer   div_ry   r!   tokensr{   r|   r}   rj   rs   r   Zdecoder_lenZattn_holderZdecoder_out_tupleZprobsrt   r$   r$   r%   forward_decoder  s8   

"
zCanmtModel.forward_decoderc                 C   s  d }|}| j j||d}d }t|}|dkrM|d d urMt|d tr(|d }n|d d }	t|	tr6|	}n|	d ur>|	d }|d urM|d d dd d f }|d d d dd d d f ||dkrdd n|d f}
| j|
dd d}|d d dd d f }|d d }||||fS )	N)rj   rl   r   r   r   Tr   rm   )r    ru   r@   r   r   r   ry   r   r$   r$   r%   forward_decoder_src  s4   

"zCanmtModel.forward_decoder_src	net_inputc                 C   s   dd |  D }| j|S )Nc                 S   s.   i | ]\}}|d kr|dkr|dkr||qS )rq   rr   sourcesr$   ).0kvr$   r$   r%   
<dictcomp>  s
    z.CanmtModel.forward_encoder.<locals>.<dictcomp>)itemsr   Zforward_torchscript)r!   r   Zencoder_inputr$   r$   r%   forward_encoder  s   zCanmtModel.forward_encoderc                 C   s   |dusJ | j ||S )  
        Reorder encoder output according to *new_order*.

        Args:
            encoder_out: output from the ``forward()`` method
            new_order (LongTensor): desired order

        Returns:
            *encoder_out* rearranged according to *new_order*
        N)r   reorder_encoder_out)r!   r{   	new_orderr$   r$   r%   r     s   zCanmtModel.reorder_encoder_outc                 C   s   | j || d S rX   )r   Z#reorder_incremental_state_scripting)r!   r|   r   r$   r$   r%   reorder_incremental_state  s   z$CanmtModel.reorder_incremental_staterX   )TFNN)rz   ) __name__
__module____qualname____doc__r   staticmethodr7   classmethodrW   rN   rQ   rR   boolr   r4   ru   torchjitexportr   r   r   r3   r   ry   r2   r   r   r   r   r   __classcell__r$   r$   r"   r%   r      sv    
 J
9

	
<
-
%r   c                       s   e Zd ZdZ fddZdd Z	ddeej fdd	Z			
	ddeej de
deej fddZ		
	ddeej de
deej fddZejjdeeee f fddZdd Zdd Z  ZS )r`   aI  
    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
    is a :class:`TransformerEncoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): encoding dictionary
        embed_tokens (torch.nn.Embedding): input embedding
    c                    sp   _ t | dtdg t jjj	d_
 j_|j}|j_ j_|_ jr4dnt|_ jsHt j|j jdnd _t dd}t ddr_t||d	_nd _ jsz jd
krzttj||dd j j _!nd _!jdkrt"jd_#nt$g _#j#% fddt& j'D  t(j#_) j*rt||d	_+d S d _+d S )Nversion   module_namerz   Zlearnedr   Flayernorm_embeddingr   r   bias        pc                    s   g | ]}  qS r$   )build_encoder_layer)r   ir   r!   r$   r%   
<listcomp>=  s    
z/TransformerEncoder.__init__.<locals>.<listcomp>),r   r   r   register_bufferr   r   r   dropoutr#   r   dropout_moduleencoder_layerdropembedding_dimr_   r9   ra   no_scale_embeddingmathsqrtembed_scaleno_token_positional_embeddingsr   encoder_learned_posembed_positionsrE   r   r   adaptive_inputquant_noise_pqapply_quant_noise_nnLinearquant_noise_pq_block_sizer   r   layers
ModuleListextendrangerB   r@   
num_layersencoder_normalize_before
layer_norm)r!   r   r[   ra   r\   r   r"   r   r%   r     sX   
	

zTransformerEncoder.__init__c                 C   sT   t |}t|dd}|rt|dd}t||d}|s t|dtnd}t||d}|S NrP   Fr;   )offload_to_cpur<   r   r=   )r   rE   r   r5   r   )r!   r   layer
checkpointr   r<   r$   r$   r%   r   G  s   z&TransformerEncoder.build_encoder_layerNtoken_embeddingc                 C   sr   |d u r	|  |}| j|  }}| jd ur|| | }| jd ur&| |}| |}| jd ur5| |}||fS rX   )ra   r   r   r   r   r   )r!   rp   r   xZembedr$   r$   r%   forward_embeddingS  s   






z$TransformerEncoder.forward_embeddingFri   re   token_embeddingsc                 C   s   |  ||||S )  
        Args:
            src_tokens (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            src_lengths (torch.LongTensor): lengths of each source sentence of
                shape `(batch)`
            return_all_hiddens (bool, optional): also return all of the
                intermediate hidden states (default: False).
            token_embeddings (torch.Tensor, optional): precomputed embeddings
                default `None` will recompute embeddings

        Returns:
            dict:
                - **encoder_out** (Tensor): the last encoder layer's output of
                  shape `(src_len, batch, embed_dim)`
                - **encoder_padding_mask** (ByteTensor): the positions of
                  padding elements of shape `(batch, src_len)`
                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
                  of shape `(batch, src_len, embed_dim)`
                - **encoder_states** (List[Tensor]): all intermediate
                  hidden states of shape `(src_len, batch, embed_dim)`.
                  Only populated if *return_all_hiddens* is True.
        )forward_scriptable)r!   rp   ri   re   r   r$   r$   r%   ru   c  s   zTransformerEncoder.forwardc                 C   s   | | j}|jjdkp| }| ||\}}|r&|d|d|  }|dd}g }	|r5|		| | j
D ]}
|
||r@|ndd}|rQ|	dusLJ |		| q8| jdur\| |}|g|g|g|	g g dS )r   Zxlarl   r   r   N)ro   rj   ro   encoder_embeddingencoder_statesrp   ri   )eqr_   devicer)   anyr   Z	unsqueezeZtype_as	transposeappendr   r   )r!   rp   ri   re   r   ro   Zhas_padsr   r   r   r   r$   r$   r%   r     s>   




z%TransformerEncoder.forward_scriptablerj   c                 C   s$  t |d dkrg }n|d d d|g}t |d dkr!g }n|d d d|g}t |d dkr7g }n|d d d|g}t |d dkrMg }n|d d d|g}t |d dkrcg }n|d d d|g}|d }t |dkrt|D ]\}	}
|
d|||	< q|||||||d	S )
r   rj   r   rl   ro   r   rp   ri   r   r   )r@   Zindex_select	enumerate)r!   rj   r   Znew_encoder_outZnew_encoder_padding_maskZnew_encoder_embeddingrp   ri   r   idxstater$   r$   r%   r     sB   z&TransformerEncoder.reorder_encoder_outc                 C       | j du r| jS t| j| j jS )z.Maximum input length supported by the encoder.N)r   r9   minmax_positionsr!   r$   r$   r%   r     
   
z TransformerEncoder.max_positionsc                 C   s   t | jtr#d|}||v rtd| ||= td|d|< t| jD ]}| j	| 
|d|| q(d|}t||tdgd dk r\d	| _d
| _tdg||< |S )@Upgrade a (possibly old) state dict for new versions of fairseq.{}.embed_positions.weightszdeleting {0}rl    {}.embed_positions._float_tensorz{}.layers.{}
{}.versionr      NF)r   r   r   formatprintr   FloatTensorr   r   r   upgrade_state_dict_namedr   itemgetr   r   	normalize)r!   
state_dictnameweights_keyr   version_keyr$   r$   r%   r     s&   


"z+TransformerEncoder.upgrade_state_dict_namedrX   )NFN)r   r   r   r   r   r   r   r   r   r   r   ru   r   r   r   r   r3   r   r   r   r   r   r$   r$   r"   r%   r`     s@    
5

$
G<r`   c                       sz  e Zd ZdZ		d  fdd	Zdd Zd!dd	Z								d"d
eee	e
e f  deee	ee	ee f f  dededee dee dee defddZ				d#d
eee	e
e f  deee	ee	ee f f  dedee dee f
ddZ	 				d#d
eee	e
e f  deee	ee	ee f f  dedee dee f
ddZdd Zdd Zdd Zdd Z  ZS )$rd   a  
    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
    is a :class:`TransformerDecoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): decoding dictionary
        embed_tokens (torch.nn.Embedding): output embedding
        no_encoder_attn (bool, optional): whether to attend to encoder outputs
            (default: False).
    FNc           	         s   _ t | dtdg td_t j	j
jd_ j_ j_|j} j}|_ j_|j_ j_|_ jrHdnt|_ jsf jdkrfttj ||dd j j!_"nd _"||krtt ||ddnd _# j$st%j|j j&dnd _'t( d	d}t( d
drt)||d_*nd _*t( dd_+jdkrt,jd_-nt.g _-j-/ fddt0 j1D  t2j-_3 j4rt( ddst)||d_5nd _5|jkr j6st |jddnd _7d _8|_9j9d u r: || d S d S )Nr   r   r   r   rz   Fr   r   r   r   r   cross_self_attentionr   r   c                    s   g | ]}  qS r$   )build_decoder_layer)r   _r   rc   r!   r$   r%   r   r  s    
z/TransformerDecoder.__init__.<locals>.<listcomp>Zno_decoder_final_norm);r   r   r   r   r   r   empty_future_maskr   r   r#   r   r   decoder_layerdroprO   share_input_output_embedr   rK   r\   decoder_output_dimoutput_embed_dimr_   r:   ra   r   r   r   r   r   r   r   r   r   r   r   project_in_dimr   r   decoder_learned_posr   rE   r   r   r   r   r   r   r   r   rD   r@   r   decoder_normalize_beforer   tie_adaptive_weightsproject_out_dimadaptive_softmaxoutput_projectionbuild_output_projection)	r!   r   r[   ra   rc   r  Zinput_embed_dimr\   r   r"   r   r%   r   3  s   
	

zTransformerDecoder.__init__c              	   C   s   |j d ur#tt|| jtj|j td|j|jr|nd |j	|j
d| _n7| jr@tj| jjjd | jjjd dd| _| jj| j_ntj| jt|dd| _tjj| jjd| jd d t|d	d}t|D ]}| j|d |j |d  t| qdd S )
N)r)   )r   Zadaptive_inputsfactorZtie_projrl   r   Fr         ࿩meanZstdZbase_layers)adaptive_softmax_cutoffr   r@   r   r   Zeval_str_listr4   adaptive_softmax_dropoutr   Zadaptive_softmax_factorZtie_adaptive_projr  r   r   r   ra   weightshaper  initnormal_rE   r   r   insertrD   r   )r!   r   r[   ra   Znum_base_layersr   r$   r$   r%   r    sD   


z*TransformerDecoder.build_output_projectionc                 C   sV   t ||}t|dd}|rt|dd}t||d}|s!t|dtnd}t||d}|S r   )r   rE   r   r5   r   )r!   r   rc   r   r   r   r<   r$   r$   r%   r     s   
z&TransformerDecoder.build_decoder_layerrj   r~   rf   rk   rg   rh   ri   re   c
                 C   s0   | j ||||||d\}
}|s| |
}
|
|fS )a\  
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for teacher forcing
            encoder_out (optional): output from the encoder, used for
                encoder-side attention, should be of size T x B x C
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`
            features_only (bool, optional): only return features without
                applying output layer (default: False).
            full_context_alignment (bool, optional): don't apply
                auto-regressive mask to self-attention (default: False).

        Returns:
            tuple:
                - the decoder's output of shape `(batch, tgt_len, vocab)`
                - a dictionary with any model-specific outputs
        )rj   r~   rk   rg   rh   )extract_featuresoutput_layer)r!   rq   rj   r~   rf   rk   rg   rh   ri   re   r   extrar$   r$   r%   ru     s    
	
zTransformerDecoder.forwardc                 C   s   |  ||||||S rX   )extract_features_scriptable)r!   rq   rj   r~   rk   rg   rh   r$   r$   r%   r    s   
z#TransformerDecoder.extract_featuresc                 C   s  |  \}}|du r| jd }d}	d}
|dur8t|d dkr8|d d }	|	  d |ks8J d| d|	j |durJt|d dkrJ|d d }
d}| jdurX| j||d}|durt|ddd	df }|durt|ddd	df }| j| | }| jdur| |}| jdur| |}|dur||7 }| j	dur| 	|}| 
|}|dd}d}| js|| j r|| j}d}|g}t| jD ]?\}}|du r|s| |}nd}|||	|
|||t||kt||kd
\}}}|| |dur	||kr	| |}q|dur |dur|d| }|jdd}| jdur+| |}|}|dd}| jdur>| |}||g|||dfS )a  
        Similar to *forward* but only return features.

        Includes several features from "Jointly Learning to Align and
        Translate with Transformer Models" (Garg et al., EMNLP 2019).

        Args:
            full_context_alignment (bool, optional): don't apply
                auto-regressive mask to self-attention (default: False).
            alignment_layer (int, optional): return mean alignment over
                heads at this layer (default: last layer).
            alignment_heads (int, optional): only average alignment over
                this many heads (default: all heads).

        Returns:
            tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - a dictionary with any model-specific outputs
        Nrl   rj   r   zExpected enc.shape == (t, z	, c) got ro   )r~   r   )self_attn_maskrn   Z	need_attnZneed_head_weights)dim)r   inner_statesrm   rn   )sizer   r@   r  r   r   ra   r   r   r   r   r   r   r   r_   r   r   r   buffered_future_maskr   r   r2   tor  r   r   )r!   rq   rj   r~   rk   rg   rh   bsslenencZpadding_maskZ	positionsr   rn   r   r  r   r   r  Z
layer_attnZself_attn_hiddenrm   r$   r$   r%   r    s   


















z.TransformerDecoder.extract_features_scriptablec                 C   s   | j du r
| |S |S )z(Project features to the vocabulary size.N)r  r  )r!   featuresr$   r$   r%   r  t  s   

zTransformerDecoder.output_layerc                 C   r   )z/Maximum output length supported by the decoder.N)r   r:   r   r   r   r$   r$   r%   r   |  r   z TransformerDecoder.max_positionsc                 C   sz   | d}| j ddks| jj|jkr| j d|k r+ttt||gd| _| j|| _| jd |d |f S )Nr   rl   )	r  r   r   r   Ztriur   Zfill_with_neg_infZzerosr  )r!   Ztensorr  r$   r$   r%   r    s   
z'TransformerDecoder.buffered_future_maskc                 C   sB  t | jtrd|}||v r||= td|d|< | d|vrD| jr,| d}n| d}||v rD|| || d< | jsD||= t| jD ]1}ddd	d
}|	 D ]$\}}dD ]}	d||||	}
|
|v rx||
 |d||||	< ||
= q[qUqId|}t
||tdgd dkrd| _d| _tdg||< |S )r   r   rl   r   z.output_projection.weightz.embed_tokens.weightz
.embed_outZself_attn_layer_normZencoder_attn_layer_normZfinal_layer_norm)012)r
  r   z{}.layers.{}.layer_norms.{}.{}z{}.layers.{}.{}.{}r   r   r   NF)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r!   r   r   r   Zembed_out_keyr   Zlayer_norm_mapoldnewmr   r   r$   r$   r%   r     sV   


	"z+TransformerDecoder.upgrade_state_dict_named)FN)F)NNFFNNNF)NFNN)r   r   r   r   r   r  r   r   r   r3   r   r   r   r4   r   ru   r  r  r  r   r  r   r   r$   r$   r"   r%   rd   &  s    U
!

	

2





y
rd   c                 C   s@   t j| ||d}t jj|jd|d d t j|j| d |S )N)r_   r   r  r  )r   rZ   r  r  r
  	constant_)r^   r   r_   r"  r$   r$   r%   rZ     s   rZ   Tc                 C   s4   t | ||}t j|j |rt j|jd |S )Nr   )r   r   r  Zxavier_uniform_r
  r#  r   )Zin_featuresZout_featuresr   r"  r$   r$   r%   r     s
   r   c                 C   sH  t | dd | _t | dd| _t | dd| _t | dd| _t | dd	| _t | d
d| _t | dd| _t | dd | _t | d| j| _	t | d| j| _
t | dd| _t | dd	| _t | dd| _t | dd| _t | dd| _t | dd| _t | dd| _t | dd| _t | dd | _t | dd| _t | dd| _t | dd| _t | d d| _t | d!d| _t | d"d| _t | d#d| _t | d$| j	| _t | d%| j	| _t | d&d| _t | d'd| _t | d(d| _t | d)d| _ t | d*d| _!| j!rd+| _ t | d,d | _"t | d-d | _#t | d.d| _$t | d/d| _%t | d0d| _&t | d1d	| _'t | d2d| _(d S )3NrM   rJ   i   encoder_ffn_embed_dimi   rB      encoder_attention_headsr0   r   Fr   rL   rK   decoder_ffn_embed_dimrD   decoder_attention_headsr   r   attention_dropoutr   activation_dropoutactivation_fnZrelur   g?r  r	  r   rO   rH   r   r   rb   r   r   decoder_input_dimr   r   r   rP   r;   Tr?   rC   r   r   r   r   quant_noise_scalar))rE   rM   rJ   r$  rB   r&  r   r   rL   rK   r'  rD   r(  r   r   r)  r*  r+  r   r  r	  rO   rH   r   r   rb   r   r   r,  r   r   r   rP   r;   r?   rC   r   r   r   r   r-  r   r$   r$   r%   r>     s   r>   c                 C   s   t | dd| _t | dd| _t | dd| _t | dd| _t | d	d
| _t | dd
| _t | dd| _t | dd| _t | dd| _	t | dd| _
t | dd| _t | dd| _t | dd| _t|  d S )NrJ   i   r$  i   r&     rB      r   Tr   rD   r   rK   r'  r(  r)  g{Gz?r*  r   )rE   rJ   r$  r&  rB   r   r   rD   rK   r'  r(  r)  r*  r   r>   r.  r$   r$   r%   transformer_deep  s$   r1  )T)0r   typingr   r   r   r   r   numpyr   Ztorch.nnr   Zfairseqr   Zfairseq.distributedr   Zfairseq.modelsr	   r
   r   r   r   Zfairseq.modulesr   r   r   r   r   r   r   r   r   Z&fairseq.modules.checkpoint_activationsr   Zfairseq.modules.quant_noiser   r   r   rF   rG   r4   r5   r   r`   rd   rZ   r   r>   r1  r$   r$   r$   r%   <module>   s<   ,   l  !   
;