o
    *j                     @   s   d Z ddlmZmZmZ ddlZddlmZ dd Zdej	d	e
fd
dZdeejj d	eejjejjf fddZdeejj d	eejjejjf fddZdee d	ee fddZdS )z-
Copyright 2020 The Microsoft DeepSpeed Team
    )DictListTupleN   )MoEc                 C   s@   d}d}|   D ]\}}t|trd}|j} ||fS q||fS )NFr   T)Znamed_modules
isinstancer   num_experts)mZhas_moer   _module r   h/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/moe/utils.pyhas_moe_layers   s   
r   paramreturnc                 C   s   t | dr
| js
dS dS )N	allreduceTF)hasattrr   )r   r   r   r   is_moe_param   s   r   paramsc                 C   s:   g g }}| D ]}t |r|| q|| q||fS )N)r   append)r   Zshared_paramsZexpert_paramspr   r   r   *split_params_into_shared_and_expert_params   s   
r   groupc                 C   sV   g }g }| D ] }|j dur&t|r||j |j q||j |j q||fS )a  Split grad of parameters into grads of non-expert params
    and grads of expert params. This is useful while computing
    grad-norms for clipping and overflow detection

        group (List[torch.nn.Parameter]):
    Args:
            The group of parameters to split

    Returns:
        Tuple[List[torch.nn.Parameter], List[torch.nn.Parameter]]:
        list of gradients for non MoE params, list of gradients of MoE params
    N)Zgradr   r   toZdtype)r   Zexpert_gradsZshared_gradsr   r   r   r   0split_params_grads_into_shared_and_expert_params)   s   
r   param_groupsc                 C   s  t | tr
t| } nt | tr| g} nt | ts!tdt|  t }| D ]}|d D ]}t|r8||j	 q,q&t|}i }| D ]O}i ||d < |D ]D}i ||d  |< |||d  | d< d||d  | d< |
 D ]!}|dkr|dkrg ||d  | |< qn|| ||d  | |< qnqLqB| D ]'}g }|d D ]}t|r||d  |j	 d | q|| q||d< q| D ]\}}	|	 D ]	\}
}| | qqt| S )zSplit parameters into different MoE groups for optimizer

    Args:
        param_groups (Tuple[Dict]):
            The list of parameter groups to split

    Returns:
        Tuple[Dict]:
        list of MoE/non-MoE groups for optimizer
    zUnknown param group type of r   nameTZmoe)r   tuplelistdict
ValueErrortypesetr   add
group_namekeysr   items)r   Zdata_parallel_group_namesZparam_groupr   Z	group_moekeyZori_key
new_paramskvZk1Zv1r   r   r   4split_params_into_different_moe_groups_for_optimizerC   sb   






r+   )__doc__typingr   r   r   Ztorchlayerr   r   ZTensorboolr   nn	Parameterr   r   r+   r   r   r   r   <module>   s*    



