o
    *j9                     @   st  d Z ddlZddlmZmZmZ ddlZddlm	Z	m
Z
 ddlmZ eeejf Zeh dZdZd	Zd
ee defddZdeeef defddZdee dedededee f
ddZdededededef
ddZdedefddZdd Zdd  Zdedefd!d"Zdeeef fd#d$Zd%d& Zd'edefd(d)Z d*edefd+d,Z!d-d. Z"d/d0 Z#d1d2 Z$d3d4 Z%dS )5z+Feature processing logic for multimer data     N)IterableListMutableMapping)msa_pairingresidue_constants   )correct_template_restypes>!   num_symall_atom_maskentity_maskZasym_lenZ all_crops_all_chains_residue_idsmsa_maskdeletion_meanZall_crops_all_chains_positionsZall_crops_all_chains_maskasym_id
seq_lengthtemplate_aatypeZall_chains_entity_ids	entity_idnum_templatesall_atom_positionstemplate_all_atom_positionsZresidue_index	bert_maskmsadeletion_matrixaatypecluster_bias_mask
resolutionZ
queue_sizeseq_masksym_idassembly_num_chainstemplate_sum_probstemplate_all_atom_masknum_alignments
msa_chainsZmem_peak   i   chainsreturnc                 C   s&   t ttdd | D }|dkS )z@Checks if a list of chains represents a homomer/monomer example.c                 S   s&   g | ]}t |d  |d  dk qS )r   r   )npunique).0chain r)   x/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/science/unifold/data/process_multimer.py
<listcomp>I   s    z*_is_homomer_or_monomer.<locals>.<listcomp>r   )lenr%   r&   concatenate)r#   Znum_unique_chainsr)   r)   r*   _is_homomer_or_monomerD   s   
r.   all_chain_featuresc                 C   s\   t |  | }t| }|rtj|d}t|}t|t|td}tj||td}t	|}|S )zRuns processing on features to augment, pair and merge.

    Args:
        all_chain_features: A MutableMap of dictionaries of features for each chain.

    Returns:
        A dictionary of features.
    )r#   msa_crop_sizepair_msa_sequencesmax_templates)np_chains_listr2   r3   )
process_unmerged_featuresr.   r   Zcreate_paired_featuresZdeduplicate_unpaired_sequencescrop_chainsMSA_CROP_SIZEMAX_TEMPLATESZmerge_chain_featuresprocess_final)r/   r4   r2   
np_exampler)   r)   r*   pair_and_mergeP   s.   
r;   chains_listr1   r2   r3   c                 C   s,   g }| D ]}t ||||d}|| q|S )ak  Crops the MSAs for a set of chains.

    Args:
        chains_list: A list of chains to be cropped.
        msa_crop_size: The total number of sequences to crop from the MSA.
        pair_msa_sequences: Whether we are operating in sequence-pairing mode.
        max_templates: The maximum templates to use per chain.

    Returns:
        The chains cropped.
    r0   )_crop_single_chainappend)r<   r1   r2   r3   Zcropped_chainsr(   Zcropped_chainr)   r)   r*   r6   u   s   r6   r(   c                 C   s  | d }|r@| d }t ||d }| d d|ddf }t t j|tjkdd}t ||}t || d}	t ||	}nt ||}d	| v oK|}
|
r[| d	 jd }t ||}| D ]E}|d
d }|tj	v rz| | d|ddf | |< q]|tj
v rd
|v r|r| | d|ddf | |< q]| | d|ddf | |< q]t j|t jd| d< |
rt j|t jd| d< |rt j|t jd| d< | S )z'Crops msa sequences to `msa_crop_size`.r    Znum_alignments_all_seq   msa_all_seqNr   Zaxisr   r   Z_all_seqdtyper   )r%   minimumsumanyr   ZMSA_GAP_IDXmaximumshapesplitZTEMPLATE_FEATURESZMSA_FEATURESasarrayint32)r(   r1   r2   r3   Zmsa_sizeZmsa_size_all_seqZmsa_crop_size_all_seqr@   Znum_non_gapped_pairsZmax_msa_crop_sizeZinclude_templatesr   Ztemplates_crop_sizekZk_splitr)   r)   r*   r=      sN   



r=   r:   c                 C   s   t | } t| } t| } | S )zCFinal processing steps in data pipeline, after merging and pairing.)_make_seq_mask_make_msa_mask_filter_featuresr:   r)   r)   r*   r9      s   r9   c                 C   s   | d dk tj| d< | S )Nr   r   r   )astyper%   float32rP   r)   r)   r*   rM      s   rM   c                 C   sD   t j| d t jd| d< | d dkt j}| d  |d 9  < | S )z:Mask features are all ones, but will later be zero-padded.r   rB   r   r   r   N)r%   Z	ones_likeZint8rQ   )r:   r   r)   r)   r*   rN      s   rN   c                 C   s   dd |   D S )z4Filters features of example to only those requested.c                 S   s   i | ]\}}|t v r||qS r)   )REQUIRED_FEATURES)r'   rL   vr)   r)   r*   
<dictcomp>   s    z$_filter_features.<locals>.<dictcomp>)itemsrP   r)   r)   r*   rO      s   rO   c                 C   s   t | }| D ]R}d|v rtj|dtjd|d< d|v r*tj|dtjd|d< tj|d dd|d< d	|vrQtj|d
  }||d< tt	|j
dg |d	< t||d< q| D ]}|d dktj|d< q[dS )z;Postprocessing stage for per-chain features before merging.deletion_matrix_intrB   r   Zdeletion_matrix_int_all_seqZdeletion_matrix_all_seqr   rA   r   r   r   r
      r   r   r   N)r,   r%   rJ   poprR   meanr   ZSTANDARD_ATOM_MASKzeroslistrH   rQ   rK   )r/   Z
num_chainschain_featuresr
   r)   r)   r*   r5      s<   


r5   c                 C   sX   t d| ft jt d| ddft jt dt jt d| dft jdS )Nr   %   rX   )r   r   )r   r   r   r   )r%   r[   rQ   Zint64rR   )Zn_resr)   r)   r*   empty_template_feats	  s
   r_   monomer_featuresc                 C   s  | d j d dkr| t| d j d  i }h d}|  D ]S\}}||v r2tj|d |jd}n1|dkrBtj|ddtj	}n!|dkrR|j d dkrQt
|}n|dkrYd	}n
|d
krc|tj}|drn|tj}|||< qd| v r| dtj| d< |d |S )z;Reshapes and modifies monomer features for multimer models.r   r   r   >   r    sequenceZdomain_namer   rB   rA   Ztemplate_all_atom_masksr   r   Z_maskrW   r   r   )rH   updater_   rV   r%   rJ   rC   ZargmaxrQ   rK   r   Zuint8endswithrR   rY   )r`   Z	convertedZunnecessary_leading_dim_featsZfeature_namefeaturer)   r)   r*   convert_monomer_features  s@   

rf   numc                 C   sd   | dkrt d|  d| d } g }| dkr-|t| d td  | d d } | dksd|S )a`  Encodes a number as a string, using reverse spreadsheet style naming.

    Args:
        num: A positive integer.

    Returns:
        A string that encodes the positive integer using reverse spreadsheet style,
        naming e.g. 1 = A, 2 = B, ..., 27 = AA, 28 = BA, 29 = CA, ... This is the
        usual way to encode chain IDs in mmCIF files.
    r   z$Only positive integers allowed, got .r      A )
ValueErrorr>   chrordjoin)rg   outputr)   r)   r*   int_id_to_str_id@  s   
rq   c                 C   s   i }t t}| D ]#}d|v sJ t|d }||vr#t|d ||< |||  | q	g }d}| D ]D\}}t|}	t|ddD ]5\}
}|d }|t	| |d< |
t	| |d< |t	| |d< |	t	| |d< |d7 }|| qCq5|S )	a  Add features to distinguish between chains.

    Args:
        all_chain_features: A dictionary which maps chain_id to a dictionary of
            features for each chain.

    Returns:
        all_chain_features: A dictionary which maps strings of the form
            `<seq_id>_<sym_id>` to the corresponding chain features. E.g. two
            chains from a homodimer would have keys A_1 and A_2. Two chains from a
            heterodimer would have keys A_1 and B_1.
    ra   r   )startr   r   r   r   r	   )
collectionsdefaultdictr\   strr,   r>   rV   	enumerater%   Zones)r/   Zseq_to_entity_idZgrouped_chainsr]   seqZnew_all_chain_featuresZchain_idr   Zgroup_chain_featuresr	   r   r   r)   r)   r*   add_assembly_featuresV  s,   
	rx   c                 C   sj   t | } | d jd }||k r3dD ]}t| | d|| fdf| |< qt| d d|| ff| d< | S )Nr   r   )r   r   r   r   r!   )r   r   r   )dictrH   r%   pad)r:   Zmin_num_seqZnum_seqZfeatr)   r)   r*   pad_msa}  s   

r{   c                 C   s:   t | d} g d}|D ]}|| v r| | d| |< q| S )Ni   )r    r   r   r   r   rb   )r{   Zreshape)r:   Zno_dim_keysrL   r)   r)   r*   post_process  s   
r|   c           
      C   sr   t dd | D }g }t|D ]\}}t||vr|| qtj| || gdd}tj||| gdd}	||	fS )Nc                 S   s   g | ]}t |qS r)   )tuple)r'   mr)   r)   r*   r+     s    zmerge_msas.<locals>.<listcomp>r   rA   )setrv   r}   r>   r%   r-   )
r   Zdel_matZnew_msaZnew_del_matZcur_msa_setZnew_rowsisZret_msaZret_del_matr)   r)   r*   
merge_msas  s   
r   )&__doc__rs   typingr   r   r   numpyr%   Z&modelscope.models.science.unifold.datar   r   utilsr   ru   ZndarrayZFeatureDict	frozensetrS   r8   r7   boolr.   r;   intr6   r=   r9   rM   rN   rO   r5   r_   rf   rq   rx   r{   r|   r   r)   r)   r)   r*   <module>   s`   %

%

 
4#*'