o
    *j|-                     @   s   d Z ddlZddlmZmZmZmZmZmZ ddl	Z
ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ eee
jf Zeejejf Zded	ed
edefddZdeej  defddZ!dededede"deeef f
ddZ#G dd dZ$dS )z@Functions for building the input features for the unifold model.    N)AnyMappingMutableMappingOptionalSequenceUnion)logging)residue_constants)msa_identifiersparsers	templates)hhblitshhsearch	hmmsearch	jackhmmersequencedescriptionnum_resreturnc                 C   s   i }t j| t jdd|d< tj|ftjd|d< tj|dgtjd|d< tjt	|tjd|d< tj|g| tjd|d	< tj| dgtjd|d
< |S )z/Constructs a feature dict of sequence features.T)r   mappingZmap_unknown_to_xZaatypeZdtypeZbetween_segment_residuesutf-8Zdomain_nameZresidue_indexZ
seq_lengthr   )
r	   Zsequence_to_onehotZrestype_order_with_xnpZzerosint32arrayencodeobject_range)r   r   r   features r   o/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/science/unifold/msa/pipeline.pymake_sequence_features    s$   



r!   msasc                 C   s*  | st dg }g }g }t }t| D ]E\}}|s!t d| dt|jD ]1\}}||v r/q&|| |dd |D  ||j|  t|j	| }	||	j
d q&qt| d jd }
t|}i }tj|tjd|d	< tj|tjd|d
< tj|g|
 tjd|d< tj|tjd|d< |S )z*Constructs a feature dict of MSA features.z"At least one MSA must be provided.zMSA z$ must contain at least one sequence.c                 S   s   g | ]}t j| qS r   )r	   ZHHBLITS_AA_TO_ID).0resr   r   r    
<listcomp>F   s    z%make_msa_features.<locals>.<listcomp>r   r   r   Zdeletion_matrix_intmsanum_alignmentsZmsa_species_identifiers)
ValueErrorset	enumerate	sequencesaddappenddeletion_matrixr
   Zget_identifiersZdescriptionsZ
species_idr   lenr   r   r   r   )r"   Zint_msar.   Zspecies_idsZseen_sequencesZ	msa_indexr&   Zsequence_indexr   identifiersr   r'   r   r   r   r    make_msa_features4   sH   



r1   input_fasta_pathmsa_out_path
msa_formatuse_precomputed_msasc                 C   s   |rt j|s.| |d }t|d}|||  W d   |S 1 s'w   Y  |S td| t|ddd}|| i}W d   |S 1 sMw   Y  |S )z:Runs an MSA tool, checking if output already exists first.r   wNzReading MSA from file %srr   encoding)	ospathexistsqueryopenwriter   warningread)Z
msa_runnerr2   r3   r4   r5   resultfr   r   r    run_msa_toolX   s   

rD   c                   @   s   e Zd ZdZ			ddedededed	ee d
ee dee dee dedejde	de
de
de	fddZdededefddZdededefddZdS )DataPipelinez:Runs the alignment tools and assembles the input features.  '  Fjackhmmer_binary_pathhhblits_binary_pathuniref90_database_pathmgnify_database_pathbfd_database_pathuniclust30_database_pathsmall_bfd_database_pathuniprot_database_pathtemplate_searchertemplate_featurizeruse_small_bfdmgnify_max_hitsuniref_max_hitsr5   c                 C   s   || _ tj||d| _|rtj||d| _n
tj|||gd| _tj||d| _tj||d| _	|	| _
|
| _|| _|| _|| _dS )zInitializes the data pipeline.)binary_pathZdatabase_path)rU   Z	databasesN)_use_small_bfdr   Z	Jackhmmerjackhmmer_uniref90_runnerjackhmmer_small_bfd_runnerr   ZHHBlitshhblits_bfd_uniclust_runnerjackhmmer_mgnify_runnerjackhmmer_uniprot_runnerrP   rQ   rS   rT   r5   )selfrH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   r5   r   r   r    __init__n   s6   

zDataPipeline.__init__r2   msa_output_dirr   c                 C   s  t |dd}| }W d   n1 sw   Y  t|\}}t|dkr/td| d|d }|d }t|}	tj|d}
t	| j
||
d	| j}tj|d
}t	| j||d	| j}|d	 }tj|| jd}t|}t|}| jjd	kr| j|}n| jjdkrt|}| j|}n	td| jj tj|d| jj }t |d}|| W d   n1 sw   Y  t|d	 }|j| jd}t|d	 }|j| jd}| jj||d}| jrtj|d}t	| j||d	| j}t|d	 }ntj|d}t	| j||d| j}t|d }| jj ||d}t!|||	d}t"|||f}t#$dt| t#$dt| t#$dt| t#$d|d d  t#$d|j%d j&d  i |||j%S )z@Runs alignment tools on the input sequence and creates features.r   r8   N   z&More than one input sequence found in .r   zuniref90_hits.stostozmgnify_hits.sto)Zmax_sequencesZa3mz$Unrecognized template input format: z	pdb_hits.r6   Zmax_seqs)Zoutput_stringinput_sequencezsmall_bfd_hits.stozbfd_uniclust_hits.a3m)Zquery_sequencehits)r   r   r   z Uniref90 MSA size: %d sequences.zBFD MSA size: %d sequences.zMGnify MSA size: %d sequences.z,Final (deduplicated) MSA size: %d sequences.r'   zbTotal number of templates (NB: this can include bad templates and is later filtered to top 4): %d.Ztemplate_domain_names)'r>   rA   r   Zparse_fastar/   r(   r:   r;   joinrD   rW   r5   rZ   Ztruncate_stockholm_msarT   Zdeduplicate_stockholm_msaZ'remove_empty_columns_from_stockholm_msarP   Zinput_formatr=   Zconvert_stockholm_to_a3mZoutput_formatr?   parse_stockholmtruncaterS   Zget_template_hitsrV   rX   rY   Z	parse_a3mrQ   Zget_templatesr!   r1   r   infor   shape)r\   r2   r^   rC   Zinput_fasta_strZ
input_seqsZinput_descsrc   Zinput_descriptionr   Zuniref90_out_pathZjackhmmer_uniref90_resultZmgnify_out_pathZjackhmmer_mgnify_resultZmsa_for_templatesZpdb_templates_resultZuniref90_msa_as_a3mZpdb_hits_out_pathZuniref90_msaZ
mgnify_msaZpdb_template_hitsZbfd_out_pathZjackhmmer_small_bfd_resultZbfd_msaZhhblits_bfd_uniclust_resultZtemplates_resultZsequence_featuresZmsa_featuresr   r   r    process   s   


zDataPipeline.processc                 C   sJ   t j|d}t| j||d| j}t|d }|jdd}t	|g}|S )Nzuniprot_hits.stora   iP  rb   )
r:   r;   re   rD   r[   r5   r   rf   rg   r1   )r\   r2   r^   Zuniprot_pathZuniprot_resultr&   Zall_seq_dictr   r   r    process_uniprot  s   
zDataPipeline.process_uniprotN)rF   rG   F)__name__
__module____qualname____doc__strr   TemplateSearcherr   ZTemplateHitFeaturizerboolintr]   FeatureDictrj   rk   r   r   r   r    rE   k   sX    	

+
trE   )%ro   r:   typingr   r   r   r   r   r   numpyr   Zabslr   Z&modelscope.models.science.unifold.datar	   Z%modelscope.models.science.unifold.msar
   r   r   Z+modelscope.models.science.unifold.msa.toolsr   r   r   r   rp   Zndarrayrt   ZHHSearchZ	Hmmsearchrq   rs   r!   ZMsar1   rr   rD   rE   r   r   r   r    <module>   s8    

$

