o
    *jF                     @   s|  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ eZeeef ZejjZeZeee	e f Zejdd	G d
d dZejdd	G dd dZejdd	G dd dZejdd	G dd dZejdd	G dd dZejdd	G dd dZ G dd de!Z"dedede	eeef  fddZ#dedededeeeeef f fddZ$ej%d d!d"dd#d$ed%ed&e&de fd'd(Z'ej%d d!d"dd#d$ed%ed&e&de fd)d*Z(d+edefd,d-Z)d.Z*dedefd/d0Z+dedefd1d2Z,dede	e fd3d4Z-deeef deee	e f fd5d6Z.d7ede&fd8d9Z/dS ):zParses the mmCIF file format.    N)AnyMappingOptionalSequenceTuple)logging)PDB)SCOPData)MMCIFParserT)frozenc                   @   s   e Zd ZU eed< eed< dS )MonomeridnumN__name__
__module____qualname__str__annotations__int r   r   l/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/models/science/unifold/msa/mmcif.pyr   "   s   
 r   c                   @   sN   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< d	S )
AtomSiteresidue_nameauthor_chain_idmmcif_chain_idauthor_seq_nummmcif_seq_numinsertion_codehetatm_atom	model_numNr   r   r   r   r   r   *   s   
 r   c                   @   s&   e Zd ZU eed< eed< eed< dS )ResiduePositionchain_idresidue_numberr   Nr   r   r   r   r   r!   7   s   
 r!   c                   @   s2   e Zd ZU ee ed< eed< eed< eed< dS )ResidueAtPositionpositionname
is_missinghetflagN)r   r   r   r   r!   r   r   boolr   r   r   r   r$   >   s
   
 r$   c                   @   sz   e Zd ZU dZeed< eed< eed< ee	e
f ed< ee	eeef f ed< eed< ee	e	f ed< ee	ef ed	< d
S )MmcifObjectap  Representation of a parsed mmCIF file.

    Contains:
        file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
            files being processed.
        header: Biopython header.
        structure: Biopython structure.
        chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
            {'A': 'ABCDEFG'}
        seqres_to_structure: Dict; for each chain_id contains a mapping between
            SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,  1: ResidueAtPosition, ...}}
        raw_string: The raw string used to construct the MmcifObject.
    file_idheader	structurechain_to_seqresseqres_to_structure
raw_stringmmcif_to_author_chain_idvalid_chainsN)r   r   r   __doc__r   r   	PdbHeaderPdbStructurer   ChainIdSeqResr   r$   r   r   r   r   r   r*   F   s   
 r*   c                   @   s6   e Zd ZU dZee ed< eee	e	f e
f ed< dS )ParsingResultzReturned by the parse function.

    Contains:
        mmcif_object: A MmcifObject, may be None if no chain could be successfully
            parsed.
        errors: A dict mapping (file_id, chain_id) to any exception generated.
    mmcif_objecterrorsN)r   r   r   r3   r   r*   r   r   r   r   r   r   r   r   r   r8   `   s   
 r8   c                   @   s   e Zd ZdZdS )
ParseErrorz;An error indicating that an mmCIF file could not be parsed.N)r   r   r   r3   r   r   r   r   r;   n   s    r;   prefixparsed_inforeturnc                    sp   g  g |  D ]\}}|| r | | qtfddD s-J d   fddt D S )an  Extracts loop associated with a prefix from mmCIF data as a list.

    Reference for loop_ in mmCIF:
        http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html

    Args:
        prefix: Prefix shared by each of the data items in the loop.
            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
            _entity_poly_seq.mon_id. Should include the trailing period.
        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
            parser.

    Returns:
        Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
    c                    s    g | ]}t |t  d  kqS )r   )len.0Zxsdatar   r   
<listcomp>   s    z&mmcif_loop_to_list.<locals>.<listcomp>z2mmCIF error: Not all loops are the same length: %sc                    s   g | ]	}t t |qS r   )dictzipr@   )colsr   r   rD      s    )items
startswithappendallrF   )r<   r=   keyvaluer   )rG   rC   r   mmcif_loop_to_listr   s   



rN   indexc                    s   t | |} fdd|D S )ag  Extracts loop associated with a prefix from mmCIF data as a dictionary.

    Args:
        prefix: Prefix shared by each of the data items in the loop.
            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
            _entity_poly_seq.mon_id. Should include the trailing period.
        index: Which item of loop data should serve as the key.
        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
            parser.

    Returns:
        Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
        indexed by the index column.
    c                    s   i | ]}|  |qS r   r   )rA   entryrO   r   r   
<dictcomp>   s    z&mmcif_loop_to_dict.<locals>.<dictcomp>)rN   )r<   rO   r=   entriesr   rQ   r   mmcif_loop_to_dict   s   
rT      F)typed)catch_all_errorsr+   mmcif_stringrW   c              
   C   s   i }zXt dd}|j}| D ]\}}t|ts|g||< qt|}t|d}	|	s3td| dfdiW S i }
t|D ]}|j	dkrAq9|j
|
|j< q9t| |ddd||
|	d}t||d	W S  ty{ } z||| df< |sk td|d	W  Y d}~S d}~ww )
  Entry point, parses an mmcif_string.

    Args:
        file_id: A string identifier for this file. Should be unique within the
            collection of files being processed.
        mmcif_string: Contents of an mmCIF file.
        catch_all_errors: If True, all exceptions are caught and error messages are
            returned as part of the ParsingResult. If False exceptions will be allowed
            to propagate.

    Returns:
        A ParsingResult.
    TZQUIETr=   N %No protein chains found in this file.1r+   r,   r-   r.   r/   r0   r1   r2   r9   r:   )r
   _mmcif_dictrH   
isinstancelist_get_header_get_protein_chainsr8   _get_atom_site_listr    r   r   r*   	Exception)r+   rX   rW   r:   parserr=   rL   rM   r,   r2   r1   atomr9   er   r   r   
fast_parse   sP   




rk   c           !   
   C   s  i }z t jdd}t|}|d|}t|}|j}| D ]\}	}
t|
t	s.|
g||	< q t
|}t|d}|sDtd| dfdiW S dd | D }i }i }t|D ]]}|jd	kr]qU|j||j< |j|v rd
}|jdkr||jdv rwd}nd|j }|j}t|jsd
}t|jt|j|d}t|j||j  }||ji }t||jd|d||< |||j< qU| D ]$\}}|| }|| }t|D ]\}}||vrtd|jdd
d||< qqi }| D ].\}}|| }g }|D ]}tj|jd}|t |dkr|nd qd!|}|||< qt"| |||||||d}t||dW S  t#yF }  z| || df< |s6 td|dW  Y d} ~ S d} ~ ww )rY   TrZ   r\   r[   Nr]   c                 S   s$   i | ]\}}|t d d |D qS )c                 S   s   g | ]}|j qS r   )r   rA   monomerr   r   r   rD     s    z$parse.<locals>.<dictcomp>.<listcomp>min)rA   r"   seqr   r   r   rR     s    zparse.<locals>.<dictcomp>r^    ZHETATM)ZHOHZWATWZH_)r"   r#   r   F)r%   r&   r'   r(   X   r_   r`   )$r   r
   ioStringIOZget_structure_get_first_modelra   rH   rb   rc   rd   re   r8   rf   r    r   r   r   r   r   _is_setr!   r   r   r   getr$   	enumerater   r	   Zprotein_letters_3to1rJ   r?   joinr*   rg   )!r+   rX   rW   r:   rh   handleZfull_structureZfirst_model_structurer=   rL   rM   r,   r2   Zseq_start_numr1   Zseq_to_structure_mappingsri   r(   r   r%   Zseq_idxcurrentr"   seq_infoZauthor_chainZcurrent_mappingidxrm   Zauthor_chain_to_sequencerp   coder9   rj   r   r   r   parse   s   



	










r   r-   c                 C   s   t |  S )z1Returns the first model in a Biopython structure.)nextZ
get_models)r-   r   r   r   rw   n  s   rw      c                 C   s   | d }t |S )z!Returns the oldest revision date.*_pdbx_audit_revision_history.revision_datern   )r=   Zrevision_datesr   r   r   get_release_datev  s   r   c              	   C   s   i }t d| }ddd |D |d< d| v rt| |d< ntd| d	  d
|d< dD ]&}|| v rRz| | d }t||d< W q, tyQ   td| |  Y q,w q,|S )zFReturns a basic header containing method, release date and resolution.z_exptl.,c                 S   s   g | ]}|d    qS )z_exptl.method)lower)rA   Z
experimentr   r   r   rD     s    z_get_header.<locals>.<listcomp>Zstructure_methodr   Zrelease_datez$Could not determine release_date: %sz	_entry.idg        
resolution)z_refine.ls_d_res_highz _em_3d_reconstruction.resolutionz_reflns.d_resolution_highr   zInvalid resolution format: %s)rN   r{   r   r   warningfloat
ValueErrordebug)r=   r,   ZexperimentsZres_keyZraw_resolutionr   r   r   rd   |  s.   
rd   c                 C   s@   dd t | d | d | d | d | d | d | d	 | d
 D S )zGReturns list of atom sites; contains data not present in the structure.c                 S   s   g | ]}t | qS r   )r   )rA   siter   r   r   rD     s    z'_get_atom_site_list.<locals>.<listcomp>z_atom_site.label_comp_idz_atom_site.auth_asym_idz_atom_site.label_asym_idz_atom_site.auth_seq_idz_atom_site.label_seq_idz_atom_site.pdbx_PDB_ins_codez_atom_site.group_PDBz_atom_site.pdbx_PDB_model_num)rF   r[   r   r   r   rf     s   rf   c                    s   t d| }tt}|D ]}||d  t|d t|d d qtdd|  t d| }tt}|D ]}|d	 }|d
 }|| | q4i }	| D ]\}}
|| }t	 fdd|
D rh|D ]}|
|	|< qaqL|	S )zExtracts polymer information for protein chains only.

    Args:
        parsed_info: _mmcif_dict produced by the Biopython parser.

    Returns:
        A dict mapping mmcif chain id to a list of Monomers.
    z_entity_poly_seq.z_entity_poly_seq.entity_idz_entity_poly_seq.mon_idz_entity_poly_seq.num)r   r   z_chem_comp.z_chem_comp.idz_struct_asym.z_struct_asym.idz_struct_asym.entity_idc                    s   g | ]}d  |j  d v qS )Zpeptidez_chem_comp.type)r   rl   Z
chem_compsr   r   rD     s    z'_get_protein_chains.<locals>.<listcomp>)
rN   collectionsdefaultdictrc   rJ   r   r   rT   rH   any)r=   Zentity_poly_seqsZpolymersZentity_poly_seqZstruct_asymsZentity_to_mmcif_chainsZstruct_asymr"   Z	entity_idr2   r~   Z	chain_idsr   r   r   re     s8   





re   rC   c                 C   s   | dvS )zFReturns False if data is a special mmCIF character indicating 'unset'.).?r   rB   r   r   r   rx     s   rx   )0r3   r   dataclasses	functoolsru   typingr   r   r   r   r   Zabslr   ZBior   ZBio.Datar	   ZBio.PDB.MMCIFParserr
   r   r6   r4   	Structurer5   r7   Z	MmCIFDict	dataclassr   r   r!   r$   r*   r8   rg   r;   rN   rT   	lru_cacher)   rk   r   rw   Z-_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDEr   rd   rf   re   rx   r   r   r   r   <module>   s   







B !
4