o
    0j9                     @   s   d dl Z d dlZd dlmZmZ d dlmZ d dlmZ ddl	m
Z
mZ ddlmZ edr:d d	lmZ d d
lmZ edrJd dlmZ d dlmZ e
dddG dd deedZdS )    N)ABCabstractmethod)List)logging   )class_requires_depsis_dep_available)AutoRegisterABCMetaClassZ	langchainDocument)RecursiveCharacterTextSplitterzlangchain-community)vectorstores)FAISSzlangchain-text-splittersc                       s  e Zd ZdZdZdZ fddZedd Zedd	 Z	d
e
fddZde
d
efddZded
e
fddZde
d
efddZdg dfdee
 dedee
 d
dfddZddd
e
fddZde
d
dfd d!Z	"	#	$d*d%ee
 ddd&ed'ed(ed
e
fd)d	Z	  ZS )+BaseRetrieverzBase RetrieverTZPADDLEX_VECTOR_STOREc                    s   t    d| _d| _dS )z*Initializes an instance of base retriever.N)super__init__
model_name	embeddingself	__class__ v/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/paddlex/inference/pipelines/components/retriever/base.pyr   (   s   

zBaseRetriever.__init__c                 C      t d)z
        Declaration of an abstract method. Subclasses are expected to
        provide a concrete implementation of generate_vector_database.
        zCThe method `generate_vector_database` has not been implemented yet.NotImplementedErrorr   r   r   r   generate_vector_database.      z&BaseRetriever.generate_vector_databasec                 C   r   )z
        Declaration of an abstract method. Subclasses are expected to
        provide a concrete implementation of similarity_retrieval.
        z?The method `similarity_retrieval` has not been implemented yet.r   r   r   r   r   similarity_retrieval8   r   z"BaseRetriever.similarity_retrievalreturnc                 C   s   | j S )zt
        Get the model name used for generating vectors.

        Returns:
            str: The model name.
        )r   r   r   r   r   get_model_nameB   s   zBaseRetriever.get_model_namesc                 C   s   | | jS )z
        Check if the given string starts with the vector store prefix.

        Args:
            s (str): The input string to check.

        Returns:
            bool: True if the string starts with the vector store prefix, False otherwise.
        )
startswithVECTOR_STORE_PREFIX)r   r"   r   r   r   is_vector_storeK   s   
zBaseRetriever.is_vector_storevector_store_bytesc                 C   s   | j t|d S )z
        Encode the vector store bytes into a base64 string prefixed with a specific prefix.

        Args:
            vector_store_bytes (bytes): The bytes to encode.

        Returns:
            str: The encoded string with the prefix.
        ascii)r$   base64	b64encodedecode)r   r&   r   r   r   encode_vector_storeW   s   
z!BaseRetriever.encode_vector_storevector_store_strc                 C   s   t |t| jd S )a  
        Decodes the vector store string by removing the prefix and decoding the base64 encoded string.

        Args:
            vector_store_str (str): The vector store string with a prefix.

        Returns:
            bytes: The decoded vector store data.
        N)r(   	b64decodelenr$   )r   r,   r   r   r   decode_vector_storee   s   
z!BaseRetriever.decode_vector_storei,  )	
u   。z

 	text_list
block_size
separatorsr   c                 C   s\   t |d|d}|d|}dd |D }ztj|| jd}W |S  ty-   d}Y |S w )a  
        Generates a vector database from a list of texts.

        Args:
            text_list (list[str]): A list of texts to generate the vector database from.
            block_size (int): The size of each chunk to split the text into.
            separators (list[str]): A list of separators to use when splitting the text.

        Returns:
            FAISS: The generated vector database.

        Raises:
            ValueError: If an unsupported API type is configured.
           )
chunk_sizeZchunk_overlapr5   r0   c                 S   s   g | ]}t |d qS )Zpage_contentr
   ).0textr   r   r   
<listcomp>   s    z:BaseRetriever.generate_vector_database.<locals>.<listcomp>)Z	documentsr   N)r   Z
split_textjoinr   Zfrom_documentsr   
ValueError)r   r3   r4   r5   Ztext_splitterZtextsZ
all_splitsvectorstorer   r   r   r   q   s   r>   c                 C   s$   |du r	| j }|S | | }|S )z
        Encode the vector store serialized to bytes.

        Args:
            vectorstore (FAISS): The vector store to be serialized and encoded.

        Returns:
            str: The encoded vector store.
        N)r$   r+   Zserialize_to_bytes)r   r>   r   r   r   encode_vector_store_to_bytes   s
   
z*BaseRetriever.encode_vector_store_to_bytesc                 C   sJ   |  |s	td| |}|dkrtd dS tjj|| jdd}|S )a  
        Decode a vector store from bytes according to the specified API type.

        Args:
            vectorstore (str): The serialized vector store string.

        Returns:
            FAISS: Deserialized vector store object.

        Raises:
            ValueError: If the retrieved vector store is not for PaddleX
            or if an unsupported API type is specified.
        z-The retrieved vectorstore is not for PaddleX.    z5The retrieved vectorstore is empty,will empty vector.NT)Z
embeddingsZallow_dangerous_deserialization)	r%   r=   r/   r   warningr   r   Zdeserialize_from_bytesr   )r   r>   Zvectorr   r   r   decode_vector_store_from_bytes   s   


z,BaseRetriever.decode_vector_store_from_bytes      ?     query_text_list
sleep_timetopkmin_charactersc                 C   s   d}|du r|S |D ]?}|}t | |j||d}	dd |	D }
t|
dd d}
|
ddd	 D ]\}}|d
krHt|t| |krD n||7 }q0q
|S )a  
        Retrieve similar contexts based on a list of query texts.

        Args:
            query_text_list (list[str]): A list of query texts to search for similar contexts.
            vectorstore (FAISS): The vector store where to perform the similarity search.
            sleep_time (float): The time to sleep between each query, in seconds. Default is 0.5.
            topk (int): The number of results to retrieve per query. Default is 2.
            min_characters (int): The minimum number of characters required for text processing, defaults to 3500.
        Returns:
            str: A concatenated string of all unique contexts found.
        r2   N)kc                 S   s   g | ]	\}}|j |fqS r   r8   )r9   Zdocumentscorer   r   r   r;      s    z6BaseRetriever.similarity_retrieval.<locals>.<listcomp>c                 S   s   | d S )N   r   )xr   r   r   <lambda>   s    z4BaseRetriever.similarity_retrieval.<locals>.<lambda>)keyg)timesleepZ'similarity_search_with_relevance_scoressortedr.   )r   rF   r>   rG   rH   rI   Zall_CZ
query_textZQUESTIONdocscontextr:   rK   r   r   r   r      s    
)rC   rD   rE   )__name__
__module____qualname____doc__Z_BaseRetriever__is_baser$   r   r   r   r   strr!   boolr%   bytesr+   r/   r   intr?   rB   float__classcell__r   r   r   r   r       sT    
	
		
#"r   )	metaclass)r(   rQ   abcr   r   typingr   Zpaddlex.utilsr   Z
utils.depsr   r   Zutils.subclass_registerr	   Zlangchain_core.documentsr   Zlangchain_text_splittersr   Zlangchain_communityr   Z langchain_community.vectorstoresr   r   r   r   r   r   <module>   s   
