o
    *j                     @   sh   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ ddlmZ G d	d
 d
eZdS )    )AnyDictN)Image)
transforms)
load_image)ModeKeys   )OfaBasePreprocessorc                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f fdd	Zdeee	f deee	f fd
dZ  ZS )&OfaVisualQuestionAnsweringPreprocessorz5
    OFA preprocessor for question answer tasks.
    c              	      sf   t t| j|||g|R i | tdd tj| j| jftjjdt	 tj
| j| jdg| _dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        c                 S   s
   |  dS )NRGB)convert)image r   w/var/www/html/Deteccion_Ine/venv/lib/python3.10/site-packages/modelscope/preprocessors/ofa/visual_question_answering.py<lambda>#   s   
 zAOfaVisualQuestionAnsweringPreprocessor.__init__.<locals>.<lambda>)interpolation)meanstdN)superr
   __init__r   ZComposeZResizeZpatch_image_sizeZInterpolationModeZBICUBICZToTensorZ	Normalizer   r   patch_resize_transform)selfcfgZ	model_dirmodeargskwargs	__class__r   r   r      s$   
z/OfaVisualQuestionAnsweringPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r   r   ZTRAIN_build_train_sample_build_infer_sample)r   r   r   r   r   __call__+   s   

z/OfaVisualQuestionAnsweringPreprocessor.__call__c                 C   s  |  |}| jd|d ddd}| jdkr,t| j|g}t|dd | jg}n<| jdkrGt|d	 |g}t|dd | jg}n!| jd
krft|d	 dd |g}t|dd | jg}nt| j	j
|dt| d < ||d< ||d< | jdurtt|t| jf }t|t| d }tt|t| d t|D ]}| j g|||   }	| j|	}
d|| |
< q||d< |S )a&  
        Building training samples.

        step 1. Preprocess the data using the logic of `_build_infer_sample`
            and make sure the label data in the result.
        step 2. Preprocessing the label data to generate `target` and `prev_output_token`.
            - add blank in the front out label data and tokenize it as `target` item.
            - if `prompt_type` is `None`, add the bos token as previous output tokens,
            add eos tokens as target items.
            - if `prompt_type` is `src`, concatenate source text input with target item as
            previous output tokens, remove the bos token and add eos token as target items.
            - if `prompt_type` is `prev_output`, just like the `prompt_type` is src, the
            difference is that it will remove the eos token in source text input in this
            setting.
            - padding the source item as final target item.
        step 3. Add constraint mask.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`
                `text` and `label`.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])`, decoder prompt, label, target previous output tokens
            and constraint mask.
        z {}labelF)Zadd_bosZadd_eosnoner   Nsrcsourceprev_outputZprev_output_tokenstargetTconstraint_mask)r!   tokenize_textformatprompt_typetorchcatbos_itemZeos_itemNotImplementedError	tokenizerZpad_token_idlenZconstraint_trieZzerosZtgt_dictboolrangeZbostolistZget_next_layer)r   r   sampleZtgt_itemZprev_output_itemZtarget_itemr*   Z	start_idxiZconstraint_prefix_tokenZconstraint_nodesr   r   r   r    1   sJ   




z:OfaVisualQuestionAnsweringPreprocessor._build_train_samplec                 C   s   |  || jd  }| |}|| jd  }| || j}|ds&|d n|}| d| }| jdkr9| j}n| jdkrA|}n| jdkrM|dd	 }nt	||t
d
g|d}d| jv ro| jd |v ro|| jd  |d< |S )a<  
        Building inference samples.

        step 1. Preprocessing image input for model's image input.
            - get pillow image from data.
            - do some transforms to the pillow image, such as resize, normalize etc.
        step 2. Preprocessing the text input for model's text input.
            - add blank in the front of input text.
            - tokenize the result above as source text input.
        step 3. Calculating the decoder prompt.
            - if `prompt_type` is `None`, using bos token.
            - if `prompt_type` is `src`, using source text input
            - if `prompt_type` is `prev_output`, using source text input without eos token.
        step 4. Whether or not to add label data which refer to an answer to the question
            in this task.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`
                `text`.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])`, decoder prompt and label.
        r   text? r$   r%   r'   Nr(   T)r&   patch_imageZ
patch_maskdecoder_promptZanswerr#   )Zget_img_pilZ
column_mapr   Zpre_questionZmax_src_lengthendswithr+   r-   r0   r1   r.   Ztensor)r   r   r   r<   r9   Zinputsr=   r7   r   r   r   r!   o   s*   




z:OfaVisualQuestionAnsweringPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   Z	INFERENCEr   r   strr   r"   r    r!   __classcell__r   r   r   r   r
      s    ""*>r
   )typingr   r   r.   ZPILr   Ztorchvisionr   Zmodelscope.preprocessors.imager   Zmodelscope.utils.constantr   baser	   r
   r   r   r   r   <module>   s   