Spaces:

hkunlp
/

Binder

Runtime error

File size: 20,610 Bytes

f6f97d8

"""
Build NSQL generation prompt.
Two main parts:
1) PromptBuilder makes prompt for calling codex to generate NSQL(Binder-SQL).
2) OpenAIQAPromptBuilder makes prompt for calling codex to generate QA answers.
"""

import random
from typing import Dict, Tuple
import pandas as pd
import copy

from utils.errors import DuplicateColumnsError
from utils.mmqa.image_stuff import get_caption_map
from retrieval.retrieve_pool import QAItem

from utils.normalizer import prepare_df_for_neuraldb_from_table


def _create_table_prompt(df: pd.DataFrame, title: str):
    """
    Return the CREATE TABLE clause as prompt.
    """
    string = "CREATE TABLE {}(\n".format(title)
    for header in df.columns:
        column_type = 'text'
        try:
            if df[header].dtype == 'int64':
                column_type = 'int'
            elif df[header].dtype == 'float64':
                column_type = 'real'
            elif df[header].dtype == 'datetime64':
                column_type = 'datetime'
        except AttributeError as e:
            raise DuplicateColumnsError(e)

        string += '\t{} {},\n'.format(header, column_type)
    string = string.rstrip(',\n') + ')\n'
    return string


class PromptBuilder(object):
    def __init__(self, args):
        self.args = args
        self.prompt_style = args.prompt_style
        random.seed(args.seed)

    def _select_x_prompt(self, df: pd.DataFrame, num_rows: int,
                         few_shot_demonstration=True):
        """
        Return the first X rows table contents as prompt.
        """
        if self.prompt_style == 'create_table_select_full_table':
            string = '/*\nAll rows of the table:\nSELECT * FROM w;\n'
        elif self.prompt_style == 'create_table_select_3':
            string = '/*\n{} example rows:\nSELECT * FROM w LIMIT {};\n'.format(num_rows, num_rows)
        elif self.prompt_style == 'create_table_select_3_hidden':
            string = '/*\n{} example rows:\n'.format(num_rows)
        elif few_shot_demonstration is True and self.prompt_style in \
                ["create_table_select_3_full_table",
                 "create_table_select_3_full_table_w_gold_passage_image",
                 "create_table_select_3_full_table_w_all_passage_image"]:
            string = '/*\n{} example rows:\nSELECT * FROM w LIMIT {};\n'.format(num_rows, num_rows)
        elif few_shot_demonstration is False and self.prompt_style in \
                ["create_table_select_3_full_table",
                 "create_table_select_3_full_table_w_gold_passage_image",
                 "create_table_select_3_full_table_w_all_passage_image"]:
            string = '/*\nAll rows of the table:\nSELECT * FROM w;\n'
        else:
            raise ValueError(f"Select x prompt style {self.prompt_style} is not supported.")

        for column_id, header in enumerate(df.columns):
            string += str(header)
            if column_id != len(df.columns) - 1:
                string += '\t'
        string += '\n'

        for row_id, row in df.iloc[:num_rows].iterrows():
            for column_id, header in enumerate(df.columns):
                string += str(row[header])
                if column_id != len(df.columns) - 1:
                    string += '\t'
            string += '\n'
        string += '*/\n'

        return string

    def _passage_prompt(self, passages, only_title, db_style_prompt=True):
        """
        Return the passage prompt.
        """
        if not db_style_prompt:
            string = "Passages: "
            for passage in passages:
                if only_title:
                    string += passage['title'] + ';; '
                else:
                    string += passage['title'] + f" ({passage['text']})" + ';; '
            string = string.rstrip(';; ')
            string += '\n'
            return string
        else:
            if len(passages) == 0:
                return ""
            passage_table_prompt = ""
            _header = []
            _rows = [[]]
            for passage in passages:
                _header.append(passage['title'])
                _rows[0].append(passage['text'])
            passage_table = prepare_df_for_neuraldb_from_table({"header": _header, "rows": _rows})
            passage_table_prompt += _create_table_prompt(passage_table, "Passages")
            if not only_title:
                passage_table_prompt += self._select_x_prompt(
                    df=passage_table,
                    num_rows=passage_table.shape[0]
                )
            return passage_table_prompt

    def _image_prompt(self, images, only_title, db_style_prompt=True):
        """
        Return the image prompt.
        """
        if not db_style_prompt:
            string = "Images: "
            for image in images:
                if only_title:
                    string += image['title'] + ';;'
                else:
                    string += image['title'] + f" ({image['caption']})" + ';; '
            string = string.rstrip(';; ')
            string += '\n'
            return string
        else:
            if len(images) == 0:
                return ""
            image_table_prompt = ""
            _header = []
            _rows = [[]]
            for image in images:
                _header.append(image['title'])
                _rows[0].append(image['caption'])
            image_table = prepare_df_for_neuraldb_from_table({"header": _header, "rows": _rows})
            image_table_prompt += _create_table_prompt(image_table, "Images")
            if not only_title:
                image_table_prompt += self._select_x_prompt(
                    df=image_table,
                    num_rows=image_table.shape[0]
                )
            return image_table_prompt

    def _pick_target_columns(self, df, strategy):
        """
        Pick the controllable target columns for generation.
        """
        if strategy == 'random':
            return random.choice(list(df.columns) + ['*'])
        elif strategy == 'traverse':
            raise NotImplementedError
        else:
            return ValueError

    def _pick_operators(self, df, strategy):
        """
        Pick the controllable operators for generation.
        """
        candidate_operators = ['none', 'count', 'max', 'min', 'sum']
        if strategy == 'random':
            return random.choice(candidate_operators)
        elif strategy == 'traverse':
            raise NotImplementedError
        else:
            return ValueError

    def _pick_nested_levels(self, df, strategy):
        """
        Pick the controllable(maybe) nested levels for generation.
        """
        if strategy == 'fixed':
            return 2
        elif strategy == 'random':
            raise NotImplementedError
        elif strategy == 'traverse':
            raise NotImplementedError
        else:
            raise ValueError

    def build_one_shot_prompt(
            self,
            prompt_type: Tuple,
            table: pd.DataFrame,
            question: str,
            answer_text: str,
            nsql: str,
            passages: Dict = None,
            images: Dict = None,
            title: str = None,
            only_title: bool = False,
            **kwargs
    ):
        """
        Build one-shot prompt with table-question-nsql.
        """
        one_shot_prompt = ""
        if self.prompt_style == 'create_table_select_full_table':
            one_shot_prompt += _create_table_prompt(table, title)
            one_shot_prompt += self._select_x_prompt(
                df=table,
                num_rows=table.shape[0]
            )
        elif self.prompt_style in ['create_table_select_3_full_table', 'create_table_select_3']:
            one_shot_prompt += _create_table_prompt(table, title)
            one_shot_prompt += self._select_x_prompt(
                df=table,
                num_rows=3,
            )
        elif self.prompt_style == 'create_table':
            one_shot_prompt += _create_table_prompt(table, title)
        elif self.prompt_style == 'no_table':
            # No table input, to test Codex QA with only internal knowledge
            pass
        elif self.prompt_style in ['create_table_select_3_full_table_w_all_passage_image']:
            assert passages is not None and images is not None
            one_shot_prompt += _create_table_prompt(table, title)
            one_shot_prompt += self._select_x_prompt(
                df=table,
                num_rows=3,
            )
            all_passages, all_images = [], []
            caption_map = get_caption_map()

            for passage_idx in range(len(passages['id'])):
                all_passages.append({
                    'id': passages['id'][passage_idx],
                    'title': passages['title'][passage_idx],
                    'url': passages['url'][passage_idx],
                    'text': passages['text'][passage_idx]
                })

            for image_idx in range(len(images['id'])):
                all_images.append({
                    "id": images['id'][image_idx],
                    "title": images['title'][image_idx],
                    "url": images['url'][image_idx],
                    "path": images['path'][image_idx],
                    "pic": images['pic'][image_idx],
                    "caption": caption_map[images['id'][image_idx]]
                })

            one_shot_prompt += self._passage_prompt(
                passages=all_passages,
                only_title=only_title
            )
            one_shot_prompt += self._image_prompt(
                images=all_images,
                only_title=only_title
            )
        else:
            raise ValueError('{} is not supported.'.format(self.prompt_style))

        # question and nsql pairs
        if prompt_type == ('question', 'nsql'):
            one_shot_prompt += 'Q: {}\n'.format(question)
            one_shot_prompt += 'NeuralSQL: {}\n'.format(nsql)
        elif prompt_type == ('question', 'sql'):
            one_shot_prompt += 'Q: {}\n'.format(question)
            one_shot_prompt += 'SQL: {}\n'.format(nsql)
        elif prompt_type == ('question', 'answer'):
            one_shot_prompt += 'Q: {}\n'.format(question)
            one_shot_prompt += 'A: {}\n'.format(', '.join(answer_text))
        else:
            raise ValueError(f'Prompt type {prompt_type} is not supported.')

        return one_shot_prompt

    def build_generate_prompt(
            self,
            generate_type: Tuple,
            table: pd.DataFrame,
            question: str = None,
            passages: Dict = None,
            images: Dict = None,
            title: str = None,
            only_title: bool = False,
            supporting_context: Dict = None,
            **kwargs
    ):
        """
        Build the prompt of the generation sample.
        """
        generate_prompt = ""

        # task instruction
        if generate_type == ('answer',):
            generate_prompt += """\n-- Answer the question based on the given table below.\n\n"""
        elif generate_type == ('nsql',):
            generate_prompt += """\n-- Parse the question into NeuralSQL based on the given table below.\n\n"""
        elif generate_type == ('sql',):
            generate_prompt += """\n-- Parse the question into SQL based on the given table below.\n\n"""
        elif generate_type == ('npython',):
            generate_prompt += """\n-- Parse the question into NeuralPython based on the given table below.\n\n"""
        elif generate_type == ('python',):
            generate_prompt += """\n-- Parse the question into Python based on the given table below.\n\n"""
        else:
            generate_prompt += """\n-- Generate NeuralSQL and question pairs based on the given table below.\n\n"""

        # table prompt
        if self.prompt_style in ['create_table_select_full_table', 'create_table_select_3_full_table']:
            generate_prompt += _create_table_prompt(table, title)
            generate_prompt += self._select_x_prompt(
                df=table,
                num_rows=table.shape[0],
                few_shot_demonstration=False
            )
        elif self.prompt_style in ['create_table_select_3']:
            generate_prompt += _create_table_prompt(table, title)
            generate_prompt += self._select_x_prompt(
                df=table,
                num_rows=3,
                few_shot_demonstration=False
            )
        elif self.prompt_style == 'create_table':
            generate_prompt += _create_table_prompt(table, title)
        elif self.prompt_style == 'no_table':
            # No table input, to test Codex QA with only internal knowledge
            pass
        elif self.prompt_style in ['create_table_select_3_full_table_w_all_passage_image']:
            assert passages is not None and images is not None
            generate_prompt += _create_table_prompt(table, title)
            generate_prompt += self._select_x_prompt(
                df=table,
                num_rows=table.shape[0],
                few_shot_demonstration=False
            )
            all_passages, all_images = [], []
            caption_map = get_caption_map()

            for passage_idx in range(len(passages['id'])):
                all_passages.append({
                    'id': passages['id'][passage_idx],
                    'title': passages['title'][passage_idx],
                    'url': passages['url'][passage_idx],
                    'text': passages['text'][passage_idx]
                })

            for image_idx in range(len(images['id'])):
                all_images.append({
                    "id": images['id'][image_idx],
                    "title": images['title'][image_idx],
                    "url": images['url'][image_idx],
                    "path": images['path'][image_idx],
                    "pic": images['pic'][image_idx],
                    "caption": caption_map[images['id'][image_idx]]
                })

            generate_prompt += self._passage_prompt(
                passages=all_passages,
                only_title=only_title
            )
            generate_prompt += self._image_prompt(
                images=all_images,
                only_title=only_title
            )
        elif self.prompt_style in ['create_table_select_3_full_table_w_gold_passage_image']:
            assert passages is not None and images is not None
            generate_prompt += _create_table_prompt(table, title)
            generate_prompt += self._select_x_prompt(
                df=table,
                num_rows=table.shape[0],
                few_shot_demonstration=False
            )
            gold_passages, gold_images = [], []
            caption_map = get_caption_map()
            for doc_id, doc_part in zip(supporting_context['doc_id'], supporting_context['doc_part']):
                if doc_part == 'text':
                    passage_idx = passages['id'].index(doc_id)
                    gold_passages.append({
                        'id': passages['id'][passage_idx],
                        'title': passages['title'][passage_idx],
                        'url': passages['url'][passage_idx],
                        'text': passages['text'][passage_idx]
                    })
                elif doc_part == 'image':
                    image_idx = images['id'].index(doc_id)
                    gold_images.append({
                        "id": images['id'][image_idx],
                        "title": images['title'][image_idx],
                        "url": images['url'][image_idx],
                        "path": images['path'][image_idx],
                        "pic": images['pic'][image_idx],
                        "caption": caption_map[doc_id]
                    })
            generate_prompt += self._passage_prompt(
                passages=gold_passages,
                only_title=only_title
            )
            generate_prompt += self._image_prompt(
                images=gold_images,
                only_title=only_title
            )
        else:
            raise ValueError('{} is not supported.'.format(self.prompt_style))

        # determine the target to generate
        if generate_type == ('answer',):
            generate_prompt += 'Q: {}\n'.format(question)
            generate_prompt += 'A: '
        elif generate_type == ('nsql',):
            generate_prompt += 'Q: {}\n'.format(question)
            generate_prompt += 'NeuralSQL: '
        elif generate_type == ('sql',):
            generate_prompt += 'Q: {}\n'.format(question)
            generate_prompt += 'SQL: '
        elif generate_type == ('npython',):
            generate_prompt += 'Q: {}\n'.format(question)
            generate_prompt += 'NeuralPython: '
        elif generate_type == ('python',):
            generate_prompt += 'Q: {}\n'.format(question)
            generate_prompt += 'Python: '
        else:
            raise ValueError(f'Generate type {generate_type} is not supported.')

        return generate_prompt


class OpenAIQAPromptBuilder(object):
    @staticmethod
    def table2codex_prompt(table, table_title=None, drop_row_id=True, ):
        _table = copy.deepcopy(table)
        header = _table['header']
        rows = _table['rows']
        if drop_row_id:
            if header[0] == "row_id":
                header = header[1:]
                rows = [_row[1:] for _row in rows]
        prompt_str = 'Table: {}\n'.format(table_title) if table_title else ''
        prompt_str += "/*\n"
        prompt_str += "\t".join(header) + "\n"
        prompt_str += '\n'.join(["\t".join([str(cell) for cell in row]) for row in rows]) + "\n"
        prompt_str += "*/"
        return prompt_str

    @staticmethod
    def build_one_shot_prompt(
            item: QAItem,
            answer_split_token: str = ';',
            verbose: bool = False,
            prompting_method='new_db',
            db_mapping_token="😅"
    ) -> str:
        """
        Build one-shot QA prompt.
        """
        assert prompting_method in ['basic', 'new_db']
        qa_type, qa_question = item.qa_question.split('@')
        prompt = ''
        db_prompt = OpenAIQAPromptBuilder.table2codex_prompt(item.table, item.title)
        prompt += "Give a database as shown below:\n{}\n\n".format(db_prompt)

        if prompting_method == 'basic':
            if qa_type == "map":
                prompt += "Q: Answer question \"{}\" row by row.".format(qa_question)
                assert answer_split_token is not None
                prompt += " The answer should be a list split by '{}' and have {} items in total.".format(
                    answer_split_token, len(item.table['rows']))
                prompt += "\nA: {}\n\n".format(f'{answer_split_token}'.join(item.qa_answer))
            elif qa_type == "ans":
                prompt += "Q: Answer question \"{}\" for the table.".format(qa_question)
                prompt += " "
                prompt += "\nA: {}\n\n".format(f'{answer_split_token}'.join(item.qa_answer))
            else:
                raise ValueError("The QA type is not supported!")

            return prompt

        elif prompting_method == "new_db":
            if qa_type == "map":
                prompt += "Q: Answer question \"{}\" row by row.".format(qa_question)
                assert answer_split_token is not None
                db_prompt_lines = db_prompt.split("\n")[2:-1]  # skip Title, /*, and */
                db_prompt_lines_with_answer = []
                db_prompt_lines_with_answer.append("/*")
                db_prompt_lines_with_answer.append(db_prompt_lines[0])
                assert len(db_prompt_lines[1:]) == len(
                    item.qa_answer), "answer items and table rows must be in the same number, check annotations"
                for db_prompt_line, qa_answer_item in zip(db_prompt_lines[1:], item.qa_answer):
                    db_prompt_lines_with_answer.append(
                        "{}{}{}".format(db_prompt_line, db_mapping_token, qa_answer_item))
                db_prompt_lines_with_answer.append("*/")
                prompt += "\n{}\n".format("\n".join(db_prompt_lines_with_answer))

            elif qa_type == "ans":
                prompt += "Q: Answer question \"{}\" for the table.".format(qa_question)
                prompt += " "
                prompt += "\nA: {}\n".format(f'{answer_split_token}'.join(item.qa_answer))
            else:
                raise ValueError("The QA type is not supported!")

            return prompt