{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "df = pd.read_csv(\"../train-data/sql_train.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['natural_query', 'sql_query', 'result'], dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## By character count" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "less_than_90 = short_queries = df[df['sql_query'].str.len() < 90]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "short_queries.to_csv(\"../train-data/less_than_90.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## From to Where" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "df['after_from'] = df['sql_query'].str.extract(r'FROM\\s+(\\w+)', flags=re.IGNORECASE)[0]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 team\n", "1 game\n", "2 game\n", "3 game\n", "4 game\n", " ... \n", "1039 game\n", "1040 game\n", "1041 other_stats\n", "1042 other_stats\n", "1043 game\n", "Name: after_from, Length: 1044, dtype: object" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['after_from']" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['team', 'game', 'other_stats'], dtype=object)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['after_from'].dropna().unique()\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "df_game = df[df['after_from'] == 'game']\n", "df_game.to_csv(\"../train-data/queries_from_game.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "df_game = df[df['after_from'] == 'team']\n", "df_game.to_csv(\"../train-data/queries_from_team.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df_game = df[df['after_from'] == 'other_stats']\n", "df_game.to_csv(\"../train-data/queries_from_other_stats.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Contain Join" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# Queries that contain the word JOIN (case-insensitive)\n", "df_with_join = df[df['sql_query'].str.contains(r'\\bJOIN\\b', case=False, na=False)]\n", "df_with_join.to_csv(\"../train-data/with_join.tsv\", sep=\"\\t\", index=False)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "\n", "# Queries that do NOT contain the word JOIN\n", "df_without_join = df[~df['sql_query'].str.contains(r'\\bJOIN\\b', case=False, na=False)]\n", "df_without_join.to_csv(\"../train-data/without_join.tsv\", sep=\"\\t\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "CSCI544", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }