diff --git "a/src/notebook.ipynb" "b/src/notebook.ipynb" new file mode 100644--- /dev/null +++ "b/src/notebook.ipynb" @@ -0,0 +1,856 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from transformers import pipeline\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline\n", + "import matplotlib.pyplot as plt\n", + "from dotenv import load_dotenv \n", + "import os\n", + "import re\n", + "from tqdm import tqdm\n", + " \n", + "import io\n", + "import sys\n", + "load_dotenv()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "deepseek_model = {\n", + " \"tokenizer\": AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-coder-7b-instruct-v1.5\"),\n", + " \"model\" : AutoModelForCausalLM.from_pretrained(\"deepseek-ai/deepseek-coder-7b-instruct-v1.5\")\n", + "}\n", + "\n", + "mistral_model = {\n", + " \"tokenizer\": AutoTokenizer.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\"),\n", + " \"model\" : AutoModelForCausalLM.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\")\n", + "}\n", + "\n", + "codellama_model = {\n", + " \"tokenizer\": AutoTokenizer.from_pretrained(\"codellama/CodeLlama-7b-Instruct-hf\"),\n", + " \"model\" : AutoModelForCausalLM.from_pretrained(\"codellama/CodeLlama-7b-Instruct-hf\")\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "libraries = \"\"\"\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import seaborn as sns\n", + "from scipy import stats\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt_template = \"\"\"df is a dataframe that {description}. df has these columns: {columns}. Without explaining, write in a python code block the answer to this question: Print {question}\n", + "\"\"\"\n", + "prompt_template = \"\"\"df is a dataframe that {description}. df has these columns: {columns}. Write in a python code block the answer to this question: Print {question}. Just code, no explanation should be given.\n", + "\"\"\"\n", + "\n", + "prompt_template = \"\"\"df is a dataframe that {description}. df has these columns: {columns}. Write in a python code block the answer to this question: {question}. Just write code and print results, no explanation should be given.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_code(text):\n", + " try:\n", + " matches = []\n", + " # pattern = r\"```(.*?)```\"\n", + " pattern = r\"```python(.*?)```\"\n", + " if text:\n", + " matches = re.findall(pattern, text, re.DOTALL)\n", + " if matches:\n", + " return matches[0]\n", + " else:\n", + " raise Exception(\"Error extracting code: No match\")\n", + " except Exception as e:\n", + " raise Exception(\"Error extracting code: \",e) from e\n", + "\n", + "def generate_response(prompt,model_name):\n", + " try:\n", + " coder_model_prompt = [\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + " encodeds = model_name[\"tokenizer\"].apply_chat_template(coder_model_prompt, return_tensors=\"pt\")\n", + "\n", + " model_inputs = encodeds.to(device)\n", + " model_name['model'].to(device)\n", + "\n", + " generated_ids = model_name['model'].generate(model_inputs, max_new_tokens=500, do_sample=False,temperature=0.1,repetition_penalty=1)\n", + " decoded = model_name[\"tokenizer\"].batch_decode(generated_ids)\n", + " return decoded[0].split('[/INST]')[-1].split('')[0]\n", + " except Exception as e:\n", + " raise Exception(\"Error generating: \",e) from e\n", + "\n", + "def execute(code,namespace):\n", + " try:\n", + " buffer = io.StringIO()\n", + " sys.stdout = buffer\n", + " exec(libraries+code,namespace)\n", + "\n", + " sys.stdout = sys.__stdout__\n", + "\n", + " return buffer.getvalue()\n", + "\n", + " except Exception as e:\n", + " raise Exception(\"Error executing: \",e) from e" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "filename = \"titanic.csv\"\n", + "df = pd.read_csv(f'./testing/datasets/{filename}')\n", + "answer1 = df[df['Name'] == \"Mr. Owen Harris Braund\"]['Fare'].iloc[0]\n", + "answer2 = df[df['Sex'] == \"female\"].shape[0]\n", + "answer3 = df['Age'].mean()\n", + "answer4 = df[df['Survived'] == 1].shape[0]\n", + "answer5 = df.loc[df['Fare'].idxmax(), 'Name']\n", + "answer6 = df['Fare'].sum()\n", + "answer7 = df.loc[df['Siblings/Spouses Aboard'].idxmax(), 'Name']\n", + "answer8 = df[df['Name'] == \"Miss. Laina Heikkinen\"]['Age'].iloc[0]\n", + "answer9 = df[df['Pclass'] == 1].shape[0]\n", + "answer10 = df['Fare'].mean()\n", + "answer11 = df[df['Siblings/Spouses Aboard'] == 0].shape[0]\n", + "answer12 = df.loc[df['Age'].idxmax(), 'Name']\n", + "answer13 = df[df['Survived'] == 0].shape[0]\n", + "answer14 = df['Fare'].min()\n", + "\n", + "TC1_questions = [\n", + " {'question': 'What is the fare for Mr. Owen Harris Braund?', 'answer': answer1},\n", + " {'question': 'How many females are in the dataset?', 'answer': answer2},\n", + " {'question': 'What is the average age of the passengers?', 'answer': answer3},\n", + " {'question': 'How many passengers survived?', 'answer': answer4},\n", + " {'question': 'Who paid the highest fare?', 'answer': answer5},\n", + " {'question': 'What is the total amount of fare paid?', 'answer': answer6},\n", + " {'question': 'Who is the passanger that has the highest number of siblings abroad ?', 'answer': answer7},\n", + " {\"question\": \"What is the age of Miss. Laina Heikkinen?\", \"answer\": answer8},\n", + " {\"question\": \"How many passengers are in the 1st class?\", \"answer\": answer9},\n", + " {\"question\": \"What is the average fare paid by passengers?\", \"answer\": answer10},\n", + " {\"question\": \"How many passengers have 0 siblings/spouses aboard?\", \"answer\": answer11},\n", + " {\"question\": \"Who is the oldest passenger in the dataset?\", \"answer\": answer12},\n", + " {\"question\": \"How many passengers did not survive?\", \"answer\": answer13},\n", + " {\"question\": \"What is the lowest fare paid in the dataset?\", \"answer\": answer14}\n", + "]\n", + "\n", + "TC1_description = 'contains the list of people that were on the titanic ship and some details such as age, sex, name and whether they survived or not'\n", + "TC1_columns = \"['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']\"\n", + "\n", + "TC1 = {\n", + " \"dataset\":filename,\n", + " \"description\":TC1_description,\n", + " \"columns\":TC1_columns,\n", + " \"questions\": TC1_questions,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "filename = \"onlinefoods.csv\"\n", + "df = pd.read_csv(f'./testing/datasets/{filename}')\n", + "answer1 = df.loc[df['Gender'] == \"Female\", 'Monthly Income'].iloc[0]\n", + "answer2 = (df['Occupation'] == \"Student\").sum()\n", + "answer3 = df['Age'].mean()\n", + "answer4 = (df['Feedback'] == \"Positive\").sum()\n", + "answer5= df[df['Gender'] == 'Male']['Educational Qualifications'].unique()\n", + "answer6 = (df['Marital Status'] == \"Single\").sum()\n", + "answer7 = df['Marital Status'].mode()[0]\n", + "answer8 = \"Yes\" if any(df['Family size'] > 3) else \"No\"\n", + "answer9 = df.loc[df['latitude'].idxmax(), 'Pin code']\n", + "answer10 = \"Yes\" if any((df['Feedback'] == \"Negative\") & (df['Monthly Income'] == \"Below Rs.10000\")) else \"No\"\n", + "\n", + "TC2_questions=[\n", + " {\"question\": \"What is the monthly income of the first female in the dataset?\", \"answer\": answer1},\n", + " {\"question\": \"How many students are there in the dataset?\", \"answer\": answer2},\n", + " {\"question\": \"What is the average age of participants?\", \"answer\": answer3},\n", + " {\"question\": \"How many participants provided positive feedback?\", \"answer\": answer4},\n", + " {\"question\": \"What is the educational qualification of the male participant?\", \"answer\": answer5},\n", + " {\"question\": \"How many participants are single?\", \"answer\": answer6},\n", + " {\"question\": \"What is the most common marital status in the dataset?\", \"answer\": answer7},\n", + " {\"question\": \"Are there any participants with a family size greater than 3?\", \"answer\": answer8},\n", + " {\"question\": \"What is the pin code for the location with the highest latitude?\", \"answer\": answer9},\n", + " {\"question\": \"Did any participant with negative feedback have a monthly income below Rs.10000?\", \"answer\": answer10}\n", + "]\n", + "\n", + "TC2_description = 'The dataset contains information collected from an online food ordering platform over a period of time.'\n", + "TC2_columns = \"['Age', 'Gender', 'Marital Status', 'Occupation', 'Monthly Income', 'Educational Qualifications', 'Family size', 'latitude', 'longitude', 'Pin code', 'Output', 'Feedback', 'Unnamed: 12']\"\n", + "\n", + "TC2 = {\n", + " \"dataset\":filename,\n", + " \"description\":TC2_description,\n", + " \"columns\":TC2_columns,\n", + " \"questions\": TC2_questions,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "filename = \"hw_200.csv\"\n", + "df = pd.read_csv(f'./testing/datasets/{filename}')\n", + "answer1 = df.loc[1, 'Weight(Pounds)']\n", + "answer2 = df[df['Height(Inches)'] > 70].shape[0]\n", + "answer3 = df['Height(Inches)'].mean()\n", + "answer4 = df[df['Weight(Pounds)'] < 120].shape[0]\n", + "answer5 = df.loc[df['Weight(Pounds)'].idxmax(), 'Index']\n", + "answer6 = df['Weight(Pounds)'].sum()\n", + "answer7 = df.loc[df['Height(Inches)'].idxmin(), 'Index']\n", + "answer8 = df.loc[3, 'Height(Inches)']\n", + "answer9 = df[df['Height(Inches)'] >= 65].shape[0]\n", + "answer10 = df['Weight(Pounds)'].mean()\n", + "answer11 = df[df['Weight(Pounds)'] == 100].shape[0]\n", + "answer12 = df.loc[df['Height(Inches)'].idxmax(), 'Index']\n", + "answer13 = df[df['Weight(Pounds)'] > 150].shape[0]\n", + "answer14 = df['Height(Inches)'].min()\n", + "\n", + "\n", + "TC3_description =\"height and weight for 200 individuals\"\n", + "TC3_columns = \"['Index', 'Height(Inches)', 'Weight(Pounds)']\"\n", + "\n", + "TC3_questions = [\n", + " {'question': 'What is the weight of the person at index 1?', 'answer': answer1},\n", + " {'question': 'How many people are taller than 70 inches?', 'answer': answer2},\n", + " {'question': 'What is the average height of the people?', 'answer': answer3},\n", + " {'question': 'How many people weigh less than 120 pounds?', 'answer': answer4},\n", + " {'question': 'Who has the highest weight?', 'answer': answer5},\n", + " {'question': 'What is the total weight of all people?', 'answer': answer6},\n", + " {'question': 'Who is the shortest person?', 'answer': answer7},\n", + " {\"question\": \"What is the height of the person at index 3?\", \"answer\": answer8},\n", + " {\"question\": \"How many people are at least 65 inches tall?\", \"answer\": answer9},\n", + " {\"question\": \"What is the average weight of the people?\", \"answer\": answer10},\n", + " {\"question\": \"How many people weigh exactly 100 pounds?\", \"answer\": answer11},\n", + " {\"question\": \"Who is the tallest person?\", \"answer\": answer12},\n", + " {\"question\": \"How many people weigh more than 150 pounds?\", \"answer\": answer13},\n", + " {\"question\": \"What is the lowest height recorded in the dataset?\", \"answer\": answer14}\n", + "]\n", + "\n", + "TC3 = {\n", + " \"dataset\":filename,\n", + " \"description\":TC3_description,\n", + " \"columns\":TC3_columns,\n", + " \"questions\": TC3_questions,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "filename = \"Monthly_Counts_of_Deaths_by_Select_Causes__2014-2019.csv\"\n", + "df = pd.read_csv(f'./testing/datasets/{filename}')\n", + "answer1 = df[(df['Jurisdiction of Occurrence'] == \"United States\") & (df['Year'] == 2014)]['Malignant Neoplasms'].sum()\n", + "answer2 = df[df['Year'] == 2015]['Natural Cause'].sum()\n", + "answer3 = df['Diabetes Mellitus'].mean()\n", + "answer4 = df[(df['Year'] == 2015) & (df['Month'] == 1)]['Intentional Self-Harm (Suicide)'].sum()\n", + "answer5 = df[(df['Year'] == 2014)]['Accidents (Unintentional Injuries)'].idxmax()\n", + "answer6 = df['All Cause'].sum()\n", + "answer7 = df.groupby('Year')['Diseases of Heart'].sum().idxmin()\n", + "answer8 = df[df['Month'] == 1]['Drug Overdose'].sum()\n", + "answer9 = df[df['Chronic Lower Respiratory Diseases'] > 10000].shape[0]\n", + "answer10 = df.groupby('Year')['Cerebrovascular Diseases'].mean()\n", + "answer11 = df[df['Year'] == 2014]['Motor Vehicle Accidents'].sum()\n", + "answer12 = df.groupby('Jurisdiction of Occurrence')['Alzheimer Disease'].sum().idxmax()\n", + "answer13 = df[df['Year'] == 2015]['Assault (Homicide)'].sum()\n", + "answer14 = df['Influenza and Pneumonia'].min()\n", + "answers = [answer1, answer2, answer3, answer4, answer5, answer6, answer7, answer8, answer9, answer10, answer11, answer12, answer13, answer14]\n", + "\n", + "TC4_description =\"Monthly counts of death by select causes.\"\n", + "TC4_columns = \"['Jurisdiction of Occurrence', 'Year', 'Month', 'All Cause', 'Natural Cause', 'Septicemia', 'Malignant Neoplasms', 'Diabetes Mellitus', 'Alzheimer Disease', 'Influenza and Pneumonia', 'Chronic Lower Respiratory Diseases', 'Other Diseases of Respiratory System', 'Nephritis, Nephrotic Syndrome, and Nephrosis', 'Symptoms, Signs, and Abnormal Clinical and Laboratory Findings, Not Elsewhere Classified', 'Diseases of Heart', 'Cerebrovascular Diseases', 'Accidents (Unintentional Injuries)', 'Motor Vehicle Accidents', 'Intentional Self-Harm (Suicide)', 'Assault (Homicide)', 'Drug Overdose']\"\n", + "\n", + "TC4_questions = [\n", + " {'question': 'How many deaths were caused by \"Malignant Neoplasms\" in the United States in 2014?', 'answer': answer1},\n", + " {'question': 'What is the total number of \"Natural Cause\" deaths recorded in 2015?', 'answer': answer2},\n", + " {'question': 'What is the average number of deaths by \"Diabetes Mellitus\" across all years and months?', 'answer': answer3},\n", + " {'question': 'How many deaths were reported as \"Intentional Self-Harm (Suicide)\" in January 2015?', 'answer': answer4},\n", + " {'question': 'Which month in 2014 had the highest number of \"Accidents (Unintentional Injuries)\"?', 'answer': answer5},\n", + " {'question': 'What is the total number of all recorded deaths in the dataset?', 'answer': answer6},\n", + " {'question': 'Which year had the lowest \"Diseases of Heart\" deaths?', 'answer': answer7},\n", + " {\"question\": \"What were the total \\\"Drug Overdose\\\" deaths in January across all years?\", \"answer\": answer8},\n", + " {\"question\": \"How many months have recorded more than 10,000 deaths from \\\"Chronic Lower Respiratory Diseases\\\"?\", \"answer\": answer9},\n", + " {\"question\": \"What is the average number of deaths due to \\\"Cerebrovascular Diseases\\\" each year?\", \"answer\": answer10},\n", + " {\"question\": \"How many \\\"Motor Vehicle Accidents\\\" deaths were there in 2014?\", \"answer\": answer11},\n", + " {\"question\": \"Which jurisdiction recorded the highest \\\"Alzheimer Disease\\\" deaths in the dataset?\", \"answer\": answer12},\n", + " {\"question\": \"How many \\\"Assault (Homicide)\\\" deaths were reported in 2015?\", \"answer\": answer13},\n", + " {\"question\": \"What is the minimum number of \\\"Influenza and Pneumonia\\\" deaths recorded in any month?\", \"answer\": answer14}\n", + "]\n", + "\n", + "TC4 = {\n", + " \"dataset\":filename,\n", + " \"description\":TC4_description,\n", + " \"columns\":TC4_columns,\n", + " \"questions\": TC4_questions,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def test(qa,description,columns,dataset):\n", + " data = []\n", + " df = pd.read_csv(f'./testing/datasets/{dataset}')\n", + " namespace = {'df': df}\n", + " try: \n", + " full_response= None\n", + " extracted_code= None\n", + " execution= None\n", + " error = None\n", + " question = qa['question']\n", + " answer = qa['answer']\n", + " prompt = prompt_template.format(description=description,columns=columns,question=question)\n", + " try:\n", + " full_response = generate_response(prompt,mistral_model)\n", + " extracted_code = extract_code(full_response)\n", + " execution = execute(extracted_code,namespace)\n", + " \n", + " except Exception as e:\n", + " error = e\n", + "\n", + " data.append({ \n", + " 'question': question,\n", + " 'prompt':prompt,\n", + " 'full_response': full_response,\n", + " 'extracted_code': extracted_code,\n", + " 'execution': execution,\n", + " 'answer':answer,\n", + " 'error': error\n", + " })\n", + " \n", + " return data\n", + " \n", + " except Exception as e:\n", + " print(e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/zidane/anaconda3/envs/proj_env/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:492: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.1` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n", + " warnings.warn(\n", + "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n" + ] + } + ], + "source": [ + "data = test(TC1[\"questions\"][3],TC1[\"description\"],TC1[\"columns\"],TC1[\"dataset\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'question': 'What is the average age of the passengers?',\n", + " 'prompt': \"df is a dataframe that contains the list of people that were on the titanic ship and some details such as age, sex, name and whether they survived or not. df has these columns: ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']. Without explaining, write in a python code block the answer to this question: Print What is the average age of the passengers?\\n\",\n", + " 'full_response': ' ```python\\n# Assuming \\'df\\' is your DataFrame object\\naverage_age = df[\\'Age\\'].mean()\\nprint(\"The average age of the passengers is:\", average_age)\\n```',\n", + " 'extracted_code': '\\n# Assuming \\'df\\' is your DataFrame object\\naverage_age = df[\\'Age\\'].mean()\\nprint(\"The average age of the passengers is:\", average_age)\\n',\n", + " 'execution': 'The average age of the passengers is: 29.471443066516347\\n',\n", + " 'answer': 29.471443066516347,\n", + " 'error': None}]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "data = [] \n", + "\n", + "for TC in [TC1,TC2,TC3,TC4]:\n", + " filename = TC[\"dataset\"]\n", + " df = pd.read_csv(f'./testing/datasets/{filename}')\n", + " namespace = {'df': df}\n", + " for qa in tqdm(TC['questions']):\n", + " try: \n", + " full_response= None\n", + " extracted_code= None\n", + " execution= None\n", + " error = None\n", + " inference_time = 0\n", + " question = qa['question']\n", + " answer = qa['answer']\n", + " prompt = prompt_template.format(description=TC['description'],columns=TC['columns'],question=question)\n", + " try:\n", + " start_time = time.time() \n", + " full_response = generate_response(prompt,deepseek_model)\n", + " end_time = time.time()\n", + " extracted_code = extract_code(full_response)\n", + " execution = execute(extracted_code,namespace)\n", + " inference_time = end_time - start_time\n", + " except Exception as e:\n", + " error = e\n", + "\n", + " data.append({\n", + " 'dataset':filename,\n", + " 'question': question,\n", + " 'prompt':prompt,\n", + " 'full_response': full_response,\n", + " 'extracted_code': extracted_code,\n", + " 'error': error,\n", + " 'inference time':inference_time,\n", + " 'answer':answer,\n", + " 'execution': execution,\n", + " \n", + " })\n", + " \n", + " except Exception as e:\n", + " print(e)\n", + " continue\n", + "\n", + "testdf = pd.DataFrame(data)\n", + "testdf.to_excel('results.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "testdf = pd.DataFrame(data)\n", + "testdf.to_excel('results.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt1 = \"\"\"df is a dataframe that {description}. df has these columns: {columns}. Without explaining, write in a python code block the answer to this question: Print {question}\n", + "\"\"\"\n", + "prompt2 = \"\"\"df is a dataframe that {description}. df has these columns: {columns}. Write in a python code block the answer to this question: Print {question}. Just code, no explanation should be given.\n", + "\"\"\"\n", + "\n", + "prompt3 = \"\"\"df is a dataframe that {description}. df has these columns: {columns}. Write in a python code block the answer to this question: {question}. Just write code and print results, no explanation should be given.\n", + "\"\"\" " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Load each DataFrame from Excel files\n", + "results_codellama_prompt3 = pd.read_excel('./testing/results_codellama.xlsx')\n", + "results_deepseek_prompt1 = pd.read_excel('./testing/results_deepseek_prompt1.xlsx')\n", + "results_deepseek_prompt2 = pd.read_excel('./testing/results_deepseek_prompt2.xlsx')\n", + "results_deepseek_prompt3 = pd.read_excel('./testing/results_deepseek_prompt3.xlsx')\n", + "results_mistral_prompt3 = pd.read_excel('./testing/results_mistral_prompt3.xlsx')\n", + "results_mistral_prompt1 = pd.read_excel('./testing/results_mistral_prompt1.xlsx')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Evaluation\n", + "True 34\n", + "False 18\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_mistral_prompt1['Evaluation'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "results_codellama= pd.read_excel('./testing/results_codellama_prompt3.xlsx')\n", + "results_deepseek = pd.read_excel('./testing/results_deepseek_prompt3.xlsx')\n", + "results_mistral = pd.read_excel('./testing/results_mistral_prompt1.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Normalize 'evaluation' column in each DataFrame\n", + "dataframes = {\n", + " 'Codellama': results_codellama,\n", + " 'Deepseek': results_deepseek,\n", + " 'Mistral': results_mistral,\n", + "}\n", + "\n", + "# Create a DataFrame to store the counts\n", + "evaluation_counts = pd.DataFrame()\n", + "\n", + "for name, df in dataframes.items():\n", + " count = df['Evaluation'].value_counts().rename_axis('Evaluation').reset_index(name=name)\n", + " if evaluation_counts.empty:\n", + " evaluation_counts = count\n", + " else:\n", + " evaluation_counts = pd.merge(evaluation_counts, count, on='Evaluation', how='outer')\n", + "\n", + "# Set index for plotting\n", + "evaluation_counts.set_index('Evaluation', inplace=True)\n", + "\n", + "# Plotting\n", + "evaluation_counts.plot(kind='bar', figsize=(10, 6),color=['#1f77b4', '#ff7f0e', '#2ca02c'])\n", + "plt.title('Comparison of Evaluation Counts Across Datasets')\n", + "plt.xlabel('Evaluation')\n", + "plt.ylabel('Counts')\n", + "plt.xticks(rotation=0)\n", + "plt.grid(True)\n", + "plt.legend(title='Dataset')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CodellamaDeepseekMistral
Evaluation
False30818
True224434
\n", + "
" + ], + "text/plain": [ + " Codellama Deepseek Mistral\n", + "Evaluation \n", + "False 30 8 18\n", + "True 22 44 34" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluation_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def calculate_true_ratio(df, column_name):\n", + " if column_name in df.columns:\n", + " value_counts = df[column_name].value_counts(normalize=True)\n", + " return value_counts.get(True, 0) # Returns 0 if True is not found\n", + " return None\n", + "\n", + "# Calculate ratios\n", + "true_ratios = {\n", + " 'Codellama': calculate_true_ratio(results_codellama, 'Evaluation'),\n", + " 'Deepseek': calculate_true_ratio(results_deepseek, 'Evaluation'),\n", + " 'Mistral': calculate_true_ratio(results_mistral, 'Evaluation'),\n", + "}\n", + "\n", + "# Prepare the plot\n", + "fig, ax = plt.subplots()\n", + "ax.bar(true_ratios.keys(), true_ratios.values(), color=['#1f77b4', '#ff7f0e', '#2ca02c'])\n", + "ax.set_ylabel('True/Total Ratio')\n", + "ax.set_title('True/Total Ratio of Evaluation Across DataFrames')\n", + "ax.set_ylim([0, 1]) # Ensure the y-axis starts at 0 and ends at 1\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout()\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "average_inference_time = {\n", + " 'Codellama':results_codellama['inference time'][results_codellama['inference time'] != 0].mean(),\n", + " 'Deepseek':results_deepseek['inference time'][results_deepseek['inference time'] != 0].mean(),\n", + " 'Mistral':results_mistral['inference time'][results_mistral['inference time'] != 0].mean()\n", + "}\n", + "fig, ax = plt.subplots()\n", + "ax.bar(average_inference_time.keys(), average_inference_time.values(), color=['#1f77b4', '#ff7f0e', '#2ca02c'])\n", + "ax.set_ylabel('Average Inference Time (excluding zeros)')\n", + "ax.set_title('Comparison of Average Inference Times')\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout() # Adjust layout to make room for rotated x-labels\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Codellama ': 0.4230769230769231,\n", + " 'Deepseek': 0.8461538461538461,\n", + " 'Mistral': 0.6538461538461539}" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "true_ratios" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Codellama': 2.2758194498113684,\n", + " 'Deepseek': 3.4430570507049563,\n", + " 'Mistral': 3.781105268833249}" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_inference_time" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "ratio_division = {key: true_ratios[key] / average_inference_time[key] for key in true_ratios if key in average_inference_time}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Codellama': 0.18590091718918644,\n", + " 'Deepseek': 0.2457565569471463,\n", + " 'Mistral': 0.17292460996408962}" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratio_division" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}