{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9CrIbR0AK3d1", "outputId": "8624a380-d370-43b0-969c-1b21e275b322" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: typing_extensions in /usr/local/lib/python3.10/dist-packages (4.9.0)\n" ] } ], "source": [ "# !pip install openai sentence-transformers\n", "# !pip install langchain\n", "!pip install typing_extensions\n" ] }, { "cell_type": "code", "source": [ "import os\n", "import openai\n", "from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader, DirectoryLoader\n", "from transformers import AutoModel\n", "from langchain_community.embeddings.sentence_transformer import (\n", " SentenceTransformerEmbeddings,\n", ")\n", "from langchain_community.vectorstores import Chroma\n", "import torch\n", "import json" ], "metadata": { "id": "xOFM83MoLQ-B" }, "execution_count": 20, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('new_articles')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WMvNDl83M7Xb", "outputId": "d59ab804-42ce-4b10-fee6-f01f19d60b38" }, "execution_count": 53, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at new_articles; to attempt to forcibly remount, call drive.mount(\"new_articles\", force_remount=True).\n" ] } ] }, { "cell_type": "code", "source": [ "def document_loader(directory):\n", " documents = {}\n", " for filename in os.listdir(directory):\n", " file_path = os.path.join(directory, filename)\n", " if filename.endswith(\".csv\"):\n", " loader = CSVLoader(file_path)\n", " elif filename.endswith(\".pdf\"):\n", " loader = PyPDFLoader(file_path)\n", " elif filename.endswith(\".txt\"):\n", " loader = TextLoader(file_path)\n", " else:\n", " break\n", "\n", " document = loader.load()\n", " documents[filename] = document\n", " return (documents)\n" ], "metadata": { "id": "QxVY8IyNL3Zp" }, "execution_count": 54, "outputs": [] }, { "cell_type": "code", "source": [ "openai.api_key = \"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\"\n", "JSON_DATA = []\n", "directory = \"/content/new_articles/MyDrive/new_articles\"\n", "documents = document_loader(directory)\n", "for filename, document in documents.items():\n", " doc = document[0].page_content\n", " # print(filename)\n", " # print(document)\n", " response = openai.chat.completions.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages = [\n", " {\"role\": \"system\", \"content\": f\"Generate one Question, Answer,Reference_Article:(use {filename}), Reference_Text from(use block of text which you've used to generate answer {doc})\"},\n", " ], temperature = 0.3\n", " )\n", " #print(response)\n", " result = response.choices[0].message.content.split(\"\\n\")\n", " # print(result)\n", " json_data = {\n", " \"Question\": result[0].split(\"Question: \")[1].strip() if len(result) > 0 and \"Question:\" in result[0] else \"Not provided\",\n", " \"Answer\": result[2].split(\"Answer: \")[1].strip() if len(result) > 2 and \"Answer:\" in result[2] else \"Not provided\",\n", " \"Reference_article\": result[4].split(\"Reference_article: \")[1].strip() if len(result) > 4 and \"Reference_article:\" in result[4] else \"Not provided\",\n", " \"Reference_text\": result[6].split(\"Reference_text: \")[1].strip() if len(result) > 6 and \"Reference_text:\" in result[6] else \"Not provided\",\n", " }\n", "\n", " # print(json_data)\n", "\n", " JSON_DATA.append(json_data)\n", "\n", "with open('question_and_answer_list.json', 'w') as json_file:\n", " json.dump(JSON_DATA, json_file, indent=2)\n", "\n", "print(\"JSON data saved to question_and_answer_list.json\")\n", "\n", "print(JSON_DATA)\n" ], "metadata": { "id": "LO9imR5SMA1u" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "eOAr3cy6iA9J" }, "execution_count": 46, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "E86P5xBqizsG" }, "execution_count": null, "outputs": [] } ] }