{ "cells": [ { "cell_type": "markdown", "id": "ebeba428", "metadata": {}, "source": [ "# ✅ RAG JuJutsu PoC (Notebook with Joblib, FAISS, ChatGPT API)" ] }, { "cell_type": "code", "execution_count": null, "id": "8bdfd3c8", "metadata": { "scrolled": true }, "outputs": [], "source": [ "\n", "!pip install --quiet openai langchain faiss-cpu PyPDF2 sentence-transformers joblib\n", "!pip install ipywidgets==7.7.2\n", "!jupyter nbextension enable --py widgetsnbextension\n", "!jupyter notebook\n" ] }, { "cell_type": "code", "execution_count": null, "id": "49ee7721", "metadata": {}, "outputs": [], "source": [ "from PyPDF2 import PdfReader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", "def load_pdf_chunks(pdf_path):\n", " reader = PdfReader(pdf_path)\n", " raw_text = \"\"\n", " for page in reader.pages:\n", " raw_text += page.extract_text() + \"\\n\"\n", "\n", " splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)\n", " return splitter.split_text(raw_text)\n", "\n", "chunks = load_pdf_chunks(\"JuJutsu-Contexto-Significado-Conexiones-Historia.pdf\")\n", "print(f\"Loaded {len(chunks)} chunks\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8109b626-0179-43e2-b924-65afe9af1e4e", "metadata": {}, "outputs": [], "source": [ "import openai" ] }, { "cell_type": "code", "execution_count": null, "id": "371c637e", "metadata": {}, "outputs": [], "source": [ "\n", "import faiss\n", "import numpy as np\n", "import joblib\n", "\n", "def get_openai_embeddings(texts):\n", " embeddings = []\n", " for text in texts:\n", " response = openai.Embedding.create(\n", " model=\"text-embedding-3-small\",\n", " input=text\n", " )\n", " vector = response['data'][0]['embedding']\n", " embeddings.append(vector)\n", " return np.array(embeddings)\n", "\n", "embeddings = get_openai_embeddings(chunks)\n", "index = faiss.IndexFlatL2(embeddings.shape[1])\n", "index.add(np.array(embeddings))\n", "\n", "joblib.dump((chunks, index), \"rag_model.joblib\")\n", "print(\"Chunks and index serialized to rag_model.joblib\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "28ce4963", "metadata": {}, "outputs": [], "source": [ "\n", "import joblib\n", "chunks, index = joblib.load(\"rag_model.joblib\")\n", "print(\"Chunks and index loaded from rag_model.joblib\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "51a89e77", "metadata": {}, "outputs": [], "source": [ "\n", "def search(query, k=3):\n", " response = openai.Embedding.create(\n", " model=\"text-embedding-3-small\",\n", " input=query\n", " )\n", " query_vec = np.array([response['data'][0]['embedding']])\n", " scores, indices = index.search(query_vec, k)\n", " return [chunks[i] for i in indices[0]]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "34315775", "metadata": {}, "outputs": [], "source": [ "\n", "import os\n", "import openai\n", "from openai import OpenAI\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\")\n", "client = OpenAI()\n", "\n", "def chat_no_rag(question):\n", " response = client.chat.completions.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": question}\n", " ],\n", " temperature=0.5,\n", " max_tokens=200,\n", " )\n", " return response.choices[0].message.content\n", "\n", "def chat_with_rag(question, retrieved_chunks):\n", " context = \"\\n\".join(retrieved_chunks)\n", " prompt = f\"Usa el siguiente contexto para responder la pregunta:\\n\\n{context}\\n\\nPregunta: {question}\"\n", "\n", " response = client.chat.completions.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": prompt}\n", " ],\n", " temperature=0.3,\n", " max_tokens=200,\n", " )\n", " return response.choices[0].message.content\n", "\n", "def chat_with_rag_enhanced(question, retrieved_chunks):\n", " context = \"\\n\".join(retrieved_chunks)\n", " prompt = (\n", " \"Eres un experto en historia marcial. \"\n", " \"Usa el siguiente contexto histórico para responder con precisión y detalle.\\n\\n\"\n", " f\"Contexto:\\n{context}\\n\\n\"\n", " f\"Pregunta: {question}\\nRespuesta:\"\n", " )\n", "\n", " response = client.chat.completions.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"user\", \"content\": prompt}\n", " ],\n", " temperature=0.2,\n", " max_tokens=200,\n", " )\n", " return response.choices[0].message.content\n" ] }, { "cell_type": "code", "execution_count": null, "id": "900dfdfa", "metadata": {}, "outputs": [], "source": [ "\n", "# Example query\n", "query = \"¿Cuál es el origen del JuJutsu en Japón?\"\n", "retrieved = search(query)\n", "\n", "print(\"🔹 Sin RAG:\")\n", "print(chat_no_rag(query))\n", "\n", "print(\"\\n🔹 Con RAG:\")\n", "print(chat_with_rag(query, retrieved))\n", "\n", "print(\"\\n🔹 Con RAG + Prompt mejorado:\")\n", "print(chat_with_rag_enhanced(query, retrieved))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }