File size: 6,049 Bytes
58249fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffec92f
 
 
 
 
58249fe
 
 
 
 
4f95172
58249fe
ffec92f
 
4f95172
ffec92f
58249fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# main.py
import os
import uuid
from typing import List
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse, HTMLResponse
from fastapi.staticfiles import StaticFiles
import pinecone
import openai
from dotenv import load_dotenv
import PyPDF2
import io

# Load environment variables from .env file
load_dotenv()

# Initialize FastAPI app
app = FastAPI()

# Mount static files
app.mount("/static", StaticFiles(directory="static"), name="static")

# Configure OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY")

# Pinecone configuration
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
INDEX_NAME = "main"
VECTOR_DIM = 3072  # Dimension for 'text-embedding-ada-002' embeddings

# Initialize Pinecone
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=VECTOR_DIM,
        metric='cosine'
    )
index = pc.Index(INDEX_NAME)

# In-memory store for bot metadata (for demonstration)
bots = {}

def generate_gpt4o_mini_response(context: str, query: str) -> str:
    """
    Generate a response using OpenAI's GPT model.
    Uses the chat completions API with the latest model.
    """
    client = openai.OpenAI()
    messages = [
        {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
    ]
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # You can also use "gpt-4" if you have access
        messages=messages,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

@app.post("/upload-documents")
async def upload_documents(files: List[UploadFile] = File(...)):
    """
    Accepts file uploads, processes PDFs and other text documents,
    generates embeddings using OpenAI, stores them in Pinecone,
    and returns a unique botid.
    """
    client = openai.OpenAI()
    botid = str(uuid.uuid4())
    bots[botid] = {"vectors": []}

    for file in files:
        # Read file content
        content = await file.read()
        
        # Process different file types
        if file.filename.lower().endswith('.pdf'):
            # Handle PDF files
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(content))
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        else:
            # Handle other text files
            text = content.decode('utf-8', errors='ignore')

        # Generate embedding using OpenAI
        embedding_response = client.embeddings.create(
            input=text,
            model="text-embedding-3-large"
        )
        vector = embedding_response.data[0].embedding

        # Create a unique ID for this vector
        vector_id = f"{botid}_{file.filename}_{uuid.uuid4()}"

        # Upsert the vector into Pinecone with metadata including the text content
        index.upsert(vectors=[(vector_id, vector, {
            "botid": botid, 
            "filename": file.filename,
            "text": text
        })])
        bots[botid]["vectors"].append(vector_id)

    return {"botid": botid}

@app.post("/query")
async def query_endpoint(botid: str = Form(...), query: str = Form(...)):
    """
    Accepts a botid and user query, retrieves relevant vectors from Pinecone,
    and returns a response generated using GPT-4o-mini proxy.
    """
    client = openai.OpenAI()
    
    # Generate embedding for the query using OpenAI
    query_embedding_response = client.embeddings.create(
        input=query,
        model="text-embedding-3-large"
    )
    query_vector = query_embedding_response.data[0].embedding

    # Query Pinecone for similar vectors associated with the given botid
    response = index.query(
        vector=query_vector,
        top_k=5,
        filter={"botid": {"$eq": botid}},
        include_metadata=True
    )

    # Process the response matches
    matches = response.matches if hasattr(response, 'matches') else []
    
    # If no matches found, the bot doesn't exist or has no content
    if not matches:
        return JSONResponse(status_code=404, content={"error": "No content found for this bot"})

    results = []
    relevant_texts = []
    
    for match in matches:
        if hasattr(match, 'metadata') and match.metadata:
            filename = match.metadata.get('filename', 'Unknown file')
            text = match.metadata.get('text', '')
            score = match.score if hasattr(match, 'score') else 0.0
            
            results.append({
                "filename": filename,
                "score": score
            })
            if text:
                relevant_texts.append(text)

    # Create context from available results and texts
    context = ""
    if results:
        context += "Relevant files: " + ", ".join([r["filename"] for r in results]) + "\n\n"
    if relevant_texts:
        context += "Content from relevant documents:\n" + "\n---\n".join(relevant_texts)
    else:
        context = "No relevant content found"

    # Use GPT-4o-mini proxy to generate an answer
    answer = generate_gpt4o_mini_response(context, query)

    return {"response": answer, "matches": results}

@app.get("/", response_class=HTMLResponse)
async def root():
    with open("index.html") as f:
        return f.read()

@app.get("/bot.html", response_class=HTMLResponse)
async def bot_page():
    with open("bot.html") as f:
        return f.read()

@app.get("/embed-code")
async def generate_embed_code(botid: str):
    """
    Generates and returns a dynamic embed code snippet for the provided botid.
    """
    base_url = "https://poemsforaphrodite-bot.hf.space"
    embed_snippet = f"""
<!-- SmartlyQ Chatbot Embed Code -->
<div id="smartlyq-chatbot-container"></div>
<script src="{base_url}/static/chatbot.js?botid={botid}"></script>
"""
    return {"embed_code": embed_snippet}