File size: 3,930 Bytes
c0ece10 17b3855 c0ece10 17b3855 c0ece10 f394d98 c0ece10 f394d98 c0ece10 f394d98 c0ece10 f394d98 c0ece10 5d9fd64 f394d98 5d9fd64 f394d98 5d9fd64 9aba39a c0ece10 8993c67 c0ece10 f394d98 c0ece10 9880e71 f394d98 9880e71 f394d98 c0ece10 f394d98 9880e71 f394d98 c0ece10 f394d98 9880e71 f394d98 c0ece10 f394d98 c0ece10 f394d98 c0ece10 9880e71 c0ece10 f394d98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import streamlit as st
import openai
import fitz # PyMuPDF
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from io import BytesIO
# Function to extract text from the uploaded PDF file
def extract_pdf_text(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text("text")
return text
# Function to get embeddings for the text
def get_embeddings(texts):
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=texts
)
embeddings = [embedding['embedding'] for embedding in response['data']]
return embeddings
# Function to get the most relevant context from the PDF for the query
def get_relevant_context(pdf_text, query, num_contexts=3):
# Split the PDF text into chunks for better matching
pdf_text_chunks = [pdf_text[i:i+1500] for i in range(0, len(pdf_text), 1500)]
# Get embeddings for both the document and the query
pdf_embeddings = get_embeddings(pdf_text_chunks)
query_embedding = get_embeddings([query])[0]
# Compute cosine similarity between query and document chunks
similarities = cosine_similarity([query_embedding], pdf_embeddings)
top_indices = similarities[0].argsort()[-num_contexts:][::-1]
# Combine the top context pieces
relevant_context = " ".join([pdf_text_chunks[i] for i in top_indices])
return relevant_context
# Function to generate a response from GPT-4 chat model
def generate_response(context, question):
messages = [
{"role": "system", "content": "You are a helpful assistant expert on GPT-4."},
{"role": "user", "content": f"Context: {context}\nQuestion: {question}"}
]
response = openai.ChatCompletion.create(
model="gpt-4o-mini", # Use the GPT-4 chat model
messages=messages,
max_tokens=1200,
temperature=0.7,
)
return response['choices'][0]['message']['content'].strip()
# Function to handle irrelevant questions
def is_irrelevant_question(question):
irrelevant_keywords = ["life", "love", "meaning", "future", "philosophy"]
return any(keyword in question.lower() for keyword in irrelevant_keywords)
# Streamlit UI
def main():
st.title("📄 GPT-4 Research Paper Chatbot")
st.write("💬 Ask any question related to the GPT-4 paper, and I'll try to answer it!")
# User input: OpenAI API key
openai_api_key = st.text_input("🔑 Enter your OpenAI API Key:", type="password")
if openai_api_key:
openai.api_key = openai_api_key
st.success("API Key successfully set!")
# Upload the PDF file
pdf_file = st.file_uploader("📂 Upload GPT-4 Research Paper PDF", type="pdf")
if pdf_file is not None:
# Extract text from the uploaded PDF
pdf_text = extract_pdf_text(pdf_file)
st.write("✅ PDF content loaded successfully! Start asking questions.")
# User input: the question they want to ask
question = st.text_input("Ask your question:")
if question:
# Check if the question is irrelevant
if is_irrelevant_question(question):
st.write("Sorry, I don't know the answer to this question. I am an expert on GPT-4 knowledge.")
else:
# Get the most relevant context from the document
relevant_context = get_relevant_context(pdf_text, question)
# Generate the response from GPT-4 chat model
answer = generate_response(relevant_context, question)
# Display the answer
st.write(f"🤖 Answer: {answer}")
else:
st.warning("⚠️ Please enter your OpenAI API Key to use the chatbot.")
if __name__ == "__main__":
main() |