File size: 7,696 Bytes
d8c3a88 d2e3c7f a6d253a d2e3c7f 5099842 6c5c0ad d2e3c7f a6d253a 58027e2 5099842 4b219d0 58bf31d 75fd4bb 5099842 d831e83 d2e3c7f 1e82c8e 58bf31d 7822729 355b657 d2e3c7f 5099842 fc6b35d b525450 fc6b35d d290cdf 5099842 23376ad 5099842 d2e3c7f 5099842 58bf31d d2e3c7f 5099842 6a6fbcd a6d253a 75fd4bb 5099842 a6d253a 5099842 7f36a98 75fd4bb 5099842 d2e3c7f 5099842 a6d253a d2e3c7f 5e8e8f0 d2e3c7f 5099842 d2e3c7f 5099842 d2e3c7f 5099842 5e8e8f0 a6d253a d2e3c7f b10e9f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import gradio as gr
from huggingface_hub import HfApi, whoami
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.prompts import PromptTemplate
openai_api_key = os.environ.get("OPENAI_API_KEY")
hf_api = HfApi()
class AdvancedPdfChatbot:
def __init__(self, openai_api_key):
os.environ["OPENAI_API_KEY"] = openai_api_key
self.embeddings = OpenAIEmbeddings()
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
self.llm = ChatOpenAI(temperature=0.5,model_name='gpt-4o',max_tokens=3000)
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
self.qa_chain = None
self.pdf_path = None
self.template = """
You are a file-based knowledge assistant that interacts with users like ChatGPT. Your primary source of knowledge comes from user-uploaded files, such as PDFs. You do not rely on general knowledge or the internet. Instead, you extract, analyze, and synthesize information directly from the content of the provided file(s).
**1. Personality and Tone**
- Be polite, clear, and professional.
- Use formal, academic language when the context requires it.
- Provide concise, well-structured responses, and maintain a helpful and supportive tone.
**2. Core Capabilities**
- Extract and summarize key information from the provided file.
- Answer user questions based on the content of the file.
- Provide in-depth analysis, explanations, and references to the file's content.
- Suggest relevant sections, chapters, or pages where specific information can be found.
- Offer guidance on how users can interpret and understand the file's contents.
**3. Knowledge and Scope**
- Your knowledge is limited to the content found in the uploaded file(s).
- You should not answer questions unrelated to the file's content unless explicitly requested.
- If a user asks a question that is not found in the file, inform them that the information is not available.
**4. Interaction Rules**
- Respond with specific references to the document's content, including page numbers, sections, or headings, if available.
- If the user asks for clarification, politely request more details.
- Provide short, clear explanations for user queries, but be ready to offer more depth if asked.
- Never "make up" information. If something is not in the file, clearly state that it cannot be found.
**5. Context Awareness**
- Remember the content of the file for the duration of the session.
- Use file-specific knowledge to provide logical and evidence-backed responses.
- If multiple files are uploaded, clarify which file is being referenced and specify which file the information is from.
**6. Technical Details**
- Summarize content into concise answers and organize information using bullet points, lists, or structured paragraphs.
- If asked to provide a summary, focus on key points, main arguments, and essential takeaways.
- When a user asks for a section or heading, search for relevant text within the file.
- Do not offer answers beyond the scope of the file, and avoid guessing.
**7. Example Usage**
User: "Can you summarize the main argument from the introduction of the file?"
Response: "Sure! The introduction discusses [key points] and highlights the central argument that [main idea]. This can be found on page 2 under the heading 'Introduction'."
User: "Where can I find the definition of 'symbolic interactionism' in the document?"
Response: "The definition of 'symbolic interactionism' appears on page 12 under the subheading 'Key Theoretical Concepts'."
User: "Explain the concept of 'cognitive dissonance' as it is presented in the document."
Response: "In the document, 'cognitive dissonance' is defined as [definition from the file]. It appears in the context of [brief explanation] and can be found on page 15 under the section 'Theoretical Foundations'."
NOTE : DESCRIBE/SUMMARY should always return the overall summary of the documents in well documented and descriptions of the topic in great details.
**End of Prompt**
Context: {context}
Question: {question}
Answer:
"""
self.prompt = PromptTemplate(template=self.template, input_variables=["context", "question"])
def load_and_process_pdf(self, pdf_path):
loader = PyPDFLoader(pdf_path)
documents = loader.load()
texts = self.text_splitter.split_documents(documents)
self.db = FAISS.from_documents(texts, self.embeddings)
self.pdf_path = pdf_path
self.setup_conversation_chain()
def setup_conversation_chain(self):
self.qa_chain = ConversationalRetrievalChain.from_llm(
self.llm,
retriever=self.db.as_retriever(),
memory=self.memory,
combine_docs_chain_kwargs={"prompt": self.prompt}
)
def chat(self, query):
if not self.qa_chain:
return "Please upload a PDF first."
result = self.qa_chain({"question": query})
return result['answer']
def get_pdf_path(self):
# Return the stored PDF path
if self.pdf_path:
return self.pdf_path
else:
return "No PDF uploaded yet."
# Initialize the chatbot
pdf_chatbot = AdvancedPdfChatbot(openai_api_key)
def get_user_folder():
try:
user_info = whoami()
username = user_info['name']
user_folder = f"user_data/{username}"
os.makedirs(user_folder, exist_ok=True)
return user_folder
except Exception:
return None
def upload_pdf(pdf_file):
if pdf_file is None:
return "Please upload a PDF file."
user_folder = get_user_folder()
if user_folder is None:
return "Please log in to upload a PDF."
file_path = os.path.join(user_folder, pdf_file.name)
with open(file_path, "wb") as f:
f.write(pdf_file.read())
pdf_chatbot.load_and_process_pdf(file_path)
return file_path
def respond(message, history):
bot_message = pdf_chatbot.chat(message)
history.append((message, bot_message))
return "", history
def clear_chatbot():
pdf_chatbot.memory.clear()
return []
def get_pdf_path():
return pdf_chatbot.get_pdf_path()
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Chatbot")
with gr.Row():
login_button = gr.LoginButton()
user_info = gr.Markdown()
with gr.Row():
pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
upload_button = gr.Button("Process PDF")
upload_status = gr.Textbox(label="Upload Status")
upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])
path_button = gr.Button("Get PDF Path")
pdf_path_display = gr.Textbox(label="Current PDF Path")
chatbot_interface = gr.Chatbot()
msg = gr.Textbox()
clear = gr.Button("Clear")
msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])
clear.click(clear_chatbot, outputs=[chatbot_interface])
path_button.click(get_pdf_path, outputs=[pdf_path_display])
demo.load(lambda: gr.update(visible=True), outputs=[user_info], inputs=None)
if __name__ == "__main__":
demo.launch()
|