girishwangikar commited on
Commit
e729802
Β·
verified Β·
1 Parent(s): d25a56b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -48
app.py CHANGED
@@ -1,71 +1,62 @@
1
  import os
2
  import gradio as gr
3
- from langchain_groq import ChatGroq
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.chains.combine_documents import create_stuff_documents_chain
6
- from langchain_core.prompts import ChatPromptTemplate
7
- from langchain.chains import create_retrieval_chain
8
- from langchain_community.vectorstores import FAISS
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain_community.embeddings import HuggingFaceEmbeddings
11
  from dotenv import load_dotenv
12
- from pydantic import ConfigDict
13
 
14
- load_dotenv() # Load the GROQ API KEY
15
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
16
 
17
- # Configure Pydantic to allow arbitrary types
18
- config = ConfigDict(arbitrary_types_allowed=True)
 
19
 
20
- llm = ChatGroq(
21
- temperature=0,
22
- model_name='llama-3.1-8b-instant',
23
- groq_api_key=GROQ_API_KEY,
24
- model_config=config
25
- )
26
-
27
- prompt = ChatPromptTemplate.from_template("""
28
- Answer the questions based on the provided context only.
29
- Please provide the most accurate response based on the question
30
- <context>{context}</context>
31
- Question: {input}
32
- """)
33
-
34
- embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
35
-
36
- # Global variable to store the vector store
37
- vectors = None
38
 
39
  def clear_knowledge_base():
40
- global vectors
41
- vectors = None
42
  return "Knowledge base cleared."
43
 
44
  def process_pdf(file):
45
- global vectors
46
  if file is not None:
47
- loader = PyPDFLoader(file.name)
48
- docs = loader.load()
49
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
50
- final_documents = text_splitter.split_documents(docs)
51
- vectors = FAISS.from_documents(final_documents, embeddings)
52
- return "PDF processed and added to the knowledge base."
 
 
 
 
53
  return "No file uploaded."
54
 
55
  def process_question(question):
56
- global vectors
57
- if vectors is None:
58
  return "Please upload a PDF first.", "", 0
 
 
 
 
 
 
 
 
59
 
60
- document_chain = create_stuff_documents_chain(llm, prompt)
61
- retriever = vectors.as_retriever()
62
- retrieval_chain = create_retrieval_chain(retriever, document_chain)
63
- response = retrieval_chain.invoke({'input': question})
64
 
65
- context = "\n\n".join([doc.page_content for doc in response["context"]])
66
- confidence_score = sum([doc.metadata.get('score', 0) for doc in response["context"]]) / len(response["context"])
67
 
68
- return response['answer'], context, round(confidence_score, 2)
69
 
70
  CSS = """
71
  .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}
 
1
  import os
2
  import gradio as gr
3
+ from transformers import pipeline
4
+ from sentence_transformers import SentenceTransformer
5
+ from pypdf import PdfReader
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import numpy as np
 
 
 
8
  from dotenv import load_dotenv
 
9
 
10
+ load_dotenv()
11
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
12
 
13
+ # Initialize models
14
+ qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
15
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
16
 
17
+ # Global variable to store the document chunks and their embeddings
18
+ document_store = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def clear_knowledge_base():
21
+ global document_store
22
+ document_store = []
23
  return "Knowledge base cleared."
24
 
25
  def process_pdf(file):
26
+ global document_store
27
  if file is not None:
28
+ reader = PdfReader(file.name)
29
+ text = ""
30
+ for page in reader.pages:
31
+ text += page.extract_text() + "\n"
32
+
33
+ # Simple text splitting (you might want to implement a more sophisticated method)
34
+ chunks = [text[i:i+1000] for i in range(0, len(text), 900)]
35
+
36
+ document_store = [(chunk, embedding_model.encode(chunk)) for chunk in chunks]
37
+ return f"PDF processed. {len(chunks)} chunks added to the knowledge base."
38
  return "No file uploaded."
39
 
40
  def process_question(question):
41
+ global document_store
42
+ if not document_store:
43
  return "Please upload a PDF first.", "", 0
44
+
45
+ question_embedding = embedding_model.encode(question)
46
+
47
+ # Find the most relevant chunks
48
+ similarities = [cosine_similarity([question_embedding], [doc_embedding])[0][0] for _, doc_embedding in document_store]
49
+ top_chunk_indices = np.argsort(similarities)[-3:][::-1] # Get top 3 most similar chunks
50
+
51
+ context = "\n".join([document_store[i][0] for i in top_chunk_indices])
52
 
53
+ # Use the QA model to get the answer
54
+ qa_result = qa_model(question=question, context=context)
 
 
55
 
56
+ answer = qa_result['answer']
57
+ confidence_score = qa_result['score']
58
 
59
+ return answer, context, round(confidence_score, 2)
60
 
61
  CSS = """
62
  .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}