NaimaAqeel commited on
Commit
6959bbb
·
verified ·
1 Parent(s): 03bc240

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -17
app.py CHANGED
@@ -3,13 +3,13 @@ import gradio as gr
3
  import fitz # PyMuPDF for PDF text extraction
4
  from docx import Document # python-docx for DOCX text extraction
5
  from sentence_transformers import SentenceTransformer
6
- from langchain_community.vectorstores import FAISS
7
- from langchain_community.embeddings import HuggingFaceEmbeddings
8
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
  from nltk.tokenize import sent_tokenize
10
  import torch
11
  import pickle
12
  import nltk
 
 
13
 
14
  # Download NLTK punkt tokenizer data if not already downloaded
15
  nltk.download('punkt', quiet=True)
@@ -40,17 +40,33 @@ def extract_text_from_docx(docx_path):
40
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
41
 
42
  # Initialize the HuggingFaceEmbeddings for LangChain
43
- hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
44
 
45
  # Initialize the FAISS index
46
- index_path = "faiss_index.pkl"
47
- if os.path.exists(index_path):
48
- with open(index_path, "rb") as f:
49
- faiss_index = pickle.load(f)
50
- print("Loaded FAISS index from faiss_index.pkl")
51
- else:
52
- # Initialize FAISS index using LangChain
53
- faiss_index = FAISS(embedding_function=hf_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def preprocess_text(text):
56
  sentences = sent_tokenize(text)
@@ -72,13 +88,14 @@ def upload_files(files):
72
  # Preprocess text
73
  sentences = preprocess_text(text)
74
 
75
- # Encode sentences and add to FAISS index
76
  embeddings = embedding_model.encode(sentences)
77
- for sentence, embedding in zip(sentences, embeddings):
78
- faiss_index.add_sentence(sentence, embedding)
 
79
 
80
  # Save the updated index
81
- with open(index_path, "wb") as f:
82
  pickle.dump(faiss_index, f)
83
 
84
  return {"message": "Files processed successfully"}
@@ -97,7 +114,7 @@ def process_and_query(state, files, question):
97
  question_embedding = embedding_model.encode([question])
98
 
99
  # Search the FAISS index for similar passages
100
- retrieved_results = faiss_index.similarity_search(question, k=5) # Retrieve top 5 passages
101
  retrieved_passages = [result['text'] for result in retrieved_results]
102
 
103
  # Initialize RAG generator model
@@ -136,4 +153,3 @@ with gr.Blocks() as demo:
136
  query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
137
 
138
  demo.launch()
139
-
 
3
  import fitz # PyMuPDF for PDF text extraction
4
  from docx import Document # python-docx for DOCX text extraction
5
  from sentence_transformers import SentenceTransformer
 
 
6
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
7
  from nltk.tokenize import sent_tokenize
8
  import torch
9
  import pickle
10
  import nltk
11
+ import faiss
12
+ import numpy as np
13
 
14
  # Download NLTK punkt tokenizer data if not already downloaded
15
  nltk.download('punkt', quiet=True)
 
40
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
41
 
42
  # Initialize the HuggingFaceEmbeddings for LangChain
43
+ # Since we're not using it directly for index, initialization may be skipped here
44
 
45
  # Initialize the FAISS index
46
+ class FAISSIndex:
47
+ def __init__(self, dimension):
48
+ self.dimension = dimension
49
+ self.index = faiss.IndexFlatL2(dimension)
50
+
51
+ def add_sentences(self, sentences, embeddings):
52
+ # Ensure embeddings are numpy arrays
53
+ embeddings = np.array(embeddings)
54
+
55
+ # Check if embeddings and sentences have the same length
56
+ assert len(embeddings) == len(sentences), "Number of embeddings should match number of sentences"
57
+
58
+ # Add each sentence embedding to the index
59
+ for emb in embeddings:
60
+ self.index.add(np.expand_dims(emb, axis=0))
61
+
62
+ def similarity_search(self, query_embedding, k=5):
63
+ # Search for similar embeddings in the index
64
+ D, I = self.index.search(query_embedding, k)
65
+ return [{"text": str(i), "score": float(d)} for i, d in zip(I[0], D[0])]
66
+
67
+ # Initialize the FAISS index instance
68
+ index_dimension = 512 # Dimensionality of SentenceTransformer embeddings
69
+ faiss_index = FAISSIndex(index_dimension)
70
 
71
  def preprocess_text(text):
72
  sentences = sent_tokenize(text)
 
88
  # Preprocess text
89
  sentences = preprocess_text(text)
90
 
91
+ # Encode sentences
92
  embeddings = embedding_model.encode(sentences)
93
+
94
+ # Add sentences to FAISS index
95
+ faiss_index.add_sentences(sentences, embeddings)
96
 
97
  # Save the updated index
98
+ with open("faiss_index.pkl", "wb") as f:
99
  pickle.dump(faiss_index, f)
100
 
101
  return {"message": "Files processed successfully"}
 
114
  question_embedding = embedding_model.encode([question])
115
 
116
  # Search the FAISS index for similar passages
117
+ retrieved_results = faiss_index.similarity_search(question_embedding, k=5) # Retrieve top 5 passages
118
  retrieved_passages = [result['text'] for result in retrieved_results]
119
 
120
  # Initialize RAG generator model
 
153
  query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
154
 
155
  demo.launch()