Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -3,13 +3,13 @@ import gradio as gr
|
|
3 |
import fitz # PyMuPDF for PDF text extraction
|
4 |
from docx import Document # python-docx for DOCX text extraction
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
-
from langchain_community.vectorstores import FAISS
|
7 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
import torch
|
11 |
import pickle
|
12 |
import nltk
|
|
|
|
|
13 |
|
14 |
# Download NLTK punkt tokenizer data if not already downloaded
|
15 |
nltk.download('punkt', quiet=True)
|
@@ -40,17 +40,33 @@ def extract_text_from_docx(docx_path):
|
|
40 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
41 |
|
42 |
# Initialize the HuggingFaceEmbeddings for LangChain
|
43 |
-
|
44 |
|
45 |
# Initialize the FAISS index
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def preprocess_text(text):
|
56 |
sentences = sent_tokenize(text)
|
@@ -72,13 +88,14 @@ def upload_files(files):
|
|
72 |
# Preprocess text
|
73 |
sentences = preprocess_text(text)
|
74 |
|
75 |
-
# Encode sentences
|
76 |
embeddings = embedding_model.encode(sentences)
|
77 |
-
|
78 |
-
|
|
|
79 |
|
80 |
# Save the updated index
|
81 |
-
with open(
|
82 |
pickle.dump(faiss_index, f)
|
83 |
|
84 |
return {"message": "Files processed successfully"}
|
@@ -97,7 +114,7 @@ def process_and_query(state, files, question):
|
|
97 |
question_embedding = embedding_model.encode([question])
|
98 |
|
99 |
# Search the FAISS index for similar passages
|
100 |
-
retrieved_results = faiss_index.similarity_search(
|
101 |
retrieved_passages = [result['text'] for result in retrieved_results]
|
102 |
|
103 |
# Initialize RAG generator model
|
@@ -136,4 +153,3 @@ with gr.Blocks() as demo:
|
|
136 |
query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
|
137 |
|
138 |
demo.launch()
|
139 |
-
|
|
|
3 |
import fitz # PyMuPDF for PDF text extraction
|
4 |
from docx import Document # python-docx for DOCX text extraction
|
5 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
6 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
7 |
from nltk.tokenize import sent_tokenize
|
8 |
import torch
|
9 |
import pickle
|
10 |
import nltk
|
11 |
+
import faiss
|
12 |
+
import numpy as np
|
13 |
|
14 |
# Download NLTK punkt tokenizer data if not already downloaded
|
15 |
nltk.download('punkt', quiet=True)
|
|
|
40 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
41 |
|
42 |
# Initialize the HuggingFaceEmbeddings for LangChain
|
43 |
+
# Since we're not using it directly for index, initialization may be skipped here
|
44 |
|
45 |
# Initialize the FAISS index
|
46 |
+
class FAISSIndex:
|
47 |
+
def __init__(self, dimension):
|
48 |
+
self.dimension = dimension
|
49 |
+
self.index = faiss.IndexFlatL2(dimension)
|
50 |
+
|
51 |
+
def add_sentences(self, sentences, embeddings):
|
52 |
+
# Ensure embeddings are numpy arrays
|
53 |
+
embeddings = np.array(embeddings)
|
54 |
+
|
55 |
+
# Check if embeddings and sentences have the same length
|
56 |
+
assert len(embeddings) == len(sentences), "Number of embeddings should match number of sentences"
|
57 |
+
|
58 |
+
# Add each sentence embedding to the index
|
59 |
+
for emb in embeddings:
|
60 |
+
self.index.add(np.expand_dims(emb, axis=0))
|
61 |
+
|
62 |
+
def similarity_search(self, query_embedding, k=5):
|
63 |
+
# Search for similar embeddings in the index
|
64 |
+
D, I = self.index.search(query_embedding, k)
|
65 |
+
return [{"text": str(i), "score": float(d)} for i, d in zip(I[0], D[0])]
|
66 |
+
|
67 |
+
# Initialize the FAISS index instance
|
68 |
+
index_dimension = 512 # Dimensionality of SentenceTransformer embeddings
|
69 |
+
faiss_index = FAISSIndex(index_dimension)
|
70 |
|
71 |
def preprocess_text(text):
|
72 |
sentences = sent_tokenize(text)
|
|
|
88 |
# Preprocess text
|
89 |
sentences = preprocess_text(text)
|
90 |
|
91 |
+
# Encode sentences
|
92 |
embeddings = embedding_model.encode(sentences)
|
93 |
+
|
94 |
+
# Add sentences to FAISS index
|
95 |
+
faiss_index.add_sentences(sentences, embeddings)
|
96 |
|
97 |
# Save the updated index
|
98 |
+
with open("faiss_index.pkl", "wb") as f:
|
99 |
pickle.dump(faiss_index, f)
|
100 |
|
101 |
return {"message": "Files processed successfully"}
|
|
|
114 |
question_embedding = embedding_model.encode([question])
|
115 |
|
116 |
# Search the FAISS index for similar passages
|
117 |
+
retrieved_results = faiss_index.similarity_search(question_embedding, k=5) # Retrieve top 5 passages
|
118 |
retrieved_passages = [result['text'] for result in retrieved_results]
|
119 |
|
120 |
# Initialize RAG generator model
|
|
|
153 |
query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
|
154 |
|
155 |
demo.launch()
|
|