Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import PyPDF2
|
|
3 |
import openai
|
4 |
import faiss
|
5 |
import os
|
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
|
@@ -21,7 +22,7 @@ def get_embeddings(text, model="text-embedding-ada-002"):
|
|
21 |
|
22 |
# Function to search for similar content
|
23 |
def search_similar(query_embedding, index, stored_texts, top_k=3):
|
24 |
-
distances, indices = index.search([query_embedding], top_k)
|
25 |
results = [(stored_texts[i], distances[0][idx]) for idx, i in enumerate(indices[0])]
|
26 |
return results
|
27 |
|
@@ -55,9 +56,12 @@ if openai_api_key:
|
|
55 |
# Generate embeddings for all chunks
|
56 |
embeddings = [get_embeddings(chunk) for chunk in chunks]
|
57 |
|
|
|
|
|
|
|
58 |
# Create a FAISS index for similarity search
|
59 |
-
index = faiss.IndexFlatL2(len(
|
60 |
-
index.add(
|
61 |
|
62 |
st.write("Course materials have been processed and indexed.")
|
63 |
|
|
|
3 |
import openai
|
4 |
import faiss
|
5 |
import os
|
6 |
+
import numpy as np
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
|
|
|
22 |
|
23 |
# Function to search for similar content
|
24 |
def search_similar(query_embedding, index, stored_texts, top_k=3):
|
25 |
+
distances, indices = index.search(np.array([query_embedding]), top_k)
|
26 |
results = [(stored_texts[i], distances[0][idx]) for idx, i in enumerate(indices[0])]
|
27 |
return results
|
28 |
|
|
|
56 |
# Generate embeddings for all chunks
|
57 |
embeddings = [get_embeddings(chunk) for chunk in chunks]
|
58 |
|
59 |
+
# Convert the list of embeddings into a NumPy array (shape: [num_chunks, embedding_size])
|
60 |
+
embeddings_np = np.array(embeddings).astype("float32")
|
61 |
+
|
62 |
# Create a FAISS index for similarity search
|
63 |
+
index = faiss.IndexFlatL2(len(embeddings_np[0])) # Use the length of the embedding vectors for the dimension
|
64 |
+
index.add(embeddings_np)
|
65 |
|
66 |
st.write("Course materials have been processed and indexed.")
|
67 |
|