Spaces:
Sleeping
Sleeping
Changing the embedding model
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
import os
|
3 |
from langchain_community.document_loaders import PyMuPDFLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
-
from langchain_openai import
|
6 |
from langchain_qdrant import QdrantVectorStore
|
7 |
from langchain.prompts import ChatPromptTemplate
|
8 |
from langchain_core.output_parsers import StrOutputParser
|
@@ -10,6 +10,7 @@ from langchain_core.runnables import RunnablePassthrough
|
|
10 |
from qdrant_client import QdrantClient
|
11 |
from qdrant_client.http.models import Distance, VectorParams
|
12 |
from operator import itemgetter
|
|
|
13 |
|
14 |
# Set up API keys
|
15 |
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
|
@@ -20,6 +21,16 @@ pdf_links = [
|
|
20 |
"https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
|
21 |
]
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
@st.cache_resource
|
24 |
def load_and_process_pdfs(pdf_links):
|
25 |
documents = []
|
@@ -40,7 +51,7 @@ def load_and_process_pdfs(pdf_links):
|
|
40 |
def setup_vectorstore():
|
41 |
LOCATION = ":memory:"
|
42 |
COLLECTION_NAME = "AI_Ethics_Framework"
|
43 |
-
VECTOR_SIZE =
|
44 |
|
45 |
qdrant_client = QdrantClient(location=LOCATION)
|
46 |
|
@@ -50,11 +61,12 @@ def setup_vectorstore():
|
|
50 |
vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
|
51 |
)
|
52 |
|
53 |
-
# Create the vector store
|
|
|
54 |
qdrant_vector_store = QdrantVectorStore(
|
55 |
client=qdrant_client,
|
56 |
collection_name=COLLECTION_NAME,
|
57 |
-
embedding=
|
58 |
)
|
59 |
|
60 |
# Load and add documents
|
@@ -69,7 +81,6 @@ def create_rag_pipeline(_vector_store):
|
|
69 |
|
70 |
template = """
|
71 |
You are an expert AI assistant with deep knowledge of business, technology, and entrepreneurship. Your task is to provide accurate, insightful answers based solely on the given context. Follow these guidelines:
|
72 |
-
|
73 |
1. Analyze the question carefully to understand the core information being sought.
|
74 |
2. Thoroughly examine the provided context, identifying key relevant information.
|
75 |
3. Formulate a clear, concise answer that directly addresses the question.
|
@@ -79,13 +90,10 @@ def create_rag_pipeline(_vector_store):
|
|
79 |
7. If asked for an opinion or recommendation, base it strictly on insights from the context.
|
80 |
8. Use a confident, authoritative tone while maintaining accuracy.
|
81 |
9. If you cannot provide a clear answer to the question, reply with "I don't know".
|
82 |
-
|
83 |
Question:
|
84 |
{question}
|
85 |
-
|
86 |
Context:
|
87 |
{context}
|
88 |
-
|
89 |
Answer:
|
90 |
"""
|
91 |
|
|
|
2 |
import os
|
3 |
from langchain_community.document_loaders import PyMuPDFLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain_openai import ChatOpenAI
|
6 |
from langchain_qdrant import QdrantVectorStore
|
7 |
from langchain.prompts import ChatPromptTemplate
|
8 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
10 |
from qdrant_client import QdrantClient
|
11 |
from qdrant_client.http.models import Distance, VectorParams
|
12 |
from operator import itemgetter
|
13 |
+
from sentence_transformers import SentenceTransformer
|
14 |
|
15 |
# Set up API keys
|
16 |
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
|
|
|
21 |
"https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
|
22 |
]
|
23 |
|
24 |
+
class CustomHuggingFaceEmbeddings:
|
25 |
+
def __init__(self, model_name):
|
26 |
+
self.model = SentenceTransformer(model_name)
|
27 |
+
|
28 |
+
def embed_documents(self, texts):
|
29 |
+
return self.model.encode(texts).tolist()
|
30 |
+
|
31 |
+
def embed_query(self, text):
|
32 |
+
return self.model.encode(text).tolist()
|
33 |
+
|
34 |
@st.cache_resource
|
35 |
def load_and_process_pdfs(pdf_links):
|
36 |
documents = []
|
|
|
51 |
def setup_vectorstore():
|
52 |
LOCATION = ":memory:"
|
53 |
COLLECTION_NAME = "AI_Ethics_Framework"
|
54 |
+
VECTOR_SIZE = 768 # Adjust this if your model's output size is different
|
55 |
|
56 |
qdrant_client = QdrantClient(location=LOCATION)
|
57 |
|
|
|
61 |
vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
|
62 |
)
|
63 |
|
64 |
+
# Create the vector store with your custom embeddings
|
65 |
+
custom_embeddings = CustomHuggingFaceEmbeddings("Technocoloredgeek/midterm-finetuned-embedding")
|
66 |
qdrant_vector_store = QdrantVectorStore(
|
67 |
client=qdrant_client,
|
68 |
collection_name=COLLECTION_NAME,
|
69 |
+
embedding=custom_embeddings
|
70 |
)
|
71 |
|
72 |
# Load and add documents
|
|
|
81 |
|
82 |
template = """
|
83 |
You are an expert AI assistant with deep knowledge of business, technology, and entrepreneurship. Your task is to provide accurate, insightful answers based solely on the given context. Follow these guidelines:
|
|
|
84 |
1. Analyze the question carefully to understand the core information being sought.
|
85 |
2. Thoroughly examine the provided context, identifying key relevant information.
|
86 |
3. Formulate a clear, concise answer that directly addresses the question.
|
|
|
90 |
7. If asked for an opinion or recommendation, base it strictly on insights from the context.
|
91 |
8. Use a confident, authoritative tone while maintaining accuracy.
|
92 |
9. If you cannot provide a clear answer to the question, reply with "I don't know".
|
|
|
93 |
Question:
|
94 |
{question}
|
|
|
95 |
Context:
|
96 |
{context}
|
|
|
97 |
Answer:
|
98 |
"""
|
99 |
|