Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import langchain
|
2 |
-
from langchain.embeddings import SentenceTransformerEmbeddings
|
|
|
3 |
from langchain_community.document_loaders import UnstructuredPDFLoader,UnstructuredWordDocumentLoader
|
4 |
from langchain.indexes import VectorstoreIndexCreator
|
5 |
from langchain.vectorstores import FAISS
|
@@ -19,7 +20,7 @@ import spaces
|
|
19 |
#import faiss as FAISS
|
20 |
|
21 |
from groq import Groq
|
22 |
-
from sentence_transformers import SentenceTransformer
|
23 |
|
24 |
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
25 |
|
@@ -38,11 +39,22 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|
38 |
separators=["\n\n", "\n", " ", ""]
|
39 |
)
|
40 |
|
41 |
-
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
42 |
#embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
|
|
|
|
|
|
43 |
|
44 |
foo = Document(page_content='foo is fou!',metadata={"source":'foo source'})
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
def reset_database(ui_session_id):
|
48 |
session_id = f"PDFAISS-{ui_session_id}"
|
@@ -67,7 +79,7 @@ def is_duplicate(split_docs,db):
|
|
67 |
print(f"DUPLICATE: Treating: {split_docs[0].metadata['source'].split('/')[-1]}")
|
68 |
for i in range(min(3,len(split_docs))):
|
69 |
query = split_docs[i].page_content
|
70 |
-
docs =
|
71 |
_ , score = docs[0]
|
72 |
epsilon += score
|
73 |
print(f"DUPLICATE: epsilon: {epsilon}")
|
@@ -306,7 +318,7 @@ def ask_gpt(query, ui_session_id, history):
|
|
306 |
print(f"SESSION: {session_id} database does not exist")
|
307 |
return f"SESSION: {session_id} database does not exist","",""
|
308 |
|
309 |
-
docs =
|
310 |
|
311 |
documents = "\n\n*-*-*-*-*-*\n\n".join(f"Content: {doc.page_content}\n" for doc in docs)
|
312 |
system = f"# Instructions\nTake a deep breath and resonate step by step.\nYou are a helpful standard assistant. Your have only one mission and that consists in answering to the user input based on the **provided documents**. If the answer to the question that is asked by the user isn't contained in the **provided documents**, say so but **don't make up an answer**. I chose you because you can say 'I don't know' so please don't do like the other LLMs and don't define acronyms that aren\'t present in the following **PROVIDED DOCUMENTS** double check if it is present before answering. If some of the information can be useful for the user you can tell him.\nFinish your response by **ONE** follow up question that the provided documents could answer.\n\nThe documents are separated by the string \'*-*-*-*-*-*\'. Do not provide any explanations or details.\n\n# **Provided documents**: {documents}."
|
|
|
1 |
import langchain
|
2 |
+
#from langchain.embeddings import SentenceTransformerEmbeddings
|
3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
from langchain_community.document_loaders import UnstructuredPDFLoader,UnstructuredWordDocumentLoader
|
5 |
from langchain.indexes import VectorstoreIndexCreator
|
6 |
from langchain.vectorstores import FAISS
|
|
|
20 |
#import faiss as FAISS
|
21 |
|
22 |
from groq import Groq
|
23 |
+
#from sentence_transformers import SentenceTransformer
|
24 |
|
25 |
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
26 |
|
|
|
39 |
separators=["\n\n", "\n", " ", ""]
|
40 |
)
|
41 |
|
42 |
+
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
43 |
#embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
44 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",
|
45 |
+
model_kwargs = {'device': 'cuda'},
|
46 |
+
encode_kwargs = {'normalize_embeddings': False})
|
47 |
+
|
48 |
|
49 |
foo = Document(page_content='foo is fou!',metadata={"source":'foo source'})
|
50 |
|
51 |
+
@spaces.GPU
|
52 |
+
def similarity_search(db,query, k=5):
|
53 |
+
return db.similarity_search(query, k)
|
54 |
+
|
55 |
+
@spaces.GPU
|
56 |
+
def similarity_search_with_score(db,query,k=1):
|
57 |
+
return db.similarity_search_with_score(query,k)
|
58 |
|
59 |
def reset_database(ui_session_id):
|
60 |
session_id = f"PDFAISS-{ui_session_id}"
|
|
|
79 |
print(f"DUPLICATE: Treating: {split_docs[0].metadata['source'].split('/')[-1]}")
|
80 |
for i in range(min(3,len(split_docs))):
|
81 |
query = split_docs[i].page_content
|
82 |
+
docs = similarity_search_with_score(db,query,k=1)
|
83 |
_ , score = docs[0]
|
84 |
epsilon += score
|
85 |
print(f"DUPLICATE: epsilon: {epsilon}")
|
|
|
318 |
print(f"SESSION: {session_id} database does not exist")
|
319 |
return f"SESSION: {session_id} database does not exist","",""
|
320 |
|
321 |
+
docs = similarity_search(db,query, k=5)
|
322 |
|
323 |
documents = "\n\n*-*-*-*-*-*\n\n".join(f"Content: {doc.page_content}\n" for doc in docs)
|
324 |
system = f"# Instructions\nTake a deep breath and resonate step by step.\nYou are a helpful standard assistant. Your have only one mission and that consists in answering to the user input based on the **provided documents**. If the answer to the question that is asked by the user isn't contained in the **provided documents**, say so but **don't make up an answer**. I chose you because you can say 'I don't know' so please don't do like the other LLMs and don't define acronyms that aren\'t present in the following **PROVIDED DOCUMENTS** double check if it is present before answering. If some of the information can be useful for the user you can tell him.\nFinish your response by **ONE** follow up question that the provided documents could answer.\n\nThe documents are separated by the string \'*-*-*-*-*-*\'. Do not provide any explanations or details.\n\n# **Provided documents**: {documents}."
|