Almaatla commited on
Commit
a5d3f98
·
verified ·
1 Parent(s): fb42ef1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import langchain
2
- from langchain.embeddings import SentenceTransformerEmbeddings
 
3
  from langchain_community.document_loaders import UnstructuredPDFLoader,UnstructuredWordDocumentLoader
4
  from langchain.indexes import VectorstoreIndexCreator
5
  from langchain.vectorstores import FAISS
@@ -19,7 +20,7 @@ import spaces
19
  #import faiss as FAISS
20
 
21
  from groq import Groq
22
- from sentence_transformers import SentenceTransformer
23
 
24
  tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
25
 
@@ -38,11 +39,22 @@ text_splitter = RecursiveCharacterTextSplitter(
38
  separators=["\n\n", "\n", " ", ""]
39
  )
40
 
41
- embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
42
  #embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
 
 
 
43
 
44
  foo = Document(page_content='foo is fou!',metadata={"source":'foo source'})
45
 
 
 
 
 
 
 
 
46
 
47
  def reset_database(ui_session_id):
48
  session_id = f"PDFAISS-{ui_session_id}"
@@ -67,7 +79,7 @@ def is_duplicate(split_docs,db):
67
  print(f"DUPLICATE: Treating: {split_docs[0].metadata['source'].split('/')[-1]}")
68
  for i in range(min(3,len(split_docs))):
69
  query = split_docs[i].page_content
70
- docs = db.similarity_search_with_score(query,k=1)
71
  _ , score = docs[0]
72
  epsilon += score
73
  print(f"DUPLICATE: epsilon: {epsilon}")
@@ -306,7 +318,7 @@ def ask_gpt(query, ui_session_id, history):
306
  print(f"SESSION: {session_id} database does not exist")
307
  return f"SESSION: {session_id} database does not exist","",""
308
 
309
- docs = db.similarity_search(query, k=5)
310
 
311
  documents = "\n\n*-*-*-*-*-*\n\n".join(f"Content: {doc.page_content}\n" for doc in docs)
312
  system = f"# Instructions\nTake a deep breath and resonate step by step.\nYou are a helpful standard assistant. Your have only one mission and that consists in answering to the user input based on the **provided documents**. If the answer to the question that is asked by the user isn't contained in the **provided documents**, say so but **don't make up an answer**. I chose you because you can say 'I don't know' so please don't do like the other LLMs and don't define acronyms that aren\'t present in the following **PROVIDED DOCUMENTS** double check if it is present before answering. If some of the information can be useful for the user you can tell him.\nFinish your response by **ONE** follow up question that the provided documents could answer.\n\nThe documents are separated by the string \'*-*-*-*-*-*\'. Do not provide any explanations or details.\n\n# **Provided documents**: {documents}."
 
1
  import langchain
2
+ #from langchain.embeddings import SentenceTransformerEmbeddings
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
  from langchain_community.document_loaders import UnstructuredPDFLoader,UnstructuredWordDocumentLoader
5
  from langchain.indexes import VectorstoreIndexCreator
6
  from langchain.vectorstores import FAISS
 
20
  #import faiss as FAISS
21
 
22
  from groq import Groq
23
+ #from sentence_transformers import SentenceTransformer
24
 
25
  tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
26
 
 
39
  separators=["\n\n", "\n", " ", ""]
40
  )
41
 
42
+ #embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
43
  #embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
44
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",
45
+ model_kwargs = {'device': 'cuda'},
46
+ encode_kwargs = {'normalize_embeddings': False})
47
+
48
 
49
  foo = Document(page_content='foo is fou!',metadata={"source":'foo source'})
50
 
51
+ @spaces.GPU
52
+ def similarity_search(db,query, k=5):
53
+ return db.similarity_search(query, k)
54
+
55
+ @spaces.GPU
56
+ def similarity_search_with_score(db,query,k=1):
57
+ return db.similarity_search_with_score(query,k)
58
 
59
  def reset_database(ui_session_id):
60
  session_id = f"PDFAISS-{ui_session_id}"
 
79
  print(f"DUPLICATE: Treating: {split_docs[0].metadata['source'].split('/')[-1]}")
80
  for i in range(min(3,len(split_docs))):
81
  query = split_docs[i].page_content
82
+ docs = similarity_search_with_score(db,query,k=1)
83
  _ , score = docs[0]
84
  epsilon += score
85
  print(f"DUPLICATE: epsilon: {epsilon}")
 
318
  print(f"SESSION: {session_id} database does not exist")
319
  return f"SESSION: {session_id} database does not exist","",""
320
 
321
+ docs = similarity_search(db,query, k=5)
322
 
323
  documents = "\n\n*-*-*-*-*-*\n\n".join(f"Content: {doc.page_content}\n" for doc in docs)
324
  system = f"# Instructions\nTake a deep breath and resonate step by step.\nYou are a helpful standard assistant. Your have only one mission and that consists in answering to the user input based on the **provided documents**. If the answer to the question that is asked by the user isn't contained in the **provided documents**, say so but **don't make up an answer**. I chose you because you can say 'I don't know' so please don't do like the other LLMs and don't define acronyms that aren\'t present in the following **PROVIDED DOCUMENTS** double check if it is present before answering. If some of the information can be useful for the user you can tell him.\nFinish your response by **ONE** follow up question that the provided documents could answer.\n\nThe documents are separated by the string \'*-*-*-*-*-*\'. Do not provide any explanations or details.\n\n# **Provided documents**: {documents}."