Spaces:
Sleeping
Sleeping
Pratik Dwivedi
commited on
Commit
·
e4b7b4c
1
Parent(s):
5b0f27d
test new version
Browse files- .gitignore +2 -0
- app.py +52 -72
- app_OG.py +76 -0
- requirements.txt +9 -2
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.gitattributes
|
app.py
CHANGED
@@ -1,84 +1,64 @@
|
|
1 |
import streamlit as st
|
2 |
-
from
|
3 |
-
import
|
4 |
-
import
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
model_file = "llama-2-7b-chat.Q3_K_M.gguf"
|
12 |
-
print("registering models")
|
13 |
-
prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
|
14 |
-
your_model_name = "open_gpt4"
|
15 |
-
hf_repo_name = "TheBloke/Open_Gpt4_8x7B-GGUF"
|
16 |
-
model_file = "open_gpt4_8x7b.Q3_K_M.gguf"
|
17 |
-
prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
|
18 |
-
your_model_name = "phi2"
|
19 |
-
hf_repo_name = "TheBloke/phi-2-GGUF"
|
20 |
-
model_file = "phi-2.Q3_K_M.gguf"
|
21 |
-
prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
|
22 |
-
your_model_name = "mistral"
|
23 |
-
hf_repo_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
|
24 |
-
model_file = "mistral-7b-instruct-v0.2.Q3_K_M.gguf"
|
25 |
-
prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
|
26 |
-
return prompter
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
st.success("Model Loaded!")
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
pdf_file = file
|
50 |
-
break
|
51 |
-
print("loading Source...")
|
52 |
-
source = prompter.add_source_document(data_path, pdf_file, query=None)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
for query in queries:
|
55 |
st.subheader(f"Query: {query}")
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
for r, response in enumerate(responses):
|
60 |
-
st.write(query)
|
61 |
-
st.write(re.sub("[\n]", " ", response["llm_response"]).strip())
|
62 |
-
|
63 |
st.success("Responses generated!")
|
64 |
-
|
65 |
-
# for query in queries:
|
66 |
-
# st.subheader(f"Query: {query}")
|
67 |
-
# with st.spinner("Generating response..."):
|
68 |
-
# for file in os.listdir(data_path):
|
69 |
-
# if file.endswith(".pdf"):
|
70 |
-
# print("Found PDF file: ", file)
|
71 |
-
# print("loading Source...")
|
72 |
-
# source = prompter.add_source_document(data_path, file, query=None)
|
73 |
-
# print("generating response...")
|
74 |
-
# responses = prompter.prompt_with_source(query, prompt_name="just_the_facts", temperature=0.3)
|
75 |
-
# print("response generated!")
|
76 |
-
# for r, response in enumerate(responses):
|
77 |
-
# print(query, ":", re.sub("[\n]"," ", response["llm_response"]).strip())
|
78 |
-
# prompter.clear_source_materials()
|
79 |
-
# st.write(query)
|
80 |
-
# st.write(re.sub("[\n]"," ", response["llm_response"]).strip())
|
81 |
-
# st.success("Response generated!")
|
82 |
-
|
83 |
if __name__ == "__main__":
|
84 |
-
main()
|
|
|
1 |
import streamlit as st
|
2 |
+
# from dotenv import load_dotenv
|
3 |
+
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader, PyPDFLoader
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain.chains import ConversationalRetrievalChain
|
8 |
+
from langchain.llms import HuggingFaceHub
|
9 |
+
from langchain.memory import ConversationBufferMemory
|
10 |
|
11 |
+
# load_dotenv()
|
12 |
|
13 |
+
def make_vectorstore(embeddings):
|
14 |
+
# use glob to find all the pdf files in the data folder in the base directory
|
15 |
+
loader = PyPDFDirectoryLoader("data")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
# load the documents
|
18 |
+
documents = loader.load()
|
19 |
+
|
20 |
+
# split the documents into chunks of 1400 characters with 0 overlap
|
21 |
+
text_splitter = CharacterTextSplitter(chunk_size=1400, chunk_overlap=0)
|
22 |
+
|
23 |
+
# split the documents into chunks of 1400 characters with 0 overlap
|
24 |
+
texts = text_splitter.split_documents(documents)
|
25 |
+
|
26 |
+
# create a vector store from the documents
|
27 |
+
docsearch = FAISS.from_documents(texts, embeddings)
|
|
|
28 |
|
29 |
+
return docsearch
|
30 |
+
|
31 |
+
def get_conversation(vectorstore):
|
32 |
+
|
33 |
+
# create a memory object to store the conversation history
|
34 |
+
memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True,)
|
35 |
+
|
36 |
+
# create a conversational retrieval chain
|
37 |
+
conversation_chain = ConversationalRetrievalChain.from_chain_type(
|
38 |
+
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}, huggingfacehub_api_token="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"),
|
39 |
+
chain_type="stuff",
|
40 |
+
retriever=vectorstore.as_retriever(),
|
41 |
+
memory=memory)
|
42 |
|
43 |
+
return conversation_chain
|
44 |
|
45 |
+
def get_response(conversation_chain, query):
|
46 |
+
# get the response
|
47 |
+
response = conversation_chain.run(query)
|
48 |
+
return response
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
def main():
|
51 |
+
st.title("BetterZila RAG Enabled LLM")
|
52 |
+
embeddings = HuggingFaceInstructEmbeddings(repo_id="google/t5-v1_1-xl")
|
53 |
+
vectorstore = make_vectorstore(embeddings)
|
54 |
+
conversation_chain = get_conversation(vectorstore)
|
55 |
+
queries = ["Can you give me an example from history where the enemy was crushed totally from the book?", "What's the point of making myself less accessible?", "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"]
|
56 |
for query in queries:
|
57 |
st.subheader(f"Query: {query}")
|
58 |
+
response = get_response(conversation_chain, query)
|
59 |
+
st.write(query)
|
60 |
+
st.write(response["llm_response"])
|
|
|
|
|
|
|
|
|
61 |
st.success("Responses generated!")
|
62 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
if __name__ == "__main__":
|
64 |
+
main()
|
app_OG.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PyPDF2 import PdfReader
|
2 |
+
from langchain.text_splitter import CharacterTextSplitter
|
3 |
+
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
|
4 |
+
from langchain.vectorstores import FAISS
|
5 |
+
from langchain.chat_models import ChatOpenAI
|
6 |
+
from langchain.memory import ConversationBufferMemory
|
7 |
+
from langchain.chains import ConversationalRetrievalChain
|
8 |
+
from langchain.llms import HuggingFaceHub
|
9 |
+
|
10 |
+
def get_pdf_text(pdf_docs):
|
11 |
+
text = ""
|
12 |
+
for pdf in pdf_docs:
|
13 |
+
pdf_reader = PdfReader(pdf)
|
14 |
+
for page in pdf_reader.pages:
|
15 |
+
text += page.extract_text()
|
16 |
+
return text
|
17 |
+
|
18 |
+
|
19 |
+
def get_text_chunks(text):
|
20 |
+
text_splitter = CharacterTextSplitter(
|
21 |
+
separator="\n",
|
22 |
+
chunk_size=1000,
|
23 |
+
chunk_overlap=200,
|
24 |
+
length_function=len
|
25 |
+
)
|
26 |
+
chunks = text_splitter.split_text(text)
|
27 |
+
return chunks
|
28 |
+
|
29 |
+
|
30 |
+
def get_vectorstore(text_chunks):
|
31 |
+
# embeddings = OpenAIEmbeddings()
|
32 |
+
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
|
33 |
+
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
34 |
+
return vectorstore
|
35 |
+
|
36 |
+
|
37 |
+
def get_conversation_chain(vectorstore):
|
38 |
+
# llm = ChatOpenAI()
|
39 |
+
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
40 |
+
|
41 |
+
memory = ConversationBufferMemory(
|
42 |
+
# memory_key='chat_history',
|
43 |
+
return_messages=True)
|
44 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
45 |
+
llm=llm,
|
46 |
+
retriever=vectorstore.as_retriever(),
|
47 |
+
# memory=memory
|
48 |
+
)
|
49 |
+
return conversation_chain
|
50 |
+
|
51 |
+
|
52 |
+
def main():
|
53 |
+
|
54 |
+
# if "conversation" not in st.session_state:
|
55 |
+
# st.session_state.conversation = None
|
56 |
+
# if "chat_history" not in st.session_state:
|
57 |
+
# st.session_state.chat_history = None
|
58 |
+
|
59 |
+
# st.header("Chat with multiple PDFs :books:")
|
60 |
+
user_question = input("Ask a question about your documents:")
|
61 |
+
if user_question:
|
62 |
+
print(user_question)
|
63 |
+
|
64 |
+
pdf_path = "data/2021-01-01-2021-01-31.pdf"
|
65 |
+
pdf_docs = [pdf_path]
|
66 |
+
|
67 |
+
raw_text = get_pdf_text(pdf_docs)
|
68 |
+
text_chunks = get_text_chunks(raw_text)
|
69 |
+
vectorstore = get_vectorstore(text_chunks)
|
70 |
+
conversation = get_conversation_chain(
|
71 |
+
vectorstore)
|
72 |
+
print(conversation)
|
73 |
+
|
74 |
+
|
75 |
+
if __name__ == '__main__':
|
76 |
+
main()
|
requirements.txt
CHANGED
@@ -1,3 +1,10 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
streamlit
|
3 |
-
PyPDF2
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
langchain
|
3 |
+
langchain-community
|
4 |
+
InstructorEmbedding
|
5 |
streamlit
|
6 |
+
PyPDF2
|
7 |
+
sentence-transformers
|
8 |
+
python-dotenv
|
9 |
+
pypdf
|
10 |
+
faiss-cpu
|