Pratik Dwivedi commited on
Commit
e4b7b4c
·
1 Parent(s): 5b0f27d

test new version

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +52 -72
  3. app_OG.py +76 -0
  4. requirements.txt +9 -2
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ .gitattributes
app.py CHANGED
@@ -1,84 +1,64 @@
1
  import streamlit as st
2
- from llmware.prompts import Prompt
3
- import io, os, re
4
- import PyPDF2
 
 
 
 
 
5
 
6
- def register_gguf_model():
7
 
8
- prompter = Prompt()
9
- your_model_name = "llama"
10
- hf_repo_name = "TheBloke/Llama-2-7B-Chat-GGUF"
11
- model_file = "llama-2-7b-chat.Q3_K_M.gguf"
12
- print("registering models")
13
- prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
14
- your_model_name = "open_gpt4"
15
- hf_repo_name = "TheBloke/Open_Gpt4_8x7B-GGUF"
16
- model_file = "open_gpt4_8x7b.Q3_K_M.gguf"
17
- prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
18
- your_model_name = "phi2"
19
- hf_repo_name = "TheBloke/phi-2-GGUF"
20
- model_file = "phi-2.Q3_K_M.gguf"
21
- prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
22
- your_model_name = "mistral"
23
- hf_repo_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
24
- model_file = "mistral-7b-instruct-v0.2.Q3_K_M.gguf"
25
- prompter.model_catalog.register_gguf_model(your_model_name,hf_repo_name, model_file, prompt_wrapper="open_chat")
26
- return prompter
27
 
28
- def main():
29
- st.title("BetterZila RAG Enabled LLM")
30
- with st.spinner("Registering Models for use..."):
31
- prompter = register_gguf_model()
32
-
33
- data_path = "data/"
34
-
35
- st.sidebar.subheader("Select Model")
36
- model_name = st.sidebar.selectbox("Select Model", ["llama", "open_gpt4", "phi2", "mistral"])
37
- with st.spinner("Loading Model..."):
38
- prompter.load_model(model_name)
39
- st.success("Model Loaded!")
40
 
41
- queries = ['Can you give me an example from history where the enemy was crushed totally from the book?', "What's the point of making myself less accessible?", "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"]
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- st.subheader("Query")
44
 
45
- with st.spinner("Loading PDF file..."):
46
- for file in os.listdir(data_path):
47
- if file.endswith(".pdf"):
48
- print("Found PDF file: ", file)
49
- pdf_file = file
50
- break
51
- print("loading Source...")
52
- source = prompter.add_source_document(data_path, pdf_file, query=None)
53
 
 
 
 
 
 
 
54
  for query in queries:
55
  st.subheader(f"Query: {query}")
56
- with st.spinner("Generating response..."):
57
- responses = prompter.prompt_with_source(query, prompt_name="just_the_facts", temperature=0.3)
58
-
59
- for r, response in enumerate(responses):
60
- st.write(query)
61
- st.write(re.sub("[\n]", " ", response["llm_response"]).strip())
62
-
63
  st.success("Responses generated!")
64
-
65
- # for query in queries:
66
- # st.subheader(f"Query: {query}")
67
- # with st.spinner("Generating response..."):
68
- # for file in os.listdir(data_path):
69
- # if file.endswith(".pdf"):
70
- # print("Found PDF file: ", file)
71
- # print("loading Source...")
72
- # source = prompter.add_source_document(data_path, file, query=None)
73
- # print("generating response...")
74
- # responses = prompter.prompt_with_source(query, prompt_name="just_the_facts", temperature=0.3)
75
- # print("response generated!")
76
- # for r, response in enumerate(responses):
77
- # print(query, ":", re.sub("[\n]"," ", response["llm_response"]).strip())
78
- # prompter.clear_source_materials()
79
- # st.write(query)
80
- # st.write(re.sub("[\n]"," ", response["llm_response"]).strip())
81
- # st.success("Response generated!")
82
-
83
  if __name__ == "__main__":
84
- main()
 
1
  import streamlit as st
2
+ # from dotenv import load_dotenv
3
+ from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader, PyPDFLoader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.llms import HuggingFaceHub
9
+ from langchain.memory import ConversationBufferMemory
10
 
11
+ # load_dotenv()
12
 
13
+ def make_vectorstore(embeddings):
14
+ # use glob to find all the pdf files in the data folder in the base directory
15
+ loader = PyPDFDirectoryLoader("data")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # load the documents
18
+ documents = loader.load()
19
+
20
+ # split the documents into chunks of 1400 characters with 0 overlap
21
+ text_splitter = CharacterTextSplitter(chunk_size=1400, chunk_overlap=0)
22
+
23
+ # split the documents into chunks of 1400 characters with 0 overlap
24
+ texts = text_splitter.split_documents(documents)
25
+
26
+ # create a vector store from the documents
27
+ docsearch = FAISS.from_documents(texts, embeddings)
 
28
 
29
+ return docsearch
30
+
31
+ def get_conversation(vectorstore):
32
+
33
+ # create a memory object to store the conversation history
34
+ memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True,)
35
+
36
+ # create a conversational retrieval chain
37
+ conversation_chain = ConversationalRetrievalChain.from_chain_type(
38
+ llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}, huggingfacehub_api_token="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"),
39
+ chain_type="stuff",
40
+ retriever=vectorstore.as_retriever(),
41
+ memory=memory)
42
 
43
+ return conversation_chain
44
 
45
+ def get_response(conversation_chain, query):
46
+ # get the response
47
+ response = conversation_chain.run(query)
48
+ return response
 
 
 
 
49
 
50
+ def main():
51
+ st.title("BetterZila RAG Enabled LLM")
52
+ embeddings = HuggingFaceInstructEmbeddings(repo_id="google/t5-v1_1-xl")
53
+ vectorstore = make_vectorstore(embeddings)
54
+ conversation_chain = get_conversation(vectorstore)
55
+ queries = ["Can you give me an example from history where the enemy was crushed totally from the book?", "What's the point of making myself less accessible?", "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"]
56
  for query in queries:
57
  st.subheader(f"Query: {query}")
58
+ response = get_response(conversation_chain, query)
59
+ st.write(query)
60
+ st.write(response["llm_response"])
 
 
 
 
61
  st.success("Responses generated!")
62
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  if __name__ == "__main__":
64
+ main()
app_OG.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
4
+ from langchain.vectorstores import FAISS
5
+ from langchain.chat_models import ChatOpenAI
6
+ from langchain.memory import ConversationBufferMemory
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.llms import HuggingFaceHub
9
+
10
+ def get_pdf_text(pdf_docs):
11
+ text = ""
12
+ for pdf in pdf_docs:
13
+ pdf_reader = PdfReader(pdf)
14
+ for page in pdf_reader.pages:
15
+ text += page.extract_text()
16
+ return text
17
+
18
+
19
+ def get_text_chunks(text):
20
+ text_splitter = CharacterTextSplitter(
21
+ separator="\n",
22
+ chunk_size=1000,
23
+ chunk_overlap=200,
24
+ length_function=len
25
+ )
26
+ chunks = text_splitter.split_text(text)
27
+ return chunks
28
+
29
+
30
+ def get_vectorstore(text_chunks):
31
+ # embeddings = OpenAIEmbeddings()
32
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
33
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
34
+ return vectorstore
35
+
36
+
37
+ def get_conversation_chain(vectorstore):
38
+ # llm = ChatOpenAI()
39
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
40
+
41
+ memory = ConversationBufferMemory(
42
+ # memory_key='chat_history',
43
+ return_messages=True)
44
+ conversation_chain = ConversationalRetrievalChain.from_llm(
45
+ llm=llm,
46
+ retriever=vectorstore.as_retriever(),
47
+ # memory=memory
48
+ )
49
+ return conversation_chain
50
+
51
+
52
+ def main():
53
+
54
+ # if "conversation" not in st.session_state:
55
+ # st.session_state.conversation = None
56
+ # if "chat_history" not in st.session_state:
57
+ # st.session_state.chat_history = None
58
+
59
+ # st.header("Chat with multiple PDFs :books:")
60
+ user_question = input("Ask a question about your documents:")
61
+ if user_question:
62
+ print(user_question)
63
+
64
+ pdf_path = "data/2021-01-01-2021-01-31.pdf"
65
+ pdf_docs = [pdf_path]
66
+
67
+ raw_text = get_pdf_text(pdf_docs)
68
+ text_chunks = get_text_chunks(raw_text)
69
+ vectorstore = get_vectorstore(text_chunks)
70
+ conversation = get_conversation_chain(
71
+ vectorstore)
72
+ print(conversation)
73
+
74
+
75
+ if __name__ == '__main__':
76
+ main()
requirements.txt CHANGED
@@ -1,3 +1,10 @@
1
- llmware
 
 
 
2
  streamlit
3
- PyPDF2
 
 
 
 
 
1
+ transformers
2
+ langchain
3
+ langchain-community
4
+ InstructorEmbedding
5
  streamlit
6
+ PyPDF2
7
+ sentence-transformers
8
+ python-dotenv
9
+ pypdf
10
+ faiss-cpu