Mattral commited on
Commit
6e04d14
·
verified ·
1 Parent(s): a431be9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from huggingface_hub import InferenceClient
4
+ from langchain_community.document_loaders import PDFPlumberLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_core.vectorstores import InMemoryVectorStore
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+
9
+ # Set up Hugging Face model and token
10
+ model = "mistralai/Mixtral-8x7B-Instruct-v0.1" # You can change to a model of your choice from Hugging Face
11
+ access_token = os.getenv("HF_TOKEN") # Your Hugging Face API token
12
+ client = InferenceClient(model=model, token=access_token)
13
+
14
+ # Template for response generation
15
+ template = """
16
+ You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
17
+ Question: {question}
18
+ Context: {context}
19
+ Answer:
20
+ """
21
+
22
+ # Directory to store uploaded PDFs
23
+ pdfs_directory = '../pdfs'
24
+ os.makedirs(pdfs_directory, exist_ok=True)
25
+
26
+ # Initialize the vector store for document indexing
27
+ vector_store = InMemoryVectorStore()
28
+
29
+ # Function to upload PDF file
30
+ def upload_pdf(file):
31
+ with open(pdfs_directory + file.name, "wb") as f:
32
+ f.write(file.getbuffer())
33
+
34
+ # Function to load PDF content
35
+ def load_pdf(file_path):
36
+ loader = PDFPlumberLoader(file_path)
37
+ documents = loader.load()
38
+ return documents
39
+
40
+ # Function to split text into manageable chunks
41
+ def split_text(documents):
42
+ text_splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=1000,
44
+ chunk_overlap=200,
45
+ add_start_index=True
46
+ )
47
+ return text_splitter.split_documents(documents)
48
+
49
+ # Function to index documents in the vector store
50
+ def index_docs(documents):
51
+ vector_store.add_documents(documents)
52
+
53
+ # Function to retrieve relevant documents based on query
54
+ def retrieve_docs(query):
55
+ return vector_store.similarity_search(query)
56
+
57
+ # Function to generate an answer based on retrieved documents
58
+ def answer_question(question, documents):
59
+ context = "\n\n".join([doc.page_content for doc in documents])
60
+ full_context = f"{context}"
61
+ prompt = ChatPromptTemplate.from_template(template)
62
+ chain = prompt | client # Send the prompt to Hugging Face's model via InferenceClient
63
+
64
+ return chain.invoke({"question": question, "context": full_context})
65
+
66
+ # Streamlit file uploader for PDF
67
+ uploaded_file = st.file_uploader(
68
+ "Upload PDF",
69
+ type="pdf",
70
+ accept_multiple_files=False
71
+ )
72
+
73
+ if uploaded_file:
74
+ # Upload, load, split, and index documents
75
+ upload_pdf(uploaded_file)
76
+ documents = load_pdf(pdfs_directory + uploaded_file.name)
77
+ chunked_documents = split_text(documents)
78
+ index_docs(chunked_documents)
79
+
80
+ # User input for a question
81
+ question = st.chat_input()
82
+
83
+ if question:
84
+ st.chat_message("user").write(question)
85
+ related_documents = retrieve_docs(question)
86
+ answer = answer_question(question, related_documents)
87
+ st.chat_message("assistant").write(answer)