Ajeet001 commited on
Commit
2b0df9e
·
verified ·
1 Parent(s): 5e15c1b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +89 -0
  2. interaction_log.json +15 -0
  3. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import yaml
4
+ from dotenv import load_dotenv
5
+ from src.document_loader import load_document
6
+ from src.chunking_embedding import setup_chunking_and_embedding
7
+ from src.vector_store import create_vectorstore
8
+ from langchain_groq import ChatGroq
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.memory import ConversationBufferWindowMemory
11
+ from src.utils import log_interaction
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+
17
+ # Load configuration
18
+ config_path = os.path.join(os.getcwd(), "config", "config.yaml") # Ensure correct path
19
+ with open(config_path, "r") as f:
20
+ config = yaml.safe_load(f)
21
+
22
+ # Get the GROQ API key from environment variables
23
+ groq_api_key = os.getenv('groq_api_key')
24
+ if not groq_api_key:
25
+ raise ValueError("GROQ_API_KEY not found. Please set it in the .env file.")
26
+
27
+ # Streamlit UI
28
+ st.set_page_config(page_title="Your Document AI Assistant", page_icon="📄", layout="centered")
29
+ st.title("🪅 Document AI Assistant!")
30
+
31
+ # Sidebar for document upload
32
+ uploaded_file = st.sidebar.file_uploader(label="Upload your document (PDF/DOC/DOCX)", type=['pdf', 'doc', 'docx'])
33
+ loader_type = st.sidebar.selectbox('Choose a loader type', config['loaders'])
34
+ embedding_model = st.sidebar.selectbox('Choose an embedding model', config['embedding_models'])
35
+ chunking_strategy = st.sidebar.selectbox('Choose a chunking strategy', config['chunking_strategies'])
36
+ chunk_size = st.sidebar.number_input('Chunk Size', min_value=100, value=500, step=100)
37
+ chunk_overlap = st.sidebar.number_input('Chunk Overlap', min_value=0, value=100, step=100)
38
+ temperature = st.sidebar.slider('Temperature', min_value=0.0, max_value=1.0, value=0.0, step=0.1)
39
+ top_p = st.sidebar.slider('Top-p', min_value=0.0, max_value=1.0, value=0.9, step=0.1)
40
+
41
+ # Initialize session state for conversation
42
+ if 'chat_history' not in st.session_state:
43
+ st.session_state.chat_history = []
44
+ if 'conversation_chain' not in st.session_state:
45
+ st.session_state.conversation_chain = None
46
+
47
+ # Process the uploaded file
48
+ if uploaded_file is not None:
49
+ file_path = os.path.join("temp", uploaded_file.name)
50
+ os.makedirs("temp", exist_ok=True)
51
+ with open(file_path, "wb") as f:
52
+ f.write(uploaded_file.getbuffer())
53
+
54
+ documents = load_document(file_path, loader_type)
55
+ doc_chunks, embeddings = setup_chunking_and_embedding(documents, chunking_strategy, chunk_size, chunk_overlap, embedding_model)
56
+ vectorstore = create_vectorstore(doc_chunks, embeddings)
57
+
58
+ # Create the conversational retrieval chain
59
+ llm = ChatGroq(groq_api_key=groq_api_key, model_name='llama-3.3-70b-versatile', temperature=temperature)
60
+ retriever = vectorstore.as_retriever()
61
+ memory = ConversationBufferWindowMemory(k=5, memory_key="chat_history", return_messages=True)
62
+ st.session_state.conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
63
+
64
+ # User input for asking a question
65
+ user_input = st.text_input("Ask a question:")
66
+ if st.button("Ask Question"):
67
+ if user_input:
68
+ with st.chat_message("user"):
69
+ st.markdown(user_input)
70
+
71
+ with st.chat_message("assistant"):
72
+ response = st.session_state.conversation_chain({"question": user_input})
73
+ assistant_response = response['answer']
74
+ st.markdown(assistant_response)
75
+ st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
76
+
77
+ # Log interaction
78
+ params = {
79
+ "loader": loader_type,
80
+ "chunking_strategy": chunking_strategy,
81
+ "chunk_size": chunk_size,
82
+ "chunk_overlap": chunk_overlap,
83
+ "embedding_model": embedding_model,
84
+ "temperature": temperature,
85
+ "top_p": top_p
86
+ }
87
+ log_interaction(user_input, assistant_response, params)
88
+
89
+
interaction_log.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "question": "what is this document about?",
4
+ "response": "This document appears to be a sample PDF file containing a passage of Lorem Ipsum text, which is a placeholder text used to demonstrate the layout and formatting of a document. It doesn't seem to be about any specific topic or subject, but rather a filler text used for testing or demonstration purposes.",
5
+ "params": {
6
+ "loader": "PyPDF",
7
+ "chunking_strategy": "Recursive",
8
+ "chunk_size": 800,
9
+ "chunk_overlap": 100,
10
+ "embedding_model": "HuggingFace",
11
+ "temperature": 0.0,
12
+ "top_p": 0.9
13
+ }
14
+ }
15
+ ]
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ pypdf
3
+ python-dotenv
4
+ langchain_groq
5
+ langchain_community
6
+ streamlit
7
+ langchain_huggingface
8
+ faiss-cpu
9
+ pymupdf
10
+ pdfplumber
11
+ pdfminer
12
+ python-docx
13
+ python-doc