ajoy0071998 commited on
Commit
0e565ef
·
verified ·
1 Parent(s): cddf2e2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ import numpy as np
6
+ import subprocess
7
+ import json
8
+ from PyPDF2 import PdfReader
9
+ import re
10
+ import hashlib
11
+
12
+ # Set Mistral API Key
13
+ MISTRAL_API_KEY = "S3vzsvK7rP5in24joHgL55dVCjqYSi1F"
14
+
15
+ # Initialize SBERT model
16
+ sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
17
+
18
+ # Initialize Faiss index
19
+ def create_faiss_index(dim=384):
20
+ return faiss.IndexFlatL2(dim)
21
+
22
+ # Cache for repeated queries
23
+ cache = {}
24
+
25
+ # Function to generate SBERT embeddings
26
+ def generate_sbert_embeddings(texts):
27
+ return sbert_model.encode(texts)
28
+
29
+ # Function to query Mistral API using curl
30
+ def query_mistral(prompt, MISTRAL_API_KEY):
31
+ payload = {
32
+ "model": "mistral-large-latest",
33
+ "messages": [{"role": "user", "content": prompt}]
34
+ }
35
+ curl_command = [
36
+ "curl",
37
+ "--location", "https://api.mistral.ai/v1/chat/completions",
38
+ "--header", "Content-Type: application/json",
39
+ "--header", "Accept: application/json",
40
+ "--header", f"Authorization: Bearer {MISTRAL_API_KEY}",
41
+ "--data", json.dumps(payload)
42
+ ]
43
+ response = subprocess.run(curl_command, capture_output=True, text=True)
44
+ if response.returncode == 0:
45
+ try:
46
+ response_json = json.loads(response.stdout)
47
+ return response_json['choices'][0]['message']['content']
48
+ except (KeyError, json.JSONDecodeError):
49
+ return "Error parsing the LLM response."
50
+ return f"Error: {response.stderr}"
51
+
52
+ # Function to split text into chunks based on sentences and word limits
53
+ def chunk_text_by_sentence(text, max_words=300):
54
+ sentences = re.split(r'(?<=\.)\s+', text)
55
+ chunks, current_chunk, current_word_count = [], [], 0
56
+ for sentence in sentences:
57
+ word_count = len(sentence.split())
58
+ if current_word_count + word_count > max_words:
59
+ chunks.append(" ".join(current_chunk))
60
+ current_chunk, current_word_count = [sentence], word_count
61
+ else:
62
+ current_chunk.append(sentence)
63
+ current_word_count += word_count
64
+ if current_chunk:
65
+ chunks.append(" ".join(current_chunk))
66
+ return chunks
67
+
68
+ # Initialize project storage
69
+ if "projects" not in st.session_state:
70
+ st.session_state.projects = {}
71
+
72
+ # Sidebar: Project Management
73
+ st.sidebar.header("Project Management")
74
+ project_name = st.sidebar.text_input("Enter Project Name:")
75
+ if st.sidebar.button("Create Project") and project_name:
76
+ if project_name not in st.session_state.projects:
77
+ st.session_state.projects[project_name] = {
78
+ "path": f"projects/{project_name}",
79
+ "texts": [],
80
+ "embeddings": None,
81
+ "index": create_faiss_index(),
82
+ }
83
+ os.makedirs(st.session_state.projects[project_name]["path"], exist_ok=True)
84
+ st.sidebar.success(f"Project '{project_name}' created!")
85
+ else:
86
+ st.sidebar.warning("Project already exists.")
87
+
88
+ # List existing projects
89
+ projects = list(st.session_state.projects.keys())
90
+ selected_project = st.sidebar.selectbox("Select a Project", projects)
91
+
92
+ # Delete a project
93
+ if st.sidebar.button("Delete Selected Project") and selected_project:
94
+ del st.session_state.projects[selected_project]
95
+ st.sidebar.success(f"Project '{selected_project}' deleted!")
96
+
97
+ # Main Section
98
+ if selected_project:
99
+ st.header(f"Manage Project: {selected_project}")
100
+
101
+ # File Upload for PDFs
102
+ project_path = st.session_state.projects[selected_project]["path"]
103
+ uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
104
+ if uploaded_files:
105
+ texts = []
106
+ for uploaded_file in uploaded_files:
107
+ pdf_reader = PdfReader(uploaded_file)
108
+ full_text = " ".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
109
+ chunks = chunk_text_by_sentence(full_text)
110
+ texts.extend(chunks)
111
+ st.session_state.projects[selected_project]["texts"].extend(texts)
112
+
113
+ # Generate embeddings and store in Faiss
114
+ embeddings = generate_sbert_embeddings(texts)
115
+ index = st.session_state.projects[selected_project]["index"]
116
+ index.add(np.array(embeddings).astype(np.float32))
117
+ st.session_state.projects[selected_project]["embeddings"] = embeddings
118
+ st.success(f"Uploaded and processed {len(uploaded_files)} PDFs.")
119
+
120
+ # Question-Answer Interface
121
+ question = st.text_input("Enter your question:")
122
+ if question:
123
+ # Check cache for repeated queries
124
+ question_hash = hashlib.sha256(question.encode()).hexdigest()
125
+ if question_hash in cache:
126
+ st.write("Cached Answer:")
127
+ st.write(cache[question_hash])
128
+ else:
129
+ # Generate embedding for the question
130
+ query_embedding = generate_sbert_embeddings([question])[0].reshape(1, -1)
131
+ index = st.session_state.projects[selected_project]["index"]
132
+ D, I = index.search(query_embedding, k=5)
133
+ retrieved_texts = "\n".join(
134
+ [st.session_state.projects[selected_project]["texts"][i] for i in I[0]]
135
+ )
136
+ prompt = f" Only Based on the following information:\n{retrieved_texts}\nAnswer the question: {question}"
137
+ mistral_answer = query_mistral(prompt, MISTRAL_API_KEY)
138
+ cache[question_hash] = mistral_answer # Cache the answer
139
+ st.write("Answer:")
140
+ st.write(mistral_answer)