Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import faiss
|
5 |
+
import numpy as np
|
6 |
+
import subprocess
|
7 |
+
import json
|
8 |
+
from PyPDF2 import PdfReader
|
9 |
+
import re
|
10 |
+
import hashlib
|
11 |
+
|
12 |
+
# Set Mistral API Key
|
13 |
+
MISTRAL_API_KEY = "S3vzsvK7rP5in24joHgL55dVCjqYSi1F"
|
14 |
+
|
15 |
+
# Initialize SBERT model
|
16 |
+
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
|
17 |
+
|
18 |
+
# Initialize Faiss index
|
19 |
+
def create_faiss_index(dim=384):
|
20 |
+
return faiss.IndexFlatL2(dim)
|
21 |
+
|
22 |
+
# Cache for repeated queries
|
23 |
+
cache = {}
|
24 |
+
|
25 |
+
# Function to generate SBERT embeddings
|
26 |
+
def generate_sbert_embeddings(texts):
|
27 |
+
return sbert_model.encode(texts)
|
28 |
+
|
29 |
+
# Function to query Mistral API using curl
|
30 |
+
def query_mistral(prompt, MISTRAL_API_KEY):
|
31 |
+
payload = {
|
32 |
+
"model": "mistral-large-latest",
|
33 |
+
"messages": [{"role": "user", "content": prompt}]
|
34 |
+
}
|
35 |
+
curl_command = [
|
36 |
+
"curl",
|
37 |
+
"--location", "https://api.mistral.ai/v1/chat/completions",
|
38 |
+
"--header", "Content-Type: application/json",
|
39 |
+
"--header", "Accept: application/json",
|
40 |
+
"--header", f"Authorization: Bearer {MISTRAL_API_KEY}",
|
41 |
+
"--data", json.dumps(payload)
|
42 |
+
]
|
43 |
+
response = subprocess.run(curl_command, capture_output=True, text=True)
|
44 |
+
if response.returncode == 0:
|
45 |
+
try:
|
46 |
+
response_json = json.loads(response.stdout)
|
47 |
+
return response_json['choices'][0]['message']['content']
|
48 |
+
except (KeyError, json.JSONDecodeError):
|
49 |
+
return "Error parsing the LLM response."
|
50 |
+
return f"Error: {response.stderr}"
|
51 |
+
|
52 |
+
# Function to split text into chunks based on sentences and word limits
|
53 |
+
def chunk_text_by_sentence(text, max_words=300):
|
54 |
+
sentences = re.split(r'(?<=\.)\s+', text)
|
55 |
+
chunks, current_chunk, current_word_count = [], [], 0
|
56 |
+
for sentence in sentences:
|
57 |
+
word_count = len(sentence.split())
|
58 |
+
if current_word_count + word_count > max_words:
|
59 |
+
chunks.append(" ".join(current_chunk))
|
60 |
+
current_chunk, current_word_count = [sentence], word_count
|
61 |
+
else:
|
62 |
+
current_chunk.append(sentence)
|
63 |
+
current_word_count += word_count
|
64 |
+
if current_chunk:
|
65 |
+
chunks.append(" ".join(current_chunk))
|
66 |
+
return chunks
|
67 |
+
|
68 |
+
# Initialize project storage
|
69 |
+
if "projects" not in st.session_state:
|
70 |
+
st.session_state.projects = {}
|
71 |
+
|
72 |
+
# Sidebar: Project Management
|
73 |
+
st.sidebar.header("Project Management")
|
74 |
+
project_name = st.sidebar.text_input("Enter Project Name:")
|
75 |
+
if st.sidebar.button("Create Project") and project_name:
|
76 |
+
if project_name not in st.session_state.projects:
|
77 |
+
st.session_state.projects[project_name] = {
|
78 |
+
"path": f"projects/{project_name}",
|
79 |
+
"texts": [],
|
80 |
+
"embeddings": None,
|
81 |
+
"index": create_faiss_index(),
|
82 |
+
}
|
83 |
+
os.makedirs(st.session_state.projects[project_name]["path"], exist_ok=True)
|
84 |
+
st.sidebar.success(f"Project '{project_name}' created!")
|
85 |
+
else:
|
86 |
+
st.sidebar.warning("Project already exists.")
|
87 |
+
|
88 |
+
# List existing projects
|
89 |
+
projects = list(st.session_state.projects.keys())
|
90 |
+
selected_project = st.sidebar.selectbox("Select a Project", projects)
|
91 |
+
|
92 |
+
# Delete a project
|
93 |
+
if st.sidebar.button("Delete Selected Project") and selected_project:
|
94 |
+
del st.session_state.projects[selected_project]
|
95 |
+
st.sidebar.success(f"Project '{selected_project}' deleted!")
|
96 |
+
|
97 |
+
# Main Section
|
98 |
+
if selected_project:
|
99 |
+
st.header(f"Manage Project: {selected_project}")
|
100 |
+
|
101 |
+
# File Upload for PDFs
|
102 |
+
project_path = st.session_state.projects[selected_project]["path"]
|
103 |
+
uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
|
104 |
+
if uploaded_files:
|
105 |
+
texts = []
|
106 |
+
for uploaded_file in uploaded_files:
|
107 |
+
pdf_reader = PdfReader(uploaded_file)
|
108 |
+
full_text = " ".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
|
109 |
+
chunks = chunk_text_by_sentence(full_text)
|
110 |
+
texts.extend(chunks)
|
111 |
+
st.session_state.projects[selected_project]["texts"].extend(texts)
|
112 |
+
|
113 |
+
# Generate embeddings and store in Faiss
|
114 |
+
embeddings = generate_sbert_embeddings(texts)
|
115 |
+
index = st.session_state.projects[selected_project]["index"]
|
116 |
+
index.add(np.array(embeddings).astype(np.float32))
|
117 |
+
st.session_state.projects[selected_project]["embeddings"] = embeddings
|
118 |
+
st.success(f"Uploaded and processed {len(uploaded_files)} PDFs.")
|
119 |
+
|
120 |
+
# Question-Answer Interface
|
121 |
+
question = st.text_input("Enter your question:")
|
122 |
+
if question:
|
123 |
+
# Check cache for repeated queries
|
124 |
+
question_hash = hashlib.sha256(question.encode()).hexdigest()
|
125 |
+
if question_hash in cache:
|
126 |
+
st.write("Cached Answer:")
|
127 |
+
st.write(cache[question_hash])
|
128 |
+
else:
|
129 |
+
# Generate embedding for the question
|
130 |
+
query_embedding = generate_sbert_embeddings([question])[0].reshape(1, -1)
|
131 |
+
index = st.session_state.projects[selected_project]["index"]
|
132 |
+
D, I = index.search(query_embedding, k=5)
|
133 |
+
retrieved_texts = "\n".join(
|
134 |
+
[st.session_state.projects[selected_project]["texts"][i] for i in I[0]]
|
135 |
+
)
|
136 |
+
prompt = f" Only Based on the following information:\n{retrieved_texts}\nAnswer the question: {question}"
|
137 |
+
mistral_answer = query_mistral(prompt, MISTRAL_API_KEY)
|
138 |
+
cache[question_hash] = mistral_answer # Cache the answer
|
139 |
+
st.write("Answer:")
|
140 |
+
st.write(mistral_answer)
|