gmustafa413 commited on
Commit
03bf0d5
Β·
verified Β·
1 Parent(s): a29fcf8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -0
app.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ import numpy as np
6
+ import google.generativeai as genai
7
+ from datasets import load_dataset
8
+ from typing import List, Dict
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Configuration
15
+ MODEL_NAME = "all-MiniLM-L6-v2"
16
+ GENAI_MODEL = "gemini-pro"
17
+ DATASET_LINK = "https://huggingface.co/datasets/midrees2806/7K_Dataset " # Replace with your dataset link
18
+ CHUNK_SIZE = 500
19
+ TOP_K = 3
20
+
21
+ # Initialize models
22
+ embedding_model = SentenceTransformer(MODEL_NAME)
23
+
24
+ class GroqRAGSystem:
25
+ def __init__(self):
26
+ self.index = None
27
+ self.chunks = []
28
+ self.dataset_loaded = False
29
+ self.gemini_api_key = os.getenv("AIzaSyASrFvE3gFPigihza0JTuALzZmBx0Kc3d0")
30
+ if self.gemini_api_key:
31
+ genai.configure(api_key=self.gemini_api_key)
32
+
33
+ def load_dataset_from_link(self, dataset_link: str):
34
+ """Load dataset from Hugging Face link"""
35
+ try:
36
+ # Extract dataset name from URL
37
+ dataset_name = dataset_link.split("datasets/")[-1].split("/")[0]
38
+ if not dataset_name:
39
+ raise ValueError("Invalid dataset URL format")
40
+
41
+ with gr.Progress() as progress:
42
+ progress(0.1, desc="πŸ“¦ Downloading dataset...")
43
+ dataset = load_dataset(dataset_name, split='train')
44
+
45
+ progress(0.5, desc="πŸ”¨ Processing dataset...")
46
+ if 'context' in dataset.features:
47
+ self.chunks = list(set(dataset['context']))
48
+ elif 'text' in dataset.features:
49
+ self.chunks = dataset['text']
50
+ elif 'question' in dataset.features and 'answer' in dataset.features:
51
+ self.chunks = [f"Q: {q}\nA: {a}" for q, a in zip(dataset['question'], dataset['answer'])]
52
+ else:
53
+ raise ValueError("Unsupported dataset format")
54
+
55
+ progress(0.7, desc="🧠 Creating embeddings...")
56
+ embeddings = embedding_model.encode(self.chunks, show_progress_bar=False)
57
+ self.index = faiss.IndexFlatL2(embeddings.shape[1])
58
+ self.index.add(embeddings.astype('float32'))
59
+
60
+ self.dataset_loaded = True
61
+ progress(1.0, desc="βœ… Dataset loaded successfully!")
62
+ return True
63
+ except Exception as e:
64
+ gr.Error(f"Failed to load dataset: {str(e)}")
65
+ return False
66
+
67
+ def get_relevant_context(self, query: str) -> str:
68
+ """Retrieve most relevant chunks with scores"""
69
+ query_embed = embedding_model.encode([query])
70
+ scores, indices = self.index.search(query_embed.astype('float32'), k=TOP_K)
71
+
72
+ context = []
73
+ for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
74
+ if idx < len(self.chunks):
75
+ context.append(f"πŸ” Match {i+1} (Score: {1-score:.2f}):\n{self.chunks[idx]}\n")
76
+ return "\n".join(context)
77
+
78
+ def generate_response(self, query: str) -> str:
79
+ """Generate response using only dataset context"""
80
+ if not self.dataset_loaded:
81
+ return "⚠️ Please load the dataset first"
82
+ if not self.gemini_api_key:
83
+ return "πŸ”‘ Please set your Gemini API key in environment variables"
84
+
85
+ context = self.get_relevant_context(query)
86
+
87
+ prompt = f"""You are an expert AI assistant that answers STRICTLY based on the provided context.
88
+ Follow these rules:
89
+ 1. Answer concisely using ONLY the context below
90
+ 2. If the answer isn't in the context, say "I couldn't find this in the dataset"
91
+ 3. Never make up information
92
+ 4. For ambiguous questions, ask for clarification
93
+
94
+ Context:
95
+ {context}
96
+
97
+ Question: {query}
98
+
99
+ Answer:"""
100
+
101
+ try:
102
+ model = genai.GenerativeModel(GENAI_MODEL)
103
+ response = model.generate_content(prompt)
104
+ return response.text
105
+ except Exception as e:
106
+ return f"⚠️ Error generating response: {str(e)}"
107
+
108
+ # Initialize the RAG system
109
+ rag_system = GroqRAGSystem()
110
+
111
+ # Custom CSS for modern UI
112
+ css = """
113
+ .gradio-container {
114
+ max-width: 900px !important;
115
+ margin: auto !important;
116
+ font-family: 'Inter', sans-serif;
117
+ }
118
+ .dark .gradio-container {
119
+ background-color: #1e1e2e;
120
+ }
121
+ .message-user {
122
+ background: #3b82f6;
123
+ color: white;
124
+ border-radius: 18px 18px 0 18px;
125
+ padding: 12px;
126
+ margin: 8px 0;
127
+ max-width: 80%;
128
+ margin-left: auto;
129
+ }
130
+ .message-bot {
131
+ background: #f3f4f6;
132
+ color: #111827;
133
+ border-radius: 18px 18px 18px 0;
134
+ padding: 12px;
135
+ margin: 8px 0;
136
+ max-width: 80%;
137
+ }
138
+ .dark .message-bot {
139
+ background: #2d3748;
140
+ color: #f7fafc;
141
+ }
142
+ .progress-bar {
143
+ height: 6px !important;
144
+ }
145
+ """
146
+
147
+ # Chat interface
148
+ with gr.Blocks(css=css, theme=gr.themes.Default()) as app:
149
+ # Store chat history
150
+ chat_history = gr.State([])
151
+
152
+ gr.Markdown("UE-ChatBot")
153
+ gr.Markdown(f"**Dataset:** {DATASET_LINK}")
154
+
155
+ with gr.Row():
156
+ with gr.Column(scale=1):
157
+ gr.Markdown("## βš™οΈ Configuration")
158
+ dataset_url = gr.Textbox(
159
+ label="Hugging Face Dataset URL",
160
+ value=DATASET_LINK,
161
+ interactive=True
162
+ )
163
+ load_btn = gr.Button("πŸš€ Load Dataset", variant="primary")
164
+ status = gr.Markdown("ℹ️ Please load the dataset first")
165
+
166
+ with gr.Column(scale=2):
167
+ chatbot = gr.Chatbot(
168
+ label="Chat History",
169
+ bubble_full_width=False,
170
+ avatar_images=(
171
+ "https://avatars.githubusercontent.com/u/1561194?v=4", # User avatar
172
+ "https://huggingface.co/spaces/groq/Groq-LLM/resolve/main/groq_logo.png" # Bot avatar
173
+ )
174
+ )
175
+ query = gr.Textbox(
176
+ label="Type your question...",
177
+ placeholder="Ask about the dataset content",
178
+ autofocus=True
179
+ )
180
+ submit_btn = gr.Button("πŸ“€ Submit", variant="primary")
181
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary")
182
+
183
+ # Event handlers
184
+ def load_dataset(dataset_url):
185
+ if rag_system.load_dataset_from_link(dataset_url):
186
+ return "βœ… Dataset loaded successfully!"
187
+ return "❌ Failed to load dataset"
188
+
189
+ def respond(query, history):
190
+ if not query.strip():
191
+ return history, ""
192
+
193
+ # Add user message
194
+ history.append((query, None))
195
+
196
+ # Get response
197
+ response = rag_system.generate_response(query)
198
+
199
+ # Update history
200
+ history[-1] = (query, response)
201
+ return history, ""
202
+
203
+ # Connect components
204
+ load_btn.click(
205
+ load_dataset,
206
+ inputs=dataset_url,
207
+ outputs=status
208
+ )
209
+
210
+ submit_btn.click(
211
+ respond,
212
+ inputs=[query, chat_history],
213
+ outputs=[chatbot, query]
214
+ )
215
+
216
+ query.submit(
217
+ respond,
218
+ inputs=[query, chat_history],
219
+ outputs=[chatbot, query]
220
+ )
221
+
222
+ clear_btn.click(
223
+ lambda: [],
224
+ inputs=None,
225
+ outputs=chatbot
226
+ )
227
+
228
+ # For Hugging Face Spaces
229
+ if __name__ == "__main__":
230
+ app.launch(debug=True)