Wedyan2023 commited on
Commit
1e59223
·
verified ·
1 Parent(s): ba411cc

Create app104.py

Browse files
Files changed (1) hide show
  1. app104.py +1396 -0
app104.py ADDED
@@ -0,0 +1,1396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import json
5
+ import base64
6
+ import random
7
+ from streamlit_pdf_viewer import pdf_viewer
8
+ from langchain.prompts import PromptTemplate
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from openai import OpenAI
12
+ from dotenv import load_dotenv
13
+ import warnings
14
+
15
+ warnings.filterwarnings('ignore')
16
+
17
+ os.getenv("OAUTH_CLIENT_ID")
18
+
19
+
20
+ # Load environment variables and initialize the OpenAI client to use Hugging Face Inference API.
21
+ load_dotenv()
22
+ client = OpenAI(
23
+ base_url="https://api-inference.huggingface.co/v1",
24
+ api_key=os.environ.get('RAM') # Hugging Face API token
25
+ )
26
+
27
+ # Create necessary directories
28
+ for dir_name in ['data', 'feedback']:
29
+ if not os.path.exists(dir_name):
30
+ os.makedirs(dir_name)
31
+
32
+ # Custom CSS
33
+ st.markdown("""
34
+ <style>
35
+ .stButton > button {
36
+ width: 100%;
37
+ margin-bottom: 10px;
38
+ background-color: #4CAF50;
39
+ color: white;
40
+ border: none;
41
+ padding: 10px;
42
+ border-radius: 5px;
43
+ }
44
+ .task-button {
45
+ background-color: #2196F3 !important;
46
+ }
47
+ .stSelectbox {
48
+ margin-bottom: 20px;
49
+ }
50
+ .output-container {
51
+ padding: 20px;
52
+ border-radius: 5px;
53
+ border: 1px solid #ddd;
54
+ margin: 10px 0;
55
+ }
56
+ .status-container {
57
+ padding: 10px;
58
+ border-radius: 5px;
59
+ margin: 10px 0;
60
+ }
61
+ .sidebar-info {
62
+ padding: 10px;
63
+ background-color: #f0f2f6;
64
+ border-radius: 5px;
65
+ margin: 10px 0;
66
+ }
67
+ .feedback-button {
68
+ background-color: #ff9800 !important;
69
+ }
70
+ .feedback-container {
71
+ padding: 15px;
72
+ background-color: #f5f5f5;
73
+ border-radius: 5px;
74
+ margin: 15px 0;
75
+ }
76
+ </style>
77
+ """, unsafe_allow_html=True)
78
+
79
+ # Helper functions
80
+ def read_csv_with_encoding(file):
81
+ encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
82
+ for encoding in encodings:
83
+ try:
84
+ return pd.read_csv(file, encoding=encoding)
85
+ except UnicodeDecodeError:
86
+ continue
87
+ raise UnicodeDecodeError("Failed to read file with any supported encoding")
88
+
89
+ #def save_feedback(feedback_data):
90
+ #feedback_file = 'feedback/user_feedback.csv'
91
+ #feedback_df = pd.DataFrame([feedback_data])
92
+
93
+ #if os.path.exists(feedback_file):
94
+ #feedback_df.to_csv(feedback_file, mode='a', header=False, index=False)
95
+ #else:
96
+ #feedback_df.to_csv(feedback_file, index=False)
97
+
98
+ def reset_conversation():
99
+ st.session_state.conversation = []
100
+ st.session_state.messages = []
101
+ if 'task_choice' in st.session_state:
102
+ del st.session_state.task_choice
103
+ return None
104
+ #new 24 March
105
+ #user_input = st.text_input("Enter your prompt:")
106
+ ###########33
107
+
108
+ # Initialize session state variables
109
+ if "messages" not in st.session_state:
110
+ st.session_state.messages = []
111
+ if "examples_to_classify" not in st.session_state:
112
+ st.session_state.examples_to_classify = []
113
+ if "system_role" not in st.session_state:
114
+ st.session_state.system_role = ""
115
+
116
+
117
+
118
+ # Main app title
119
+ st.title("🤖🦙 Text Data Labeling and Generation App")
120
+ # def embed_pdf_sidebar(pdf_path):
121
+ # with open(pdf_path, "rb") as f:
122
+ # base64_pdf = base64.b64encode(f.read()).decode('utf-8')
123
+ # pdf_display = f"""
124
+ # <iframe src="data:application/pdf;base64,{base64_pdf}"
125
+ # width="100%" height="400" type="application/pdf"></iframe>
126
+ # """
127
+ # st.markdown(pdf_display, unsafe_allow_html=True)
128
+ #
129
+
130
+
131
+ # Sidebar settings
132
+ with st.sidebar:
133
+ st.title("⚙️ Settings")
134
+
135
+
136
+ #this last code works
137
+ with st.sidebar:
138
+ st.markdown("### 📘Data Generation and Labeling Instructions")
139
+ #st.markdown("<h4 style='color: #4A90E2;'>📘 Instructions</h4>", unsafe_allow_html=True)
140
+ with open("User instructions.pdf", "rb") as f:
141
+ st.download_button(
142
+ label="📄 Download Instructions PDF",
143
+ data=f,
144
+ #file_name="instructions.pdf",
145
+ file_name="User instructions.pdf",
146
+ mime="application/pdf"
147
+ )
148
+
149
+ selected_model = st.selectbox(
150
+ "Select Model",
151
+ ["meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.2-3B-Instruct","meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct",
152
+ "meta-llama/Llama-3.1-70B-Instruct"],
153
+ key='model_select'
154
+ )
155
+
156
+ temperature = st.slider(
157
+ "Temperature",
158
+ 0.0, 1.0, 0.7,
159
+ help="Controls randomness in generation"
160
+ )
161
+
162
+ st.button("🔄 New Conversation", on_click=reset_conversation)
163
+ with st.container():
164
+ st.markdown(f"""
165
+ <div class="sidebar-info">
166
+ <h4>Current Model: {selected_model}</h4>
167
+ <p><em>Note: Generated content may be inaccurate or false. Check important info.</em></p>
168
+ </div>
169
+ """, unsafe_allow_html=True)
170
+
171
+ feedback_url = "https://docs.google.com/forms/d/e/1FAIpQLSdZ_5mwW-pjqXHgxR0xriyVeRhqdQKgb5c-foXlYAV55Rilsg/viewform?usp=header"
172
+ st.sidebar.markdown(
173
+ f'<a href="{feedback_url}" target="_blank"><button style="width: 100%;">Feedback Form</button></a>',
174
+ unsafe_allow_html=True
175
+ )
176
+
177
+ # Display conversation
178
+ for message in st.session_state.messages:
179
+ with st.chat_message(message["role"]):
180
+ st.markdown(message["content"])
181
+
182
+ # Main content
183
+ if 'task_choice' not in st.session_state:
184
+ col1, col2 = st.columns(2)
185
+ with col1:
186
+ if st.button("📝 Data Generation", key="gen_button", help="Generate new data"):
187
+ st.session_state.task_choice = "Data Generation"
188
+ with col2:
189
+ if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"):
190
+ st.session_state.task_choice = "Data Labeling"
191
+
192
+ if "task_choice" in st.session_state:
193
+ if st.session_state.task_choice == "Data Generation":
194
+ st.header("📝 Data Generation")
195
+
196
+ # 1. Domain selection
197
+ domain_selection = st.selectbox("Domain", [
198
+ "Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"
199
+ ])
200
+
201
+ # 2. Handle custom domain input
202
+ custom_domain_valid = True # Assume valid until proven otherwise
203
+
204
+ if domain_selection == "Custom":
205
+ domain = st.text_input("Specify custom domain")
206
+ if not domain.strip():
207
+ st.error("Please specify a domain name.")
208
+ custom_domain_valid = False
209
+ else:
210
+ domain = domain_selection
211
+
212
+ # Classification type selection
213
+ classification_type = st.selectbox(
214
+ "Classification Type",
215
+ ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
216
+ )
217
+ # Labels setup based on classification type
218
+ #labels = []
219
+ labels = []
220
+ labels_valid = False
221
+ errors = []
222
+
223
+ def validate_binary_labels(labels):
224
+ errors = []
225
+ normalized = [label.strip().lower() for label in labels]
226
+
227
+ if not labels[0].strip():
228
+ errors.append("First class name is required.")
229
+ if not labels[1].strip():
230
+ errors.append("Second class name is required.")
231
+ if normalized[0] == normalized[1] and all(normalized):
232
+ errors.append("Class names must be different.")
233
+ return errors
234
+
235
+ if classification_type == "Sentiment Analysis":
236
+ st.write("### Sentiment Analysis Labels (Fixed)")
237
+ col1, col2, col3 = st.columns(3)
238
+ with col1:
239
+ st.text_input("First class", "Positive", disabled=True)
240
+ with col2:
241
+ st.text_input("Second class", "Negative", disabled=True)
242
+ with col3:
243
+ st.text_input("Third class", "Neutral", disabled=True)
244
+ labels = ["Positive", "Negative", "Neutral"]
245
+
246
+ elif classification_type == "Binary Classification":
247
+ st.write("### Binary Classification Labels")
248
+ col1, col2 = st.columns(2)
249
+ with col1:
250
+ label_1 = st.text_input("First class", "Positive")
251
+ with col2:
252
+ label_2 = st.text_input("Second class", "Negative")
253
+
254
+ labels = [label_1, label_2]
255
+ errors = validate_binary_labels(labels)
256
+
257
+ if errors:
258
+ st.error("\n".join(errors))
259
+ else:
260
+ st.success("Binary class names are valid and unique!")
261
+
262
+
263
+ elif classification_type == "Multi-Class Classification":
264
+ st.write("### Multi-Class Classification Labels")
265
+
266
+ default_labels_by_domain = {
267
+ "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
268
+ "AG News": ["World", "Sports", "Business", "Sci/Tech"],
269
+ "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
270
+ "Food & Dining", "Local Experience", "Adventure Activities",
271
+ "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
272
+ "Luxury Tourism"],
273
+ "Restaurant reviews": ["Italian", "French", "American"],
274
+ "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
275
+ "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
276
+ "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
277
+ "Books & Stationery","Toys & Games", "Sports & Fitness",
278
+ "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
279
+ }
280
+
281
+ num_classes = st.slider("Number of classes", 3, 15, 3)
282
+
283
+ # Get defaults for selected domain, or empty list
284
+ defaults = default_labels_by_domain.get(domain, [])
285
+
286
+ labels = []
287
+ errors = []
288
+ cols = st.columns(3)
289
+
290
+ for i in range(num_classes):
291
+ with cols[i % 3]:
292
+ default_value = defaults[i] if i < len(defaults) else ""
293
+ label_input = st.text_input(f"Class {i+1}", default_value)
294
+ normalized_label = label_input.strip().title()
295
+
296
+ if not normalized_label:
297
+ errors.append(f"Class {i+1} name is required.")
298
+ else:
299
+ labels.append(normalized_label)
300
+
301
+ # Check for duplicates (case-insensitive)
302
+ if len(labels) != len(set(labels)):
303
+ errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
304
+
305
+ # Show validation results
306
+ if errors:
307
+ for error in errors:
308
+ st.error(error)
309
+ else:
310
+ st.success("All Labels names are valid and unique!")
311
+ labels_valid = not errors # Will be True only if there are no label errors
312
+
313
+ ##############
314
+ #new 22/4/2025
315
+ # add additional attributes
316
+ add_attributes = st.checkbox("Add additional attributes (optional)")
317
+ additional_attributes = []
318
+
319
+ if add_attributes:
320
+ num_attributes = st.slider("Number of attributes to add", 1, 5, 1)
321
+ for i in range(num_attributes):
322
+ st.markdown(f"#### Attribute {i+1}")
323
+ attr_name = st.text_input(f"Name of attribute {i+1}", key=f"attr_name_{i}")
324
+ attr_topics = st.text_input(f"Topics (comma-separated) for {attr_name}", key=f"attr_topics_{i}")
325
+ if attr_name and attr_topics:
326
+ topics_list = [topic.strip() for topic in attr_topics.split(",") if topic.strip()]
327
+ additional_attributes.append({"attribute": attr_name, "topics": topics_list})
328
+
329
+ ################
330
+
331
+ # Generation parameters
332
+ col1, col2 = st.columns(2)
333
+ with col1:
334
+ min_words = st.number_input("Min words", 1, 100, 20)
335
+ with col2:
336
+ max_words = st.number_input("Max words", min_words, 100, 50)
337
+
338
+ # Few-shot examples
339
+ use_few_shot = st.toggle("Use few-shot examples")
340
+ few_shot_examples = []
341
+ if use_few_shot:
342
+ num_examples = st.slider("Number of few-shot examples", 1, 10, 1)
343
+ for i in range(num_examples):
344
+ with st.expander(f"Example {i+1}"):
345
+ content = st.text_area(f"Content", key=f"few_shot_content_{i}")
346
+ label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}")
347
+ if content and label:
348
+ few_shot_examples.append({"content": content, "label": label})
349
+
350
+ num_to_generate = st.number_input("Number of examples", 1, 200, 10)
351
+ #sytem role after
352
+ # System role customization
353
+ #default_system_role = f"You are a professional {classification_type} expert, your role is to generate text examples for {domain} domain. Always generate unique diverse examples and do not repeat the generated data. The generated text should be between {min_words} to {max_words} words long."
354
+ # System role customization
355
+ default_system_role = (
356
+ f"You are a seasoned expert in {classification_type}, specializing in the {domain} domain. "
357
+ f" Your primary responsibility is to generate high-quality, diverse, and unique text examples "
358
+ f"tailored to this domain. Please ensure that each example adheres to the specified length "
359
+ f"requirements, ranging from {min_words} to {max_words} words, and avoid any repetition in the generated content."
360
+ )
361
+ system_role = st.text_area("Modify System Role (optional)",
362
+ value=default_system_role,
363
+ key="system_role_input")
364
+ st.session_state['system_role'] = system_role if system_role else default_system_role
365
+ # Labels initialization
366
+ #labels = []
367
+
368
+
369
+ user_prompt = st.text_area("User Prompt (optional)")
370
+
371
+ # Updated prompt template including system role
372
+ prompt_template = PromptTemplate(
373
+ input_variables=["system_role", "classification_type", "domain", "num_examples",
374
+ "min_words", "max_words", "labels", "user_prompt", "few_shot_examples", "additional_attributes"],
375
+ template=(
376
+ "{system_role}\n"
377
+ "- Use the following parameters:\n"
378
+ "- Generate {num_examples} examples\n"
379
+ "- Each example should be between {min_words} to {max_words} words long\n"
380
+ "- Use these labels: {labels}.\n"
381
+ "- Use the following additional attributes:\n"
382
+ "- {additional_attributes}\n"
383
+ "- Generate the examples in this format: 'Example text. Label: label'\n"
384
+ "- Do not include word counts or any additional information\n"
385
+ "- Always use your creativity and intelligence to generate unique and diverse text data\n"
386
+ "- In sentiment analysis, ensure that the sentiment classification is clearly identified as Positive, Negative, or Neutral. Do not leave the sentiment ambiguous.\n"
387
+ "- In binary sentiment analysis, classify text strictly as either Positive or Negative. Do not include or imply Neutral as an option.\n"
388
+ "- Write unique examples every time.\n"
389
+ "- DO NOT REPEAT your gnerated text. \n"
390
+ "- For each Output, describe it once and move to the next.\n"
391
+ "- List each Output only once, and avoid repeating details.\n"
392
+ "- Additional instructions: {user_prompt}\n\n"
393
+ "- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n"
394
+ "- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it"
395
+
396
+ )
397
+ )
398
+ # template=(
399
+ # "{system_role}\n"
400
+ # "- Use the following parameters:\n"
401
+ # "- Generate {num_examples} examples\n"
402
+ # "- Each example should be between {min_words} to {max_words} words long\n"
403
+ # "- Use these labels: {labels}.\n"
404
+ # "- Use the following additional attributes:\n"
405
+ # "{additional_attributes}\n"
406
+ # #"- Format each example like this: 'Example text. Label: [label]. Attribute1: [topic1]. Attribute2: [topic2]'\n"
407
+ # "- Generate the examples in this format: 'Example text. Label: label'\n"
408
+ # "- Additional instructions: {user_prompt}\n"
409
+ # "- Use these few-shot examples if provided:\n{few_shot_examples}\n"
410
+ # "- Think step by step and ensure examples are unique and not repeated."
411
+ # )
412
+ # )
413
+ ##########new 22/4/2025
414
+ formatted_attributes = "\n".join([
415
+ f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
416
+ ])
417
+ #######################
418
+
419
+ # Generate system prompt
420
+ system_prompt = prompt_template.format(
421
+ system_role=st.session_state['system_role'],
422
+ classification_type=classification_type,
423
+ domain=domain,
424
+ num_examples=num_to_generate,
425
+ min_words=min_words,
426
+ max_words=max_words,
427
+ labels=", ".join(labels),
428
+ user_prompt=user_prompt,
429
+ few_shot_examples="\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) if few_shot_examples else "",
430
+ additional_attributes=formatted_attributes
431
+ )
432
+
433
+
434
+ # Store system prompt in session state
435
+ st.session_state['system_prompt'] = system_prompt
436
+
437
+ # Display system prompt
438
+ st.write("System Prompt:")
439
+ st.text_area("Current System Prompt", value=st.session_state['system_prompt'],
440
+ height=400, disabled=True)
441
+
442
+
443
+ if st.button("🎯 Generate Examples"):
444
+ #
445
+ errors = []
446
+ if domain_selection == "Custom" and not domain.strip():
447
+ st.warning("Custom domain name is required.")
448
+ elif len(labels) != len(set(labels)):
449
+ st.warning("Class names must be unique.")
450
+ elif any(not lbl.strip() for lbl in labels):
451
+ st.warning("All class labels must be filled in.")
452
+ #else:
453
+ #st.success("Generating examples for domain: {domain}")
454
+
455
+ #if not custom_domain_valid:
456
+ #st.warning("Custom domain name is required.")
457
+ #elif not labels_valid:
458
+ #st.warning("Please fix the label errors before generating examples.")
459
+ #else:
460
+ # Proceed to generate examples
461
+ #st.success(f"Generating examples for domain: {domain}")
462
+
463
+ with st.spinner("Generating examples..."):
464
+ try:
465
+ stream = client.chat.completions.create(
466
+ model=selected_model,
467
+ messages=[{"role": "system", "content": st.session_state['system_prompt']}],
468
+ temperature=temperature,
469
+ stream=True,
470
+ max_tokens=80000,
471
+ top_p=0.9,
472
+ # repetition_penalty=1.2,
473
+ #frequency_penalty=0.5, # Discourages frequent words
474
+ #presence_penalty=0.6,
475
+ )
476
+ #st.session_state['system_prompt'] = system_prompt
477
+ #new 24 march
478
+ st.session_state.messages.append({"role": "user", "content": system_prompt})
479
+ # # ####################
480
+ response = st.write_stream(stream)
481
+ st.session_state.messages.append({"role": "assistant", "content": response})
482
+ # Initialize session state variables if they don't exist
483
+ if 'system_prompt' not in st.session_state:
484
+ st.session_state.system_prompt = system_prompt
485
+
486
+ if 'response' not in st.session_state:
487
+ st.session_state.response = response
488
+
489
+ if 'generated_examples' not in st.session_state:
490
+ st.session_state.generated_examples = []
491
+
492
+ if 'generated_examples_csv' not in st.session_state:
493
+ st.session_state.generated_examples_csv = None
494
+
495
+ if 'generated_examples_json' not in st.session_state:
496
+ st.session_state.generated_examples_json = None
497
+
498
+ # Parse response and generate examples list
499
+ examples_list = []
500
+ for line in response.split('\n'):
501
+ if line.strip():
502
+ parts = line.rsplit('Label:', 1)
503
+ if len(parts) == 2:
504
+ text = parts[0].strip()
505
+ label = parts[1].strip()
506
+ if text and label:
507
+ examples_list.append({
508
+ 'text': text,
509
+ 'label': label,
510
+ 'system_prompt': st.session_state.system_prompt,
511
+ 'system_role': st.session_state.system_role,
512
+ 'task_type': 'Data Generation',
513
+ 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
514
+ })
515
+
516
+ # example_dict = {
517
+ # 'text': text,
518
+ # 'label': label,
519
+ # 'system_prompt': st.session_state.system_prompt,
520
+ # 'system_role': st.session_state.system_role,
521
+ # 'task_type': 'Data Generation',
522
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
523
+ # }
524
+ # for attr in additional_attributes:
525
+ # example_dict[attr['attribute']] = random.choice(attr['topics'])
526
+
527
+ # examples_list.append(example_dict)
528
+
529
+
530
+ if examples_list:
531
+ # Update session state with new data
532
+ st.session_state.generated_examples = examples_list
533
+
534
+ # Generate CSV and JSON data
535
+ df = pd.DataFrame(examples_list)
536
+ st.session_state.generated_examples_csv = df.to_csv(index=False).encode('utf-8')
537
+ st.session_state.generated_examples_json = json.dumps(examples_list, indent=2).encode('utf-8')
538
+
539
+ # Vertical layout with centered "or" between buttons
540
+ st.download_button(
541
+ "📥 Download Generated Examples (CSV)",
542
+ st.session_state.generated_examples_csv,
543
+ "generated_examples.csv",
544
+ "text/csv",
545
+ key='download-csv-persistent'
546
+ )
547
+
548
+ # Add space and center the "or"
549
+ st.markdown("""
550
+ <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
551
+ """, unsafe_allow_html=True)
552
+
553
+ st.download_button(
554
+ "📥 Download Generated Examples (JSON)",
555
+ st.session_state.generated_examples_json,
556
+ "generated_examples.json",
557
+ "application/json",
558
+ key='download-json-persistent'
559
+ )
560
+ # # Display the labeled examples
561
+ # st.markdown("##### 📋 Labeled Examples Preview")
562
+ # st.dataframe(df, use_container_width=True)
563
+
564
+ if st.button("Continue"):
565
+ if follow_up == "Generate more examples":
566
+ st.experimental_rerun()
567
+ elif follow_up == "Data Labeling":
568
+ st.session_state.task_choice = "Data Labeling"
569
+ st.experimental_rerun()
570
+
571
+ except Exception as e:
572
+ st.error("An error occurred during generation.")
573
+ st.error(f"Details: {e}")
574
+
575
+
576
+ # Lableing Process
577
+ elif st.session_state.task_choice == "Data Labeling":
578
+ st.header("🏷️ Data Labeling")
579
+
580
+ domain_selection = st.selectbox("Domain", ["Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"])
581
+ # 2. Handle custom domain input
582
+ custom_domain_valid = True # Assume valid until proven otherwise
583
+
584
+ if domain_selection == "Custom":
585
+ domain = st.text_input("Specify custom domain")
586
+ if not domain.strip():
587
+ st.error("Please specify a domain name.")
588
+ custom_domain_valid = False
589
+ else:
590
+ domain = domain_selection
591
+
592
+
593
+ # Classification type selection
594
+ classification_type = st.selectbox(
595
+ "Classification Type",
596
+ ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"]
597
+ )
598
+ #NNew edit
599
+ # Labels setup based on classification type
600
+ labels = []
601
+ labels_valid = False
602
+ errors = []
603
+
604
+ if classification_type == "Sentiment Analysis":
605
+ st.write("### Sentiment Analysis Labels (Fixed)")
606
+ col1, col2, col3 = st.columns(3)
607
+ with col1:
608
+ label_1 = st.text_input("First class", "Positive", disabled=True)
609
+ with col2:
610
+ label_2 = st.text_input("Second class", "Negative", disabled=True)
611
+ with col3:
612
+ label_3 = st.text_input("Third class", "Neutral", disabled=True)
613
+ labels = ["Positive", "Negative", "Neutral"]
614
+
615
+
616
+ elif classification_type == "Binary Classification":
617
+ st.write("### Binary Classification Labels")
618
+ col1, col2 = st.columns(2)
619
+
620
+ with col1:
621
+ label_1 = st.text_input("First class", "Positive")
622
+ with col2:
623
+ label_2 = st.text_input("Second class", "Negative")
624
+
625
+ errors = []
626
+ labels = [label_1.strip(), label_2.strip()]
627
+
628
+
629
+ # Strip and lower-case labels for validation
630
+ label_1 = labels[0].strip()
631
+ label_2 = labels[1].strip()
632
+
633
+ # Check for empty class names
634
+ if not label_1:
635
+ errors.append("First class name is required.")
636
+ if not label_2:
637
+ errors.append("Second class name is required.")
638
+
639
+ # Check for duplicates (case insensitive)
640
+ if label_1.lower() == label_2.lower() and label_1 and label_2:
641
+ errors.append("Class names must be different.")
642
+
643
+ # Show errors or success
644
+ if errors:
645
+ for error in errors:
646
+ st.error(error)
647
+ else:
648
+ st.success("Binary class names are valid and unique!")
649
+
650
+
651
+ elif classification_type == "Multi-Class Classification":
652
+ st.write("### Multi-Class Classification Labels")
653
+
654
+ default_labels_by_domain = {
655
+ "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
656
+ "AG News": ["World", "Sports", "Business", "Sci/Tech"],
657
+ "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
658
+ "Food & Dining", "Local Experience", "Adventure Activities",
659
+ "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
660
+ "Luxury Tourism"],
661
+ "Restaurant reviews": ["Italian", "French", "American"],
662
+ "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
663
+ "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
664
+ "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
665
+ "Books & Stationery","Toys & Games", "Sports & Fitness",
666
+ "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
667
+ }
668
+
669
+
670
+
671
+ # Ask user how many classes they want to define
672
+ num_classes = st.slider("Select the number of classes (labels)", min_value=3, max_value=10, value=3)
673
+
674
+ # Use default labels based on selected domain, if available
675
+ defaults = default_labels_by_domain.get(domain, [])
676
+
677
+ labels = []
678
+ errors = []
679
+ cols = st.columns(3) # For nicely arranged label inputs
680
+
681
+ for i in range(num_classes):
682
+ with cols[i % 3]: # Distribute inputs across columns
683
+ default_value = defaults[i] if i < len(defaults) else ""
684
+ label_input = st.text_input(f"Label {i + 1}", default_value)
685
+ normalized_label = label_input.strip().title()
686
+
687
+ if not normalized_label:
688
+ errors.append(f"Label {i + 1} is required.")
689
+ else:
690
+ labels.append(normalized_label)
691
+
692
+ # Check for duplicates (case-insensitive)
693
+ normalized_set = {label.lower() for label in labels}
694
+ if len(labels) != len(normalized_set):
695
+ errors.append("Label names must be unique (case-insensitive).")
696
+
697
+ # Show validation results
698
+ if errors:
699
+ for error in errors:
700
+ st.error(error)
701
+ else:
702
+ st.success("All label names are valid and unique!")
703
+
704
+ labels_valid = not errors # True if no validation errors
705
+
706
+ elif classification_type == "Named Entity Recognition (NER)":
707
+ # # NER entity options
708
+ # ner_entities = [
709
+ # "PERSON - Names of people, fictional characters, historical figures",
710
+ # "ORG - Companies, institutions, agencies, teams",
711
+ # "LOC - Physical locations (mountains, oceans, etc.)",
712
+ # "GPE - Countries, cities, states, political regions",
713
+ # "DATE - Calendar dates, years, centuries",
714
+ # "TIME - Times, durations",
715
+ # "MONEY - Monetary values with currency"
716
+ # ]
717
+ # selected_entities = st.multiselect(
718
+ # "Select entities to recognize",
719
+ # ner_entities,
720
+ # default=["PERSON - Names of people, fictional characters, historical figures",
721
+ # "ORG - Companies, institutions, agencies, teams",
722
+ # "LOC - Physical locations (mountains, oceans, etc.)",
723
+ # "GPE - Countries, cities, states, political regions",
724
+ # "DATE - Calendar dates, years, centuries",
725
+ # "TIME - Times, durations",
726
+ # "MONEY - Monetary values with currency"],
727
+ # key="ner_entity_selection"
728
+ # )
729
+ #new 22/4/2025
730
+ #if classification_type == "Named Entity Recognition (NER)":
731
+ use_few_shot = True
732
+ #new 22/4/2025
733
+ few_shot_examples = [
734
+ {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
735
+ {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
736
+ {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
737
+ {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
738
+ {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
739
+ {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
740
+ {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
741
+ {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
742
+ {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
743
+ {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
744
+ ]
745
+ ###########
746
+
747
+ st.write("### Named Entity Recognition (NER) Entities")
748
+
749
+ # Predefined standard entities
750
+ ner_entities = [
751
+ "PERSON - Names of people, fictional characters, historical figures",
752
+ "ORG - Companies, institutions, agencies, teams",
753
+ "LOC - Physical locations (mountains, oceans, etc.)",
754
+ "GPE - Countries, cities, states, political regions",
755
+ "DATE - Calendar dates, years, centuries",
756
+ "TIME - Times, durations",
757
+ "MONEY - Monetary values with currency"
758
+ ]
759
+
760
+ # User can add custom NER types
761
+ custom_ner_entities = []
762
+ if st.checkbox("Add custom NER entities?"):
763
+ num_custom_ner = st.slider("Number of custom NER entities", 1, 10, 1)
764
+ for i in range(num_custom_ner):
765
+ st.markdown(f"#### Custom Entity {i+1}")
766
+ custom_type = st.text_input(f"Entity type {i+1}", key=f"custom_ner_type_{i}")
767
+ custom_description = st.text_input(f"Description for {custom_type}", key=f"custom_ner_desc_{i}")
768
+ if custom_type and custom_description:
769
+ custom_ner_entities.append(f"{custom_type.upper()} - {custom_description}")
770
+
771
+ # Combine built-in and custom NERs
772
+ all_ner_options = ner_entities + custom_ner_entities
773
+
774
+ selected_entities = st.multiselect(
775
+ "Select entities to recognize",
776
+ all_ner_options,
777
+ default=ner_entities
778
+ )
779
+
780
+ # Extract entity type names (before the dash)
781
+ labels = [entity.split(" - ")[0].strip() for entity in selected_entities]
782
+
783
+ if not labels:
784
+ st.warning("Please select at least one entity type.")
785
+ labels = ["PERSON"]
786
+
787
+ ##########
788
+
789
+ # # Extract just the entity type (before the dash)
790
+ # labels = [entity.split(" - ")[0] for entity in selected_entities]
791
+
792
+ # if not labels:
793
+ # st.warning("Please select at least one entity type")
794
+ # labels = ["PERSON"] # Default if nothing selected
795
+
796
+
797
+
798
+
799
+
800
+ #NNew edit
801
+ # elif classification_type == "Multi-Class Classification":
802
+ # st.write("### Multi-Class Classification Labels")
803
+
804
+ # default_labels_by_domain = {
805
+ # "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
806
+ # "AG News": ["World", "Sports", "Business", "Sci/Tech"],
807
+ # "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
808
+ # "Food & Dining", "Local Experience", "Adventure Activities",
809
+ # "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
810
+ # "Luxury Tourism"],
811
+ # "Restaurant reviews": ["Italian", "French", "American"]
812
+ # }
813
+ # num_classes = st.slider("Number of classes", 3, 10, 3)
814
+
815
+ # # Get defaults for selected domain, or empty list
816
+ # defaults = default_labels_by_domain.get(domain, [])
817
+
818
+ # labels = []
819
+ # errors = []
820
+ # cols = st.columns(3)
821
+
822
+ # for i in range(num_classes):
823
+ # with cols[i % 3]:
824
+ # default_value = defaults[i] if i < len(defaults) else ""
825
+ # label_input = st.text_input(f"Class {i+1}", default_value)
826
+ # normalized_label = label_input.strip().title()
827
+
828
+ # if not normalized_label:
829
+ # errors.append(f"Class {i+1} name is required.")
830
+ # else:
831
+ # labels.append(normalized_label)
832
+
833
+ # # Check for duplicates (case-insensitive)
834
+ # if len(labels) != len(set(labels)):
835
+ # errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
836
+
837
+ # # Show validation results
838
+ # if errors:
839
+ # for error in errors:
840
+ # st.error(error)
841
+ # else:
842
+ # st.success("All Labels names are valid and unique!")
843
+ # labels_valid = not errors # Will be True only if there are no label errors
844
+
845
+
846
+
847
+
848
+ # else:
849
+ # num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes")
850
+ # labels = []
851
+ # cols = st.columns(3)
852
+ # for i in range(num_classes):
853
+ # with cols[i % 3]:
854
+ # label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
855
+ # labels.append(label)
856
+
857
+ use_few_shot = st.toggle("Use few-shot examples for labeling")
858
+ few_shot_examples = []
859
+ if use_few_shot:
860
+ num_few_shot = st.slider("Number of few-shot examples", 1, 10, 1)
861
+ for i in range(num_few_shot):
862
+ with st.expander(f"Few-shot Example {i+1}"):
863
+ content = st.text_area(f"Content", key=f"label_few_shot_content_{i}")
864
+ label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}")
865
+ if content and label:
866
+ few_shot_examples.append(f"{content}\nLabel: {label}")
867
+
868
+ num_examples = st.number_input("Number of examples to classify", 1, 100, 1)
869
+
870
+ examples_to_classify = []
871
+ if num_examples <= 20:
872
+ for i in range(num_examples):
873
+ example = st.text_area(f"Example {i+1}", key=f"example_{i}")
874
+ if example:
875
+ examples_to_classify.append(example)
876
+ else:
877
+ examples_text = st.text_area(
878
+ "Enter examples (one per line)",
879
+ height=300,
880
+ help="Enter each example on a new line"
881
+ )
882
+ if examples_text:
883
+ examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()]
884
+ if len(examples_to_classify) > num_examples:
885
+ examples_to_classify = examples_to_classify[:num_examples]
886
+
887
+ #New Wedyan
888
+ #default_system_role = f"You are a professional {classification_type} expert, your role is to classify the provided text examples for {domain} domain."
889
+ # System role customization
890
+ default_system_role = (f"You are a highly skilled {classification_type} expert."
891
+ f" Your task is to accurately classify the provided text examples within the {domain} domain."
892
+ f" Ensure that all classifications are precise, context-aware, and aligned with domain-specific standards and best practices."
893
+ )
894
+ system_role = st.text_area("Modify System Role (optional)",
895
+ value=default_system_role,
896
+ key="system_role_input")
897
+ st.session_state['system_role'] = system_role if system_role else default_system_role
898
+ # Labels initialization
899
+ #labels = []
900
+ ####
901
+
902
+ user_prompt = st.text_area("User prompt (optional)", key="label_instructions")
903
+
904
+ few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else ""
905
+ examples_text = "\n".join([f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify)])
906
+
907
+ # Customize prompt template based on classification type
908
+ if classification_type == "Named Entity Recognition (NER)":
909
+ # label_prompt_template = PromptTemplate(
910
+ # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
911
+ # template=(
912
+ # "{system_role}\n"
913
+ # #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n"
914
+ # "- For each text example provided, identify all entities of the requested types.\n"
915
+ # "- Use the following entities: {labels}.\n"
916
+ # "- Return each example followed by the entities you found in this format: 'Example text.\n \n Entities:\n [ENTITY_TYPE: entity text\n\n, ENTITY_TYPE: entity text\n\n, ...] or [No entities found]'\n"
917
+ # "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n"
918
+ # "- Be precise about entity boundaries - don't include unnecessary words.\n"
919
+ # "- Do not provide any additional information or explanations.\n"
920
+ # "- Additional instructions:\n {user_prompt}\n\n"
921
+ # "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n"
922
+ # "- Examples to analyze:\n{examples}\n\n"
923
+ # "Output:\n"
924
+ # )
925
+ # )
926
+ #new 22/4/2025
927
+ # label_prompt_template = PromptTemplate(
928
+ # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
929
+ # template=(
930
+ # "{system_role}\n"
931
+ # "- You are performing Named Entity Recognition (NER) in the domain of {domain}.\n"
932
+ # "- Use the following entity types: {labels}.\n\n"
933
+ # "### Reasoning Steps:\n"
934
+ # "1. Read the example carefully.\n"
935
+ # "2. For each named entity mentioned, determine its meaning and role in the sentence.\n"
936
+ # "3. Think about the **context**: Is it a physical location (LOC)? A geopolitical region (GPE)? A person (PERSON)?\n"
937
+ # "4. Based on the definition of each label, assign the most **specific and correct** label.\n\n"
938
+ # "For example:\n"
939
+ # "- 'Mount Everest' → LOC (it's a mountain)\n"
940
+ # "- 'France' → GPE (it's a country)\n"
941
+ # "- 'Microsoft' → ORG\n"
942
+ # "- 'John Smith' → PERSON\n\n"
943
+ # "- Return each example followed by the entities you found in this format:\n"
944
+ # "'Example text.'\nEntities: [ENTITY_TYPE: entity text, ENTITY_TYPE: entity text, ...] or [No entities found]\n"
945
+ # "- If no entities of the requested types are found, return 'No entities found'.\n"
946
+ # "- Be precise about entity boundaries - don't include extra words.\n"
947
+ # "- Do not explain or justify your answers.\n\n"
948
+ # "Additional instructions:\n{user_prompt}\n\n"
949
+ # "Few-shot examples:\n{few_shot_examples}\n\n"
950
+ # "Examples to label:\n{examples}\n"
951
+ # "Output:\n"
952
+ # )
953
+ #)
954
+ # label_prompt_template = PromptTemplate(
955
+ # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
956
+ # template=(
957
+ # "{system_role}\n"
958
+ # "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
959
+ # "- Use these entity types: {labels}.\n\n"
960
+ # "### Output Format:\n"
961
+ # # "Return each example followed by the entities you found in this format: 'Example text.\n Entities:\n [ENTITY_TYPE: entity text\n\"
962
+ # "Return each example followed by the entities you found in this format: 'Example text.\n 'Entity types:\n "Then group the entities under each label like this:\n" "
963
+ # #"Then Start with this line exactly: 'Entity types\n'\n"
964
+ # #"Then group the entities under each label like this:\n"
965
+ # "\n PERSON – Angela Merkel, John Smith\n\n"
966
+ # "\ ORG – Google, United Nations\n\n"
967
+ # "\n DATE – January 1st, 2023\n\n"
968
+ # "\n ... and so on.\n\n"
969
+ # "If entity {labels} not found, do not write it in your response\n"
970
+ # "- Do NOT output them inline after the text.\n"
971
+ # "- Do NOT repeat the sentence.\n"
972
+ # "- If no entities are found for a type, skip it.\n"
973
+ # "- Keep the format consistent.\n\n"
974
+ # "User Instructions:\n{user_prompt}\n\n"
975
+ # "Few-shot Examples:\n{few_shot_examples}\n\n"
976
+ # "Examples to analyze:\n{examples}"
977
+ # )
978
+ # )
979
+
980
+
981
+ label_prompt_template = PromptTemplate(
982
+ input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
983
+ template=(
984
+ "{system_role}\n"
985
+ "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
986
+ "- Use these entity types: {labels}.\n\n"
987
+ "### Output Format:\n"
988
+ "Return each example followed by the entities you found in this format:\n"
989
+ "'Example text.\nEntity types:\n"
990
+ "Then group the entities under each label like this:\n"
991
+ "\nPERSON – Angela Merkel, John Smith\n"
992
+ "ORG – Google, United Nations\n"
993
+ "DATE – January 1st, 2023\n"
994
+ "... and so on.\n\n"
995
+ "Each new entities group should be in a new line.\n"
996
+ "If entity type {labels} is not found, do not write it in your response.\n"
997
+ "- Do NOT output them inline after the text.\n"
998
+ "- Do NOT repeat the sentence.\n"
999
+ "- If no entities are found for a type, skip it.\n"
1000
+ "- Keep the format consistent.\n\n"
1001
+ "User Instructions:\n{user_prompt}\n\n"
1002
+ "Few-shot Examples:\n{few_shot_examples}\n\n"
1003
+ "Examples to analyze:\n{examples}"
1004
+ )
1005
+ )
1006
+
1007
+ #######
1008
+ else:
1009
+ label_prompt_template = PromptTemplate(
1010
+
1011
+ input_variables=["system_role", "classification_type", "labels", "few_shot_examples", "examples","domain", "user_prompt"],
1012
+ template=(
1013
+ #"- Let'\s think step by step:"
1014
+ "{system_role}\n"
1015
+ # "- You are a professional {classification_type} expert in {domain} domain. Your role is to classify the following examples using these labels: {labels}.\n"
1016
+ "- Use the following instructions:\n"
1017
+ "- Use the following labels: {labels}.\n"
1018
+ "- Return the classified text followed by the label in this format: 'text. Label: [label]'\n"
1019
+ "- Do not provide any additional information or explanations\n"
1020
+ "- User prompt:\n {user_prompt}\n\n"
1021
+ "- Use user provided examples as guidence in the classification process:\n\n {few_shot_examples}\n"
1022
+ "- Examples to classify:\n{examples}\n\n"
1023
+ "- Think step by step then classify the examples"
1024
+ #"Output:\n"
1025
+ ))
1026
+
1027
+ # Check if few_shot_examples is already a formatted string
1028
+ # Check if few_shot_examples is already a formatted string
1029
+ if isinstance(few_shot_examples, str):
1030
+ formatted_few_shot = few_shot_examples
1031
+ # If it's a list of already formatted strings
1032
+ elif isinstance(few_shot_examples, list) and all(isinstance(ex, str) for ex in few_shot_examples):
1033
+ formatted_few_shot = "\n".join(few_shot_examples)
1034
+ # If it's a list of dictionaries with 'content' and 'label' keys
1035
+ elif isinstance(few_shot_examples, list) and all(isinstance(ex, dict) and 'content' in ex and 'label' in ex for ex in few_shot_examples):
1036
+ formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
1037
+ else:
1038
+ formatted_few_shot = ""
1039
+ # #new 22/4/2025
1040
+ # few_shot_examples = [
1041
+ # {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
1042
+ # {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
1043
+ # {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
1044
+ # {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
1045
+ # {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
1046
+ # {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
1047
+ # {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
1048
+ # {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
1049
+ # {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
1050
+ # {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
1051
+ # ]
1052
+ # ###########
1053
+ # new 22/4/2025
1054
+ #formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
1055
+ formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
1056
+
1057
+ ###########
1058
+ system_prompt = label_prompt_template.format(
1059
+ system_role=st.session_state['system_role'],
1060
+ classification_type=classification_type,
1061
+ domain=domain,
1062
+ examples="\n".join(examples_to_classify),
1063
+ labels=", ".join(labels),
1064
+ user_prompt=user_prompt,
1065
+ few_shot_examples=formatted_few_shot
1066
+ )
1067
+
1068
+ # Step 2: Store the system_prompt in st.session_state
1069
+ st.session_state['system_prompt'] = system_prompt
1070
+ #::contentReference[oaicite:0]{index=0}
1071
+ st.write("System Prompt:")
1072
+ #st.code(system_prompt)
1073
+ #st.code(st.session_state['system_prompt'])
1074
+ st.text_area("System Prompt", value=st.session_state['system_prompt'], height=300, max_chars=None, key=None, help=None, disabled=True)
1075
+
1076
+
1077
+
1078
+ if st.button("🏷️ Label Data"):
1079
+ if examples_to_classify:
1080
+ with st.spinner("Labeling data..."):
1081
+ #Generate the system prompt based on classification type
1082
+ if classification_type == "Named Entity Recognition (NER)":
1083
+ system_prompt = label_prompt_template.format(
1084
+ system_role=st.session_state['system_role'],
1085
+ labels=", ".join(labels),
1086
+ domain = domain,
1087
+ few_shot_examples=few_shot_text,
1088
+ examples=examples_text,
1089
+ user_prompt=user_prompt
1090
+ #new
1091
+ #'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1092
+ )
1093
+ # if classification_type == "Named Entity Recognition (NER)":
1094
+ # # Step 1: Split the full response by example
1095
+ # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1096
+ # inputs = [ex.strip() for ex in examples_to_classify]
1097
+
1098
+ # # Step 2: Match inputs with NER output blocks
1099
+ # labeled_examples = []
1100
+ # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1101
+ # labeled_examples.append({
1102
+ # 'text': text,
1103
+ # 'entities': f"Entity types\n{output_block.strip()}",
1104
+ # 'system_prompt': st.session_state.system_prompt,
1105
+ # 'system_role': st.session_state.system_role,
1106
+ # 'task_type': 'Named Entity Recognition (NER)',
1107
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1108
+ # })
1109
+
1110
+ # if classification_type == "Named Entity Recognition (NER)":
1111
+ # # Step 1: Split the full response by example
1112
+ # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1113
+ # inputs = [ex.strip() for ex in examples_to_classify]
1114
+
1115
+ # # Step 2: Match inputs with NER output blocks
1116
+ # labeled_examples = []
1117
+ # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1118
+ # labeled_examples.append({
1119
+ # 'text': text,
1120
+ # 'entities': f"Entity types\n{output_block.strip()}",
1121
+ # 'system_prompt': st.session_state.system_prompt,
1122
+ # 'system_role': st.session_state.system_role,
1123
+ # 'task_type': 'Named Entity Recognition (NER)',
1124
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1125
+ # })
1126
+
1127
+
1128
+ # import re
1129
+
1130
+ # if classification_type == "Named Entity Recognition (NER)":
1131
+ # # Use regex to split on "Entity types" while keeping it attached to each block
1132
+ # blocks = re.split(r"(Entity types)", response.strip())
1133
+
1134
+ # # Recombine 'Entity types' with each block after splitting
1135
+ # raw_outputs = [
1136
+ # (blocks[i] + blocks[i+1]).strip()
1137
+ # for i in range(1, len(blocks) - 1, 2)
1138
+ # ]
1139
+
1140
+ # inputs = [ex.strip() for ex in examples_to_classify]
1141
+
1142
+ # labeled_examples = []
1143
+ # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1144
+ # labeled_examples.append({
1145
+ # 'text': text,
1146
+ # 'entities': output_block,
1147
+ # 'system_prompt': st.session_state.system_prompt,
1148
+ # 'system_role': st.session_state.system_role,
1149
+ # 'task_type': 'Named Entity Recognition (NER)',
1150
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1151
+ # })
1152
+
1153
+
1154
+ else:
1155
+ system_prompt = label_prompt_template.format(
1156
+ classification_type=classification_type,
1157
+ system_role=st.session_state['system_role'],
1158
+ domain = domain,
1159
+ labels=", ".join(labels),
1160
+ few_shot_examples=few_shot_text,
1161
+ examples=examples_text,
1162
+ user_prompt=user_prompt
1163
+ )
1164
+ try:
1165
+ stream = client.chat.completions.create(
1166
+ model=selected_model,
1167
+ messages=[{"role": "system", "content": system_prompt}],
1168
+ temperature=temperature,
1169
+ stream=True,
1170
+ max_tokens=20000,
1171
+ top_p = 0.9,
1172
+
1173
+ )
1174
+ #new 24 March
1175
+ # Append user message
1176
+ st.session_state.messages.append({"role": "user", "content": system_prompt})
1177
+ #################
1178
+ response = st.write_stream(stream)
1179
+ st.session_state.messages.append({"role": "assistant", "content": response})
1180
+ # Display the labeled examples
1181
+ # # Optional: If you want to add it as a chat-style message log
1182
+ # preview_str = st.session_state.labeled_preview.to_markdown(index=False)
1183
+ # st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"})
1184
+
1185
+
1186
+ # # Stream response and append assistant message
1187
+ # #14/4/2024
1188
+ # response = st.write_stream(stream)
1189
+ # st.session_state.messages.append({"role": "assistant", "content": response})
1190
+
1191
+ # Initialize session state variables if they don't exist
1192
+ if 'system_prompt' not in st.session_state:
1193
+ st.session_state.system_prompt = system_prompt
1194
+
1195
+ if 'response' not in st.session_state:
1196
+ st.session_state.response = response
1197
+
1198
+ if 'generated_examples' not in st.session_state:
1199
+ st.session_state.generated_examples = []
1200
+
1201
+ if 'generated_examples_csv' not in st.session_state:
1202
+ st.session_state.generated_examples_csv = None
1203
+
1204
+ if 'generated_examples_json' not in st.session_state:
1205
+ st.session_state.generated_examples_json = None
1206
+
1207
+
1208
+
1209
+
1210
+ # Save labeled examples to CSV
1211
+ #new 14/4/2025
1212
+ #labeled_examples = []
1213
+ # if classification_type == "Named Entity Recognition (NER)":
1214
+ # labeled_examples = []
1215
+ # for line in response.split('\n'):
1216
+ # if line.strip():
1217
+ # parts = line.rsplit('Entities:', 1)
1218
+ # if len(parts) == 2:
1219
+ # text = parts[0].strip()
1220
+ # entities = parts[1].strip()
1221
+ # if text and entities:
1222
+ # labeled_examples.append({
1223
+ # 'text': text,
1224
+ # 'entities': entities,
1225
+ # 'system_prompt': st.session_state.system_prompt,
1226
+ # 'system_role': st.session_state.system_role,
1227
+ # 'task_type': 'Named Entity Recognition (NER)',
1228
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1229
+ # })
1230
+
1231
+ #new 22/4/2025
1232
+ labeled_examples = []
1233
+ if classification_type == "Named Entity Recognition (NER)":
1234
+ labeled_examples = [{
1235
+ 'ner_output': response.strip(),
1236
+ 'system_prompt': st.session_state.system_prompt,
1237
+ 'system_role': st.session_state.system_role,
1238
+ 'task_type': 'Named Entity Recognition (NER)',
1239
+ 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1240
+ }]
1241
+
1242
+ ######
1243
+
1244
+
1245
+ else:
1246
+ labeled_examples = []
1247
+ for line in response.split('\n'):
1248
+ if line.strip():
1249
+ parts = line.rsplit('Label:', 1)
1250
+ if len(parts) == 2:
1251
+ text = parts[0].strip()
1252
+ label = parts[1].strip()
1253
+ if text and label:
1254
+ labeled_examples.append({
1255
+ 'text': text,
1256
+ 'label': label,
1257
+ 'system_prompt': st.session_state.system_prompt,
1258
+ 'system_role': st.session_state.system_role,
1259
+ 'task_type': 'Data Labeling',
1260
+ 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1261
+ })
1262
+ # Save and provide download options
1263
+ if labeled_examples:
1264
+ # Update session state
1265
+ st.session_state.labeled_examples = labeled_examples
1266
+
1267
+ # Convert to CSV and JSON
1268
+ df = pd.DataFrame(labeled_examples)
1269
+ #new 22/4/2025
1270
+ # CSV
1271
+ st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1272
+
1273
+ # JSON
1274
+ st.session_state.labeled_examples_json = json.dumps({
1275
+ "metadata": {
1276
+ "domain": domain,
1277
+ "labels": labels,
1278
+ "used_few_shot": use_few_shot,
1279
+ "task_type": "Named Entity Recognition (NER)",
1280
+ "timestamp": datetime.now().isoformat()
1281
+ },
1282
+ "examples": labeled_examples
1283
+ }, indent=2).encode('utf-8')
1284
+
1285
+ ############
1286
+ # CSV
1287
+ # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1288
+
1289
+ # # JSON
1290
+ # st.session_state.labeled_examples_json = json.dumps({
1291
+ # "metadata": {
1292
+ # "domain": domain,
1293
+ # "labels": labels,
1294
+ # "used_few_shot": use_few_shot,
1295
+ # "task_type": "Named Entity Recognition (NER)",
1296
+ # "timestamp": datetime.now().isoformat()
1297
+ # },
1298
+ # "examples": labeled_examples
1299
+ # }, indent=2).encode('utf-8')
1300
+
1301
+ ########
1302
+ # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1303
+ # st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8')
1304
+
1305
+ # Download buttons
1306
+ st.download_button(
1307
+ "📥 Download Labeled Examples (CSV)",
1308
+ st.session_state.labeled_examples_csv,
1309
+ "labeled_examples.csv",
1310
+ "text/csv",
1311
+ key='download-labeled-csv'
1312
+ )
1313
+
1314
+ st.markdown("""
1315
+ <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1316
+ """, unsafe_allow_html=True)
1317
+
1318
+ st.download_button(
1319
+ "📥 Download Labeled Examples (JSON)",
1320
+ st.session_state.labeled_examples_json,
1321
+ "labeled_examples.json",
1322
+ "application/json",
1323
+ key='download-labeled-json'
1324
+ )
1325
+ # Display the labeled examples
1326
+ st.markdown("##### 📋 Labeled Examples Preview")
1327
+ st.dataframe(df, use_container_width=True)
1328
+ # Display section
1329
+ #st.markdown("### 📋 Labeled Examples Preview")
1330
+ #st.dataframe(st.session_state.labeled_preview, use_container_width=True)
1331
+
1332
+
1333
+
1334
+ # if labeled_examples:
1335
+ # df = pd.DataFrame(labeled_examples)
1336
+ # csv = df.to_csv(index=False).encode('utf-8')
1337
+ # st.download_button(
1338
+ # "📥 Download Labeled Examples",
1339
+ # csv,
1340
+ # "labeled_examples.csv",
1341
+ # "text/csv",
1342
+ # key='download-labeled-csv'
1343
+ # )
1344
+ # # Add space and center the "or"
1345
+ # st.markdown("""
1346
+ # <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1347
+ # """, unsafe_allow_html=True)
1348
+
1349
+ # if labeled_examples:
1350
+ # df = pd.DataFrame(labeled_examples)
1351
+ # csv = df.to_csv(index=False).encode('utf-8')
1352
+ # st.download_button(
1353
+ # "📥 Download Labeled Examples",
1354
+ # csv,
1355
+ # "labeled_examples.json",
1356
+ # "text/json",
1357
+ # key='download-labeled-JSON'
1358
+ # )
1359
+
1360
+ # Add follow-up interaction options
1361
+ #st.markdown("---")
1362
+ #follow_up = st.radio(
1363
+ #"What would you like to do next?",
1364
+ #["Label more data", "Data Generation"],
1365
+ # key="labeling_follow_up"
1366
+ # )
1367
+
1368
+ if st.button("Continue"):
1369
+ if follow_up == "Label more data":
1370
+ st.session_state.examples_to_classify = []
1371
+ st.experimental_rerun()
1372
+ elif follow_up == "Data Generation":
1373
+ st.session_state.task_choice = "Data Labeling"
1374
+ st.experimental_rerun()
1375
+
1376
+ except Exception as e:
1377
+ st.error("An error occurred during labeling.")
1378
+ st.error(f"Details: {e}")
1379
+ else:
1380
+ st.warning("Please enter at least one example to classify.")
1381
+
1382
+ #st.session_state.messages.append({"role": "assistant", "content": response})
1383
+
1384
+
1385
+
1386
+
1387
+ # Footer
1388
+ st.markdown("---")
1389
+ st.markdown(
1390
+ """
1391
+ <div style='text-align: center'>
1392
+ <p>Made with ❤️ by Wedyan AlSakran 2025</p>
1393
+ </div>
1394
+ """,
1395
+ unsafe_allow_html=True
1396
+ )