Wedyan2023 commited on
Commit
4b87947
·
verified ·
1 Parent(s): a829e8a

Create app103.py

Browse files
Files changed (1) hide show
  1. app103.py +1424 -0
app103.py ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import json
5
+ import base64
6
+ import random
7
+ from streamlit_pdf_viewer import pdf_viewer
8
+ from langchain.prompts import PromptTemplate
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from openai import OpenAI
12
+ from dotenv import load_dotenv
13
+ import warnings
14
+
15
+ warnings.filterwarnings('ignore')
16
+
17
+ os.getenv("OAUTH_CLIENT_ID")
18
+
19
+
20
+ # Load environment variables and initialize the OpenAI client to use Hugging Face Inference API.
21
+ load_dotenv()
22
+ client = OpenAI(
23
+ base_url="https://api-inference.huggingface.co/v1",
24
+ api_key=os.environ.get('TOKEN2') # Hugging Face API token
25
+ )
26
+
27
+ # Create necessary directories
28
+ for dir_name in ['data', 'feedback']:
29
+ if not os.path.exists(dir_name):
30
+ os.makedirs(dir_name)
31
+
32
+ # Custom CSS
33
+ st.markdown("""
34
+ <style>
35
+ .stButton > button {
36
+ width: 100%;
37
+ margin-bottom: 10px;
38
+ background-color: #4CAF50;
39
+ color: white;
40
+ border: none;
41
+ padding: 10px;
42
+ border-radius: 5px;
43
+ }
44
+ .task-button {
45
+ background-color: #2196F3 !important;
46
+ }
47
+ .stSelectbox {
48
+ margin-bottom: 20px;
49
+ }
50
+ .output-container {
51
+ padding: 20px;
52
+ border-radius: 5px;
53
+ border: 1px solid #ddd;
54
+ margin: 10px 0;
55
+ }
56
+ .status-container {
57
+ padding: 10px;
58
+ border-radius: 5px;
59
+ margin: 10px 0;
60
+ }
61
+ .sidebar-info {
62
+ padding: 10px;
63
+ background-color: #f0f2f6;
64
+ border-radius: 5px;
65
+ margin: 10px 0;
66
+ }
67
+ .feedback-button {
68
+ background-color: #ff9800 !important;
69
+ }
70
+ .feedback-container {
71
+ padding: 15px;
72
+ background-color: #f5f5f5;
73
+ border-radius: 5px;
74
+ margin: 15px 0;
75
+ }
76
+ </style>
77
+ """, unsafe_allow_html=True)
78
+
79
+ # Helper functions
80
+ def read_csv_with_encoding(file):
81
+ encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
82
+ for encoding in encodings:
83
+ try:
84
+ return pd.read_csv(file, encoding=encoding)
85
+ except UnicodeDecodeError:
86
+ continue
87
+ raise UnicodeDecodeError("Failed to read file with any supported encoding")
88
+
89
+ #def save_feedback(feedback_data):
90
+ #feedback_file = 'feedback/user_feedback.csv'
91
+ #feedback_df = pd.DataFrame([feedback_data])
92
+
93
+ #if os.path.exists(feedback_file):
94
+ #feedback_df.to_csv(feedback_file, mode='a', header=False, index=False)
95
+ #else:
96
+ #feedback_df.to_csv(feedback_file, index=False)
97
+
98
+ def reset_conversation():
99
+ st.session_state.conversation = []
100
+ st.session_state.messages = []
101
+ if 'task_choice' in st.session_state:
102
+ del st.session_state.task_choice
103
+ return None
104
+ #new 24 March
105
+ #user_input = st.text_input("Enter your prompt:")
106
+ ###########33
107
+
108
+ # Initialize session state variables
109
+ if "messages" not in st.session_state:
110
+ st.session_state.messages = []
111
+ if "examples_to_classify" not in st.session_state:
112
+ st.session_state.examples_to_classify = []
113
+ if "system_role" not in st.session_state:
114
+ st.session_state.system_role = ""
115
+
116
+
117
+
118
+ # Main app title
119
+ st.title("🤖🦙 Text Data Labeling and Generation App")
120
+ # def embed_pdf_sidebar(pdf_path):
121
+ # with open(pdf_path, "rb") as f:
122
+ # base64_pdf = base64.b64encode(f.read()).decode('utf-8')
123
+ # pdf_display = f"""
124
+ # <iframe src="data:application/pdf;base64,{base64_pdf}"
125
+ # width="100%" height="400" type="application/pdf"></iframe>
126
+ # """
127
+ # st.markdown(pdf_display, unsafe_allow_html=True)
128
+ #
129
+
130
+
131
+ # Sidebar settings
132
+ with st.sidebar:
133
+ st.title("⚙️ Settings")
134
+
135
+
136
+ #this last code works
137
+ with st.sidebar:
138
+ st.markdown("### 📘Data Generation and Labeling Instructions")
139
+ #st.markdown("<h4 style='color: #4A90E2;'>📘 Instructions</h4>", unsafe_allow_html=True)
140
+ with open("User instructions.pdf", "rb") as f:
141
+ st.download_button(
142
+ label="📄 Download Instructions PDF",
143
+ data=f,
144
+ #file_name="instructions.pdf",
145
+ file_name="User instructions.pdf",
146
+ mime="application/pdf"
147
+ )
148
+
149
+ selected_model = st.selectbox(
150
+ "Select Model",
151
+ ["meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.2-3B-Instruct","meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct",
152
+ "meta-llama/Llama-3.1-70B-Instruct"],
153
+ key='model_select'
154
+ )
155
+
156
+ temperature = st.slider(
157
+ "Temperature",
158
+ 0.0, 1.0, 0.7,
159
+ help="Controls randomness in generation"
160
+ )
161
+
162
+ st.button("🔄 New Conversation", on_click=reset_conversation)
163
+ with st.container():
164
+ st.markdown(f"""
165
+ <div class="sidebar-info">
166
+ <h4>Current Model: {selected_model}</h4>
167
+ <p><em>Note: Generated content may be inaccurate or false. Check important info.</em></p>
168
+ </div>
169
+ """, unsafe_allow_html=True)
170
+
171
+ feedback_url = "https://docs.google.com/forms/d/e/1FAIpQLSdZ_5mwW-pjqXHgxR0xriyVeRhqdQKgb5c-foXlYAV55Rilsg/viewform?usp=header"
172
+ st.sidebar.markdown(
173
+ f'<a href="{feedback_url}" target="_blank"><button style="width: 100%;">Feedback Form</button></a>',
174
+ unsafe_allow_html=True
175
+ )
176
+
177
+ # Display conversation
178
+ for message in st.session_state.messages:
179
+ with st.chat_message(message["role"]):
180
+ st.markdown(message["content"])
181
+
182
+ # Main content
183
+ if 'task_choice' not in st.session_state:
184
+ col1, col2 = st.columns(2)
185
+ with col1:
186
+ if st.button("📝 Data Generation", key="gen_button", help="Generate new data"):
187
+ st.session_state.task_choice = "Data Generation"
188
+ with col2:
189
+ if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"):
190
+ st.session_state.task_choice = "Data Labeling"
191
+
192
+ if "task_choice" in st.session_state:
193
+ if st.session_state.task_choice == "Data Generation":
194
+ st.header("📝 Data Generation")
195
+
196
+ # 1. Domain selection
197
+ domain_selection = st.selectbox("Domain", [
198
+ "Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"
199
+ ])
200
+
201
+ # 2. Handle custom domain input
202
+ custom_domain_valid = True # Assume valid until proven otherwise
203
+
204
+ if domain_selection == "Custom":
205
+ domain = st.text_input("Specify custom domain")
206
+ if not domain.strip():
207
+ st.error("Please specify a domain name.")
208
+ custom_domain_valid = False
209
+ else:
210
+ domain = domain_selection
211
+
212
+ # Classification type selection
213
+ classification_type = st.selectbox(
214
+ "Classification Type",
215
+ ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
216
+ )
217
+ # Labels setup based on classification type
218
+ #labels = []
219
+ labels = []
220
+ labels_valid = False
221
+ errors = []
222
+
223
+ def validate_binary_labels(labels):
224
+ errors = []
225
+ normalized = [label.strip().lower() for label in labels]
226
+
227
+ if not labels[0].strip():
228
+ errors.append("First class name is required.")
229
+ if not labels[1].strip():
230
+ errors.append("Second class name is required.")
231
+ if normalized[0] == normalized[1] and all(normalized):
232
+ errors.append("Class names must be different.")
233
+ return errors
234
+
235
+ if classification_type == "Sentiment Analysis":
236
+ st.write("### Sentiment Analysis Labels (Fixed)")
237
+ col1, col2, col3 = st.columns(3)
238
+ with col1:
239
+ st.text_input("First class", "Positive", disabled=True)
240
+ with col2:
241
+ st.text_input("Second class", "Negative", disabled=True)
242
+ with col3:
243
+ st.text_input("Third class", "Neutral", disabled=True)
244
+ labels = ["Positive", "Negative", "Neutral"]
245
+
246
+ elif classification_type == "Binary Classification":
247
+ st.write("### Binary Classification Labels")
248
+ col1, col2 = st.columns(2)
249
+ with col1:
250
+ label_1 = st.text_input("First class", "Positive")
251
+ with col2:
252
+ label_2 = st.text_input("Second class", "Negative")
253
+
254
+ labels = [label_1, label_2]
255
+ errors = validate_binary_labels(labels)
256
+
257
+ if errors:
258
+ st.error("\n".join(errors))
259
+ else:
260
+ st.success("Binary class names are valid and unique!")
261
+
262
+
263
+ elif classification_type == "Multi-Class Classification":
264
+ st.write("### Multi-Class Classification Labels")
265
+
266
+ default_labels_by_domain = {
267
+ "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
268
+ "AG News": ["World", "Sports", "Business", "Sci/Tech"],
269
+ "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
270
+ "Food & Dining", "Local Experience", "Adventure Activities",
271
+ "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
272
+ "Luxury Tourism"],
273
+ "Restaurant reviews": ["Italian", "French", "American"],
274
+ "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
275
+ "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
276
+ "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
277
+ "Books & Stationery","Toys & Games", "Sports & Fitness",
278
+ "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
279
+ }
280
+
281
+ num_classes = st.slider("Number of classes", 3, 15, 3)
282
+
283
+ # Get defaults for selected domain, or empty list
284
+ defaults = default_labels_by_domain.get(domain, [])
285
+
286
+ labels = []
287
+ errors = []
288
+ cols = st.columns(3)
289
+
290
+ for i in range(num_classes):
291
+ with cols[i % 3]:
292
+ default_value = defaults[i] if i < len(defaults) else ""
293
+ label_input = st.text_input(f"Class {i+1}", default_value)
294
+ normalized_label = label_input.strip().title()
295
+
296
+ if not normalized_label:
297
+ errors.append(f"Class {i+1} name is required.")
298
+ else:
299
+ labels.append(normalized_label)
300
+
301
+ # Check for duplicates (case-insensitive)
302
+ if len(labels) != len(set(labels)):
303
+ errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
304
+
305
+ # Show validation results
306
+ if errors:
307
+ for error in errors:
308
+ st.error(error)
309
+ else:
310
+ st.success("All Labels names are valid and unique!")
311
+ labels_valid = not errors # Will be True only if there are no label errors
312
+
313
+ ##############
314
+ #new 22/4/2025
315
+ # add additional attributes
316
+ add_attributes = st.checkbox("Add additional attributes (optional)")
317
+ additional_attributes = []
318
+
319
+ if add_attributes:
320
+ num_attributes = st.slider("Number of attributes to add", 1, 5, 1)
321
+ for i in range(num_attributes):
322
+ st.markdown(f"#### Attribute {i+1}")
323
+ attr_name = st.text_input(f"Name of attribute {i+1}", key=f"attr_name_{i}")
324
+ attr_topics = st.text_input(f"Topics (comma-separated) for {attr_name}", key=f"attr_topics_{i}")
325
+ if attr_name and attr_topics:
326
+ topics_list = [topic.strip() for topic in attr_topics.split(",") if topic.strip()]
327
+ additional_attributes.append({"attribute": attr_name, "topics": topics_list})
328
+
329
+ ################
330
+
331
+ # Generation parameters
332
+ col1, col2 = st.columns(2)
333
+ with col1:
334
+ min_words = st.number_input("Min words", 1, 100, 20)
335
+ with col2:
336
+ max_words = st.number_input("Max words", min_words, 100, 50)
337
+
338
+ # Few-shot examples
339
+ use_few_shot = st.toggle("Use few-shot examples")
340
+ few_shot_examples = []
341
+ if use_few_shot:
342
+ num_examples = st.slider("Number of few-shot examples", 1, 10, 1)
343
+ for i in range(num_examples):
344
+ with st.expander(f"Example {i+1}"):
345
+ content = st.text_area(f"Content", key=f"few_shot_content_{i}")
346
+ label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}")
347
+ if content and label:
348
+ few_shot_examples.append({"content": content, "label": label})
349
+
350
+ num_to_generate = st.number_input("Number of examples", 1, 200, 10)
351
+ #sytem role after
352
+ # System role customization
353
+ #default_system_role = f"You are a professional {classification_type} expert, your role is to generate text examples for {domain} domain. Always generate unique diverse examples and do not repeat the generated data. The generated text should be between {min_words} to {max_words} words long."
354
+ # System role customization
355
+ default_system_role = (
356
+ f"You are a seasoned expert in {classification_type}, specializing in the {domain} domain. "
357
+ f" Your primary responsibility is to generate high-quality, diverse, and unique text examples "
358
+ f"tailored to this domain. Please ensure that each example adheres to the specified length "
359
+ f"requirements, ranging from {min_words} to {max_words} words, and avoid any repetition in the generated content."
360
+ )
361
+ system_role = st.text_area("Modify System Role (optional)",
362
+ value=default_system_role,
363
+ key="system_role_input")
364
+ st.session_state['system_role'] = system_role if system_role else default_system_role
365
+ # Labels initialization
366
+ #labels = []
367
+
368
+
369
+ user_prompt = st.text_area("User Prompt (optional)")
370
+
371
+ # Updated prompt template including system role
372
+ prompt_template = PromptTemplate(
373
+ input_variables=["system_role", "classification_type", "domain", "num_examples",
374
+ "min_words", "max_words", "labels", "user_prompt", "few_shot_examples", "additional_attributes"],
375
+ template=(
376
+ "{system_role}\n"
377
+ "- Use the following parameters:\n"
378
+ "- Generate {num_examples} examples\n"
379
+ "- Each example should be between {min_words} to {max_words} words long\n"
380
+ "- Use these labels: {labels}.\n"
381
+ "- Use the following additional attributes:\n"
382
+ "- {additional_attributes}\n"
383
+ "- Generate the examples in this format: 'Example text. Label: label'\n"
384
+ "- Do not include word counts or any additional information\n"
385
+ "- Always use your creativity and intelligence to generate unique and diverse text data\n"
386
+ "- In sentiment analysis, ensure that the sentiment classification is clearly identified as Positive, Negative, or Neutral. Do not leave the sentiment ambiguous.\n"
387
+ "- In binary sentiment analysis, classify text strictly as either Positive or Negative. Do not include or imply Neutral as an option.\n"
388
+ "- Write unique examples every time.\n"
389
+ "- DO NOT REPEAT your gnerated text. \n"
390
+ "- For each Output, describe it once and move to the next.\n"
391
+ "- List each Output only once, and avoid repeating details.\n"
392
+ "- Additional instructions: {user_prompt}\n\n"
393
+ "- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n"
394
+ "- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it"
395
+
396
+ )
397
+ )
398
+ # template=(
399
+ # "{system_role}\n"
400
+ # "- Use the following parameters:\n"
401
+ # "- Generate {num_examples} examples\n"
402
+ # "- Each example should be between {min_words} to {max_words} words long\n"
403
+ # "- Use these labels: {labels}.\n"
404
+ # "- Use the following additional attributes:\n"
405
+ # "{additional_attributes}\n"
406
+ # #"- Format each example like this: 'Example text. Label: [label]. Attribute1: [topic1]. Attribute2: [topic2]'\n"
407
+ # "- Generate the examples in this format: 'Example text. Label: label'\n"
408
+ # "- Additional instructions: {user_prompt}\n"
409
+ # "- Use these few-shot examples if provided:\n{few_shot_examples}\n"
410
+ # "- Think step by step and ensure examples are unique and not repeated."
411
+ # )
412
+ # )
413
+ ##########new 22/4/2025
414
+ formatted_attributes = "\n".join([
415
+ f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
416
+ ])
417
+ #######################
418
+
419
+ # Generate system prompt
420
+ system_prompt = prompt_template.format(
421
+ system_role=st.session_state['system_role'],
422
+ classification_type=classification_type,
423
+ domain=domain,
424
+ num_examples=num_to_generate,
425
+ min_words=min_words,
426
+ max_words=max_words,
427
+ labels=", ".join(labels),
428
+ user_prompt=user_prompt,
429
+ few_shot_examples="\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) if few_shot_examples else "",
430
+ additional_attributes=formatted_attributes
431
+ )
432
+
433
+
434
+ # Store system prompt in session state
435
+ st.session_state['system_prompt'] = system_prompt
436
+
437
+ # Display system prompt
438
+ st.write("System Prompt:")
439
+ st.text_area("Current System Prompt", value=st.session_state['system_prompt'],
440
+ height=400, disabled=True)
441
+
442
+
443
+ if st.button("🎯 Generate Examples"):
444
+ #
445
+ errors = []
446
+ if domain_selection == "Custom" and not domain.strip():
447
+ st.warning("Custom domain name is required.")
448
+ elif len(labels) != len(set(labels)):
449
+ st.warning("Class names must be unique.")
450
+ elif any(not lbl.strip() for lbl in labels):
451
+ st.warning("All class labels must be filled in.")
452
+ #else:
453
+ #st.success("Generating examples for domain: {domain}")
454
+
455
+ #if not custom_domain_valid:
456
+ #st.warning("Custom domain name is required.")
457
+ #elif not labels_valid:
458
+ #st.warning("Please fix the label errors before generating examples.")
459
+ #else:
460
+ # Proceed to generate examples
461
+ #st.success(f"Generating examples for domain: {domain}")
462
+
463
+ with st.spinner("Generating examples..."):
464
+ try:
465
+ stream = client.chat.completions.create(
466
+ model=selected_model,
467
+ messages=[{"role": "system", "content": st.session_state['system_prompt']}],
468
+ temperature=temperature,
469
+ stream=True,
470
+ max_tokens=80000,
471
+ top_p=0.9,
472
+ # repetition_penalty=1.2,
473
+ #frequency_penalty=0.5, # Discourages frequent words
474
+ #presence_penalty=0.6,
475
+ )
476
+ #st.session_state['system_prompt'] = system_prompt
477
+ #new 24 march
478
+ st.session_state.messages.append({"role": "user", "content": system_prompt})
479
+ # # ####################
480
+ response = st.write_stream(stream)
481
+ st.session_state.messages.append({"role": "assistant", "content": response})
482
+ # Initialize session state variables if they don't exist
483
+ if 'system_prompt' not in st.session_state:
484
+ st.session_state.system_prompt = system_prompt
485
+
486
+ if 'response' not in st.session_state:
487
+ st.session_state.response = response
488
+
489
+ if 'generated_examples' not in st.session_state:
490
+ st.session_state.generated_examples = []
491
+
492
+ if 'generated_examples_csv' not in st.session_state:
493
+ st.session_state.generated_examples_csv = None
494
+
495
+ if 'generated_examples_json' not in st.session_state:
496
+ st.session_state.generated_examples_json = None
497
+
498
+ # Parse response and generate examples list
499
+ examples_list = []
500
+ for line in response.split('\n'):
501
+ if line.strip():
502
+ parts = line.rsplit('Label:', 1)
503
+ if len(parts) == 2:
504
+ text = parts[0].strip()
505
+ label = parts[1].strip()
506
+ if text and label:
507
+ examples_list.append({
508
+ 'text': text,
509
+ 'label': label,
510
+ 'system_prompt': st.session_state.system_prompt,
511
+ 'system_role': st.session_state.system_role,
512
+ 'task_type': 'Data Generation',
513
+ 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
514
+ })
515
+
516
+ # example_dict = {
517
+ # 'text': text,
518
+ # 'label': label,
519
+ # 'system_prompt': st.session_state.system_prompt,
520
+ # 'system_role': st.session_state.system_role,
521
+ # 'task_type': 'Data Generation',
522
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
523
+ # }
524
+ # for attr in additional_attributes:
525
+ # example_dict[attr['attribute']] = random.choice(attr['topics'])
526
+
527
+ # examples_list.append(example_dict)
528
+
529
+
530
+ if examples_list:
531
+ # Update session state with new data
532
+ st.session_state.generated_examples = examples_list
533
+
534
+ # Generate CSV and JSON data
535
+ df = pd.DataFrame(examples_list)
536
+ st.session_state.generated_examples_csv = df.to_csv(index=False).encode('utf-8')
537
+ st.session_state.generated_examples_json = json.dumps(examples_list, indent=2).encode('utf-8')
538
+
539
+ # Vertical layout with centered "or" between buttons
540
+ st.download_button(
541
+ "📥 Download Generated Examples (CSV)",
542
+ st.session_state.generated_examples_csv,
543
+ "generated_examples.csv",
544
+ "text/csv",
545
+ key='download-csv-persistent'
546
+ )
547
+
548
+ # Add space and center the "or"
549
+ st.markdown("""
550
+ <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
551
+ """, unsafe_allow_html=True)
552
+
553
+ st.download_button(
554
+ "📥 Download Generated Examples (JSON)",
555
+ st.session_state.generated_examples_json,
556
+ "generated_examples.json",
557
+ "application/json",
558
+ key='download-json-persistent'
559
+ )
560
+ # Display the labeled examples
561
+ st.markdown("##### 📋 Labeled Examples Preview")
562
+ st.dataframe(df, use_container_width=True)
563
+
564
+ if st.button("Continue"):
565
+ if follow_up == "Generate more examples":
566
+ st.experimental_rerun()
567
+ elif follow_up == "Data Labeling":
568
+ st.session_state.task_choice = "Data Labeling"
569
+ st.experimental_rerun()
570
+
571
+ except Exception as e:
572
+ st.error("An error occurred during generation.")
573
+ st.error(f"Details: {e}")
574
+
575
+
576
+ # Lableing Process
577
+ elif st.session_state.task_choice == "Data Labeling":
578
+ st.header("🏷️ Data Labeling")
579
+
580
+ domain_selection = st.selectbox("Domain", ["Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"])
581
+ # 2. Handle custom domain input
582
+ custom_domain_valid = True # Assume valid until proven otherwise
583
+
584
+ if domain_selection == "Custom":
585
+ domain = st.text_input("Specify custom domain")
586
+ if not domain.strip():
587
+ st.error("Please specify a domain name.")
588
+ custom_domain_valid = False
589
+ else:
590
+ domain = domain_selection
591
+
592
+
593
+ # Classification type selection
594
+ classification_type = st.selectbox(
595
+ "Classification Type",
596
+ ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"]
597
+ )
598
+ #NNew edit
599
+ # Labels setup based on classification type
600
+ labels = []
601
+ labels_valid = False
602
+ errors = []
603
+
604
+ if classification_type == "Sentiment Analysis":
605
+ st.write("### Sentiment Analysis Labels (Fixed)")
606
+ col1, col2, col3 = st.columns(3)
607
+ with col1:
608
+ label_1 = st.text_input("First class", "Positive", disabled=True)
609
+ with col2:
610
+ label_2 = st.text_input("Second class", "Negative", disabled=True)
611
+ with col3:
612
+ label_3 = st.text_input("Third class", "Neutral", disabled=True)
613
+ labels = ["Positive", "Negative", "Neutral"]
614
+
615
+
616
+ elif classification_type == "Binary Classification":
617
+ st.write("### Binary Classification Labels")
618
+ col1, col2 = st.columns(2)
619
+
620
+ with col1:
621
+ label_1 = st.text_input("First class", "Positive")
622
+ with col2:
623
+ label_2 = st.text_input("Second class", "Negative")
624
+
625
+ errors = []
626
+ labels = [label_1.strip(), label_2.strip()]
627
+
628
+
629
+ # Strip and lower-case labels for validation
630
+ label_1 = labels[0].strip()
631
+ label_2 = labels[1].strip()
632
+
633
+ # Check for empty class names
634
+ if not label_1:
635
+ errors.append("First class name is required.")
636
+ if not label_2:
637
+ errors.append("Second class name is required.")
638
+
639
+ # Check for duplicates (case insensitive)
640
+ if label_1.lower() == label_2.lower() and label_1 and label_2:
641
+ errors.append("Class names must be different.")
642
+
643
+ # Show errors or success
644
+ if errors:
645
+ for error in errors:
646
+ st.error(error)
647
+ else:
648
+ st.success("Binary class names are valid and unique!")
649
+
650
+
651
+ elif classification_type == "Multi-Class Classification":
652
+ st.write("### Multi-Class Classification Labels")
653
+
654
+ default_labels_by_domain = {
655
+ "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
656
+ "AG News": ["World", "Sports", "Business", "Sci/Tech"],
657
+ "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
658
+ "Food & Dining", "Local Experience", "Adventure Activities",
659
+ "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
660
+ "Luxury Tourism"],
661
+ "Restaurant reviews": ["Italian", "French", "American"],
662
+ "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
663
+ "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
664
+ "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
665
+ "Books & Stationery","Toys & Games", "Sports & Fitness",
666
+ "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
667
+ }
668
+
669
+
670
+
671
+ # Ask user how many classes they want to define
672
+ num_classes = st.slider("Select the number of classes (labels)", min_value=3, max_value=10, value=3)
673
+
674
+ # Use default labels based on selected domain, if available
675
+ defaults = default_labels_by_domain.get(domain, [])
676
+
677
+ labels = []
678
+ errors = []
679
+ cols = st.columns(3) # For nicely arranged label inputs
680
+
681
+ for i in range(num_classes):
682
+ with cols[i % 3]: # Distribute inputs across columns
683
+ default_value = defaults[i] if i < len(defaults) else ""
684
+ label_input = st.text_input(f"Label {i + 1}", default_value)
685
+ normalized_label = label_input.strip().title()
686
+
687
+ if not normalized_label:
688
+ errors.append(f"Label {i + 1} is required.")
689
+ else:
690
+ labels.append(normalized_label)
691
+
692
+ # Check for duplicates (case-insensitive)
693
+ normalized_set = {label.lower() for label in labels}
694
+ if len(labels) != len(normalized_set):
695
+ errors.append("Label names must be unique (case-insensitive).")
696
+
697
+ # Show validation results
698
+ if errors:
699
+ for error in errors:
700
+ st.error(error)
701
+ else:
702
+ st.success("All label names are valid and unique!")
703
+
704
+ labels_valid = not errors # True if no validation errors
705
+
706
+ elif classification_type == "Named Entity Recognition (NER)":
707
+
708
+ #new 22/4/2025
709
+ #if classification_type == "Named Entity Recognition (NER)":
710
+ use_few_shot = True
711
+ #new 22/4/2025
712
+ few_shot_examples = [
713
+ {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
714
+ {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
715
+ {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
716
+ {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
717
+ {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
718
+ {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
719
+ {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
720
+ {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
721
+ {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
722
+ {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
723
+ ]
724
+ ###########
725
+
726
+ st.write("### Named Entity Recognition (NER) Entities")
727
+
728
+ # Predefined standard entities
729
+ ner_entities = [
730
+ "PERSON - Names of people, fictional characters, historical figures",
731
+ "ORG - Companies, institutions, agencies, teams",
732
+ "LOC - Physical locations (mountains, oceans, etc.)",
733
+ "GPE - Countries, cities, states, political regions",
734
+ "DATE - Calendar dates, years, centuries",
735
+ "TIME - Times, durations",
736
+ "MONEY - Monetary values with currency"
737
+ ]
738
+
739
+ # User can add custom NER types
740
+ custom_ner_entities = []
741
+ if st.checkbox("Add custom NER entities?"):
742
+ num_custom_ner = st.slider("Number of custom NER entities", 1, 10, 1)
743
+ for i in range(num_custom_ner):
744
+ st.markdown(f"#### Custom Entity {i+1}")
745
+ custom_type = st.text_input(f"Entity type {i+1}", key=f"custom_ner_type_{i}")
746
+ custom_description = st.text_input(f"Description for {custom_type}", key=f"custom_ner_desc_{i}")
747
+ if custom_type and custom_description:
748
+ custom_ner_entities.append(f"{custom_type.upper()} - {custom_description}")
749
+
750
+ # Combine built-in and custom NERs
751
+ all_ner_options = ner_entities + custom_ner_entities
752
+
753
+ selected_entities = st.multiselect(
754
+ "Select entities to recognize",
755
+ all_ner_options,
756
+ default=ner_entities
757
+ )
758
+
759
+ # Extract entity type names (before the dash)
760
+ labels = [entity.split(" - ")[0].strip() for entity in selected_entities]
761
+
762
+ if not labels:
763
+ st.warning("Please select at least one entity type.")
764
+ labels = ["PERSON"]
765
+
766
+ ##########
767
+
768
+ # # Extract just the entity type (before the dash)
769
+ # labels = [entity.split(" - ")[0] for entity in selected_entities]
770
+
771
+ # if not labels:
772
+ # st.warning("Please select at least one entity type")
773
+ # labels = ["PERSON"] # Default if nothing selected
774
+
775
+
776
+
777
+
778
+
779
+ #NNew edit
780
+ # elif classification_type == "Multi-Class Classification":
781
+ # st.write("### Multi-Class Classification Labels")
782
+
783
+ # default_labels_by_domain = {
784
+ # "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
785
+ # "AG News": ["World", "Sports", "Business", "Sci/Tech"],
786
+ # "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
787
+ # "Food & Dining", "Local Experience", "Adventure Activities",
788
+ # "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
789
+ # "Luxury Tourism"],
790
+ # "Restaurant reviews": ["Italian", "French", "American"]
791
+ # }
792
+ # num_classes = st.slider("Number of classes", 3, 10, 3)
793
+
794
+ # # Get defaults for selected domain, or empty list
795
+ # defaults = default_labels_by_domain.get(domain, [])
796
+
797
+ # labels = []
798
+ # errors = []
799
+ # cols = st.columns(3)
800
+
801
+ # for i in range(num_classes):
802
+ # with cols[i % 3]:
803
+ # default_value = defaults[i] if i < len(defaults) else ""
804
+ # label_input = st.text_input(f"Class {i+1}", default_value)
805
+ # normalized_label = label_input.strip().title()
806
+
807
+ # if not normalized_label:
808
+ # errors.append(f"Class {i+1} name is required.")
809
+ # else:
810
+ # labels.append(normalized_label)
811
+
812
+ # # Check for duplicates (case-insensitive)
813
+ # if len(labels) != len(set(labels)):
814
+ # errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
815
+
816
+ # # Show validation results
817
+ # if errors:
818
+ # for error in errors:
819
+ # st.error(error)
820
+ # else:
821
+ # st.success("All Labels names are valid and unique!")
822
+ # labels_valid = not errors # Will be True only if there are no label errors
823
+
824
+
825
+
826
+
827
+ # else:
828
+ # num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes")
829
+ # labels = []
830
+ # cols = st.columns(3)
831
+ # for i in range(num_classes):
832
+ # with cols[i % 3]:
833
+ # label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
834
+ # labels.append(label)
835
+
836
+ use_few_shot = st.toggle("Use few-shot examples for labeling")
837
+ few_shot_examples = []
838
+ if use_few_shot:
839
+ num_few_shot = st.slider("Number of few-shot examples", 1, 10, 1)
840
+ for i in range(num_few_shot):
841
+ with st.expander(f"Few-shot Example {i+1}"):
842
+ content = st.text_area(f"Content", key=f"label_few_shot_content_{i}")
843
+ label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}")
844
+ if content and label:
845
+ few_shot_examples.append(f"{content}\nLabel: {label}")
846
+
847
+ num_examples = st.number_input("Number of examples to classify", 1, 100, 1)
848
+
849
+ examples_to_classify = []
850
+ if num_examples <= 20:
851
+ for i in range(num_examples):
852
+ example = st.text_area(f"Example {i+1}", key=f"example_{i}")
853
+ if example:
854
+ examples_to_classify.append(example)
855
+ else:
856
+ examples_text = st.text_area(
857
+ "Enter examples (one per line)",
858
+ height=300,
859
+ help="Enter each example on a new line"
860
+ )
861
+ if examples_text:
862
+ examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()]
863
+ if len(examples_to_classify) > num_examples:
864
+ examples_to_classify = examples_to_classify[:num_examples]
865
+
866
+ #New Wedyan
867
+ #default_system_role = f"You are a professional {classification_type} expert, your role is to classify the provided text examples for {domain} domain."
868
+ # System role customization
869
+ default_system_role = (f"You are a highly skilled {classification_type} expert."
870
+ f" Your task is to accurately classify the provided text examples within the {domain} domain."
871
+ f" Ensure that all classifications are precise, context-aware, and aligned with domain-specific standards and best practices."
872
+ )
873
+ system_role = st.text_area("Modify System Role (optional)",
874
+ value=default_system_role,
875
+ key="system_role_input")
876
+ st.session_state['system_role'] = system_role if system_role else default_system_role
877
+ # Labels initialization
878
+ #labels = []
879
+ ####
880
+
881
+ user_prompt = st.text_area("User prompt (optional)", key="label_instructions")
882
+
883
+ few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else ""
884
+ examples_text = "\n".join([f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify)])
885
+
886
+ # Customize prompt template based on classification type
887
+ if classification_type == "Named Entity Recognition (NER)":
888
+ # label_prompt_template = PromptTemplate(
889
+ # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
890
+ # template=(
891
+ # "{system_role}\n"
892
+ # #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n"
893
+ # "- For each text example provided, identify all entities of the requested types.\n"
894
+ # "- Use the following entities: {labels}.\n"
895
+ # "- Return each example followed by the entities you found in this format: 'Example text.\n \n Entities:\n [ENTITY_TYPE: entity text\n\n, ENTITY_TYPE: entity text\n\n, ...] or [No entities found]'\n"
896
+ # "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n"
897
+ # "- Be precise about entity boundaries - don't include unnecessary words.\n"
898
+ # "- Do not provide any additional information or explanations.\n"
899
+ # "- Additional instructions:\n {user_prompt}\n\n"
900
+ # "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n"
901
+ # "- Examples to analyze:\n{examples}\n\n"
902
+ # "Output:\n"
903
+ # )
904
+ # )
905
+ #new 22/4/2025
906
+ # label_prompt_template = PromptTemplate(
907
+ # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
908
+ # template=(
909
+ # "{system_role}\n"
910
+ # "- You are performing Named Entity Recognition (NER) in the domain of {domain}.\n"
911
+ # "- Use the following entity types: {labels}.\n\n"
912
+ # "### Reasoning Steps:\n"
913
+ # "1. Read the example carefully.\n"
914
+ # "2. For each named entity mentioned, determine its meaning and role in the sentence.\n"
915
+ # "3. Think about the **context**: Is it a physical location (LOC)? A geopolitical region (GPE)? A person (PERSON)?\n"
916
+ # "4. Based on the definition of each label, assign the most **specific and correct** label.\n\n"
917
+ # "For example:\n"
918
+ # "- 'Mount Everest' → LOC (it's a mountain)\n"
919
+ # "- 'France' → GPE (it's a country)\n"
920
+ # "- 'Microsoft' → ORG\n"
921
+ # "- 'John Smith' → PERSON\n\n"
922
+ # "- Return each example followed by the entities you found in this format:\n"
923
+ # "'Example text.'\nEntities: [ENTITY_TYPE: entity text, ENTITY_TYPE: entity text, ...] or [No entities found]\n"
924
+ # "- If no entities of the requested types are found, return 'No entities found'.\n"
925
+ # "- Be precise about entity boundaries - don't include extra words.\n"
926
+ # "- Do not explain or justify your answers.\n\n"
927
+ # "Additional instructions:\n{user_prompt}\n\n"
928
+ # "Few-shot examples:\n{few_shot_examples}\n\n"
929
+ # "Examples to label:\n{examples}\n"
930
+ # "Output:\n"
931
+ # )
932
+ #)
933
+ # label_prompt_template = PromptTemplate(
934
+ # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
935
+ # template=(
936
+ # "{system_role}\n"
937
+ # "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
938
+ # "- Use these entity types: {labels}.\n\n"
939
+ # "### Output Format:\n"
940
+ # # "Return each example followed by the entities you found in this format: 'Example text.\n Entities:\n [ENTITY_TYPE: entity text\n\"
941
+ # "Return each example followed by the entities you found in this format: 'Example text.\n 'Entity types:\n "Then group the entities under each label like this:\n" "
942
+ # #"Then Start with this line exactly: 'Entity types\n'\n"
943
+ # #"Then group the entities under each label like this:\n"
944
+ # "\n PERSON – Angela Merkel, John Smith\n\n"
945
+ # "\ ORG – Google, United Nations\n\n"
946
+ # "\n DATE – January 1st, 2023\n\n"
947
+ # "\n ... and so on.\n\n"
948
+ # "If entity {labels} not found, do not write it in your response\n"
949
+ # "- Do NOT output them inline after the text.\n"
950
+ # "- Do NOT repeat the sentence.\n"
951
+ # "- If no entities are found for a type, skip it.\n"
952
+ # "- Keep the format consistent.\n\n"
953
+ # "User Instructions:\n{user_prompt}\n\n"
954
+ # "Few-shot Examples:\n{few_shot_examples}\n\n"
955
+ # "Examples to analyze:\n{examples}"
956
+ # )
957
+ # )
958
+
959
+
960
+ label_prompt_template = PromptTemplate(
961
+ input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
962
+ template=(
963
+ "{system_role}\n"
964
+ "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
965
+ "- Use these entity types: {labels}.\n\n"
966
+ "### Output Format:\n"
967
+ "Return each example followed by the entities you found in this format:\n"
968
+ "'Example text.\n \n and in new line Entity types:\n"
969
+ "Then group the entities under each label like this:\n"
970
+ "\nPERSON –[Angela Merkel, John Smith]\ n"
971
+ #"\nORG – Google, United Nations\n"
972
+ "\nORG – [Google, United Nations]\n"
973
+ "\nDATE – [January 1st, 2023]\n"
974
+ "\n... and so on.\n\n"
975
+ "and write each new entities group in a new line.\n"
976
+ "If entity type {labels} is not found, do not write it in your response.\n"
977
+ "- Do NOT output them inline after the text.\n"
978
+ "- Do NOT repeat the sentence.\n"
979
+ "- If no entities are found for a type, skip it.\n"
980
+ "- Keep the format consistent.\n\n"
981
+ "User Instructions:\n{user_prompt}\n\n"
982
+ "Few-shot Examples:\n{few_shot_examples}\n\n"
983
+ "Examples to analyze:\n{examples}"
984
+ )
985
+ )
986
+
987
+ #######
988
+ else:
989
+ label_prompt_template = PromptTemplate(
990
+
991
+ input_variables=["system_role", "classification_type", "labels", "few_shot_examples", "examples","domain", "user_prompt"],
992
+ template=(
993
+ #"- Let'\s think step by step:"
994
+ "{system_role}\n"
995
+ # "- You are a professional {classification_type} expert in {domain} domain. Your role is to classify the following examples using these labels: {labels}.\n"
996
+ "- Use the following instructions:\n"
997
+ "- Use the following labels: {labels}.\n"
998
+ "- Return the classified text followed by the label in this format: 'text. Label: [label]'\n"
999
+ "- In sentiment classification, ensure the output clearly distinguishes between the three categories: Positive, Negative, and Neutral. Each classification should be unambiguous and accurately reflect the sentiment expressed in the text.\n"
1000
+ "- In binary sentiment classification, restrict the output to either Positive or Negative only. Do not classify or imply Neutral. If the sentiment is ambiguous or mixed, lean toward the dominant tone.\n"
1001
+ "- Do not provide any additional information or explanations\n"
1002
+ "- User prompt:\n {user_prompt}\n\n"
1003
+ "- Use user provided examples as guidence in the classification process:\n\n {few_shot_examples}\n"
1004
+ "- Examples to classify:\n{examples}\n\n"
1005
+ "- Think step by step then classify the examples"
1006
+ #"Output:\n"
1007
+ ))
1008
+
1009
+ # Check if few_shot_examples is already a formatted string
1010
+ # Check if few_shot_examples is already a formatted string
1011
+ if isinstance(few_shot_examples, str):
1012
+ formatted_few_shot = few_shot_examples
1013
+ # If it's a list of already formatted strings
1014
+ elif isinstance(few_shot_examples, list) and all(isinstance(ex, str) for ex in few_shot_examples):
1015
+ formatted_few_shot = "\n".join(few_shot_examples)
1016
+ # If it's a list of dictionaries with 'content' and 'label' keys
1017
+ elif isinstance(few_shot_examples, list) and all(isinstance(ex, dict) and 'content' in ex and 'label' in ex for ex in few_shot_examples):
1018
+ formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
1019
+ else:
1020
+ formatted_few_shot = ""
1021
+ # #new 22/4/2025
1022
+ # few_shot_examples = [
1023
+ # {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
1024
+ # {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
1025
+ # {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
1026
+ # {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
1027
+ # {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
1028
+ # {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
1029
+ # {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
1030
+ # {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
1031
+ # {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
1032
+ # {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
1033
+ # ]
1034
+ # ###########
1035
+ # new 22/4/2025
1036
+ #formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
1037
+ formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
1038
+
1039
+ ###########
1040
+ system_prompt = label_prompt_template.format(
1041
+ system_role=st.session_state['system_role'],
1042
+ classification_type=classification_type,
1043
+ domain=domain,
1044
+ examples="\n".join(examples_to_classify),
1045
+ labels=", ".join(labels),
1046
+ user_prompt=user_prompt,
1047
+ few_shot_examples=formatted_few_shot
1048
+ )
1049
+
1050
+ # Step 2: Store the system_prompt in st.session_state
1051
+ st.session_state['system_prompt'] = system_prompt
1052
+ #::contentReference[oaicite:0]{index=0}
1053
+ st.write("System Prompt:")
1054
+ #st.code(system_prompt)
1055
+ #st.code(st.session_state['system_prompt'])
1056
+ st.text_area("System Prompt", value=st.session_state['system_prompt'], height=300, max_chars=None, key=None, help=None, disabled=True)
1057
+
1058
+
1059
+
1060
+ if st.button("🏷️ Label Data"):
1061
+ if examples_to_classify:
1062
+ with st.spinner("Labeling data..."):
1063
+ #Generate the system prompt based on classification type
1064
+ if classification_type == "Named Entity Recognition (NER)":
1065
+ system_prompt = label_prompt_template.format(
1066
+ system_role=st.session_state['system_role'],
1067
+ labels=", ".join(labels),
1068
+ domain = domain,
1069
+ few_shot_examples=few_shot_text,
1070
+ examples=examples_text,
1071
+ user_prompt=user_prompt
1072
+ #new
1073
+ #'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1074
+ )
1075
+ # if classification_type == "Named Entity Recognition (NER)":
1076
+ # # Step 1: Split the full response by example
1077
+ # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1078
+ # inputs = [ex.strip() for ex in examples_to_classify]
1079
+
1080
+ # # Step 2: Match inputs with NER output blocks
1081
+ # labeled_examples = []
1082
+ # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1083
+ # labeled_examples.append({
1084
+ # 'text': text,
1085
+ # 'entities': f"Entity types\n{output_block.strip()}",
1086
+ # 'system_prompt': st.session_state.system_prompt,
1087
+ # 'system_role': st.session_state.system_role,
1088
+ # 'task_type': 'Named Entity Recognition (NER)',
1089
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1090
+ # })
1091
+
1092
+ # if classification_type == "Named Entity Recognition (NER)":
1093
+ # # Step 1: Split the full response by example
1094
+ # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1095
+ # inputs = [ex.strip() for ex in examples_to_classify]
1096
+
1097
+ # # Step 2: Match inputs with NER output blocks
1098
+ # labeled_examples = []
1099
+ # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1100
+ # labeled_examples.append({
1101
+ # 'text': text,
1102
+ # 'entities': f"Entity types\n{output_block.strip()}",
1103
+ # 'system_prompt': st.session_state.system_prompt,
1104
+ # 'system_role': st.session_state.system_role,
1105
+ # 'task_type': 'Named Entity Recognition (NER)',
1106
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1107
+ # })
1108
+
1109
+
1110
+ # import re
1111
+
1112
+ # if classification_type == "Named Entity Recognition (NER)":
1113
+ # # Use regex to split on "Entity types" while keeping it attached to each block
1114
+ # blocks = re.split(r"(Entity types)", response.strip())
1115
+
1116
+ # # Recombine 'Entity types' with each block after splitting
1117
+ # raw_outputs = [
1118
+ # (blocks[i] + blocks[i+1]).strip()
1119
+ # for i in range(1, len(blocks) - 1, 2)
1120
+ # ]
1121
+
1122
+ # inputs = [ex.strip() for ex in examples_to_classify]
1123
+
1124
+ # labeled_examples = []
1125
+ # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1126
+ # labeled_examples.append({
1127
+ # 'text': text,
1128
+ # 'entities': output_block,
1129
+ # 'system_prompt': st.session_state.system_prompt,
1130
+ # 'system_role': st.session_state.system_role,
1131
+ # 'task_type': 'Named Entity Recognition (NER)',
1132
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1133
+ # })
1134
+
1135
+
1136
+ else:
1137
+ system_prompt = label_prompt_template.format(
1138
+ classification_type=classification_type,
1139
+ system_role=st.session_state['system_role'],
1140
+ domain = domain,
1141
+ labels=", ".join(labels),
1142
+ few_shot_examples=few_shot_text,
1143
+ examples=examples_text,
1144
+ user_prompt=user_prompt
1145
+ )
1146
+ try:
1147
+ stream = client.chat.completions.create(
1148
+ model=selected_model,
1149
+ messages=[{"role": "system", "content": system_prompt}],
1150
+ temperature=temperature,
1151
+ stream=True,
1152
+ max_tokens=20000,
1153
+ top_p = 0.9,
1154
+
1155
+ )
1156
+ #new 24 March
1157
+ # Append user message
1158
+ st.session_state.messages.append({"role": "user", "content": system_prompt})
1159
+ #################
1160
+ response = st.write_stream(stream)
1161
+ st.session_state.messages.append({"role": "assistant", "content": response})
1162
+ # Display the labeled examples
1163
+ # # Optional: If you want to add it as a chat-style message log
1164
+ # preview_str = st.session_state.labeled_preview.to_markdown(index=False)
1165
+ # st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"})
1166
+
1167
+
1168
+ # # Stream response and append assistant message
1169
+ # #14/4/2024
1170
+ # response = st.write_stream(stream)
1171
+ # st.session_state.messages.append({"role": "assistant", "content": response})
1172
+
1173
+ # Initialize session state variables if they don't exist
1174
+ if 'system_prompt' not in st.session_state:
1175
+ st.session_state.system_prompt = system_prompt
1176
+
1177
+ if 'response' not in st.session_state:
1178
+ st.session_state.response = response
1179
+
1180
+ if 'generated_examples' not in st.session_state:
1181
+ st.session_state.generated_examples = []
1182
+
1183
+ if 'generated_examples_csv' not in st.session_state:
1184
+ st.session_state.generated_examples_csv = None
1185
+
1186
+ if 'generated_examples_json' not in st.session_state:
1187
+ st.session_state.generated_examples_json = None
1188
+
1189
+
1190
+
1191
+
1192
+ # Save labeled examples to CSV
1193
+ #new 14/4/2025
1194
+ labeled_examples = []
1195
+ # if classification_type == "Named Entity Recognition (NER)":
1196
+ # labeled_examples = []
1197
+ # for line in response.split('\n'):
1198
+ # if line.strip():
1199
+ # parts = line.rsplit('Entities:', 1)
1200
+ # if len(parts) == 2:
1201
+ # text = parts[0].strip()
1202
+ # entities = parts[1].strip()
1203
+ # if text and entities:
1204
+ # labeled_examples.append({
1205
+ # 'text': text,
1206
+ # 'entities': entities,
1207
+ # 'system_prompt': st.session_state.system_prompt,
1208
+ # 'system_role': st.session_state.system_role,
1209
+ # 'task_type': 'Named Entity Recognition (NER)',
1210
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1211
+ # })
1212
+
1213
+ # #new 22/4/2025
1214
+ # if classification_type == "Named Entity Recognition (NER)":
1215
+ # labeled_examples = [{
1216
+ # 'ner_output': response.strip(),
1217
+ # 'system_prompt': st.session_state.system_prompt,
1218
+ # 'system_role': st.session_state.system_role,
1219
+ # 'task_type': 'Named Entity Recognition (NER)',
1220
+ # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1221
+ # }]
1222
+
1223
+ #new 24/4/2025
1224
+ labeled_examples = []
1225
+
1226
+ if classification_type == "Named Entity Recognition (NER)":
1227
+ # Split response into lines and try to extract the text and entities
1228
+ for line in response.strip().split('\n'):
1229
+ if line.strip():
1230
+ parts = line.rsplit('Entities:', 1)
1231
+ if len(parts) == 2:
1232
+ text = parts[0].strip()
1233
+ entities = parts[1].strip()
1234
+ if text and entities:
1235
+ labeled_examples.append({
1236
+ 'text': text,
1237
+ 'entities': entities,
1238
+ 'system_prompt': st.session_state.system_prompt,
1239
+ 'system_role': st.session_state.system_role,
1240
+ 'task_type': 'Named Entity Recognition (NER)',
1241
+ 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1242
+ })
1243
+
1244
+ #new 24/4/2025
1245
+ # Save and provide download options
1246
+ if labeled_examples:
1247
+ # Update session state
1248
+ st.session_state.labeled_examples = labeled_examples
1249
+
1250
+ # Convert to CSV and JSON
1251
+ df = pd.DataFrame(labeled_examples)
1252
+
1253
+ ####
1254
+
1255
+ ######
1256
+
1257
+
1258
+ else:
1259
+ labeled_examples = []
1260
+ for line in response.split('\n'):
1261
+ if line.strip():
1262
+ parts = line.rsplit('Label:', 1)
1263
+ if len(parts) == 2:
1264
+ text = parts[0].strip()
1265
+ label = parts[1].strip()
1266
+ if text and label:
1267
+ labeled_examples.append({
1268
+ 'text': text,
1269
+ 'label': label,
1270
+ 'system_prompt': st.session_state.system_prompt,
1271
+ 'system_role': st.session_state.system_role,
1272
+ 'task_type': 'Data Labeling',
1273
+ 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1274
+ })
1275
+ # Save and provide download options
1276
+ if labeled_examples:
1277
+ # Update session state
1278
+ st.session_state.labeled_examples = labeled_examples
1279
+
1280
+ # Convert to CSV and JSON
1281
+ df = pd.DataFrame(labeled_examples)
1282
+
1283
+ # ########3
1284
+ # if labeled_examples:
1285
+ # df = pd.DataFrame(labeled_examples)
1286
+
1287
+ # csv = df.to_csv(index=False).encode('utf-8')
1288
+
1289
+ # st.download_button(
1290
+ # label="📥 Download Labeled Examples CSV",
1291
+ # data=csv,
1292
+ # file_name='labeled_examples.csv',
1293
+ # mime='text/csv'
1294
+ # )
1295
+ ###########
1296
+ #new 22/4/2025
1297
+ # CSV
1298
+ st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1299
+
1300
+ # JSON
1301
+ st.session_state.labeled_examples_json = json.dumps({
1302
+ "metadata": {
1303
+ "domain": domain,
1304
+ "labels": labels,
1305
+ "used_few_shot": use_few_shot,
1306
+ "task_type": "Named Entity Recognition (NER)",
1307
+ "timestamp": datetime.now().isoformat()
1308
+ },
1309
+ "examples": labeled_examples
1310
+ }, indent=2).encode('utf-8')
1311
+
1312
+ ############
1313
+ # CSV
1314
+ # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1315
+
1316
+ # # JSON
1317
+ # st.session_state.labeled_examples_json = json.dumps({
1318
+ # "metadata": {
1319
+ # "domain": domain,
1320
+ # "labels": labels,
1321
+ # "used_few_shot": use_few_shot,
1322
+ # "task_type": "Named Entity Recognition (NER)",
1323
+ # "timestamp": datetime.now().isoformat()
1324
+ # },
1325
+ # "examples": labeled_examples
1326
+ # }, indent=2).encode('utf-8')
1327
+
1328
+ ########
1329
+ st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1330
+ st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8')
1331
+
1332
+ # Download buttons
1333
+ st.download_button(
1334
+ "📥 Download Labeled Examples (CSV)",
1335
+ st.session_state.labeled_examples_csv,
1336
+ "labeled_examples.csv",
1337
+ "text/csv",
1338
+ key='download-labeled-csv'
1339
+ )
1340
+
1341
+
1342
+ st.markdown("""
1343
+ <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1344
+ """, unsafe_allow_html=True)
1345
+
1346
+ st.download_button(
1347
+ "📥 Download Labeled Examples (JSON)",
1348
+ st.session_state.labeled_examples_json,
1349
+ "labeled_examples.json",
1350
+ "application/json",
1351
+ key='download-labeled-json'
1352
+ )
1353
+ # Display the labeled examples
1354
+ st.markdown("##### 📋 Labeled Examples Preview")
1355
+ st.dataframe(df, use_container_width=True)
1356
+ # Display section
1357
+ #st.markdown("### 📋 Labeled Examples Preview")
1358
+ #st.dataframe(st.session_state.labeled_preview, use_container_width=True)
1359
+
1360
+
1361
+
1362
+ # if labeled_examples:
1363
+ # df = pd.DataFrame(labeled_examples)
1364
+ # csv = df.to_csv(index=False).encode('utf-8')
1365
+ # st.download_button(
1366
+ # "📥 Download Labeled Examples",
1367
+ # csv,
1368
+ # "labeled_examples.csv",
1369
+ # "text/csv",
1370
+ # key='download-labeled-csv'
1371
+ # )
1372
+ # # Add space and center the "or"
1373
+ # st.markdown("""
1374
+ # <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1375
+ # """, unsafe_allow_html=True)
1376
+
1377
+ # if labeled_examples:
1378
+ # df = pd.DataFrame(labeled_examples)
1379
+ # csv = df.to_csv(index=False).encode('utf-8')
1380
+ # st.download_button(
1381
+ # "📥 Download Labeled Examples",
1382
+ # csv,
1383
+ # "labeled_examples.json",
1384
+ # "text/json",
1385
+ # key='download-labeled-JSON'
1386
+ # )
1387
+
1388
+ # Add follow-up interaction options
1389
+ #st.markdown("---")
1390
+ #follow_up = st.radio(
1391
+ #"What would you like to do next?",
1392
+ #["Label more data", "Data Generation"],
1393
+ # key="labeling_follow_up"
1394
+ # )
1395
+
1396
+ if st.button("Continue"):
1397
+ if follow_up == "Label more data":
1398
+ st.session_state.examples_to_classify = []
1399
+ st.experimental_rerun()
1400
+ elif follow_up == "Data Generation":
1401
+ st.session_state.task_choice = "Data Labeling"
1402
+ st.experimental_rerun()
1403
+
1404
+ except Exception as e:
1405
+ st.error("An error occurred during labeling.")
1406
+ st.error(f"Details: {e}")
1407
+ else:
1408
+ st.warning("Please enter at least one example to classify.")
1409
+
1410
+ #st.session_state.messages.append({"role": "assistant", "content": response})
1411
+
1412
+
1413
+
1414
+
1415
+ # Footer
1416
+ st.markdown("---")
1417
+ st.markdown(
1418
+ """
1419
+ <div style='text-align: center'>
1420
+ <p>Made with ❤️ by Wedyan AlSakran 2025</p>
1421
+ </div>
1422
+ """,
1423
+ unsafe_allow_html=True
1424
+ )