Wedyan2023 commited on
Commit
b31be4a
·
verified ·
1 Parent(s): 1d87dfc

Delete app104.py

Browse files
Files changed (1) hide show
  1. app104.py +0 -1574
app104.py DELETED
@@ -1,1574 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import os
4
- import json
5
- import base64
6
- import random
7
- from streamlit_pdf_viewer import pdf_viewer
8
- from langchain.prompts import PromptTemplate
9
- from datetime import datetime
10
- from pathlib import Path
11
- from openai import OpenAI
12
- from dotenv import load_dotenv
13
- import warnings
14
-
15
- from transformers import AutoModelForCausalLM, AutoTokenizer
16
- import torch
17
-
18
- warnings.filterwarnings('ignore')
19
-
20
- os.getenv("OAUTH_CLIENT_ID")
21
-
22
-
23
- # Load environment variables and initialize the OpenAI client to use Hugging Face Inference API.
24
- load_dotenv()
25
- client = OpenAI(
26
- base_url="https://api-inference.huggingface.co/v1",
27
- api_key=os.environ.get('TOKEN2') # Hugging Face API token
28
- )
29
- ##########################################################3
30
- # import streamlit as st
31
- # from transformers import AutoModelForCausalLM, AutoTokenizer
32
- # import torch
33
-
34
- # # Model selection dropdown
35
- # selected_model = st.selectbox(
36
- # "Select Model",
37
- # ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
38
- # "meta-llama/Llama-3.3-70B-Instruct",
39
- # "meta-llama/Llama-3.2-3B-Instruct",
40
- # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
41
- # "meta-llama/Meta-Llama-3-8B-Instruct",
42
- # "meta-llama/Llama-3.1-70B-Instruct"],
43
- # key='model_select'
44
- # )
45
-
46
- # @st.cache_resource # Cache the model to prevent reloading
47
- # def load_model(model_name):
48
- # try:
49
- # # Optimized model loading configuration
50
- # model = AutoModelForCausalLM.from_pretrained(
51
- # model_name,
52
- # torch_dtype=torch.float16, # Use half precision
53
- # device_map="auto", # Automatic device mapping
54
- # load_in_8bit=True, # Enable 8-bit quantization
55
- # low_cpu_mem_usage=True, # Optimize CPU memory usage
56
- # max_memory={0: "10GB"} # Limit GPU memory usage
57
- # )
58
-
59
- # tokenizer = AutoTokenizer.from_pretrained(
60
- # model_name,
61
- # padding_side="left",
62
- # truncation_side="left"
63
- # )
64
-
65
- # return model, tokenizer
66
-
67
- # except Exception as e:
68
- # st.error(f"Error loading model: {str(e)}")
69
- # return None, None
70
-
71
- # # Load the selected model with optimizations
72
- # if selected_model:
73
- # model, tokenizer = load_model(selected_model)
74
-
75
- # # Check if model loaded successfully
76
- # if model is not None:
77
- # st.success(f"Successfully loaded {selected_model}")
78
- # else:
79
- # st.warning("Please select a different model or check your hardware capabilities")
80
-
81
- # # Function to generate text
82
- # def generate_response(prompt, model, tokenizer):
83
- # try:
84
- # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
85
-
86
- # with torch.no_grad():
87
- # outputs = model.generate(
88
- # inputs["input_ids"],
89
- # max_length=256,
90
- # num_return_sequences=1,
91
- # temperature=0.7,
92
- # do_sample=True,
93
- # pad_token_id=tokenizer.pad_token_id
94
- # )
95
-
96
- # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
97
- # return response
98
-
99
- # except Exception as e:
100
- # return f"Error generating response: {str(e)}"
101
- ############################################################
102
-
103
- ####new
104
- # from openai import OpenAI
105
-
106
- # client = OpenAI(
107
- # base_url="https://router.huggingface.co/together/v1",
108
- # api_key=os.environ.get('TOKEN2'),
109
- # )
110
-
111
- # completion = client.chat.completions.create(
112
- # model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
113
- # messages=[
114
- # {
115
- # "role": "user",
116
- # "content": "What is the capital of France?"
117
- # }
118
- # ],
119
- # max_tokens=512,
120
- # )
121
-
122
- # print(completion.choices[0].message)
123
- #####
124
-
125
- # Create necessary directories
126
- for dir_name in ['data', 'feedback']:
127
- if not os.path.exists(dir_name):
128
- os.makedirs(dir_name)
129
-
130
- # Custom CSS
131
- st.markdown("""
132
- <style>
133
- .stButton > button {
134
- width: 100%;
135
- margin-bottom: 10px;
136
- background-color: #4CAF50;
137
- color: white;
138
- border: none;
139
- padding: 10px;
140
- border-radius: 5px;
141
- }
142
- .task-button {
143
- background-color: #2196F3 !important;
144
- }
145
- .stSelectbox {
146
- margin-bottom: 20px;
147
- }
148
- .output-container {
149
- padding: 20px;
150
- border-radius: 5px;
151
- border: 1px solid #ddd;
152
- margin: 10px 0;
153
- }
154
- .status-container {
155
- padding: 10px;
156
- border-radius: 5px;
157
- margin: 10px 0;
158
- }
159
- .sidebar-info {
160
- padding: 10px;
161
- background-color: #f0f2f6;
162
- border-radius: 5px;
163
- margin: 10px 0;
164
- }
165
- .feedback-button {
166
- background-color: #ff9800 !important;
167
- }
168
- .feedback-container {
169
- padding: 15px;
170
- background-color: #f5f5f5;
171
- border-radius: 5px;
172
- margin: 15px 0;
173
- }
174
- </style>
175
- """, unsafe_allow_html=True)
176
-
177
- # Helper functions
178
- def read_csv_with_encoding(file):
179
- encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
180
- for encoding in encodings:
181
- try:
182
- return pd.read_csv(file, encoding=encoding)
183
- except UnicodeDecodeError:
184
- continue
185
- raise UnicodeDecodeError("Failed to read file with any supported encoding")
186
-
187
- #def save_feedback(feedback_data):
188
- #feedback_file = 'feedback/user_feedback.csv'
189
- #feedback_df = pd.DataFrame([feedback_data])
190
-
191
- #if os.path.exists(feedback_file):
192
- #feedback_df.to_csv(feedback_file, mode='a', header=False, index=False)
193
- #else:
194
- #feedback_df.to_csv(feedback_file, index=False)
195
-
196
- def reset_conversation():
197
- st.session_state.conversation = []
198
- st.session_state.messages = []
199
- if 'task_choice' in st.session_state:
200
- del st.session_state.task_choice
201
- return None
202
- #new 24 March
203
- #user_input = st.text_input("Enter your prompt:")
204
- ###########33
205
-
206
- # Initialize session state variables
207
- if "messages" not in st.session_state:
208
- st.session_state.messages = []
209
- if "examples_to_classify" not in st.session_state:
210
- st.session_state.examples_to_classify = []
211
- if "system_role" not in st.session_state:
212
- st.session_state.system_role = ""
213
-
214
-
215
-
216
- # Main app title
217
- st.title("🤖🦙 Text Data Labeling and Generation App")
218
- # def embed_pdf_sidebar(pdf_path):
219
- # with open(pdf_path, "rb") as f:
220
- # base64_pdf = base64.b64encode(f.read()).decode('utf-8')
221
- # pdf_display = f"""
222
- # <iframe src="data:application/pdf;base64,{base64_pdf}"
223
- # width="100%" height="400" type="application/pdf"></iframe>
224
- # """
225
- # st.markdown(pdf_display, unsafe_allow_html=True)
226
- #
227
-
228
-
229
- # Sidebar settings
230
- with st.sidebar:
231
- st.title("⚙️ Settings")
232
-
233
-
234
- #this last code works
235
- with st.sidebar:
236
- st.markdown("### 📘Data Generation and Labeling Instructions")
237
- #st.markdown("<h4 style='color: #4A90E2;'>📘 Instructions</h4>", unsafe_allow_html=True)
238
- with open("User instructions.pdf", "rb") as f:
239
- st.download_button(
240
- label="📄 Download Instructions PDF",
241
- data=f,
242
- #file_name="instructions.pdf",
243
- file_name="User instructions.pdf",
244
- mime="application/pdf"
245
- )
246
-
247
- selected_model = st.selectbox(
248
- "Select Model",
249
- ["meta-llama/Llama-3.2-11B-Vision-Instruct","meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.2-3B-Instruct","meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct",
250
- "meta-llama/Llama-3.1-70B-Instruct"],
251
- key='model_select'
252
- )
253
-
254
- #################new oooo
255
-
256
- # # Model selection dropdown
257
- # selected_model = st.selectbox(
258
- # "Select Model",
259
- # [#"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
260
- # "meta-llama/Llama-3.2-3B-Instruct",
261
- # "meta-llama/Llama-3.3-70B-Instruct",
262
- # "meta-llama/Llama-3.2-3B-Instruct",
263
- # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
264
- # "meta-llama/Meta-Llama-3-8B-Instruct",
265
- # "meta-llama/Llama-3.1-70B-Instruct"],
266
- # key='model_select'
267
- # )
268
-
269
- # @st.cache_resource # Cache the model to prevent reloading
270
- # def load_model(model_name):
271
- # try:
272
- # # Optimized model loading configuration
273
- # model = AutoModelForCausalLM.from_pretrained(
274
- # model_name,
275
- # torch_dtype=torch.float16, # Use half precision
276
- # device_map="auto", # Automatic device mapping
277
- # load_in_8bit=True, # Enable 8-bit quantization
278
- # low_cpu_mem_usage=True, # Optimize CPU memory usage
279
- # max_memory={0: "10GB"} # Limit GPU memory usage
280
- # )
281
-
282
- # tokenizer = AutoTokenizer.from_pretrained(
283
- # model_name,
284
- # padding_side="left",
285
- # truncation_side="left"
286
- # )
287
-
288
- # return model, tokenizer
289
-
290
- # except Exception as e:
291
- # st.error(f"Error loading model: {str(e)}")
292
- # return None, None
293
-
294
- # # Load the selected model with optimizations
295
- # if selected_model:
296
- # model, tokenizer = load_model(selected_model)
297
-
298
- # # Check if model loaded successfully
299
- # if model is not None:
300
- # st.success(f"Successfully loaded {selected_model}")
301
- # else:
302
- # st.warning("Please select a different model or check your hardware capabilities")
303
-
304
- # # Function to generate text
305
- # def generate_response(prompt, model, tokenizer):
306
- # try:
307
- # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
308
-
309
- # with torch.no_grad():
310
- # outputs = model.generate(
311
- # inputs["input_ids"],
312
- # max_length=256,
313
- # num_return_sequences=1,
314
- # temperature=0.7,
315
- # do_sample=True,
316
- # pad_token_id=tokenizer.pad_token_id
317
- # )
318
-
319
- # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
320
- # return response
321
-
322
- # except Exception as e:
323
- # return f"Error generating response: {str(e)}"
324
- # ################
325
-
326
- # model = AutoModelForCausalLM.from_pretrained(
327
- # "meta-llama/Meta-Llama-3-8B-Instruct",
328
- # torch_dtype=torch.float16, # Use half precision
329
- # device_map="auto", # Automatic device mapping
330
- # load_in_8bit=True # Load in 8-bit precision
331
- # )
332
- temperature = st.slider(
333
- "Temperature",
334
- 0.0, 1.0, 0.7,
335
- help="Controls randomness in generation"
336
- )
337
-
338
- st.button("🔄 New Conversation", on_click=reset_conversation)
339
- with st.container():
340
- st.markdown(f"""
341
- <div class="sidebar-info">
342
- <h4>Current Model: {selected_model}</h4>
343
- <p><em>Note: Generated content may be inaccurate or false. Check important info.</em></p>
344
- </div>
345
- """, unsafe_allow_html=True)
346
-
347
- feedback_url = "https://docs.google.com/forms/d/e/1FAIpQLSdZ_5mwW-pjqXHgxR0xriyVeRhqdQKgb5c-foXlYAV55Rilsg/viewform?usp=header"
348
- st.sidebar.markdown(
349
- f'<a href="{feedback_url}" target="_blank"><button style="width: 100%;">Feedback Form</button></a>',
350
- unsafe_allow_html=True
351
- )
352
-
353
- # Display conversation
354
- for message in st.session_state.messages:
355
- with st.chat_message(message["role"]):
356
- st.markdown(message["content"])
357
-
358
- # Main content
359
- if 'task_choice' not in st.session_state:
360
- col1, col2 = st.columns(2)
361
- with col1:
362
- if st.button("📝 Data Generation", key="gen_button", help="Generate new data"):
363
- st.session_state.task_choice = "Data Generation"
364
- with col2:
365
- if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"):
366
- st.session_state.task_choice = "Data Labeling"
367
-
368
- if "task_choice" in st.session_state:
369
- if st.session_state.task_choice == "Data Generation":
370
- st.header("📝 Data Generation")
371
-
372
- # 1. Domain selection
373
- domain_selection = st.selectbox("Domain", [
374
- "Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"
375
- ])
376
-
377
- # 2. Handle custom domain input
378
- custom_domain_valid = True # Assume valid until proven otherwise
379
-
380
- if domain_selection == "Custom":
381
- domain = st.text_input("Specify custom domain")
382
- if not domain.strip():
383
- st.error("Please specify a domain name.")
384
- custom_domain_valid = False
385
- else:
386
- domain = domain_selection
387
-
388
- # Classification type selection
389
- classification_type = st.selectbox(
390
- "Classification Type",
391
- ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
392
- )
393
- # Labels setup based on classification type
394
- #labels = []
395
- labels = []
396
- labels_valid = False
397
- errors = []
398
-
399
- def validate_binary_labels(labels):
400
- errors = []
401
- normalized = [label.strip().lower() for label in labels]
402
-
403
- if not labels[0].strip():
404
- errors.append("First class name is required.")
405
- if not labels[1].strip():
406
- errors.append("Second class name is required.")
407
- if normalized[0] == normalized[1] and all(normalized):
408
- errors.append("Class names must be different.")
409
- return errors
410
-
411
- if classification_type == "Sentiment Analysis":
412
- st.write("### Sentiment Analysis Labels (Fixed)")
413
- col1, col2, col3 = st.columns(3)
414
- with col1:
415
- st.text_input("First class", "Positive", disabled=True)
416
- with col2:
417
- st.text_input("Second class", "Negative", disabled=True)
418
- with col3:
419
- st.text_input("Third class", "Neutral", disabled=True)
420
- labels = ["Positive", "Negative", "Neutral"]
421
-
422
- elif classification_type == "Binary Classification":
423
- st.write("### Binary Classification Labels")
424
- col1, col2 = st.columns(2)
425
- with col1:
426
- label_1 = st.text_input("First class", "Positive")
427
- with col2:
428
- label_2 = st.text_input("Second class", "Negative")
429
-
430
- labels = [label_1, label_2]
431
- errors = validate_binary_labels(labels)
432
-
433
- if errors:
434
- st.error("\n".join(errors))
435
- else:
436
- st.success("Binary class names are valid and unique!")
437
-
438
-
439
- elif classification_type == "Multi-Class Classification":
440
- st.write("### Multi-Class Classification Labels")
441
-
442
- default_labels_by_domain = {
443
- "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
444
- "AG News": ["World", "Sports", "Business", "Sci/Tech"],
445
- "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
446
- "Food & Dining", "Local Experience", "Adventure Activities",
447
- "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
448
- "Luxury Tourism"],
449
- "Restaurant reviews": ["Italian", "French", "American"],
450
- "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
451
- "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
452
- "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
453
- "Books & Stationery","Toys & Games", "Sports & Fitness",
454
- "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
455
- }
456
-
457
- num_classes = st.slider("Number of classes", 3, 15, 3)
458
-
459
- # Get defaults for selected domain, or empty list
460
- defaults = default_labels_by_domain.get(domain, [])
461
-
462
- labels = []
463
- errors = []
464
- cols = st.columns(3)
465
-
466
- for i in range(num_classes):
467
- with cols[i % 3]:
468
- default_value = defaults[i] if i < len(defaults) else ""
469
- label_input = st.text_input(f"Class {i+1}", default_value)
470
- normalized_label = label_input.strip().title()
471
-
472
- if not normalized_label:
473
- errors.append(f"Class {i+1} name is required.")
474
- else:
475
- labels.append(normalized_label)
476
-
477
- # Check for duplicates (case-insensitive)
478
- if len(labels) != len(set(labels)):
479
- errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
480
-
481
- # Show validation results
482
- if errors:
483
- for error in errors:
484
- st.error(error)
485
- else:
486
- st.success("All Labels names are valid and unique!")
487
- labels_valid = not errors # Will be True only if there are no label errors
488
-
489
- ##############
490
- #new 22/4/2025
491
- # add additional attributes
492
- add_attributes = st.checkbox("Add additional attributes (optional)")
493
- additional_attributes = []
494
-
495
- if add_attributes:
496
- num_attributes = st.slider("Number of attributes to add", 1, 5, 1)
497
- for i in range(num_attributes):
498
- st.markdown(f"#### Attribute {i+1}")
499
- attr_name = st.text_input(f"Name of attribute {i+1}", key=f"attr_name_{i}")
500
- attr_topics = st.text_input(f"Topics (comma-separated) for {attr_name}", key=f"attr_topics_{i}")
501
- if attr_name and attr_topics:
502
- topics_list = [topic.strip() for topic in attr_topics.split(",") if topic.strip()]
503
- additional_attributes.append({"attribute": attr_name, "topics": topics_list})
504
-
505
- ################
506
-
507
- # Generation parameters
508
- col1, col2 = st.columns(2)
509
- with col1:
510
- min_words = st.number_input("Min words", 1, 100, 20)
511
- with col2:
512
- max_words = st.number_input("Max words", min_words, 100, 50)
513
-
514
- # Few-shot examples
515
- use_few_shot = st.toggle("Use few-shot examples")
516
- few_shot_examples = []
517
- if use_few_shot:
518
- num_examples = st.slider("Number of few-shot examples", 1, 10, 1)
519
- for i in range(num_examples):
520
- with st.expander(f"Example {i+1}"):
521
- content = st.text_area(f"Content", key=f"few_shot_content_{i}")
522
- label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}")
523
- if content and label:
524
- few_shot_examples.append({"content": content, "label": label})
525
-
526
- num_to_generate = st.number_input("Number of examples", 1, 100, 10)
527
- #sytem role after
528
- # System role customization
529
- #default_system_role = f"You are a professional {classification_type} expert, your role is to generate text examples for {domain} domain. Always generate unique diverse examples and do not repeat the generated data. The generated text should be between {min_words} to {max_words} words long."
530
- # System role customization
531
- default_system_role = (
532
- f"You are a seasoned expert in {classification_type}, specializing in the {domain} domain. "
533
- f" Your primary responsibility is to generate high-quality, diverse, and unique text examples "
534
- f"tailored to this domain. Please ensure that each example adheres to the specified length "
535
- f"requirements, ranging from {min_words} to {max_words} words, and avoid any repetition in the generated content."
536
- )
537
- system_role = st.text_area("Modify System Role (optional)",
538
- value=default_system_role,
539
- key="system_role_input")
540
- st.session_state['system_role'] = system_role if system_role else default_system_role
541
- # Labels initialization
542
- #labels = []
543
-
544
-
545
- user_prompt = st.text_area("User Prompt (optional)")
546
-
547
- # Updated prompt template including system role
548
- prompt_template = PromptTemplate(
549
- input_variables=["system_role", "classification_type", "domain", "num_examples",
550
- "min_words", "max_words", "labels", "user_prompt", "few_shot_examples", "additional_attributes"],
551
- template=(
552
- "{system_role}\n"
553
- "- Use the following parameters:\n"
554
- "- Generate {num_examples} examples\n"
555
- "- Each example should be between {min_words} to {max_words} words long\n"
556
- "- Use these labels: {labels}.\n"
557
- "- Use the following additional attributes:\n"
558
- "- {additional_attributes}\n"
559
- "- Generate the examples in this format: 'Example text. Label: label'\n"
560
- "- Do not include word counts or any additional information\n"
561
- "- Always use your creativity and intelligence to generate unique and diverse text data\n"
562
- "- In sentiment analysis, ensure that the sentiment classification is clearly identified as Positive, Negative, or Neutral. Do not leave the sentiment ambiguous.\n"
563
- "- In binary sentiment analysis, classify text strictly as either Positive or Negative. Do not include or imply Neutral as an option.\n"
564
- "- Write unique examples every time.\n"
565
- "- DO NOT REPEAT your gnerated text. \n"
566
- "- For each Output, describe it once and move to the next.\n"
567
- "- List each Output only once, and avoid repeating details.\n"
568
- "- Additional instructions: {user_prompt}\n\n"
569
- "- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n"
570
- "- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it"
571
-
572
- )
573
- )
574
- # template=(
575
- # "{system_role}\n"
576
- # "- Use the following parameters:\n"
577
- # "- Generate {num_examples} examples\n"
578
- # "- Each example should be between {min_words} to {max_words} words long\n"
579
- # "- Use these labels: {labels}.\n"
580
- # "- Use the following additional attributes:\n"
581
- # "{additional_attributes}\n"
582
- # #"- Format each example like this: 'Example text. Label: [label]. Attribute1: [topic1]. Attribute2: [topic2]'\n"
583
- # "- Generate the examples in this format: 'Example text. Label: label'\n"
584
- # "- Additional instructions: {user_prompt}\n"
585
- # "- Use these few-shot examples if provided:\n{few_shot_examples}\n"
586
- # "- Think step by step and ensure examples are unique and not repeated."
587
- # )
588
- # )
589
- ##########new 22/4/2025
590
- formatted_attributes = "\n".join([
591
- f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
592
- ])
593
- #######################
594
-
595
- # Generate system prompt
596
- system_prompt = prompt_template.format(
597
- system_role=st.session_state['system_role'],
598
- classification_type=classification_type,
599
- domain=domain,
600
- num_examples=num_to_generate,
601
- min_words=min_words,
602
- max_words=max_words,
603
- labels=", ".join(labels),
604
- user_prompt=user_prompt,
605
- few_shot_examples="\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) if few_shot_examples else "",
606
- additional_attributes=formatted_attributes
607
- )
608
-
609
-
610
- # Store system prompt in session state
611
- st.session_state['system_prompt'] = system_prompt
612
-
613
- # Display system prompt
614
- st.write("System Prompt:")
615
- st.text_area("Current System Prompt", value=st.session_state['system_prompt'],
616
- height=400, disabled=True)
617
-
618
-
619
- if st.button("🎯 Generate Examples"):
620
- #
621
- errors = []
622
- if domain_selection == "Custom" and not domain.strip():
623
- st.warning("Custom domain name is required.")
624
- elif len(labels) != len(set(labels)):
625
- st.warning("Class names must be unique.")
626
- elif any(not lbl.strip() for lbl in labels):
627
- st.warning("All class labels must be filled in.")
628
- #else:
629
- #st.success("Generating examples for domain: {domain}")
630
-
631
- #if not custom_domain_valid:
632
- #st.warning("Custom domain name is required.")
633
- #elif not labels_valid:
634
- #st.warning("Please fix the label errors before generating examples.")
635
- #else:
636
- # Proceed to generate examples
637
- #st.success(f"Generating examples for domain: {domain}")
638
-
639
- with st.spinner("Generating examples..."):
640
- try:
641
- stream = client.chat.completions.create(
642
- model=selected_model,
643
- messages=[{"role": "system", "content": st.session_state['system_prompt']}],
644
- temperature=temperature,
645
- stream=True,
646
- #max_tokens=80000,
647
- max_tokens=4000,
648
- top_p=0.9,
649
- # repetition_penalty=1.2,
650
- #frequency_penalty=0.5, # Discourages frequent words
651
- #presence_penalty=0.6,
652
- )
653
- #st.session_state['system_prompt'] = system_prompt
654
- #new 24 march
655
- st.session_state.messages.append({"role": "user", "content": system_prompt})
656
- # # ####################
657
- response = st.write_stream(stream)
658
- st.session_state.messages.append({"role": "assistant", "content": response})
659
- # Initialize session state variables if they don't exist
660
- if 'system_prompt' not in st.session_state:
661
- st.session_state.system_prompt = system_prompt
662
-
663
- if 'response' not in st.session_state:
664
- st.session_state.response = response
665
-
666
- if 'generated_examples' not in st.session_state:
667
- st.session_state.generated_examples = []
668
-
669
- if 'generated_examples_csv' not in st.session_state:
670
- st.session_state.generated_examples_csv = None
671
-
672
- if 'generated_examples_json' not in st.session_state:
673
- st.session_state.generated_examples_json = None
674
-
675
- # Parse response and generate examples list
676
- examples_list = []
677
- for line in response.split('\n'):
678
- if line.strip():
679
- parts = line.rsplit('Label:', 1)
680
- if len(parts) == 2:
681
- text = parts[0].strip()
682
- label = parts[1].strip()
683
- if text and label:
684
- examples_list.append({
685
- 'text': text,
686
- 'label': label,
687
- 'system_prompt': st.session_state.system_prompt,
688
- 'system_role': st.session_state.system_role,
689
- 'task_type': 'Data Generation',
690
- 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
691
- })
692
-
693
- # example_dict = {
694
- # 'text': text,
695
- # 'label': label,
696
- # 'system_prompt': st.session_state.system_prompt,
697
- # 'system_role': st.session_state.system_role,
698
- # 'task_type': 'Data Generation',
699
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
700
- # }
701
- # for attr in additional_attributes:
702
- # example_dict[attr['attribute']] = random.choice(attr['topics'])
703
-
704
- # examples_list.append(example_dict)
705
-
706
-
707
- if examples_list:
708
- # Update session state with new data
709
- st.session_state.generated_examples = examples_list
710
-
711
- # Generate CSV and JSON data
712
- df = pd.DataFrame(examples_list)
713
- st.session_state.generated_examples_csv = df.to_csv(index=False).encode('utf-8')
714
- st.session_state.generated_examples_json = json.dumps(examples_list, indent=2).encode('utf-8')
715
-
716
- # Vertical layout with centered "or" between buttons
717
- st.download_button(
718
- "📥 Download Generated Examples (CSV)",
719
- st.session_state.generated_examples_csv,
720
- "generated_examples.csv",
721
- "text/csv",
722
- key='download-csv-persistent'
723
- )
724
-
725
- # Add space and center the "or"
726
- st.markdown("""
727
- <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
728
- """, unsafe_allow_html=True)
729
-
730
- st.download_button(
731
- "📥 Download Generated Examples (JSON)",
732
- st.session_state.generated_examples_json,
733
- "generated_examples.json",
734
- "application/json",
735
- key='download-json-persistent'
736
- )
737
- # # Display the labeled examples
738
- # st.markdown("##### 📋 Labeled Examples Preview")
739
- # st.dataframe(df, use_container_width=True)
740
-
741
- if st.button("Continue"):
742
- if follow_up == "Generate more examples":
743
- st.experimental_rerun()
744
- elif follow_up == "Data Labeling":
745
- st.session_state.task_choice = "Data Labeling"
746
- st.experimental_rerun()
747
-
748
- except Exception as e:
749
- st.error("An error occurred during generation.")
750
- st.error(f"Details: {e}")
751
-
752
-
753
- # Lableing Process
754
- elif st.session_state.task_choice == "Data Labeling":
755
- st.header("🏷️ Data Labeling")
756
-
757
- domain_selection = st.selectbox("Domain", ["Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"])
758
- # 2. Handle custom domain input
759
- custom_domain_valid = True # Assume valid until proven otherwise
760
-
761
- if domain_selection == "Custom":
762
- domain = st.text_input("Specify custom domain")
763
- if not domain.strip():
764
- st.error("Please specify a domain name.")
765
- custom_domain_valid = False
766
- else:
767
- domain = domain_selection
768
-
769
-
770
- # Classification type selection
771
- classification_type = st.selectbox(
772
- "Classification Type",
773
- ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"]
774
- )
775
- #NNew edit
776
- # Labels setup based on classification type
777
- labels = []
778
- labels_valid = False
779
- errors = []
780
-
781
- if classification_type == "Sentiment Analysis":
782
- st.write("### Sentiment Analysis Labels (Fixed)")
783
- col1, col2, col3 = st.columns(3)
784
- with col1:
785
- label_1 = st.text_input("First class", "Positive", disabled=True)
786
- with col2:
787
- label_2 = st.text_input("Second class", "Negative", disabled=True)
788
- with col3:
789
- label_3 = st.text_input("Third class", "Neutral", disabled=True)
790
- labels = ["Positive", "Negative", "Neutral"]
791
-
792
-
793
- elif classification_type == "Binary Classification":
794
- st.write("### Binary Classification Labels")
795
- col1, col2 = st.columns(2)
796
-
797
- with col1:
798
- label_1 = st.text_input("First class", "Positive")
799
- with col2:
800
- label_2 = st.text_input("Second class", "Negative")
801
-
802
- errors = []
803
- labels = [label_1.strip(), label_2.strip()]
804
-
805
-
806
- # Strip and lower-case labels for validation
807
- label_1 = labels[0].strip()
808
- label_2 = labels[1].strip()
809
-
810
- # Check for empty class names
811
- if not label_1:
812
- errors.append("First class name is required.")
813
- if not label_2:
814
- errors.append("Second class name is required.")
815
-
816
- # Check for duplicates (case insensitive)
817
- if label_1.lower() == label_2.lower() and label_1 and label_2:
818
- errors.append("Class names must be different.")
819
-
820
- # Show errors or success
821
- if errors:
822
- for error in errors:
823
- st.error(error)
824
- else:
825
- st.success("Binary class names are valid and unique!")
826
-
827
-
828
- elif classification_type == "Multi-Class Classification":
829
- st.write("### Multi-Class Classification Labels")
830
-
831
- default_labels_by_domain = {
832
- "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
833
- "AG News": ["World", "Sports", "Business", "Sci/Tech"],
834
- "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
835
- "Food & Dining", "Local Experience", "Adventure Activities",
836
- "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
837
- "Luxury Tourism"],
838
- "Restaurant reviews": ["Italian", "French", "American"],
839
- "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
840
- "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
841
- "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
842
- "Books & Stationery","Toys & Games", "Sports & Fitness",
843
- "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
844
- }
845
-
846
-
847
-
848
- # Ask user how many classes they want to define
849
- num_classes = st.slider("Select the number of classes (labels)", min_value=3, max_value=10, value=3)
850
-
851
- # Use default labels based on selected domain, if available
852
- defaults = default_labels_by_domain.get(domain, [])
853
-
854
- labels = []
855
- errors = []
856
- cols = st.columns(3) # For nicely arranged label inputs
857
-
858
- for i in range(num_classes):
859
- with cols[i % 3]: # Distribute inputs across columns
860
- default_value = defaults[i] if i < len(defaults) else ""
861
- label_input = st.text_input(f"Label {i + 1}", default_value)
862
- normalized_label = label_input.strip().title()
863
-
864
- if not normalized_label:
865
- errors.append(f"Label {i + 1} is required.")
866
- else:
867
- labels.append(normalized_label)
868
-
869
- # Check for duplicates (case-insensitive)
870
- normalized_set = {label.lower() for label in labels}
871
- if len(labels) != len(normalized_set):
872
- errors.append("Label names must be unique (case-insensitive).")
873
-
874
- # Show validation results
875
- if errors:
876
- for error in errors:
877
- st.error(error)
878
- else:
879
- st.success("All label names are valid and unique!")
880
-
881
- labels_valid = not errors # True if no validation errors
882
-
883
- elif classification_type == "Named Entity Recognition (NER)":
884
- # # NER entity options
885
- # ner_entities = [
886
- # "PERSON - Names of people, fictional characters, historical figures",
887
- # "ORG - Companies, institutions, agencies, teams",
888
- # "LOC - Physical locations (mountains, oceans, etc.)",
889
- # "GPE - Countries, cities, states, political regions",
890
- # "DATE - Calendar dates, years, centuries",
891
- # "TIME - Times, durations",
892
- # "MONEY - Monetary values with currency"
893
- # ]
894
- # selected_entities = st.multiselect(
895
- # "Select entities to recognize",
896
- # ner_entities,
897
- # default=["PERSON - Names of people, fictional characters, historical figures",
898
- # "ORG - Companies, institutions, agencies, teams",
899
- # "LOC - Physical locations (mountains, oceans, etc.)",
900
- # "GPE - Countries, cities, states, political regions",
901
- # "DATE - Calendar dates, years, centuries",
902
- # "TIME - Times, durations",
903
- # "MONEY - Monetary values with currency"],
904
- # key="ner_entity_selection"
905
- # )
906
- #new 22/4/2025
907
- #if classification_type == "Named Entity Recognition (NER)":
908
- use_few_shot = True
909
- #new 22/4/2025
910
- few_shot_examples = [
911
- {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
912
- {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
913
- {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
914
- {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
915
- {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
916
- {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
917
- {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
918
- {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
919
- {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
920
- {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
921
- ]
922
- ###########
923
-
924
- st.write("### Named Entity Recognition (NER) Entities")
925
-
926
- # Predefined standard entities
927
- ner_entities = [
928
- "PERSON - Names of people, fictional characters, historical figures",
929
- "ORG - Companies, institutions, agencies, teams",
930
- "LOC - Physical locations (mountains, oceans, etc.)",
931
- "GPE - Countries, cities, states, political regions",
932
- "DATE - Calendar dates, years, centuries",
933
- "TIME - Times, durations",
934
- "MONEY - Monetary values with currency"
935
- ]
936
-
937
- # User can add custom NER types
938
- custom_ner_entities = []
939
- if st.checkbox("Add custom NER entities?"):
940
- num_custom_ner = st.slider("Number of custom NER entities", 1, 10, 1)
941
- for i in range(num_custom_ner):
942
- st.markdown(f"#### Custom Entity {i+1}")
943
- custom_type = st.text_input(f"Entity type {i+1}", key=f"custom_ner_type_{i}")
944
- custom_description = st.text_input(f"Description for {custom_type}", key=f"custom_ner_desc_{i}")
945
- if custom_type and custom_description:
946
- custom_ner_entities.append(f"{custom_type.upper()} - {custom_description}")
947
-
948
- # Combine built-in and custom NERs
949
- all_ner_options = ner_entities + custom_ner_entities
950
-
951
- selected_entities = st.multiselect(
952
- "Select entities to recognize",
953
- all_ner_options,
954
- default=ner_entities
955
- )
956
-
957
- # Extract entity type names (before the dash)
958
- labels = [entity.split(" - ")[0].strip() for entity in selected_entities]
959
-
960
- if not labels:
961
- st.warning("Please select at least one entity type.")
962
- labels = ["PERSON"]
963
-
964
- ##########
965
-
966
- # # Extract just the entity type (before the dash)
967
- # labels = [entity.split(" - ")[0] for entity in selected_entities]
968
-
969
- # if not labels:
970
- # st.warning("Please select at least one entity type")
971
- # labels = ["PERSON"] # Default if nothing selected
972
-
973
-
974
-
975
-
976
-
977
- #NNew edit
978
- # elif classification_type == "Multi-Class Classification":
979
- # st.write("### Multi-Class Classification Labels")
980
-
981
- # default_labels_by_domain = {
982
- # "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
983
- # "AG News": ["World", "Sports", "Business", "Sci/Tech"],
984
- # "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
985
- # "Food & Dining", "Local Experience", "Adventure Activities",
986
- # "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
987
- # "Luxury Tourism"],
988
- # "Restaurant reviews": ["Italian", "French", "American"]
989
- # }
990
- # num_classes = st.slider("Number of classes", 3, 10, 3)
991
-
992
- # # Get defaults for selected domain, or empty list
993
- # defaults = default_labels_by_domain.get(domain, [])
994
-
995
- # labels = []
996
- # errors = []
997
- # cols = st.columns(3)
998
-
999
- # for i in range(num_classes):
1000
- # with cols[i % 3]:
1001
- # default_value = defaults[i] if i < len(defaults) else ""
1002
- # label_input = st.text_input(f"Class {i+1}", default_value)
1003
- # normalized_label = label_input.strip().title()
1004
-
1005
- # if not normalized_label:
1006
- # errors.append(f"Class {i+1} name is required.")
1007
- # else:
1008
- # labels.append(normalized_label)
1009
-
1010
- # # Check for duplicates (case-insensitive)
1011
- # if len(labels) != len(set(labels)):
1012
- # errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
1013
-
1014
- # # Show validation results
1015
- # if errors:
1016
- # for error in errors:
1017
- # st.error(error)
1018
- # else:
1019
- # st.success("All Labels names are valid and unique!")
1020
- # labels_valid = not errors # Will be True only if there are no label errors
1021
-
1022
-
1023
-
1024
-
1025
- # else:
1026
- # num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes")
1027
- # labels = []
1028
- # cols = st.columns(3)
1029
- # for i in range(num_classes):
1030
- # with cols[i % 3]:
1031
- # label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
1032
- # labels.append(label)
1033
-
1034
- use_few_shot = st.toggle("Use few-shot examples for labeling")
1035
- few_shot_examples = []
1036
- if use_few_shot:
1037
- num_few_shot = st.slider("Number of few-shot examples", 1, 10, 1)
1038
- for i in range(num_few_shot):
1039
- with st.expander(f"Few-shot Example {i+1}"):
1040
- content = st.text_area(f"Content", key=f"label_few_shot_content_{i}")
1041
- label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}")
1042
- if content and label:
1043
- few_shot_examples.append(f"{content}\nLabel: {label}")
1044
-
1045
- num_examples = st.number_input("Number of examples to classify", 1, 100, 1)
1046
-
1047
- examples_to_classify = []
1048
- if num_examples <= 10:
1049
- for i in range(num_examples):
1050
- example = st.text_area(f"Example {i+1}", key=f"example_{i}")
1051
- if example:
1052
- examples_to_classify.append(example)
1053
- else:
1054
- examples_text = st.text_area(
1055
- "Enter examples (one per line)",
1056
- height=300,
1057
- help="Enter each example on a new line"
1058
- )
1059
- if examples_text:
1060
- examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()]
1061
- if len(examples_to_classify) > num_examples:
1062
- examples_to_classify = examples_to_classify[:num_examples]
1063
-
1064
- #New Wedyan
1065
- #default_system_role = f"You are a professional {classification_type} expert, your role is to classify the provided text examples for {domain} domain."
1066
- # System role customization
1067
- default_system_role = (f"You are a highly skilled {classification_type} expert."
1068
- f" Your task is to accurately classify the provided text examples within the {domain} domain."
1069
- f" Ensure that all classifications are precise, context-aware, and aligned with domain-specific standards and best practices."
1070
- )
1071
- system_role = st.text_area("Modify System Role (optional)",
1072
- value=default_system_role,
1073
- key="system_role_input")
1074
- st.session_state['system_role'] = system_role if system_role else default_system_role
1075
- # Labels initialization
1076
- #labels = []
1077
- ####
1078
-
1079
- user_prompt = st.text_area("User prompt (optional)", key="label_instructions")
1080
-
1081
- few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else ""
1082
- examples_text = "\n".join([f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify)])
1083
-
1084
- # Customize prompt template based on classification type
1085
- if classification_type == "Named Entity Recognition (NER)":
1086
- # label_prompt_template = PromptTemplate(
1087
- # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1088
- # template=(
1089
- # "{system_role}\n"
1090
- # #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n"
1091
- # "- For each text example provided, identify all entities of the requested types.\n"
1092
- # "- Use the following entities: {labels}.\n"
1093
- # "- Return each example followed by the entities you found in this format: 'Example text.\n \n Entities:\n [ENTITY_TYPE: entity text\n\n, ENTITY_TYPE: entity text\n\n, ...] or [No entities found]'\n"
1094
- # "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n"
1095
- # "- Be precise about entity boundaries - don't include unnecessary words.\n"
1096
- # "- Do not provide any additional information or explanations.\n"
1097
- # "- Additional instructions:\n {user_prompt}\n\n"
1098
- # "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n"
1099
- # "- Examples to analyze:\n{examples}\n\n"
1100
- # "Output:\n"
1101
- # )
1102
- # )
1103
- #new 22/4/2025
1104
- # label_prompt_template = PromptTemplate(
1105
- # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1106
- # template=(
1107
- # "{system_role}\n"
1108
- # "- You are performing Named Entity Recognition (NER) in the domain of {domain}.\n"
1109
- # "- Use the following entity types: {labels}.\n\n"
1110
- # "### Reasoning Steps:\n"
1111
- # "1. Read the example carefully.\n"
1112
- # "2. For each named entity mentioned, determine its meaning and role in the sentence.\n"
1113
- # "3. Think about the **context**: Is it a physical location (LOC)? A geopolitical region (GPE)? A person (PERSON)?\n"
1114
- # "4. Based on the definition of each label, assign the most **specific and correct** label.\n\n"
1115
- # "For example:\n"
1116
- # "- 'Mount Everest' → LOC (it's a mountain)\n"
1117
- # "- 'France' → GPE (it's a country)\n"
1118
- # "- 'Microsoft' → ORG\n"
1119
- # "- 'John Smith' → PERSON\n\n"
1120
- # "- Return each example followed by the entities you found in this format:\n"
1121
- # "'Example text.'\nEntities: [ENTITY_TYPE: entity text, ENTITY_TYPE: entity text, ...] or [No entities found]\n"
1122
- # "- If no entities of the requested types are found, return 'No entities found'.\n"
1123
- # "- Be precise about entity boundaries - don't include extra words.\n"
1124
- # "- Do not explain or justify your answers.\n\n"
1125
- # "Additional instructions:\n{user_prompt}\n\n"
1126
- # "Few-shot examples:\n{few_shot_examples}\n\n"
1127
- # "Examples to label:\n{examples}\n"
1128
- # "Output:\n"
1129
- # )
1130
- #)
1131
- # label_prompt_template = PromptTemplate(
1132
- # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1133
- # template=(
1134
- # "{system_role}\n"
1135
- # "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
1136
- # "- Use these entity types: {labels}.\n\n"
1137
- # "### Output Format:\n"
1138
- # # "Return each example followed by the entities you found in this format: 'Example text.\n Entities:\n [ENTITY_TYPE: entity text\n\"
1139
- # "Return each example followed by the entities you found in this format: 'Example text.\n 'Entity types:\n "Then group the entities under each label like this:\n" "
1140
- # #"Then Start with this line exactly: 'Entity types\n'\n"
1141
- # #"Then group the entities under each label like this:\n"
1142
- # "\n PERSON – Angela Merkel, John Smith\n\n"
1143
- # "\ ORG – Google, United Nations\n\n"
1144
- # "\n DATE – January 1st, 2023\n\n"
1145
- # "\n ... and so on.\n\n"
1146
- # "If entity {labels} not found, do not write it in your response\n"
1147
- # "- Do NOT output them inline after the text.\n"
1148
- # "- Do NOT repeat the sentence.\n"
1149
- # "- If no entities are found for a type, skip it.\n"
1150
- # "- Keep the format consistent.\n\n"
1151
- # "User Instructions:\n{user_prompt}\n\n"
1152
- # "Few-shot Examples:\n{few_shot_examples}\n\n"
1153
- # "Examples to analyze:\n{examples}"
1154
- # )
1155
- # )
1156
-
1157
-
1158
- label_prompt_template = PromptTemplate(
1159
- input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1160
- template=(
1161
- "{system_role}\n"
1162
- "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
1163
- "- Use these entity types: {labels}.\n\n"
1164
- "### Output Format:\n"
1165
- "Return each example followed by the entities you found in this format:\n"
1166
- "'Example text.\nEntity types:\n"
1167
- "Then group the entities under each label like this:\n"
1168
- "\nPERSON – Angela Merkel, John Smith\n"
1169
- "ORG – Google, United Nations\n"
1170
- "DATE – January 1st, 2023\n"
1171
- "... and so on.\n\n"
1172
- "Each new entities group should be in a new line.\n"
1173
- "If entity type {labels} is not found, do not write it in your response.\n"
1174
- "- Do NOT output them inline after the text.\n"
1175
- "- Do NOT repeat the sentence.\n"
1176
- "- If no entities are found for a type, skip it.\n"
1177
- "- Keep the format consistent.\n\n"
1178
- "User Instructions:\n{user_prompt}\n\n"
1179
- "Few-shot Examples:\n{few_shot_examples}\n\n"
1180
- "Examples to analyze:\n{examples}"
1181
- )
1182
- )
1183
-
1184
- #######
1185
- else:
1186
- label_prompt_template = PromptTemplate(
1187
-
1188
- input_variables=["system_role", "classification_type", "labels", "few_shot_examples", "examples","domain", "user_prompt"],
1189
- template=(
1190
- #"- Let'\s think step by step:"
1191
- "{system_role}\n"
1192
- # "- You are a professional {classification_type} expert in {domain} domain. Your role is to classify the following examples using these labels: {labels}.\n"
1193
- "- Use the following instructions:\n"
1194
- "- Use the following labels: {labels}.\n"
1195
- "- Return the classified text followed by the label in this format: 'text. Label: [label]'\n"
1196
- "- Do not provide any additional information or explanations\n"
1197
- "- User prompt:\n {user_prompt}\n\n"
1198
- "- Use user provided examples as guidence in the classification process:\n\n {few_shot_examples}\n"
1199
- "- Examples to classify:\n{examples}\n\n"
1200
- "- Think step by step then classify the examples"
1201
- #"Output:\n"
1202
- ))
1203
-
1204
- # Check if few_shot_examples is already a formatted string
1205
- # Check if few_shot_examples is already a formatted string
1206
- if isinstance(few_shot_examples, str):
1207
- formatted_few_shot = few_shot_examples
1208
- # If it's a list of already formatted strings
1209
- elif isinstance(few_shot_examples, list) and all(isinstance(ex, str) for ex in few_shot_examples):
1210
- formatted_few_shot = "\n".join(few_shot_examples)
1211
- # If it's a list of dictionaries with 'content' and 'label' keys
1212
- elif isinstance(few_shot_examples, list) and all(isinstance(ex, dict) and 'content' in ex and 'label' in ex for ex in few_shot_examples):
1213
- formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
1214
- else:
1215
- formatted_few_shot = ""
1216
- # #new 22/4/2025
1217
- # few_shot_examples = [
1218
- # {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
1219
- # {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
1220
- # {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
1221
- # {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
1222
- # {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
1223
- # {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
1224
- # {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
1225
- # {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
1226
- # {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
1227
- # {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
1228
- # ]
1229
- # ###########
1230
- # new 22/4/2025
1231
- #formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
1232
- formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
1233
-
1234
- ###########
1235
- system_prompt = label_prompt_template.format(
1236
- system_role=st.session_state['system_role'],
1237
- classification_type=classification_type,
1238
- domain=domain,
1239
- examples="\n".join(examples_to_classify),
1240
- labels=", ".join(labels),
1241
- user_prompt=user_prompt,
1242
- few_shot_examples=formatted_few_shot
1243
- )
1244
-
1245
- # Step 2: Store the system_prompt in st.session_state
1246
- st.session_state['system_prompt'] = system_prompt
1247
- #::contentReference[oaicite:0]{index=0}
1248
- st.write("System Prompt:")
1249
- #st.code(system_prompt)
1250
- #st.code(st.session_state['system_prompt'])
1251
- st.text_area("System Prompt", value=st.session_state['system_prompt'], height=300, max_chars=None, key=None, help=None, disabled=True)
1252
-
1253
-
1254
-
1255
- if st.button("🏷️ Label Data"):
1256
- if examples_to_classify:
1257
- with st.spinner("Labeling data..."):
1258
- #Generate the system prompt based on classification type
1259
- if classification_type == "Named Entity Recognition (NER)":
1260
- system_prompt = label_prompt_template.format(
1261
- system_role=st.session_state['system_role'],
1262
- labels=", ".join(labels),
1263
- domain = domain,
1264
- few_shot_examples=few_shot_text,
1265
- examples=examples_text,
1266
- user_prompt=user_prompt
1267
- #new
1268
- #'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1269
- )
1270
- # if classification_type == "Named Entity Recognition (NER)":
1271
- # # Step 1: Split the full response by example
1272
- # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1273
- # inputs = [ex.strip() for ex in examples_to_classify]
1274
-
1275
- # # Step 2: Match inputs with NER output blocks
1276
- # labeled_examples = []
1277
- # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1278
- # labeled_examples.append({
1279
- # 'text': text,
1280
- # 'entities': f"Entity types\n{output_block.strip()}",
1281
- # 'system_prompt': st.session_state.system_prompt,
1282
- # 'system_role': st.session_state.system_role,
1283
- # 'task_type': 'Named Entity Recognition (NER)',
1284
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1285
- # })
1286
-
1287
- # if classification_type == "Named Entity Recognition (NER)":
1288
- # # Step 1: Split the full response by example
1289
- # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1290
- # inputs = [ex.strip() for ex in examples_to_classify]
1291
-
1292
- # # Step 2: Match inputs with NER output blocks
1293
- # labeled_examples = []
1294
- # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1295
- # labeled_examples.append({
1296
- # 'text': text,
1297
- # 'entities': f"Entity types\n{output_block.strip()}",
1298
- # 'system_prompt': st.session_state.system_prompt,
1299
- # 'system_role': st.session_state.system_role,
1300
- # 'task_type': 'Named Entity Recognition (NER)',
1301
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1302
- # })
1303
-
1304
-
1305
- # import re
1306
-
1307
- # if classification_type == "Named Entity Recognition (NER)":
1308
- # # Use regex to split on "Entity types" while keeping it attached to each block
1309
- # blocks = re.split(r"(Entity types)", response.strip())
1310
-
1311
- # # Recombine 'Entity types' with each block after splitting
1312
- # raw_outputs = [
1313
- # (blocks[i] + blocks[i+1]).strip()
1314
- # for i in range(1, len(blocks) - 1, 2)
1315
- # ]
1316
-
1317
- # inputs = [ex.strip() for ex in examples_to_classify]
1318
-
1319
- # labeled_examples = []
1320
- # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1321
- # labeled_examples.append({
1322
- # 'text': text,
1323
- # 'entities': output_block,
1324
- # 'system_prompt': st.session_state.system_prompt,
1325
- # 'system_role': st.session_state.system_role,
1326
- # 'task_type': 'Named Entity Recognition (NER)',
1327
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1328
- # })
1329
-
1330
-
1331
- else:
1332
- system_prompt = label_prompt_template.format(
1333
- classification_type=classification_type,
1334
- system_role=st.session_state['system_role'],
1335
- domain = domain,
1336
- labels=", ".join(labels),
1337
- few_shot_examples=few_shot_text,
1338
- examples=examples_text,
1339
- user_prompt=user_prompt
1340
- )
1341
- try:
1342
- stream = client.chat.completions.create(
1343
- model=selected_model,
1344
- messages=[{"role": "system", "content": system_prompt}],
1345
- temperature=temperature,
1346
- stream=True,
1347
- #max_tokens=20000,
1348
- max_tokens=4000,
1349
- top_p = 0.9,
1350
-
1351
- )
1352
- #new 24 March
1353
- # Append user message
1354
- st.session_state.messages.append({"role": "user", "content": system_prompt})
1355
- #################
1356
- response = st.write_stream(stream)
1357
- st.session_state.messages.append({"role": "assistant", "content": response})
1358
- # Display the labeled examples
1359
- # # Optional: If you want to add it as a chat-style message log
1360
- # preview_str = st.session_state.labeled_preview.to_markdown(index=False)
1361
- # st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"})
1362
-
1363
-
1364
- # # Stream response and append assistant message
1365
- # #14/4/2024
1366
- # response = st.write_stream(stream)
1367
- # st.session_state.messages.append({"role": "assistant", "content": response})
1368
-
1369
- # Initialize session state variables if they don't exist
1370
- if 'system_prompt' not in st.session_state:
1371
- st.session_state.system_prompt = system_prompt
1372
-
1373
- if 'response' not in st.session_state:
1374
- st.session_state.response = response
1375
-
1376
- if 'generated_examples' not in st.session_state:
1377
- st.session_state.generated_examples = []
1378
-
1379
- if 'generated_examples_csv' not in st.session_state:
1380
- st.session_state.generated_examples_csv = None
1381
-
1382
- if 'generated_examples_json' not in st.session_state:
1383
- st.session_state.generated_examples_json = None
1384
-
1385
-
1386
-
1387
-
1388
- # Save labeled examples to CSV
1389
- #new 14/4/2025
1390
- #labeled_examples = []
1391
- # if classification_type == "Named Entity Recognition (NER)":
1392
- # labeled_examples = []
1393
- # for line in response.split('\n'):
1394
- # if line.strip():
1395
- # parts = line.rsplit('Entities:', 1)
1396
- # if len(parts) == 2:
1397
- # text = parts[0].strip()
1398
- # entities = parts[1].strip()
1399
- # if text and entities:
1400
- # labeled_examples.append({
1401
- # 'text': text,
1402
- # 'entities': entities,
1403
- # 'system_prompt': st.session_state.system_prompt,
1404
- # 'system_role': st.session_state.system_role,
1405
- # 'task_type': 'Named Entity Recognition (NER)',
1406
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1407
- # })
1408
-
1409
- #new 22/4/2025
1410
- labeled_examples = []
1411
- if classification_type == "Named Entity Recognition (NER)":
1412
- labeled_examples = [{
1413
- 'ner_output': response.strip(),
1414
- 'system_prompt': st.session_state.system_prompt,
1415
- 'system_role': st.session_state.system_role,
1416
- 'task_type': 'Named Entity Recognition (NER)',
1417
- 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1418
- }]
1419
-
1420
- ######
1421
-
1422
-
1423
- else:
1424
- labeled_examples = []
1425
- for line in response.split('\n'):
1426
- if line.strip():
1427
- parts = line.rsplit('Label:', 1)
1428
- if len(parts) == 2:
1429
- text = parts[0].strip()
1430
- label = parts[1].strip()
1431
- if text and label:
1432
- labeled_examples.append({
1433
- 'text': text,
1434
- 'label': label,
1435
- 'system_prompt': st.session_state.system_prompt,
1436
- 'system_role': st.session_state.system_role,
1437
- 'task_type': 'Data Labeling',
1438
- 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1439
- })
1440
- # Save and provide download options
1441
- if labeled_examples:
1442
- # Update session state
1443
- st.session_state.labeled_examples = labeled_examples
1444
-
1445
- # Convert to CSV and JSON
1446
- df = pd.DataFrame(labeled_examples)
1447
- #new 22/4/2025
1448
- # CSV
1449
- st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1450
-
1451
- # JSON
1452
- st.session_state.labeled_examples_json = json.dumps({
1453
- "metadata": {
1454
- "domain": domain,
1455
- "labels": labels,
1456
- "used_few_shot": use_few_shot,
1457
- "task_type": "Named Entity Recognition (NER)",
1458
- "timestamp": datetime.now().isoformat()
1459
- },
1460
- "examples": labeled_examples
1461
- }, indent=2).encode('utf-8')
1462
-
1463
- ############
1464
- # CSV
1465
- # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1466
-
1467
- # # JSON
1468
- # st.session_state.labeled_examples_json = json.dumps({
1469
- # "metadata": {
1470
- # "domain": domain,
1471
- # "labels": labels,
1472
- # "used_few_shot": use_few_shot,
1473
- # "task_type": "Named Entity Recognition (NER)",
1474
- # "timestamp": datetime.now().isoformat()
1475
- # },
1476
- # "examples": labeled_examples
1477
- # }, indent=2).encode('utf-8')
1478
-
1479
- ########
1480
- # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1481
- # st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8')
1482
-
1483
- # Download buttons
1484
- st.download_button(
1485
- "📥 Download Labeled Examples (CSV)",
1486
- st.session_state.labeled_examples_csv,
1487
- "labeled_examples.csv",
1488
- "text/csv",
1489
- key='download-labeled-csv'
1490
- )
1491
-
1492
- st.markdown("""
1493
- <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1494
- """, unsafe_allow_html=True)
1495
-
1496
- st.download_button(
1497
- "📥 Download Labeled Examples (JSON)",
1498
- st.session_state.labeled_examples_json,
1499
- "labeled_examples.json",
1500
- "application/json",
1501
- key='download-labeled-json'
1502
- )
1503
- # Display the labeled examples
1504
- st.markdown("##### 📋 Labeled Examples Preview")
1505
- st.dataframe(df, use_container_width=True)
1506
- # Display section
1507
- #st.markdown("### 📋 Labeled Examples Preview")
1508
- #st.dataframe(st.session_state.labeled_preview, use_container_width=True)
1509
-
1510
-
1511
-
1512
- # if labeled_examples:
1513
- # df = pd.DataFrame(labeled_examples)
1514
- # csv = df.to_csv(index=False).encode('utf-8')
1515
- # st.download_button(
1516
- # "📥 Download Labeled Examples",
1517
- # csv,
1518
- # "labeled_examples.csv",
1519
- # "text/csv",
1520
- # key='download-labeled-csv'
1521
- # )
1522
- # # Add space and center the "or"
1523
- # st.markdown("""
1524
- # <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1525
- # """, unsafe_allow_html=True)
1526
-
1527
- # if labeled_examples:
1528
- # df = pd.DataFrame(labeled_examples)
1529
- # csv = df.to_csv(index=False).encode('utf-8')
1530
- # st.download_button(
1531
- # "📥 Download Labeled Examples",
1532
- # csv,
1533
- # "labeled_examples.json",
1534
- # "text/json",
1535
- # key='download-labeled-JSON'
1536
- # )
1537
-
1538
- # Add follow-up interaction options
1539
- #st.markdown("---")
1540
- #follow_up = st.radio(
1541
- #"What would you like to do next?",
1542
- #["Label more data", "Data Generation"],
1543
- # key="labeling_follow_up"
1544
- # )
1545
-
1546
- if st.button("Continue"):
1547
- if follow_up == "Label more data":
1548
- st.session_state.examples_to_classify = []
1549
- st.experimental_rerun()
1550
- elif follow_up == "Data Generation":
1551
- st.session_state.task_choice = "Data Labeling"
1552
- st.experimental_rerun()
1553
-
1554
- except Exception as e:
1555
- st.error("An error occurred during labeling.")
1556
- st.error(f"Details: {e}")
1557
- else:
1558
- st.warning("Please enter at least one example to classify.")
1559
-
1560
- #st.session_state.messages.append({"role": "assistant", "content": response})
1561
-
1562
-
1563
-
1564
-
1565
- # Footer
1566
- st.markdown("---")
1567
- st.markdown(
1568
- """
1569
- <div style='text-align: center'>
1570
- <p>Made with ❤️ by Wedyan AlSakran 2025</p>
1571
- </div>
1572
- """,
1573
- unsafe_allow_html=True
1574
- )