import streamlit as st import pandas as pd import os import json import base64 import random from streamlit_pdf_viewer import pdf_viewer from langchain.prompts import PromptTemplate from datetime import datetime from pathlib import Path from openai import OpenAI from dotenv import load_dotenv import warnings from transformers import AutoModelForCausalLM, AutoTokenizer import torch warnings.filterwarnings('ignore') os.getenv("OAUTH_CLIENT_ID") # Load environment variables and initialize the OpenAI client to use Hugging Face Inference API. load_dotenv() client = OpenAI( base_url="https://api-inference.huggingface.co/v1", api_key=os.environ.get('TOKEN2') # Hugging Face API token ) ##########################################################3 # import streamlit as st # from transformers import AutoModelForCausalLM, AutoTokenizer # import torch # # Model selection dropdown # selected_model = st.selectbox( # "Select Model", # ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo", # "meta-llama/Llama-3.3-70B-Instruct", # "meta-llama/Llama-3.2-3B-Instruct", # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # "meta-llama/Meta-Llama-3-8B-Instruct", # "meta-llama/Llama-3.1-70B-Instruct"], # key='model_select' # ) # @st.cache_resource # Cache the model to prevent reloading # def load_model(model_name): # try: # # Optimized model loading configuration # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=torch.float16, # Use half precision # device_map="auto", # Automatic device mapping # load_in_8bit=True, # Enable 8-bit quantization # low_cpu_mem_usage=True, # Optimize CPU memory usage # max_memory={0: "10GB"} # Limit GPU memory usage # ) # tokenizer = AutoTokenizer.from_pretrained( # model_name, # padding_side="left", # truncation_side="left" # ) # return model, tokenizer # except Exception as e: # st.error(f"Error loading model: {str(e)}") # return None, None # # Load the selected model with optimizations # if selected_model: # model, tokenizer = load_model(selected_model) # # Check if model loaded successfully # if model is not None: # st.success(f"Successfully loaded {selected_model}") # else: # st.warning("Please select a different model or check your hardware capabilities") # # Function to generate text # def generate_response(prompt, model, tokenizer): # try: # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) # with torch.no_grad(): # outputs = model.generate( # inputs["input_ids"], # max_length=256, # num_return_sequences=1, # temperature=0.7, # do_sample=True, # pad_token_id=tokenizer.pad_token_id # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # except Exception as e: # return f"Error generating response: {str(e)}" ############################################################ ####new # from openai import OpenAI # client = OpenAI( # base_url="https://router.huggingface.co/together/v1", # api_key=os.environ.get('TOKEN2'), # ) # completion = client.chat.completions.create( # model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo", # messages=[ # { # "role": "user", # "content": "What is the capital of France?" # } # ], # max_tokens=512, # ) # print(completion.choices[0].message) ##### # Create necessary directories for dir_name in ['data', 'feedback']: if not os.path.exists(dir_name): os.makedirs(dir_name) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Helper functions def read_csv_with_encoding(file): encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252'] for encoding in encodings: try: return pd.read_csv(file, encoding=encoding) except UnicodeDecodeError: continue raise UnicodeDecodeError("Failed to read file with any supported encoding") #def save_feedback(feedback_data): #feedback_file = 'feedback/user_feedback.csv' #feedback_df = pd.DataFrame([feedback_data]) #if os.path.exists(feedback_file): #feedback_df.to_csv(feedback_file, mode='a', header=False, index=False) #else: #feedback_df.to_csv(feedback_file, index=False) def reset_conversation(): st.session_state.conversation = [] st.session_state.messages = [] if 'task_choice' in st.session_state: del st.session_state.task_choice return None #new 24 March #user_input = st.text_input("Enter your prompt:") ###########33 # Initialize session state variables if "messages" not in st.session_state: st.session_state.messages = [] if "examples_to_classify" not in st.session_state: st.session_state.examples_to_classify = [] if "system_role" not in st.session_state: st.session_state.system_role = "" # Main app title st.title("🤖🦙 Text Data Labeling and Generation App") # def embed_pdf_sidebar(pdf_path): # with open(pdf_path, "rb") as f: # base64_pdf = base64.b64encode(f.read()).decode('utf-8') # pdf_display = f""" # # """ # st.markdown(pdf_display, unsafe_allow_html=True) # # Sidebar settings with st.sidebar: st.title("⚙️ Settings") #this last code works with st.sidebar: st.markdown("### 📘Data Generation and Labeling Instructions") #st.markdown("

📘 Instructions

", unsafe_allow_html=True) with open("User instructions.pdf", "rb") as f: st.download_button( label="📄 Download Instructions PDF", data=f, #file_name="instructions.pdf", file_name="User instructions.pdf", mime="application/pdf" ) selected_model = st.selectbox( "Select Model", ["meta-llama/Llama-3.2-11B-Vision-Instruct","meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.2-3B-Instruct","meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Llama-3.1-70B-Instruct"], key='model_select' ) #################new oooo # # Model selection dropdown # selected_model = st.selectbox( # "Select Model", # [#"meta-llama/Meta-Llama-3-8B-Instruct-Turbo", # "meta-llama/Llama-3.2-3B-Instruct", # "meta-llama/Llama-3.3-70B-Instruct", # "meta-llama/Llama-3.2-3B-Instruct", # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # "meta-llama/Meta-Llama-3-8B-Instruct", # "meta-llama/Llama-3.1-70B-Instruct"], # key='model_select' # ) # @st.cache_resource # Cache the model to prevent reloading # def load_model(model_name): # try: # # Optimized model loading configuration # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=torch.float16, # Use half precision # device_map="auto", # Automatic device mapping # load_in_8bit=True, # Enable 8-bit quantization # low_cpu_mem_usage=True, # Optimize CPU memory usage # max_memory={0: "10GB"} # Limit GPU memory usage # ) # tokenizer = AutoTokenizer.from_pretrained( # model_name, # padding_side="left", # truncation_side="left" # ) # return model, tokenizer # except Exception as e: # st.error(f"Error loading model: {str(e)}") # return None, None # # Load the selected model with optimizations # if selected_model: # model, tokenizer = load_model(selected_model) # # Check if model loaded successfully # if model is not None: # st.success(f"Successfully loaded {selected_model}") # else: # st.warning("Please select a different model or check your hardware capabilities") # # Function to generate text # def generate_response(prompt, model, tokenizer): # try: # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) # with torch.no_grad(): # outputs = model.generate( # inputs["input_ids"], # max_length=256, # num_return_sequences=1, # temperature=0.7, # do_sample=True, # pad_token_id=tokenizer.pad_token_id # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # except Exception as e: # return f"Error generating response: {str(e)}" # ################ # model = AutoModelForCausalLM.from_pretrained( # "meta-llama/Meta-Llama-3-8B-Instruct", # torch_dtype=torch.float16, # Use half precision # device_map="auto", # Automatic device mapping # load_in_8bit=True # Load in 8-bit precision # ) temperature = st.slider( "Temperature", 0.0, 1.0, 0.7, help="Controls randomness in generation" ) st.button("🔄 New Conversation", on_click=reset_conversation) with st.container(): st.markdown(f""" """, unsafe_allow_html=True) feedback_url = "https://docs.google.com/forms/d/e/1FAIpQLSdZ_5mwW-pjqXHgxR0xriyVeRhqdQKgb5c-foXlYAV55Rilsg/viewform?usp=header" st.sidebar.markdown( f'', unsafe_allow_html=True ) # Display conversation for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Main content if 'task_choice' not in st.session_state: col1, col2 = st.columns(2) with col1: if st.button("📝 Data Generation", key="gen_button", help="Generate new data"): st.session_state.task_choice = "Data Generation" with col2: if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"): st.session_state.task_choice = "Data Labeling" if "task_choice" in st.session_state: if st.session_state.task_choice == "Data Generation": st.header("📝 Data Generation") # 1. Domain selection domain_selection = st.selectbox("Domain", [ "Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom" ]) # 2. Handle custom domain input custom_domain_valid = True # Assume valid until proven otherwise if domain_selection == "Custom": domain = st.text_input("Specify custom domain") if not domain.strip(): st.error("Please specify a domain name.") custom_domain_valid = False else: domain = domain_selection # Classification type selection classification_type = st.selectbox( "Classification Type", ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"] ) # Labels setup based on classification type #labels = [] labels = [] labels_valid = False errors = [] def validate_binary_labels(labels): errors = [] normalized = [label.strip().lower() for label in labels] if not labels[0].strip(): errors.append("First class name is required.") if not labels[1].strip(): errors.append("Second class name is required.") if normalized[0] == normalized[1] and all(normalized): errors.append("Class names must be different.") return errors if classification_type == "Sentiment Analysis": st.write("### Sentiment Analysis Labels (Fixed)") col1, col2, col3 = st.columns(3) with col1: st.text_input("First class", "Positive", disabled=True) with col2: st.text_input("Second class", "Negative", disabled=True) with col3: st.text_input("Third class", "Neutral", disabled=True) labels = ["Positive", "Negative", "Neutral"] elif classification_type == "Binary Classification": st.write("### Binary Classification Labels") col1, col2 = st.columns(2) with col1: label_1 = st.text_input("First class", "Positive") with col2: label_2 = st.text_input("Second class", "Negative") labels = [label_1, label_2] errors = validate_binary_labels(labels) if errors: st.error("\n".join(errors)) else: st.success("Binary class names are valid and unique!") elif classification_type == "Multi-Class Classification": st.write("### Multi-Class Classification Labels") default_labels_by_domain = { "News": ["Political", "Sports", "Entertainment", "Technology", "Business"], "AG News": ["World", "Sports", "Business", "Sci/Tech"], "Tourism": ["Accommodation", "Transportation", "Tourist Attractions", "Food & Dining", "Local Experience", "Adventure Activities", "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly", "Luxury Tourism"], "Restaurant reviews": ["Italian", "French", "American"], "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining", "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion", "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets", "Books & Stationery","Toys & Games", "Sports & Fitness", "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"] } num_classes = st.slider("Number of classes", 3, 15, 3) # Get defaults for selected domain, or empty list defaults = default_labels_by_domain.get(domain, []) labels = [] errors = [] cols = st.columns(3) for i in range(num_classes): with cols[i % 3]: default_value = defaults[i] if i < len(defaults) else "" label_input = st.text_input(f"Class {i+1}", default_value) normalized_label = label_input.strip().title() if not normalized_label: errors.append(f"Class {i+1} name is required.") else: labels.append(normalized_label) # Check for duplicates (case-insensitive) if len(labels) != len(set(labels)): errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).") # Show validation results if errors: for error in errors: st.error(error) else: st.success("All Labels names are valid and unique!") labels_valid = not errors # Will be True only if there are no label errors ############## #new 22/4/2025 # add additional attributes add_attributes = st.checkbox("Add additional attributes (optional)") additional_attributes = [] if add_attributes: num_attributes = st.slider("Number of attributes to add", 1, 5, 1) for i in range(num_attributes): st.markdown(f"#### Attribute {i+1}") attr_name = st.text_input(f"Name of attribute {i+1}", key=f"attr_name_{i}") attr_topics = st.text_input(f"Topics (comma-separated) for {attr_name}", key=f"attr_topics_{i}") if attr_name and attr_topics: topics_list = [topic.strip() for topic in attr_topics.split(",") if topic.strip()] additional_attributes.append({"attribute": attr_name, "topics": topics_list}) ################ # Generation parameters col1, col2 = st.columns(2) with col1: min_words = st.number_input("Min words", 1, 100, 20) with col2: max_words = st.number_input("Max words", min_words, 100, 50) # Few-shot examples use_few_shot = st.toggle("Use few-shot examples") few_shot_examples = [] if use_few_shot: num_examples = st.slider("Number of few-shot examples", 1, 10, 1) for i in range(num_examples): with st.expander(f"Example {i+1}"): content = st.text_area(f"Content", key=f"few_shot_content_{i}") label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}") if content and label: few_shot_examples.append({"content": content, "label": label}) num_to_generate = st.number_input("Number of examples", 1, 100, 10) #sytem role after # System role customization #default_system_role = f"You are a professional {classification_type} expert, your role is to generate text examples for {domain} domain. Always generate unique diverse examples and do not repeat the generated data. The generated text should be between {min_words} to {max_words} words long." # System role customization default_system_role = ( f"You are a seasoned expert in {classification_type}, specializing in the {domain} domain. " f" Your primary responsibility is to generate high-quality, diverse, and unique text examples " f"tailored to this domain. Please ensure that each example adheres to the specified length " f"requirements, ranging from {min_words} to {max_words} words, and avoid any repetition in the generated content." ) system_role = st.text_area("Modify System Role (optional)", value=default_system_role, key="system_role_input") st.session_state['system_role'] = system_role if system_role else default_system_role # Labels initialization #labels = [] user_prompt = st.text_area("User Prompt (optional)") # Updated prompt template including system role prompt_template = PromptTemplate( input_variables=["system_role", "classification_type", "domain", "num_examples", "min_words", "max_words", "labels", "user_prompt", "few_shot_examples", "additional_attributes"], template=( "{system_role}\n" "- Use the following parameters:\n" "- Generate {num_examples} examples\n" "- Each example should be between {min_words} to {max_words} words long\n" "- Use these labels: {labels}.\n" "- Use the following additional attributes:\n" "- {additional_attributes}\n" "- Generate the examples in this format: 'Example text. Label: label'\n" "- Do not include word counts or any additional information\n" "- Always use your creativity and intelligence to generate unique and diverse text data\n" "- In sentiment analysis, ensure that the sentiment classification is clearly identified as Positive, Negative, or Neutral. Do not leave the sentiment ambiguous.\n" "- In binary sentiment analysis, classify text strictly as either Positive or Negative. Do not include or imply Neutral as an option.\n" "- Write unique examples every time.\n" "- DO NOT REPEAT your gnerated text. \n" "- For each Output, describe it once and move to the next.\n" "- List each Output only once, and avoid repeating details.\n" "- Additional instructions: {user_prompt}\n\n" "- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n" "- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it" ) ) # template=( # "{system_role}\n" # "- Use the following parameters:\n" # "- Generate {num_examples} examples\n" # "- Each example should be between {min_words} to {max_words} words long\n" # "- Use these labels: {labels}.\n" # "- Use the following additional attributes:\n" # "{additional_attributes}\n" # #"- Format each example like this: 'Example text. Label: [label]. Attribute1: [topic1]. Attribute2: [topic2]'\n" # "- Generate the examples in this format: 'Example text. Label: label'\n" # "- Additional instructions: {user_prompt}\n" # "- Use these few-shot examples if provided:\n{few_shot_examples}\n" # "- Think step by step and ensure examples are unique and not repeated." # ) # ) ##########new 22/4/2025 formatted_attributes = "\n".join([ f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes ]) ####################### # Generate system prompt system_prompt = prompt_template.format( system_role=st.session_state['system_role'], classification_type=classification_type, domain=domain, num_examples=num_to_generate, min_words=min_words, max_words=max_words, labels=", ".join(labels), user_prompt=user_prompt, few_shot_examples="\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) if few_shot_examples else "", additional_attributes=formatted_attributes ) # Store system prompt in session state st.session_state['system_prompt'] = system_prompt # Display system prompt st.write("System Prompt:") st.text_area("Current System Prompt", value=st.session_state['system_prompt'], height=400, disabled=True) if st.button("🎯 Generate Examples"): # errors = [] if domain_selection == "Custom" and not domain.strip(): st.warning("Custom domain name is required.") elif len(labels) != len(set(labels)): st.warning("Class names must be unique.") elif any(not lbl.strip() for lbl in labels): st.warning("All class labels must be filled in.") #else: #st.success("Generating examples for domain: {domain}") #if not custom_domain_valid: #st.warning("Custom domain name is required.") #elif not labels_valid: #st.warning("Please fix the label errors before generating examples.") #else: # Proceed to generate examples #st.success(f"Generating examples for domain: {domain}") with st.spinner("Generating examples..."): try: stream = client.chat.completions.create( model=selected_model, messages=[{"role": "system", "content": st.session_state['system_prompt']}], temperature=temperature, stream=True, #max_tokens=80000, max_tokens=4000, top_p=0.9, # repetition_penalty=1.2, #frequency_penalty=0.5, # Discourages frequent words #presence_penalty=0.6, ) #st.session_state['system_prompt'] = system_prompt #new 24 march st.session_state.messages.append({"role": "user", "content": system_prompt}) # # #################### response = st.write_stream(stream) st.session_state.messages.append({"role": "assistant", "content": response}) # Initialize session state variables if they don't exist if 'system_prompt' not in st.session_state: st.session_state.system_prompt = system_prompt if 'response' not in st.session_state: st.session_state.response = response if 'generated_examples' not in st.session_state: st.session_state.generated_examples = [] if 'generated_examples_csv' not in st.session_state: st.session_state.generated_examples_csv = None if 'generated_examples_json' not in st.session_state: st.session_state.generated_examples_json = None # Parse response and generate examples list examples_list = [] for line in response.split('\n'): if line.strip(): parts = line.rsplit('Label:', 1) if len(parts) == 2: text = parts[0].strip() label = parts[1].strip() if text and label: examples_list.append({ 'text': text, 'label': label, 'system_prompt': st.session_state.system_prompt, 'system_role': st.session_state.system_role, 'task_type': 'Data Generation', 'Use few-shot example?': 'Yes' if use_few_shot else 'No', }) # example_dict = { # 'text': text, # 'label': label, # 'system_prompt': st.session_state.system_prompt, # 'system_role': st.session_state.system_role, # 'task_type': 'Data Generation', # 'Use few-shot example?': 'Yes' if use_few_shot else 'No', # } # for attr in additional_attributes: # example_dict[attr['attribute']] = random.choice(attr['topics']) # examples_list.append(example_dict) if examples_list: # Update session state with new data st.session_state.generated_examples = examples_list # Generate CSV and JSON data df = pd.DataFrame(examples_list) st.session_state.generated_examples_csv = df.to_csv(index=False).encode('utf-8') st.session_state.generated_examples_json = json.dumps(examples_list, indent=2).encode('utf-8') # Vertical layout with centered "or" between buttons st.download_button( "📥 Download Generated Examples (CSV)", st.session_state.generated_examples_csv, "generated_examples.csv", "text/csv", key='download-csv-persistent' ) # Add space and center the "or" st.markdown("""

. . . . . . or

""", unsafe_allow_html=True) st.download_button( "📥 Download Generated Examples (JSON)", st.session_state.generated_examples_json, "generated_examples.json", "application/json", key='download-json-persistent' ) # # Display the labeled examples # st.markdown("##### 📋 Labeled Examples Preview") # st.dataframe(df, use_container_width=True) if st.button("Continue"): if follow_up == "Generate more examples": st.experimental_rerun() elif follow_up == "Data Labeling": st.session_state.task_choice = "Data Labeling" st.experimental_rerun() except Exception as e: st.error("An error occurred during generation.") st.error(f"Details: {e}") # Lableing Process elif st.session_state.task_choice == "Data Labeling": st.header("🏷️ Data Labeling") domain_selection = st.selectbox("Domain", ["Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"]) # 2. Handle custom domain input custom_domain_valid = True # Assume valid until proven otherwise if domain_selection == "Custom": domain = st.text_input("Specify custom domain") if not domain.strip(): st.error("Please specify a domain name.") custom_domain_valid = False else: domain = domain_selection # Classification type selection classification_type = st.selectbox( "Classification Type", ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"] ) #NNew edit # Labels setup based on classification type labels = [] labels_valid = False errors = [] if classification_type == "Sentiment Analysis": st.write("### Sentiment Analysis Labels (Fixed)") col1, col2, col3 = st.columns(3) with col1: label_1 = st.text_input("First class", "Positive", disabled=True) with col2: label_2 = st.text_input("Second class", "Negative", disabled=True) with col3: label_3 = st.text_input("Third class", "Neutral", disabled=True) labels = ["Positive", "Negative", "Neutral"] elif classification_type == "Binary Classification": st.write("### Binary Classification Labels") col1, col2 = st.columns(2) with col1: label_1 = st.text_input("First class", "Positive") with col2: label_2 = st.text_input("Second class", "Negative") errors = [] labels = [label_1.strip(), label_2.strip()] # Strip and lower-case labels for validation label_1 = labels[0].strip() label_2 = labels[1].strip() # Check for empty class names if not label_1: errors.append("First class name is required.") if not label_2: errors.append("Second class name is required.") # Check for duplicates (case insensitive) if label_1.lower() == label_2.lower() and label_1 and label_2: errors.append("Class names must be different.") # Show errors or success if errors: for error in errors: st.error(error) else: st.success("Binary class names are valid and unique!") elif classification_type == "Multi-Class Classification": st.write("### Multi-Class Classification Labels") default_labels_by_domain = { "News": ["Political", "Sports", "Entertainment", "Technology", "Business"], "AG News": ["World", "Sports", "Business", "Sci/Tech"], "Tourism": ["Accommodation", "Transportation", "Tourist Attractions", "Food & Dining", "Local Experience", "Adventure Activities", "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly", "Luxury Tourism"], "Restaurant reviews": ["Italian", "French", "American"], "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining", "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion", "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets", "Books & Stationery","Toys & Games", "Sports & Fitness", "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"] } # Ask user how many classes they want to define num_classes = st.slider("Select the number of classes (labels)", min_value=3, max_value=10, value=3) # Use default labels based on selected domain, if available defaults = default_labels_by_domain.get(domain, []) labels = [] errors = [] cols = st.columns(3) # For nicely arranged label inputs for i in range(num_classes): with cols[i % 3]: # Distribute inputs across columns default_value = defaults[i] if i < len(defaults) else "" label_input = st.text_input(f"Label {i + 1}", default_value) normalized_label = label_input.strip().title() if not normalized_label: errors.append(f"Label {i + 1} is required.") else: labels.append(normalized_label) # Check for duplicates (case-insensitive) normalized_set = {label.lower() for label in labels} if len(labels) != len(normalized_set): errors.append("Label names must be unique (case-insensitive).") # Show validation results if errors: for error in errors: st.error(error) else: st.success("All label names are valid and unique!") labels_valid = not errors # True if no validation errors elif classification_type == "Named Entity Recognition (NER)": # # NER entity options # ner_entities = [ # "PERSON - Names of people, fictional characters, historical figures", # "ORG - Companies, institutions, agencies, teams", # "LOC - Physical locations (mountains, oceans, etc.)", # "GPE - Countries, cities, states, political regions", # "DATE - Calendar dates, years, centuries", # "TIME - Times, durations", # "MONEY - Monetary values with currency" # ] # selected_entities = st.multiselect( # "Select entities to recognize", # ner_entities, # default=["PERSON - Names of people, fictional characters, historical figures", # "ORG - Companies, institutions, agencies, teams", # "LOC - Physical locations (mountains, oceans, etc.)", # "GPE - Countries, cities, states, political regions", # "DATE - Calendar dates, years, centuries", # "TIME - Times, durations", # "MONEY - Monetary values with currency"], # key="ner_entity_selection" # ) #new 22/4/2025 #if classification_type == "Named Entity Recognition (NER)": use_few_shot = True #new 22/4/2025 few_shot_examples = [ {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"}, {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"}, {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"}, {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"}, {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"}, {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"}, {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"}, {"content": "She bought the dress for $200.", "label": "MONEY: $200"}, {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"}, {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"} ] ########### st.write("### Named Entity Recognition (NER) Entities") # Predefined standard entities ner_entities = [ "PERSON - Names of people, fictional characters, historical figures", "ORG - Companies, institutions, agencies, teams", "LOC - Physical locations (mountains, oceans, etc.)", "GPE - Countries, cities, states, political regions", "DATE - Calendar dates, years, centuries", "TIME - Times, durations", "MONEY - Monetary values with currency" ] # User can add custom NER types custom_ner_entities = [] if st.checkbox("Add custom NER entities?"): num_custom_ner = st.slider("Number of custom NER entities", 1, 10, 1) for i in range(num_custom_ner): st.markdown(f"#### Custom Entity {i+1}") custom_type = st.text_input(f"Entity type {i+1}", key=f"custom_ner_type_{i}") custom_description = st.text_input(f"Description for {custom_type}", key=f"custom_ner_desc_{i}") if custom_type and custom_description: custom_ner_entities.append(f"{custom_type.upper()} - {custom_description}") # Combine built-in and custom NERs all_ner_options = ner_entities + custom_ner_entities selected_entities = st.multiselect( "Select entities to recognize", all_ner_options, default=ner_entities ) # Extract entity type names (before the dash) labels = [entity.split(" - ")[0].strip() for entity in selected_entities] if not labels: st.warning("Please select at least one entity type.") labels = ["PERSON"] ########## # # Extract just the entity type (before the dash) # labels = [entity.split(" - ")[0] for entity in selected_entities] # if not labels: # st.warning("Please select at least one entity type") # labels = ["PERSON"] # Default if nothing selected #NNew edit # elif classification_type == "Multi-Class Classification": # st.write("### Multi-Class Classification Labels") # default_labels_by_domain = { # "News": ["Political", "Sports", "Entertainment", "Technology", "Business"], # "AG News": ["World", "Sports", "Business", "Sci/Tech"], # "Tourism": ["Accommodation", "Transportation", "Tourist Attractions", # "Food & Dining", "Local Experience", "Adventure Activities", # "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly", # "Luxury Tourism"], # "Restaurant reviews": ["Italian", "French", "American"] # } # num_classes = st.slider("Number of classes", 3, 10, 3) # # Get defaults for selected domain, or empty list # defaults = default_labels_by_domain.get(domain, []) # labels = [] # errors = [] # cols = st.columns(3) # for i in range(num_classes): # with cols[i % 3]: # default_value = defaults[i] if i < len(defaults) else "" # label_input = st.text_input(f"Class {i+1}", default_value) # normalized_label = label_input.strip().title() # if not normalized_label: # errors.append(f"Class {i+1} name is required.") # else: # labels.append(normalized_label) # # Check for duplicates (case-insensitive) # if len(labels) != len(set(labels)): # errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).") # # Show validation results # if errors: # for error in errors: # st.error(error) # else: # st.success("All Labels names are valid and unique!") # labels_valid = not errors # Will be True only if there are no label errors # else: # num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes") # labels = [] # cols = st.columns(3) # for i in range(num_classes): # with cols[i % 3]: # label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}") # labels.append(label) use_few_shot = st.toggle("Use few-shot examples for labeling") few_shot_examples = [] if use_few_shot: num_few_shot = st.slider("Number of few-shot examples", 1, 10, 1) for i in range(num_few_shot): with st.expander(f"Few-shot Example {i+1}"): content = st.text_area(f"Content", key=f"label_few_shot_content_{i}") label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}") if content and label: few_shot_examples.append(f"{content}\nLabel: {label}") num_examples = st.number_input("Number of examples to classify", 1, 100, 1) examples_to_classify = [] if num_examples <= 10: for i in range(num_examples): example = st.text_area(f"Example {i+1}", key=f"example_{i}") if example: examples_to_classify.append(example) else: examples_text = st.text_area( "Enter examples (one per line)", height=300, help="Enter each example on a new line" ) if examples_text: examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()] if len(examples_to_classify) > num_examples: examples_to_classify = examples_to_classify[:num_examples] #New Wedyan #default_system_role = f"You are a professional {classification_type} expert, your role is to classify the provided text examples for {domain} domain." # System role customization default_system_role = (f"You are a highly skilled {classification_type} expert." f" Your task is to accurately classify the provided text examples within the {domain} domain." f" Ensure that all classifications are precise, context-aware, and aligned with domain-specific standards and best practices." ) system_role = st.text_area("Modify System Role (optional)", value=default_system_role, key="system_role_input") st.session_state['system_role'] = system_role if system_role else default_system_role # Labels initialization #labels = [] #### user_prompt = st.text_area("User prompt (optional)", key="label_instructions") few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else "" examples_text = "\n".join([f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify)]) # Customize prompt template based on classification type if classification_type == "Named Entity Recognition (NER)": # label_prompt_template = PromptTemplate( # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"], # template=( # "{system_role}\n" # #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n" # "- For each text example provided, identify all entities of the requested types.\n" # "- Use the following entities: {labels}.\n" # "- Return each example followed by the entities you found in this format: 'Example text.\n \n Entities:\n [ENTITY_TYPE: entity text\n\n, ENTITY_TYPE: entity text\n\n, ...] or [No entities found]'\n" # "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n" # "- Be precise about entity boundaries - don't include unnecessary words.\n" # "- Do not provide any additional information or explanations.\n" # "- Additional instructions:\n {user_prompt}\n\n" # "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n" # "- Examples to analyze:\n{examples}\n\n" # "Output:\n" # ) # ) #new 22/4/2025 # label_prompt_template = PromptTemplate( # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"], # template=( # "{system_role}\n" # "- You are performing Named Entity Recognition (NER) in the domain of {domain}.\n" # "- Use the following entity types: {labels}.\n\n" # "### Reasoning Steps:\n" # "1. Read the example carefully.\n" # "2. For each named entity mentioned, determine its meaning and role in the sentence.\n" # "3. Think about the **context**: Is it a physical location (LOC)? A geopolitical region (GPE)? A person (PERSON)?\n" # "4. Based on the definition of each label, assign the most **specific and correct** label.\n\n" # "For example:\n" # "- 'Mount Everest' → LOC (it's a mountain)\n" # "- 'France' → GPE (it's a country)\n" # "- 'Microsoft' → ORG\n" # "- 'John Smith' → PERSON\n\n" # "- Return each example followed by the entities you found in this format:\n" # "'Example text.'\nEntities: [ENTITY_TYPE: entity text, ENTITY_TYPE: entity text, ...] or [No entities found]\n" # "- If no entities of the requested types are found, return 'No entities found'.\n" # "- Be precise about entity boundaries - don't include extra words.\n" # "- Do not explain or justify your answers.\n\n" # "Additional instructions:\n{user_prompt}\n\n" # "Few-shot examples:\n{few_shot_examples}\n\n" # "Examples to label:\n{examples}\n" # "Output:\n" # ) #) # label_prompt_template = PromptTemplate( # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"], # template=( # "{system_role}\n" # "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n" # "- Use these entity types: {labels}.\n\n" # "### Output Format:\n" # # "Return each example followed by the entities you found in this format: 'Example text.\n Entities:\n [ENTITY_TYPE: entity text\n\" # "Return each example followed by the entities you found in this format: 'Example text.\n 'Entity types:\n "Then group the entities under each label like this:\n" " # #"Then Start with this line exactly: 'Entity types\n'\n" # #"Then group the entities under each label like this:\n" # "\n PERSON – Angela Merkel, John Smith\n\n" # "\ ORG – Google, United Nations\n\n" # "\n DATE – January 1st, 2023\n\n" # "\n ... and so on.\n\n" # "If entity {labels} not found, do not write it in your response\n" # "- Do NOT output them inline after the text.\n" # "- Do NOT repeat the sentence.\n" # "- If no entities are found for a type, skip it.\n" # "- Keep the format consistent.\n\n" # "User Instructions:\n{user_prompt}\n\n" # "Few-shot Examples:\n{few_shot_examples}\n\n" # "Examples to analyze:\n{examples}" # ) # ) label_prompt_template = PromptTemplate( input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"], template=( "{system_role}\n" "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n" "- Use these entity types: {labels}.\n\n" "### Output Format:\n" "Return each example followed by the entities you found in this format:\n" "'Example text.\nEntity types:\n" "Then group the entities under each label like this:\n" "\nPERSON – Angela Merkel, John Smith\n" "ORG – Google, United Nations\n" "DATE – January 1st, 2023\n" "... and so on.\n\n" "Each new entities group should be in a new line.\n" "If entity type {labels} is not found, do not write it in your response.\n" "- Do NOT output them inline after the text.\n" "- Do NOT repeat the sentence.\n" "- If no entities are found for a type, skip it.\n" "- Keep the format consistent.\n\n" "User Instructions:\n{user_prompt}\n\n" "Few-shot Examples:\n{few_shot_examples}\n\n" "Examples to analyze:\n{examples}" ) ) ####### else: label_prompt_template = PromptTemplate( input_variables=["system_role", "classification_type", "labels", "few_shot_examples", "examples","domain", "user_prompt"], template=( #"- Let'\s think step by step:" "{system_role}\n" # "- You are a professional {classification_type} expert in {domain} domain. Your role is to classify the following examples using these labels: {labels}.\n" "- Use the following instructions:\n" "- Use the following labels: {labels}.\n" "- Return the classified text followed by the label in this format: 'text. Label: [label]'\n" "- Do not provide any additional information or explanations\n" "- User prompt:\n {user_prompt}\n\n" "- Use user provided examples as guidence in the classification process:\n\n {few_shot_examples}\n" "- Examples to classify:\n{examples}\n\n" "- Think step by step then classify the examples" #"Output:\n" )) # Check if few_shot_examples is already a formatted string # Check if few_shot_examples is already a formatted string if isinstance(few_shot_examples, str): formatted_few_shot = few_shot_examples # If it's a list of already formatted strings elif isinstance(few_shot_examples, list) and all(isinstance(ex, str) for ex in few_shot_examples): formatted_few_shot = "\n".join(few_shot_examples) # If it's a list of dictionaries with 'content' and 'label' keys elif isinstance(few_shot_examples, list) and all(isinstance(ex, dict) and 'content' in ex and 'label' in ex for ex in few_shot_examples): formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) else: formatted_few_shot = "" # #new 22/4/2025 # few_shot_examples = [ # {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"}, # {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"}, # {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"}, # {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"}, # {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"}, # {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"}, # {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"}, # {"content": "She bought the dress for $200.", "label": "MONEY: $200"}, # {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"}, # {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"} # ] # ########### # new 22/4/2025 #formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples]) formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples]) ########### system_prompt = label_prompt_template.format( system_role=st.session_state['system_role'], classification_type=classification_type, domain=domain, examples="\n".join(examples_to_classify), labels=", ".join(labels), user_prompt=user_prompt, few_shot_examples=formatted_few_shot ) # Step 2: Store the system_prompt in st.session_state st.session_state['system_prompt'] = system_prompt #::contentReference[oaicite:0]{index=0} st.write("System Prompt:") #st.code(system_prompt) #st.code(st.session_state['system_prompt']) st.text_area("System Prompt", value=st.session_state['system_prompt'], height=300, max_chars=None, key=None, help=None, disabled=True) if st.button("🏷️ Label Data"): if examples_to_classify: with st.spinner("Labeling data..."): #Generate the system prompt based on classification type if classification_type == "Named Entity Recognition (NER)": system_prompt = label_prompt_template.format( system_role=st.session_state['system_role'], labels=", ".join(labels), domain = domain, few_shot_examples=few_shot_text, examples=examples_text, user_prompt=user_prompt #new #'Use few-shot example?': 'Yes' if use_few_shot else 'No', ) # if classification_type == "Named Entity Recognition (NER)": # # Step 1: Split the full response by example # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()] # inputs = [ex.strip() for ex in examples_to_classify] # # Step 2: Match inputs with NER output blocks # labeled_examples = [] # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)): # labeled_examples.append({ # 'text': text, # 'entities': f"Entity types\n{output_block.strip()}", # 'system_prompt': st.session_state.system_prompt, # 'system_role': st.session_state.system_role, # 'task_type': 'Named Entity Recognition (NER)', # 'Use few-shot example?': 'Yes' if use_few_shot else 'No', # }) # if classification_type == "Named Entity Recognition (NER)": # # Step 1: Split the full response by example # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()] # inputs = [ex.strip() for ex in examples_to_classify] # # Step 2: Match inputs with NER output blocks # labeled_examples = [] # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)): # labeled_examples.append({ # 'text': text, # 'entities': f"Entity types\n{output_block.strip()}", # 'system_prompt': st.session_state.system_prompt, # 'system_role': st.session_state.system_role, # 'task_type': 'Named Entity Recognition (NER)', # 'Use few-shot example?': 'Yes' if use_few_shot else 'No', # }) # import re # if classification_type == "Named Entity Recognition (NER)": # # Use regex to split on "Entity types" while keeping it attached to each block # blocks = re.split(r"(Entity types)", response.strip()) # # Recombine 'Entity types' with each block after splitting # raw_outputs = [ # (blocks[i] + blocks[i+1]).strip() # for i in range(1, len(blocks) - 1, 2) # ] # inputs = [ex.strip() for ex in examples_to_classify] # labeled_examples = [] # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)): # labeled_examples.append({ # 'text': text, # 'entities': output_block, # 'system_prompt': st.session_state.system_prompt, # 'system_role': st.session_state.system_role, # 'task_type': 'Named Entity Recognition (NER)', # 'Use few-shot example?': 'Yes' if use_few_shot else 'No', # }) else: system_prompt = label_prompt_template.format( classification_type=classification_type, system_role=st.session_state['system_role'], domain = domain, labels=", ".join(labels), few_shot_examples=few_shot_text, examples=examples_text, user_prompt=user_prompt ) try: stream = client.chat.completions.create( model=selected_model, messages=[{"role": "system", "content": system_prompt}], temperature=temperature, stream=True, #max_tokens=20000, max_tokens=4000, top_p = 0.9, ) #new 24 March # Append user message st.session_state.messages.append({"role": "user", "content": system_prompt}) ################# response = st.write_stream(stream) st.session_state.messages.append({"role": "assistant", "content": response}) # Display the labeled examples # # Optional: If you want to add it as a chat-style message log # preview_str = st.session_state.labeled_preview.to_markdown(index=False) # st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"}) # # Stream response and append assistant message # #14/4/2024 # response = st.write_stream(stream) # st.session_state.messages.append({"role": "assistant", "content": response}) # Initialize session state variables if they don't exist if 'system_prompt' not in st.session_state: st.session_state.system_prompt = system_prompt if 'response' not in st.session_state: st.session_state.response = response if 'generated_examples' not in st.session_state: st.session_state.generated_examples = [] if 'generated_examples_csv' not in st.session_state: st.session_state.generated_examples_csv = None if 'generated_examples_json' not in st.session_state: st.session_state.generated_examples_json = None # Save labeled examples to CSV #new 14/4/2025 #labeled_examples = [] # if classification_type == "Named Entity Recognition (NER)": # labeled_examples = [] # for line in response.split('\n'): # if line.strip(): # parts = line.rsplit('Entities:', 1) # if len(parts) == 2: # text = parts[0].strip() # entities = parts[1].strip() # if text and entities: # labeled_examples.append({ # 'text': text, # 'entities': entities, # 'system_prompt': st.session_state.system_prompt, # 'system_role': st.session_state.system_role, # 'task_type': 'Named Entity Recognition (NER)', # 'Use few-shot example?': 'Yes' if use_few_shot else 'No', # }) #new 22/4/2025 labeled_examples = [] if classification_type == "Named Entity Recognition (NER)": labeled_examples = [{ 'ner_output': response.strip(), 'system_prompt': st.session_state.system_prompt, 'system_role': st.session_state.system_role, 'task_type': 'Named Entity Recognition (NER)', 'Use few-shot example?': 'Yes' if use_few_shot else 'No', }] ###### else: labeled_examples = [] for line in response.split('\n'): if line.strip(): parts = line.rsplit('Label:', 1) if len(parts) == 2: text = parts[0].strip() label = parts[1].strip() if text and label: labeled_examples.append({ 'text': text, 'label': label, 'system_prompt': st.session_state.system_prompt, 'system_role': st.session_state.system_role, 'task_type': 'Data Labeling', 'Use few-shot example?': 'Yes' if use_few_shot else 'No', }) # Save and provide download options if labeled_examples: # Update session state st.session_state.labeled_examples = labeled_examples # Convert to CSV and JSON df = pd.DataFrame(labeled_examples) #new 22/4/2025 # CSV st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8') # JSON st.session_state.labeled_examples_json = json.dumps({ "metadata": { "domain": domain, "labels": labels, "used_few_shot": use_few_shot, "task_type": "Named Entity Recognition (NER)", "timestamp": datetime.now().isoformat() }, "examples": labeled_examples }, indent=2).encode('utf-8') ############ # CSV # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8') # # JSON # st.session_state.labeled_examples_json = json.dumps({ # "metadata": { # "domain": domain, # "labels": labels, # "used_few_shot": use_few_shot, # "task_type": "Named Entity Recognition (NER)", # "timestamp": datetime.now().isoformat() # }, # "examples": labeled_examples # }, indent=2).encode('utf-8') ######## # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8') # st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8') # Download buttons st.download_button( "📥 Download Labeled Examples (CSV)", st.session_state.labeled_examples_csv, "labeled_examples.csv", "text/csv", key='download-labeled-csv' ) st.markdown("""

. . . . . . or

""", unsafe_allow_html=True) st.download_button( "📥 Download Labeled Examples (JSON)", st.session_state.labeled_examples_json, "labeled_examples.json", "application/json", key='download-labeled-json' ) # Display the labeled examples st.markdown("##### 📋 Labeled Examples Preview") st.dataframe(df, use_container_width=True) # Display section #st.markdown("### 📋 Labeled Examples Preview") #st.dataframe(st.session_state.labeled_preview, use_container_width=True) # if labeled_examples: # df = pd.DataFrame(labeled_examples) # csv = df.to_csv(index=False).encode('utf-8') # st.download_button( # "📥 Download Labeled Examples", # csv, # "labeled_examples.csv", # "text/csv", # key='download-labeled-csv' # ) # # Add space and center the "or" # st.markdown(""" #

. . . . . . or

# """, unsafe_allow_html=True) # if labeled_examples: # df = pd.DataFrame(labeled_examples) # csv = df.to_csv(index=False).encode('utf-8') # st.download_button( # "📥 Download Labeled Examples", # csv, # "labeled_examples.json", # "text/json", # key='download-labeled-JSON' # ) # Add follow-up interaction options #st.markdown("---") #follow_up = st.radio( #"What would you like to do next?", #["Label more data", "Data Generation"], # key="labeling_follow_up" # ) if st.button("Continue"): if follow_up == "Label more data": st.session_state.examples_to_classify = [] st.experimental_rerun() elif follow_up == "Data Generation": st.session_state.task_choice = "Data Labeling" st.experimental_rerun() except Exception as e: st.error("An error occurred during labeling.") st.error(f"Details: {e}") else: st.warning("Please enter at least one example to classify.") #st.session_state.messages.append({"role": "assistant", "content": response}) # Footer st.markdown("---") st.markdown( """

Made with ❤️ by Wedyan AlSakran 2025

""", unsafe_allow_html=True )

📘 Instructions

Current Model: {selected_model}