Data_Generation_LabelingCopy

Running

App Files Files

Wedyan2023 commited on Nov 3, 2024

Commit

f45bbe4

verified ·

1 Parent(s): 07c4ad3

Create app8.py

Browse files

Files changed (1) hide show

app8.py +394 -0

app8.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import streamlit as st
+import pandas as pd
+import os
+from datetime import datetime
+import random
+from pathlib import Path
+from openai import OpenAI
+from dotenv import load_dotenv
+from langchain_core.prompts import PromptTemplate
+# Load environment variables
+load_dotenv()
+##openai_api_key = os.getenv("OPENAI_API_KEY")
+# Initialize the client
+client = OpenAI(
+    base_url="https://api-inference.huggingface.co/v1",
+    api_key=os.environ.get('TOKEN2')  # Add your Huggingface token here
+)
+# Initialize OpenAI client
+##client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+# Custom CSS for better appearance
+st.markdown("""
+<style>
+    .stButton > button {
+        width: 100%;
+        margin-bottom: 10px;
+        background-color: #4CAF50;
+        color: white;
+        border: none;
+        padding: 10px;
+        border-radius: 5px;
+    }
+    .task-button {
+        background-color: #2196F3 !important;
+    }
+    .stSelectbox {
+        margin-bottom: 20px;
+    }
+    .output-container {
+        padding: 20px;
+        border-radius: 5px;
+        border: 1px solid #ddd;
+        margin: 10px 0;
+    }
+    .status-container {
+        padding: 10px;
+        border-radius: 5px;
+        margin: 10px 0;
+    }
+    .sidebar-info {
+        padding: 10px;
+        background-color: #f0f2f6;
+        border-radius: 5px;
+        margin: 10px 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Create data directories if they don't exist
+if not os.path.exists('data'):
+    os.makedirs('data')
+def read_csv_with_encoding(file):
+    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
+    for encoding in encodings:
+        try:
+            return pd.read_csv(file, encoding=encoding)
+        except UnicodeDecodeError:
+            continue
+    raise UnicodeDecodeError("Failed to read file with any supported encoding")
+def save_to_csv(data, filename):
+    df = pd.DataFrame(data)
+    df.to_csv(f'data/{filename}', index=False)
+    return df
+def load_from_csv(filename):
+    try:
+        return pd.read_csv(f'data/{filename}')
+    except:
+        return pd.DataFrame()
+# Define reset function
+def reset_conversation():
+    st.session_state.conversation = []
+    st.session_state.messages = []
+# Initialize session state
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Main app title
+st.title("🤖 Text Data Generation & Labeling App")
+# Sidebar settings
+with st.sidebar:
+    st.title("⚙️ Settings")
+    selected_model = st.selectbox(
+        "Select Model",
+        ["meta-llama/Meta-Llama-3-8B-Instruct"],
+        key='model_select'
+    )
+    temperature = st.slider(
+        "Temperature",
+        0.0, 1.0, 0.5,
+        help="Controls randomness in generation"
+    )
+    st.button("🔄 Reset Conversation", on_click=reset_conversation)
+    with st.container():
+        st.markdown("""
+        <div class="sidebar-info">
+            <h4>Current Model: {}</h4>
+            <p><em>Note: Generated content may be inaccurate or false.</em></p>
+        </div>
+        """.format(selected_model), unsafe_allow_html=True)
+# Main content
+col1, col2 = st.columns(2)
+with col1:
+    if st.button("📝 Data Generation", key="gen_button", help="Generate new data"):
+        st.session_state.task_choice = "Data Generation"
+with col2:
+    if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"):
+        st.session_state.task_choice = "Data Labeling"
+if "task_choice" in st.session_state:
+    if st.session_state.task_choice == "Data Generation":
+        st.header("📝 Data Generation")
+        classification_type = st.selectbox(
+            "Classification Type",
+            ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
+        )
+        if classification_type == "Sentiment Analysis":
+            labels = ["Positive", "Negative", "Neutral"]
+        elif classification_type == "Binary Classification":
+            col1, col2 = st.columns(2)
+            with col1:
+                label_1 = st.text_input("First class", "Positive")
+            with col2:
+                label_2 = st.text_input("Second class", "Negative")
+            labels = [label_1, label_2] if label_1 and label_2 else ["Positive", "Negative"]
+        else:
+            num_classes = st.slider("Number of classes", 3, 10, 3)
+            labels = []
+            cols = st.columns(3)
+            for i in range(num_classes):
+                with cols[i % 3]:
+                    label = st.text_input(f"Class {i+1}", f"Class_{i+1}")
+                    labels.append(label)
+        domain = st.selectbox("Domain", ["Restaurant reviews", "E-commerce reviews", "Custom"])
+        if domain == "Custom":
+            domain = st.text_input("Specify custom domain")
+        col1, col2 = st.columns(2)
+        with col1:
+            min_words = st.number_input("Min words", 10, 90, 20)
+        with col2:
+            max_words = st.number_input("Max words", min_words, 90, 50)
+        use_few_shot = st.toggle("Use few-shot examples")
+        few_shot_examples = []
+        if use_few_shot:
+            num_examples = st.slider("Number of few-shot examples", 1, 5, 1)
+            for i in range(num_examples):
+                with st.expander(f"Example {i+1}"):
+                    content = st.text_area(f"Content", key=f"few_shot_content_{i}")
+                    label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}")
+                    if content and label:
+                        few_shot_examples.append({"content": content, "label": label})
+        num_to_generate = st.number_input("Number of examples", 1, 100, 10)
+        user_prompt = st.text_area("Additional instructions (optional)")
+        # Updated prompt template with word length constraints
+        prompt_template = PromptTemplate(
+            input_variables=["classification_type", "domain", "num_examples", "min_words", "max_words", "labels", "user_prompt"],
+            template=(
+                "You are a professional {classification_type} expert tasked with generating examples for {domain}.\n"
+                "Use the following parameters:\n"
+                "- Generate exactly {num_examples} examples\n"
+                "- Each example MUST be between {min_words} and {max_words} words long\n"
+                "- Use these labels: {labels}\n"
+                "- Generate the examples in this format: 'Example text. Label: [label]'\n"
+                "- Do not include word counts or any additional information\n"
+                "Additional instructions: {user_prompt}\n\n"
+                "Generate numbered examples:"
+            )
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("🎯 Generate Examples"):
+                with st.spinner("Generating examples..."):
+                    system_prompt = prompt_template.format(
+                        classification_type=classification_type,
+                        domain=domain,
+                        num_examples=num_to_generate,
+                        min_words=min_words,
+                        max_words=max_words,
+                        labels=", ".join(labels),
+                        user_prompt=user_prompt
+                    )
+                    try:
+                        stream = client.chat.completions.create(
+                            model=selected_model,
+                            messages=[{"role": "system", "content": system_prompt}],
+                            temperature=temperature,
+                            stream=True,
+                            max_tokens=3000,
+                        )
+                        response = st.write_stream(stream)
+                        st.session_state.messages.append({"role": "assistant", "content": response})
+                    except Exception as e:
+                        st.error("An error occurred during generation.")
+                        st.error(f"Details: {e}")
+        with col2:
+            if st.button("🔄 Regenerate"):
+                st.session_state.messages = st.session_state.messages[:-1] if st.session_state.messages else []
+                with st.spinner("Regenerating examples..."):
+                    system_prompt = prompt_template.format(
+                        classification_type=classification_type,
+                        domain=domain,
+                        num_examples=num_to_generate,
+                        min_words=min_words,
+                        max_words=max_words,
+                        labels=", ".join(labels),
+                        user_prompt=user_prompt
+                    )
+                    try:
+                        stream = client.chat.completions.create(
+                            model=selected_model,
+                            messages=[{"role": "system", "content": system_prompt}],
+                            temperature=temperature,
+                            stream=True,
+                            max_tokens=3000,
+                        )
+                        response = st.write_stream(stream)
+                        st.session_state.messages.append({"role": "assistant", "content": response})
+                    except Exception as e:
+                        st.error("An error occurred during regeneration.")
+                        st.error(f"Details: {e}")
+    elif st.session_state.task_choice == "Data Labeling":
+        st.header("🏷️ Data Labeling")
+        classification_type = st.selectbox(
+            "Classification Type",
+            ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"],
+            key="label_class_type"
+        )
+        if classification_type == "Sentiment Analysis":
+            labels = ["Positive", "Negative", "Neutral"]
+        elif classification_type == "Binary Classification":
+            col1, col2 = st.columns(2)
+            with col1:
+                label_1 = st.text_input("First class", "Positive", key="label_first")
+            with col2:
+                label_2 = st.text_input("Second class", "Negative", key="label_second")
+            labels = [label_1, label_2] if label_1 and label_2 else ["Positive", "Negative"]
+        else:
+            num_classes = st.slider("Number of classes", 3, 10, 3, key="label_num_classes")
+            labels = []
+            cols = st.columns(3)
+            for i in range(num_classes):
+                with cols[i % 3]:
+                    label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
+                    labels.append(label)
+        use_few_shot = st.toggle("Use few-shot examples for labeling")
+        few_shot_examples = []
+        if use_few_shot:
+            num_few_shot = st.slider("Number of few-shot examples", 1, 5, 1)
+            for i in range(num_few_shot):
+                with st.expander(f"Few-shot Example {i+1}"):
+                    content = st.text_area(f"Content", key=f"label_few_shot_content_{i}")
+                    label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}")
+                    if content and label:
+                        few_shot_examples.append(f"{content}\nLabel: {label}")
+        num_examples = st.number_input("Number of examples to classify", 1, 100, 1)
+        examples_to_classify = []
+        if num_examples <= 20:
+            for i in range(num_examples):
+                example = st.text_area(f"Example {i+1}", key=f"example_{i}")
+                if example:
+                    examples_to_classify.append(example)
+        else:
+            examples_text = st.text_area(
+                "Enter examples (one per line)",
+                height=300,
+                help="Enter each example on a new line"
+            )
+            if examples_text:
+                examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()]
+                if len(examples_to_classify) > num_examples:
+                    examples_to_classify = examples_to_classify[:num_examples]
+        user_prompt = st.text_area("Additional instructions (optional)", key="label_instructions")
+        # Updated prompt template for labeling
+        few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else ""
+        examples_text = "\n".join(f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify))
+        label_prompt_template = PromptTemplate(
+    input_variables=["classification_type", "labels", "few_shot_examples", "examples", "user_prompt"],
+    template=(
+        "You are a professional {classification_type} expert. Classify the following examples using these labels: {labels}.\n"
+        "Instructions:\n"
+        "- Return ONLY the numbered example followed by its classification\n"
+        "- Use the format: 'Example text. Label: [label]'\n"
+        "- Do not provide explanations or justifications\n"
+        "{user_prompt}\n\n"
+        "Few-shot examples:\n{few_shot_examples}\n\n"
+        "Examples to classify:\n{examples}\n\n"
+        "Output:\n"
+    )
+)
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("🏷️ Label Data"):
+                if examples_to_classify:
+                    with st.spinner("Labeling data..."):
+                        system_prompt = label_prompt_template.format(
+                            classification_type=classification_type,
+                            labels=", ".join(labels),
+                            few_shot_examples=few_shot_text,
+                            examples=examples_text,
+                            user_prompt=user_prompt
+                        )
+                        try:
+                            stream = client.chat.completions.create(
+                                model=selected_model,
+                                messages=[{"role": "system", "content": system_prompt}],
+                                temperature=temperature,
+                                stream=True,
+                                max_tokens=3000,
+                            )
+                            response = st.write_stream(stream)
+                            st.session_state.messages.append({"role": "assistant", "content": response})
+                        except Exception as e:
+                            st.error("An error occurred during labeling.")
+                            st.error(f"Details: {e}")
+                else:
+                    st.warning("Please enter at least one example to classify.")
+        with col2:
+            if st.button("🔄 Relabel"):
+                if examples_to_classify:
+                    st.session_state.messages = st.session_state.messages[:-1] if st.session_state.messages else []
+                    with st.spinner("Relabeling data..."):
+                        system_prompt = label_prompt_template.format(
+                            classification_type=classification_type,
+                            labels=", ".join(labels),
+                            few_shot_examples=few_shot_text,
+                            examples=examples_text,
+                            user_prompt=user_prompt
+                        )
+                        try:
+                            stream = client.chat.completions.create(
+                                model=selected_model,
+                                messages=[{"role": "system", "content": system_prompt}],
+                                temperature=temperature,
+                                stream=True,
+                                max_tokens=3000,
+                            )
+                            response = st.write_stream(stream)
+                            st.session_state.messages.append({"role": "assistant", "content": response})
+                        except Exception as e:
+                            st.error("An error occurred during relabeling.")
+                            st.error(f"Details: {e}")
+                else:
+                    st.warning("Please enter at least one example to classify.")
+if st.session_state.messages:
+    st.markdown("### Output:")
+    for message in st.session_state.messages[-1:]:
+        st.markdown(message["content"])