Update app110.py
Browse files
app110.py
CHANGED
@@ -47,124 +47,8 @@ completion = client.chat.completions.create(
|
|
47 |
)
|
48 |
|
49 |
print(completion.choices[0].message)
|
50 |
-
|
51 |
-
|
52 |
-
# from openai import OpenAI
|
53 |
-
|
54 |
-
# client = OpenAI(
|
55 |
-
# base_url="https://router.huggingface.co/together/v1",
|
56 |
-
# #api_key="hf_XXXXX",
|
57 |
-
# api_key=os.environ.get('TOKEN2'), # Hugging Face API token
|
58 |
-
# )
|
59 |
-
# #meta-llama/Meta-Llama-3-8B-Instruct
|
60 |
-
# completion = client.chat.completions.create(
|
61 |
-
# #model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
62 |
-
# model="meta-llama/Meta-Llama-3-8B-Instruct",
|
63 |
-
# messages=[
|
64 |
-
# {
|
65 |
-
# "role": "user",
|
66 |
-
# "content": "What is the capital of France?"
|
67 |
-
# }
|
68 |
-
# ],
|
69 |
-
# )
|
70 |
-
|
71 |
-
#print(completion.choices[0].message)
|
72 |
-
#####
|
73 |
-
##########################################################3
|
74 |
-
# import streamlit as st
|
75 |
-
# from transformers import AutoModelForCausalLM, AutoTokenizer
|
76 |
-
# import torch
|
77 |
-
|
78 |
-
# # Model selection dropdown
|
79 |
-
# selected_model = st.selectbox(
|
80 |
-
# "Select Model",
|
81 |
-
# ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
82 |
-
# "meta-llama/Llama-3.3-70B-Instruct",
|
83 |
-
# "meta-llama/Llama-3.2-3B-Instruct",
|
84 |
-
# "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
85 |
-
# "meta-llama/Meta-Llama-3-8B-Instruct",
|
86 |
-
# "meta-llama/Llama-3.1-70B-Instruct"],
|
87 |
-
# key='model_select'
|
88 |
-
# )
|
89 |
-
|
90 |
-
# @st.cache_resource # Cache the model to prevent reloading
|
91 |
-
# def load_model(model_name):
|
92 |
-
# try:
|
93 |
-
# # Optimized model loading configuration
|
94 |
-
# model = AutoModelForCausalLM.from_pretrained(
|
95 |
-
# model_name,
|
96 |
-
# torch_dtype=torch.float16, # Use half precision
|
97 |
-
# device_map="auto", # Automatic device mapping
|
98 |
-
# load_in_8bit=True, # Enable 8-bit quantization
|
99 |
-
# low_cpu_mem_usage=True, # Optimize CPU memory usage
|
100 |
-
# max_memory={0: "10GB"} # Limit GPU memory usage
|
101 |
-
# )
|
102 |
-
|
103 |
-
# tokenizer = AutoTokenizer.from_pretrained(
|
104 |
-
# model_name,
|
105 |
-
# padding_side="left",
|
106 |
-
# truncation_side="left"
|
107 |
-
# )
|
108 |
-
|
109 |
-
# return model, tokenizer
|
110 |
-
|
111 |
-
# except Exception as e:
|
112 |
-
# st.error(f"Error loading model: {str(e)}")
|
113 |
-
# return None, None
|
114 |
-
|
115 |
-
# # Load the selected model with optimizations
|
116 |
-
# if selected_model:
|
117 |
-
# model, tokenizer = load_model(selected_model)
|
118 |
-
|
119 |
-
# # Check if model loaded successfully
|
120 |
-
# if model is not None:
|
121 |
-
# st.success(f"Successfully loaded {selected_model}")
|
122 |
-
# else:
|
123 |
-
# st.warning("Please select a different model or check your hardware capabilities")
|
124 |
-
|
125 |
-
# # Function to generate text
|
126 |
-
# def generate_response(prompt, model, tokenizer):
|
127 |
-
# try:
|
128 |
-
# inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
129 |
-
|
130 |
-
# with torch.no_grad():
|
131 |
-
# outputs = model.generate(
|
132 |
-
# inputs["input_ids"],
|
133 |
-
# max_length=256,
|
134 |
-
# num_return_sequences=1,
|
135 |
-
# temperature=0.7,
|
136 |
-
# do_sample=True,
|
137 |
-
# pad_token_id=tokenizer.pad_token_id
|
138 |
-
# )
|
139 |
-
|
140 |
-
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
141 |
-
# return response
|
142 |
-
|
143 |
-
# except Exception as e:
|
144 |
-
# return f"Error generating response: {str(e)}"
|
145 |
-
############################################################
|
146 |
-
|
147 |
-
####new
|
148 |
-
# from openai import OpenAI
|
149 |
-
|
150 |
-
# client = OpenAI(
|
151 |
-
# base_url="https://router.huggingface.co/together/v1",
|
152 |
-
# api_key=os.environ.get('TOKEN2'),
|
153 |
-
# )
|
154 |
-
|
155 |
-
# completion = client.chat.completions.create(
|
156 |
-
# model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
157 |
-
# messages=[
|
158 |
-
# {
|
159 |
-
# "role": "user",
|
160 |
-
# "content": "What is the capital of France?"
|
161 |
-
# }
|
162 |
-
# ],
|
163 |
-
# max_tokens=512,
|
164 |
-
# )
|
165 |
-
|
166 |
-
# print(completion.choices[0].message)
|
167 |
-
#####
|
168 |
|
169 |
# Create necessary directories
|
170 |
for dir_name in ['data', 'feedback']:
|
@@ -228,14 +112,7 @@ def read_csv_with_encoding(file):
|
|
228 |
continue
|
229 |
raise UnicodeDecodeError("Failed to read file with any supported encoding")
|
230 |
|
231 |
-
#
|
232 |
-
#feedback_file = 'feedback/user_feedback.csv'
|
233 |
-
#feedback_df = pd.DataFrame([feedback_data])
|
234 |
-
|
235 |
-
#if os.path.exists(feedback_file):
|
236 |
-
#feedback_df.to_csv(feedback_file, mode='a', header=False, index=False)
|
237 |
-
#else:
|
238 |
-
#feedback_df.to_csv(feedback_file, index=False)
|
239 |
|
240 |
def reset_conversation():
|
241 |
st.session_state.conversation = []
|
@@ -259,16 +136,7 @@ if "system_role" not in st.session_state:
|
|
259 |
|
260 |
# Main app title
|
261 |
st.title("🤖🦙 Text Data Labeling and Generation App")
|
262 |
-
|
263 |
-
# with open(pdf_path, "rb") as f:
|
264 |
-
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
265 |
-
# pdf_display = f"""
|
266 |
-
# <iframe src="data:application/pdf;base64,{base64_pdf}"
|
267 |
-
# width="100%" height="400" type="application/pdf"></iframe>
|
268 |
-
# """
|
269 |
-
# st.markdown(pdf_display, unsafe_allow_html=True)
|
270 |
-
#
|
271 |
-
|
272 |
|
273 |
# Sidebar settings
|
274 |
with st.sidebar:
|
@@ -295,84 +163,7 @@ with st.sidebar:
|
|
295 |
key='model_select'
|
296 |
)
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
# # Model selection dropdown
|
301 |
-
# selected_model = st.selectbox(
|
302 |
-
# "Select Model",
|
303 |
-
# [#"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
304 |
-
# "meta-llama/Llama-3.2-3B-Instruct",
|
305 |
-
# "meta-llama/Llama-3.3-70B-Instruct",
|
306 |
-
# "meta-llama/Llama-3.2-3B-Instruct",
|
307 |
-
# "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
308 |
-
# "meta-llama/Meta-Llama-3-8B-Instruct",
|
309 |
-
# "meta-llama/Llama-3.1-70B-Instruct"],
|
310 |
-
# key='model_select'
|
311 |
-
# )
|
312 |
-
|
313 |
-
# @st.cache_resource # Cache the model to prevent reloading
|
314 |
-
# def load_model(model_name):
|
315 |
-
# try:
|
316 |
-
# # Optimized model loading configuration
|
317 |
-
# model = AutoModelForCausalLM.from_pretrained(
|
318 |
-
# model_name,
|
319 |
-
# torch_dtype=torch.float16, # Use half precision
|
320 |
-
# device_map="auto", # Automatic device mapping
|
321 |
-
# load_in_8bit=True, # Enable 8-bit quantization
|
322 |
-
# low_cpu_mem_usage=True, # Optimize CPU memory usage
|
323 |
-
# max_memory={0: "10GB"} # Limit GPU memory usage
|
324 |
-
# )
|
325 |
-
|
326 |
-
# tokenizer = AutoTokenizer.from_pretrained(
|
327 |
-
# model_name,
|
328 |
-
# padding_side="left",
|
329 |
-
# truncation_side="left"
|
330 |
-
# )
|
331 |
-
|
332 |
-
# return model, tokenizer
|
333 |
-
|
334 |
-
# except Exception as e:
|
335 |
-
# st.error(f"Error loading model: {str(e)}")
|
336 |
-
# return None, None
|
337 |
-
|
338 |
-
# # Load the selected model with optimizations
|
339 |
-
# if selected_model:
|
340 |
-
# model, tokenizer = load_model(selected_model)
|
341 |
-
|
342 |
-
# # Check if model loaded successfully
|
343 |
-
# if model is not None:
|
344 |
-
# st.success(f"Successfully loaded {selected_model}")
|
345 |
-
# else:
|
346 |
-
# st.warning("Please select a different model or check your hardware capabilities")
|
347 |
-
|
348 |
-
# # Function to generate text
|
349 |
-
# def generate_response(prompt, model, tokenizer):
|
350 |
-
# try:
|
351 |
-
# inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
352 |
-
|
353 |
-
# with torch.no_grad():
|
354 |
-
# outputs = model.generate(
|
355 |
-
# inputs["input_ids"],
|
356 |
-
# max_length=256,
|
357 |
-
# num_return_sequences=1,
|
358 |
-
# temperature=0.7,
|
359 |
-
# do_sample=True,
|
360 |
-
# pad_token_id=tokenizer.pad_token_id
|
361 |
-
# )
|
362 |
-
|
363 |
-
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
364 |
-
# return response
|
365 |
-
|
366 |
-
# except Exception as e:
|
367 |
-
# return f"Error generating response: {str(e)}"
|
368 |
-
# ################
|
369 |
-
|
370 |
-
# model = AutoModelForCausalLM.from_pretrained(
|
371 |
-
# "meta-llama/Meta-Llama-3-8B-Instruct",
|
372 |
-
# torch_dtype=torch.float16, # Use half precision
|
373 |
-
# device_map="auto", # Automatic device mapping
|
374 |
-
# load_in_8bit=True # Load in 8-bit precision
|
375 |
-
# )
|
376 |
temperature = st.slider(
|
377 |
"Temperature",
|
378 |
0.0, 1.0, 0.7,
|
@@ -615,21 +406,7 @@ if "task_choice" in st.session_state:
|
|
615 |
|
616 |
)
|
617 |
)
|
618 |
-
|
619 |
-
# "{system_role}\n"
|
620 |
-
# "- Use the following parameters:\n"
|
621 |
-
# "- Generate {num_examples} examples\n"
|
622 |
-
# "- Each example should be between {min_words} to {max_words} words long\n"
|
623 |
-
# "- Use these labels: {labels}.\n"
|
624 |
-
# "- Use the following additional attributes:\n"
|
625 |
-
# "{additional_attributes}\n"
|
626 |
-
# #"- Format each example like this: 'Example text. Label: [label]. Attribute1: [topic1]. Attribute2: [topic2]'\n"
|
627 |
-
# "- Generate the examples in this format: 'Example text. Label: label'\n"
|
628 |
-
# "- Additional instructions: {user_prompt}\n"
|
629 |
-
# "- Use these few-shot examples if provided:\n{few_shot_examples}\n"
|
630 |
-
# "- Think step by step and ensure examples are unique and not repeated."
|
631 |
-
# )
|
632 |
-
# )
|
633 |
##########new 22/4/2025
|
634 |
formatted_attributes = "\n".join([
|
635 |
f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
|
@@ -669,16 +446,7 @@ if "task_choice" in st.session_state:
|
|
669 |
st.warning("Class names must be unique.")
|
670 |
elif any(not lbl.strip() for lbl in labels):
|
671 |
st.warning("All class labels must be filled in.")
|
672 |
-
|
673 |
-
#st.success("Generating examples for domain: {domain}")
|
674 |
-
|
675 |
-
#if not custom_domain_valid:
|
676 |
-
#st.warning("Custom domain name is required.")
|
677 |
-
#elif not labels_valid:
|
678 |
-
#st.warning("Please fix the label errors before generating examples.")
|
679 |
-
#else:
|
680 |
-
# Proceed to generate examples
|
681 |
-
#st.success(f"Generating examples for domain: {domain}")
|
682 |
|
683 |
with st.spinner("Generating examples..."):
|
684 |
try:
|
@@ -694,7 +462,7 @@ if "task_choice" in st.session_state:
|
|
694 |
#frequency_penalty=0.5, # Discourages frequent words
|
695 |
#presence_penalty=0.6,
|
696 |
)
|
697 |
-
|
698 |
#new 24 march
|
699 |
st.session_state.messages.append({"role": "user", "content": system_prompt})
|
700 |
# # ####################
|
@@ -734,18 +502,7 @@ if "task_choice" in st.session_state:
|
|
734 |
'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
735 |
})
|
736 |
|
737 |
-
|
738 |
-
# 'text': text,
|
739 |
-
# 'label': label,
|
740 |
-
# 'system_prompt': st.session_state.system_prompt,
|
741 |
-
# 'system_role': st.session_state.system_role,
|
742 |
-
# 'task_type': 'Data Generation',
|
743 |
-
# 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
744 |
-
# }
|
745 |
-
# for attr in additional_attributes:
|
746 |
-
# example_dict[attr['attribute']] = random.choice(attr['topics'])
|
747 |
-
|
748 |
-
# examples_list.append(example_dict)
|
749 |
|
750 |
|
751 |
if examples_list:
|
@@ -778,9 +535,9 @@ if "task_choice" in st.session_state:
|
|
778 |
"application/json",
|
779 |
key='download-json-persistent'
|
780 |
)
|
781 |
-
#
|
782 |
-
|
783 |
-
|
784 |
|
785 |
if st.button("Continue"):
|
786 |
if follow_up == "Generate more examples":
|
@@ -1003,78 +760,8 @@ if "task_choice" in st.session_state:
|
|
1003 |
|
1004 |
if not labels:
|
1005 |
st.warning("Please select at least one entity type.")
|
1006 |
-
labels = ["PERSON"]
|
1007 |
-
|
1008 |
-
##########
|
1009 |
-
|
1010 |
-
# # Extract just the entity type (before the dash)
|
1011 |
-
# labels = [entity.split(" - ")[0] for entity in selected_entities]
|
1012 |
-
|
1013 |
-
# if not labels:
|
1014 |
-
# st.warning("Please select at least one entity type")
|
1015 |
-
# labels = ["PERSON"] # Default if nothing selected
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
#NNew edit
|
1022 |
-
# elif classification_type == "Multi-Class Classification":
|
1023 |
-
# st.write("### Multi-Class Classification Labels")
|
1024 |
-
|
1025 |
-
# default_labels_by_domain = {
|
1026 |
-
# "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
|
1027 |
-
# "AG News": ["World", "Sports", "Business", "Sci/Tech"],
|
1028 |
-
# "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
|
1029 |
-
# "Food & Dining", "Local Experience", "Adventure Activities",
|
1030 |
-
# "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
|
1031 |
-
# "Luxury Tourism"],
|
1032 |
-
# "Restaurant reviews": ["Italian", "French", "American"]
|
1033 |
-
# }
|
1034 |
-
# num_classes = st.slider("Number of classes", 3, 10, 3)
|
1035 |
-
|
1036 |
-
# # Get defaults for selected domain, or empty list
|
1037 |
-
# defaults = default_labels_by_domain.get(domain, [])
|
1038 |
-
|
1039 |
-
# labels = []
|
1040 |
-
# errors = []
|
1041 |
-
# cols = st.columns(3)
|
1042 |
-
|
1043 |
-
# for i in range(num_classes):
|
1044 |
-
# with cols[i % 3]:
|
1045 |
-
# default_value = defaults[i] if i < len(defaults) else ""
|
1046 |
-
# label_input = st.text_input(f"Class {i+1}", default_value)
|
1047 |
-
# normalized_label = label_input.strip().title()
|
1048 |
-
|
1049 |
-
# if not normalized_label:
|
1050 |
-
# errors.append(f"Class {i+1} name is required.")
|
1051 |
-
# else:
|
1052 |
-
# labels.append(normalized_label)
|
1053 |
-
|
1054 |
-
# # Check for duplicates (case-insensitive)
|
1055 |
-
# if len(labels) != len(set(labels)):
|
1056 |
-
# errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
|
1057 |
-
|
1058 |
-
# # Show validation results
|
1059 |
-
# if errors:
|
1060 |
-
# for error in errors:
|
1061 |
-
# st.error(error)
|
1062 |
-
# else:
|
1063 |
-
# st.success("All Labels names are valid and unique!")
|
1064 |
-
# labels_valid = not errors # Will be True only if there are no label errors
|
1065 |
-
|
1066 |
-
|
1067 |
-
|
1068 |
-
|
1069 |
-
# else:
|
1070 |
-
# num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes")
|
1071 |
-
# labels = []
|
1072 |
-
# cols = st.columns(3)
|
1073 |
-
# for i in range(num_classes):
|
1074 |
-
# with cols[i % 3]:
|
1075 |
-
# label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
|
1076 |
-
# labels.append(label)
|
1077 |
-
|
1078 |
use_few_shot = st.toggle("Use few-shot examples for labeling")
|
1079 |
few_shot_examples = []
|
1080 |
if use_few_shot:
|
@@ -1127,78 +814,8 @@ if "task_choice" in st.session_state:
|
|
1127 |
|
1128 |
# Customize prompt template based on classification type
|
1129 |
if classification_type == "Named Entity Recognition (NER)":
|
1130 |
-
|
1131 |
-
|
1132 |
-
# template=(
|
1133 |
-
# "{system_role}\n"
|
1134 |
-
# #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n"
|
1135 |
-
# "- For each text example provided, identify all entities of the requested types.\n"
|
1136 |
-
# "- Use the following entities: {labels}.\n"
|
1137 |
-
# "- Return each example followed by the entities you found in this format: 'Example text.\n \n Entities:\n [ENTITY_TYPE: entity text\n\n, ENTITY_TYPE: entity text\n\n, ...] or [No entities found]'\n"
|
1138 |
-
# "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n"
|
1139 |
-
# "- Be precise about entity boundaries - don't include unnecessary words.\n"
|
1140 |
-
# "- Do not provide any additional information or explanations.\n"
|
1141 |
-
# "- Additional instructions:\n {user_prompt}\n\n"
|
1142 |
-
# "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n"
|
1143 |
-
# "- Examples to analyze:\n{examples}\n\n"
|
1144 |
-
# "Output:\n"
|
1145 |
-
# )
|
1146 |
-
# )
|
1147 |
-
#new 22/4/2025
|
1148 |
-
# label_prompt_template = PromptTemplate(
|
1149 |
-
# input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
|
1150 |
-
# template=(
|
1151 |
-
# "{system_role}\n"
|
1152 |
-
# "- You are performing Named Entity Recognition (NER) in the domain of {domain}.\n"
|
1153 |
-
# "- Use the following entity types: {labels}.\n\n"
|
1154 |
-
# "### Reasoning Steps:\n"
|
1155 |
-
# "1. Read the example carefully.\n"
|
1156 |
-
# "2. For each named entity mentioned, determine its meaning and role in the sentence.\n"
|
1157 |
-
# "3. Think about the **context**: Is it a physical location (LOC)? A geopolitical region (GPE)? A person (PERSON)?\n"
|
1158 |
-
# "4. Based on the definition of each label, assign the most **specific and correct** label.\n\n"
|
1159 |
-
# "For example:\n"
|
1160 |
-
# "- 'Mount Everest' → LOC (it's a mountain)\n"
|
1161 |
-
# "- 'France' → GPE (it's a country)\n"
|
1162 |
-
# "- 'Microsoft' → ORG\n"
|
1163 |
-
# "- 'John Smith' → PERSON\n\n"
|
1164 |
-
# "- Return each example followed by the entities you found in this format:\n"
|
1165 |
-
# "'Example text.'\nEntities: [ENTITY_TYPE: entity text, ENTITY_TYPE: entity text, ...] or [No entities found]\n"
|
1166 |
-
# "- If no entities of the requested types are found, return 'No entities found'.\n"
|
1167 |
-
# "- Be precise about entity boundaries - don't include extra words.\n"
|
1168 |
-
# "- Do not explain or justify your answers.\n\n"
|
1169 |
-
# "Additional instructions:\n{user_prompt}\n\n"
|
1170 |
-
# "Few-shot examples:\n{few_shot_examples}\n\n"
|
1171 |
-
# "Examples to label:\n{examples}\n"
|
1172 |
-
# "Output:\n"
|
1173 |
-
# )
|
1174 |
-
#)
|
1175 |
-
# label_prompt_template = PromptTemplate(
|
1176 |
-
# input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
|
1177 |
-
# template=(
|
1178 |
-
# "{system_role}\n"
|
1179 |
-
# "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
|
1180 |
-
# "- Use these entity types: {labels}.\n\n"
|
1181 |
-
# "### Output Format:\n"
|
1182 |
-
# # "Return each example followed by the entities you found in this format: 'Example text.\n Entities:\n [ENTITY_TYPE: entity text\n\"
|
1183 |
-
# "Return each example followed by the entities you found in this format: 'Example text.\n 'Entity types:\n "Then group the entities under each label like this:\n" "
|
1184 |
-
# #"Then Start with this line exactly: 'Entity types\n'\n"
|
1185 |
-
# #"Then group the entities under each label like this:\n"
|
1186 |
-
# "\n PERSON – Angela Merkel, John Smith\n\n"
|
1187 |
-
# "\ ORG – Google, United Nations\n\n"
|
1188 |
-
# "\n DATE – January 1st, 2023\n\n"
|
1189 |
-
# "\n ... and so on.\n\n"
|
1190 |
-
# "If entity {labels} not found, do not write it in your response\n"
|
1191 |
-
# "- Do NOT output them inline after the text.\n"
|
1192 |
-
# "- Do NOT repeat the sentence.\n"
|
1193 |
-
# "- If no entities are found for a type, skip it.\n"
|
1194 |
-
# "- Keep the format consistent.\n\n"
|
1195 |
-
# "User Instructions:\n{user_prompt}\n\n"
|
1196 |
-
# "Few-shot Examples:\n{few_shot_examples}\n\n"
|
1197 |
-
# "Examples to analyze:\n{examples}"
|
1198 |
-
# )
|
1199 |
-
# )
|
1200 |
-
|
1201 |
-
|
1202 |
label_prompt_template = PromptTemplate(
|
1203 |
input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
|
1204 |
template=(
|
@@ -1257,20 +874,7 @@ if "task_choice" in st.session_state:
|
|
1257 |
formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
|
1258 |
else:
|
1259 |
formatted_few_shot = ""
|
1260 |
-
|
1261 |
-
# few_shot_examples = [
|
1262 |
-
# {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
|
1263 |
-
# {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
|
1264 |
-
# {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
|
1265 |
-
# {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
|
1266 |
-
# {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
|
1267 |
-
# {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
|
1268 |
-
# {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
|
1269 |
-
# {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
|
1270 |
-
# {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
|
1271 |
-
# {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
|
1272 |
-
# ]
|
1273 |
-
# ###########
|
1274 |
# new 22/4/2025
|
1275 |
#formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
|
1276 |
formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
|
@@ -1308,69 +912,8 @@ if "task_choice" in st.session_state:
|
|
1308 |
few_shot_examples=few_shot_text,
|
1309 |
examples=examples_text,
|
1310 |
user_prompt=user_prompt
|
1311 |
-
|
1312 |
-
|
1313 |
-
)
|
1314 |
-
# if classification_type == "Named Entity Recognition (NER)":
|
1315 |
-
# # Step 1: Split the full response by example
|
1316 |
-
# raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
|
1317 |
-
# inputs = [ex.strip() for ex in examples_to_classify]
|
1318 |
-
|
1319 |
-
# # Step 2: Match inputs with NER output blocks
|
1320 |
-
# labeled_examples = []
|
1321 |
-
# for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
|
1322 |
-
# labeled_examples.append({
|
1323 |
-
# 'text': text,
|
1324 |
-
# 'entities': f"Entity types\n{output_block.strip()}",
|
1325 |
-
# 'system_prompt': st.session_state.system_prompt,
|
1326 |
-
# 'system_role': st.session_state.system_role,
|
1327 |
-
# 'task_type': 'Named Entity Recognition (NER)',
|
1328 |
-
# 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
1329 |
-
# })
|
1330 |
-
|
1331 |
-
# if classification_type == "Named Entity Recognition (NER)":
|
1332 |
-
# # Step 1: Split the full response by example
|
1333 |
-
# raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
|
1334 |
-
# inputs = [ex.strip() for ex in examples_to_classify]
|
1335 |
-
|
1336 |
-
# # Step 2: Match inputs with NER output blocks
|
1337 |
-
# labeled_examples = []
|
1338 |
-
# for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
|
1339 |
-
# labeled_examples.append({
|
1340 |
-
# 'text': text,
|
1341 |
-
# 'entities': f"Entity types\n{output_block.strip()}",
|
1342 |
-
# 'system_prompt': st.session_state.system_prompt,
|
1343 |
-
# 'system_role': st.session_state.system_role,
|
1344 |
-
# 'task_type': 'Named Entity Recognition (NER)',
|
1345 |
-
# 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
1346 |
-
# })
|
1347 |
-
|
1348 |
-
|
1349 |
-
# import re
|
1350 |
-
|
1351 |
-
# if classification_type == "Named Entity Recognition (NER)":
|
1352 |
-
# # Use regex to split on "Entity types" while keeping it attached to each block
|
1353 |
-
# blocks = re.split(r"(Entity types)", response.strip())
|
1354 |
-
|
1355 |
-
# # Recombine 'Entity types' with each block after splitting
|
1356 |
-
# raw_outputs = [
|
1357 |
-
# (blocks[i] + blocks[i+1]).strip()
|
1358 |
-
# for i in range(1, len(blocks) - 1, 2)
|
1359 |
-
# ]
|
1360 |
-
|
1361 |
-
# inputs = [ex.strip() for ex in examples_to_classify]
|
1362 |
-
|
1363 |
-
# labeled_examples = []
|
1364 |
-
# for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
|
1365 |
-
# labeled_examples.append({
|
1366 |
-
# 'text': text,
|
1367 |
-
# 'entities': output_block,
|
1368 |
-
# 'system_prompt': st.session_state.system_prompt,
|
1369 |
-
# 'system_role': st.session_state.system_role,
|
1370 |
-
# 'task_type': 'Named Entity Recognition (NER)',
|
1371 |
-
# 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
1372 |
-
# })
|
1373 |
-
|
1374 |
|
1375 |
else:
|
1376 |
system_prompt = label_prompt_template.format(
|
@@ -1399,16 +942,7 @@ if "task_choice" in st.session_state:
|
|
1399 |
#################
|
1400 |
response = st.write_stream(stream)
|
1401 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
1402 |
-
|
1403 |
-
# # Optional: If you want to add it as a chat-style message log
|
1404 |
-
# preview_str = st.session_state.labeled_preview.to_markdown(index=False)
|
1405 |
-
# st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"})
|
1406 |
-
|
1407 |
-
|
1408 |
-
# # Stream response and append assistant message
|
1409 |
-
# #14/4/2024
|
1410 |
-
# response = st.write_stream(stream)
|
1411 |
-
# st.session_state.messages.append({"role": "assistant", "content": response})
|
1412 |
|
1413 |
# Initialize session state variables if they don't exist
|
1414 |
if 'system_prompt' not in st.session_state:
|
@@ -1427,28 +961,6 @@ if "task_choice" in st.session_state:
|
|
1427 |
st.session_state.generated_examples_json = None
|
1428 |
|
1429 |
|
1430 |
-
|
1431 |
-
|
1432 |
-
# Save labeled examples to CSV
|
1433 |
-
#new 14/4/2025
|
1434 |
-
#labeled_examples = []
|
1435 |
-
# if classification_type == "Named Entity Recognition (NER)":
|
1436 |
-
# labeled_examples = []
|
1437 |
-
# for line in response.split('\n'):
|
1438 |
-
# if line.strip():
|
1439 |
-
# parts = line.rsplit('Entities:', 1)
|
1440 |
-
# if len(parts) == 2:
|
1441 |
-
# text = parts[0].strip()
|
1442 |
-
# entities = parts[1].strip()
|
1443 |
-
# if text and entities:
|
1444 |
-
# labeled_examples.append({
|
1445 |
-
# 'text': text,
|
1446 |
-
# 'entities': entities,
|
1447 |
-
# 'system_prompt': st.session_state.system_prompt,
|
1448 |
-
# 'system_role': st.session_state.system_role,
|
1449 |
-
# 'task_type': 'Named Entity Recognition (NER)',
|
1450 |
-
# 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
1451 |
-
# })
|
1452 |
|
1453 |
#new 22/4/2025
|
1454 |
labeled_examples = []
|
@@ -1504,25 +1016,6 @@ if "task_choice" in st.session_state:
|
|
1504 |
"examples": labeled_examples
|
1505 |
}, indent=2).encode('utf-8')
|
1506 |
|
1507 |
-
############
|
1508 |
-
# CSV
|
1509 |
-
# st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
|
1510 |
-
|
1511 |
-
# # JSON
|
1512 |
-
# st.session_state.labeled_examples_json = json.dumps({
|
1513 |
-
# "metadata": {
|
1514 |
-
# "domain": domain,
|
1515 |
-
# "labels": labels,
|
1516 |
-
# "used_few_shot": use_few_shot,
|
1517 |
-
# "task_type": "Named Entity Recognition (NER)",
|
1518 |
-
# "timestamp": datetime.now().isoformat()
|
1519 |
-
# },
|
1520 |
-
# "examples": labeled_examples
|
1521 |
-
# }, indent=2).encode('utf-8')
|
1522 |
-
|
1523 |
-
########
|
1524 |
-
# st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
|
1525 |
-
# st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8')
|
1526 |
|
1527 |
# Download buttons
|
1528 |
st.download_button(
|
@@ -1547,45 +1040,12 @@ if "task_choice" in st.session_state:
|
|
1547 |
# Display the labeled examples
|
1548 |
st.markdown("##### 📋 Labeled Examples Preview")
|
1549 |
st.dataframe(df, use_container_width=True)
|
1550 |
-
|
1551 |
-
|
1552 |
-
|
|
|
1553 |
|
1554 |
|
1555 |
-
|
1556 |
-
# if labeled_examples:
|
1557 |
-
# df = pd.DataFrame(labeled_examples)
|
1558 |
-
# csv = df.to_csv(index=False).encode('utf-8')
|
1559 |
-
# st.download_button(
|
1560 |
-
# "📥 Download Labeled Examples",
|
1561 |
-
# csv,
|
1562 |
-
# "labeled_examples.csv",
|
1563 |
-
# "text/csv",
|
1564 |
-
# key='download-labeled-csv'
|
1565 |
-
# )
|
1566 |
-
# # Add space and center the "or"
|
1567 |
-
# st.markdown("""
|
1568 |
-
# <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
|
1569 |
-
# """, unsafe_allow_html=True)
|
1570 |
-
|
1571 |
-
# if labeled_examples:
|
1572 |
-
# df = pd.DataFrame(labeled_examples)
|
1573 |
-
# csv = df.to_csv(index=False).encode('utf-8')
|
1574 |
-
# st.download_button(
|
1575 |
-
# "📥 Download Labeled Examples",
|
1576 |
-
# csv,
|
1577 |
-
# "labeled_examples.json",
|
1578 |
-
# "text/json",
|
1579 |
-
# key='download-labeled-JSON'
|
1580 |
-
# )
|
1581 |
-
|
1582 |
-
# Add follow-up interaction options
|
1583 |
-
#st.markdown("---")
|
1584 |
-
#follow_up = st.radio(
|
1585 |
-
#"What would you like to do next?",
|
1586 |
-
#["Label more data", "Data Generation"],
|
1587 |
-
# key="labeling_follow_up"
|
1588 |
-
# )
|
1589 |
|
1590 |
if st.button("Continue"):
|
1591 |
if follow_up == "Label more data":
|
|
|
47 |
)
|
48 |
|
49 |
print(completion.choices[0].message)
|
50 |
+
|
51 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Create necessary directories
|
54 |
for dir_name in ['data', 'feedback']:
|
|
|
112 |
continue
|
113 |
raise UnicodeDecodeError("Failed to read file with any supported encoding")
|
114 |
|
115 |
+
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
def reset_conversation():
|
118 |
st.session_state.conversation = []
|
|
|
136 |
|
137 |
# Main app title
|
138 |
st.title("🤖🦙 Text Data Labeling and Generation App")
|
139 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
# Sidebar settings
|
142 |
with st.sidebar:
|
|
|
163 |
key='model_select'
|
164 |
)
|
165 |
|
166 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
temperature = st.slider(
|
168 |
"Temperature",
|
169 |
0.0, 1.0, 0.7,
|
|
|
406 |
|
407 |
)
|
408 |
)
|
409 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
##########new 22/4/2025
|
411 |
formatted_attributes = "\n".join([
|
412 |
f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
|
|
|
446 |
st.warning("Class names must be unique.")
|
447 |
elif any(not lbl.strip() for lbl in labels):
|
448 |
st.warning("All class labels must be filled in.")
|
449 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
|
451 |
with st.spinner("Generating examples..."):
|
452 |
try:
|
|
|
462 |
#frequency_penalty=0.5, # Discourages frequent words
|
463 |
#presence_penalty=0.6,
|
464 |
)
|
465 |
+
|
466 |
#new 24 march
|
467 |
st.session_state.messages.append({"role": "user", "content": system_prompt})
|
468 |
# # ####################
|
|
|
502 |
'Use few-shot example?': 'Yes' if use_few_shot else 'No',
|
503 |
})
|
504 |
|
505 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
|
508 |
if examples_list:
|
|
|
535 |
"application/json",
|
536 |
key='download-json-persistent'
|
537 |
)
|
538 |
+
# Display the labeled examples
|
539 |
+
st.markdown("##### 📋 Labeled Examples Preview")
|
540 |
+
st.dataframe(df, use_container_width=True)
|
541 |
|
542 |
if st.button("Continue"):
|
543 |
if follow_up == "Generate more examples":
|
|
|
760 |
|
761 |
if not labels:
|
762 |
st.warning("Please select at least one entity type.")
|
763 |
+
labels = ["PERSON"]
|
764 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
765 |
use_few_shot = st.toggle("Use few-shot examples for labeling")
|
766 |
few_shot_examples = []
|
767 |
if use_few_shot:
|
|
|
814 |
|
815 |
# Customize prompt template based on classification type
|
816 |
if classification_type == "Named Entity Recognition (NER)":
|
817 |
+
|
818 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
819 |
label_prompt_template = PromptTemplate(
|
820 |
input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
|
821 |
template=(
|
|
|
874 |
formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
|
875 |
else:
|
876 |
formatted_few_shot = ""
|
877 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
878 |
# new 22/4/2025
|
879 |
#formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
|
880 |
formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
|
|
|
912 |
few_shot_examples=few_shot_text,
|
913 |
examples=examples_text,
|
914 |
user_prompt=user_prompt
|
915 |
+
|
916 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
917 |
|
918 |
else:
|
919 |
system_prompt = label_prompt_template.format(
|
|
|
942 |
#################
|
943 |
response = st.write_stream(stream)
|
944 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
945 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
946 |
|
947 |
# Initialize session state variables if they don't exist
|
948 |
if 'system_prompt' not in st.session_state:
|
|
|
961 |
st.session_state.generated_examples_json = None
|
962 |
|
963 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
964 |
|
965 |
#new 22/4/2025
|
966 |
labeled_examples = []
|
|
|
1016 |
"examples": labeled_examples
|
1017 |
}, indent=2).encode('utf-8')
|
1018 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1019 |
|
1020 |
# Download buttons
|
1021 |
st.download_button(
|
|
|
1040 |
# Display the labeled examples
|
1041 |
st.markdown("##### 📋 Labeled Examples Preview")
|
1042 |
st.dataframe(df, use_container_width=True)
|
1043 |
+
|
1044 |
+
Display section
|
1045 |
+
st.markdown("### 📋 Labeled Examples Preview")
|
1046 |
+
st.dataframe(st.session_state.labeled_preview, use_container_width=True)
|
1047 |
|
1048 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1049 |
|
1050 |
if st.button("Continue"):
|
1051 |
if follow_up == "Label more data":
|