Wedyan2023 commited on
Commit
d8fa7c4
·
verified ·
1 Parent(s): aebaf84

Update app110.py

Browse files
Files changed (1) hide show
  1. app110.py +24 -564
app110.py CHANGED
@@ -47,124 +47,8 @@ completion = client.chat.completions.create(
47
  )
48
 
49
  print(completion.choices[0].message)
50
- #######
51
- #####
52
- # from openai import OpenAI
53
-
54
- # client = OpenAI(
55
- # base_url="https://router.huggingface.co/together/v1",
56
- # #api_key="hf_XXXXX",
57
- # api_key=os.environ.get('TOKEN2'), # Hugging Face API token
58
- # )
59
- # #meta-llama/Meta-Llama-3-8B-Instruct
60
- # completion = client.chat.completions.create(
61
- # #model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
62
- # model="meta-llama/Meta-Llama-3-8B-Instruct",
63
- # messages=[
64
- # {
65
- # "role": "user",
66
- # "content": "What is the capital of France?"
67
- # }
68
- # ],
69
- # )
70
-
71
- #print(completion.choices[0].message)
72
- #####
73
- ##########################################################3
74
- # import streamlit as st
75
- # from transformers import AutoModelForCausalLM, AutoTokenizer
76
- # import torch
77
-
78
- # # Model selection dropdown
79
- # selected_model = st.selectbox(
80
- # "Select Model",
81
- # ["meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
82
- # "meta-llama/Llama-3.3-70B-Instruct",
83
- # "meta-llama/Llama-3.2-3B-Instruct",
84
- # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
85
- # "meta-llama/Meta-Llama-3-8B-Instruct",
86
- # "meta-llama/Llama-3.1-70B-Instruct"],
87
- # key='model_select'
88
- # )
89
-
90
- # @st.cache_resource # Cache the model to prevent reloading
91
- # def load_model(model_name):
92
- # try:
93
- # # Optimized model loading configuration
94
- # model = AutoModelForCausalLM.from_pretrained(
95
- # model_name,
96
- # torch_dtype=torch.float16, # Use half precision
97
- # device_map="auto", # Automatic device mapping
98
- # load_in_8bit=True, # Enable 8-bit quantization
99
- # low_cpu_mem_usage=True, # Optimize CPU memory usage
100
- # max_memory={0: "10GB"} # Limit GPU memory usage
101
- # )
102
-
103
- # tokenizer = AutoTokenizer.from_pretrained(
104
- # model_name,
105
- # padding_side="left",
106
- # truncation_side="left"
107
- # )
108
-
109
- # return model, tokenizer
110
-
111
- # except Exception as e:
112
- # st.error(f"Error loading model: {str(e)}")
113
- # return None, None
114
-
115
- # # Load the selected model with optimizations
116
- # if selected_model:
117
- # model, tokenizer = load_model(selected_model)
118
-
119
- # # Check if model loaded successfully
120
- # if model is not None:
121
- # st.success(f"Successfully loaded {selected_model}")
122
- # else:
123
- # st.warning("Please select a different model or check your hardware capabilities")
124
-
125
- # # Function to generate text
126
- # def generate_response(prompt, model, tokenizer):
127
- # try:
128
- # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
129
-
130
- # with torch.no_grad():
131
- # outputs = model.generate(
132
- # inputs["input_ids"],
133
- # max_length=256,
134
- # num_return_sequences=1,
135
- # temperature=0.7,
136
- # do_sample=True,
137
- # pad_token_id=tokenizer.pad_token_id
138
- # )
139
-
140
- # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
141
- # return response
142
-
143
- # except Exception as e:
144
- # return f"Error generating response: {str(e)}"
145
- ############################################################
146
-
147
- ####new
148
- # from openai import OpenAI
149
-
150
- # client = OpenAI(
151
- # base_url="https://router.huggingface.co/together/v1",
152
- # api_key=os.environ.get('TOKEN2'),
153
- # )
154
-
155
- # completion = client.chat.completions.create(
156
- # model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
157
- # messages=[
158
- # {
159
- # "role": "user",
160
- # "content": "What is the capital of France?"
161
- # }
162
- # ],
163
- # max_tokens=512,
164
- # )
165
-
166
- # print(completion.choices[0].message)
167
- #####
168
 
169
  # Create necessary directories
170
  for dir_name in ['data', 'feedback']:
@@ -228,14 +112,7 @@ def read_csv_with_encoding(file):
228
  continue
229
  raise UnicodeDecodeError("Failed to read file with any supported encoding")
230
 
231
- #def save_feedback(feedback_data):
232
- #feedback_file = 'feedback/user_feedback.csv'
233
- #feedback_df = pd.DataFrame([feedback_data])
234
-
235
- #if os.path.exists(feedback_file):
236
- #feedback_df.to_csv(feedback_file, mode='a', header=False, index=False)
237
- #else:
238
- #feedback_df.to_csv(feedback_file, index=False)
239
 
240
  def reset_conversation():
241
  st.session_state.conversation = []
@@ -259,16 +136,7 @@ if "system_role" not in st.session_state:
259
 
260
  # Main app title
261
  st.title("🤖🦙 Text Data Labeling and Generation App")
262
- # def embed_pdf_sidebar(pdf_path):
263
- # with open(pdf_path, "rb") as f:
264
- # base64_pdf = base64.b64encode(f.read()).decode('utf-8')
265
- # pdf_display = f"""
266
- # <iframe src="data:application/pdf;base64,{base64_pdf}"
267
- # width="100%" height="400" type="application/pdf"></iframe>
268
- # """
269
- # st.markdown(pdf_display, unsafe_allow_html=True)
270
- #
271
-
272
 
273
  # Sidebar settings
274
  with st.sidebar:
@@ -295,84 +163,7 @@ with st.sidebar:
295
  key='model_select'
296
  )
297
 
298
- #################new oooo
299
-
300
- # # Model selection dropdown
301
- # selected_model = st.selectbox(
302
- # "Select Model",
303
- # [#"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
304
- # "meta-llama/Llama-3.2-3B-Instruct",
305
- # "meta-llama/Llama-3.3-70B-Instruct",
306
- # "meta-llama/Llama-3.2-3B-Instruct",
307
- # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
308
- # "meta-llama/Meta-Llama-3-8B-Instruct",
309
- # "meta-llama/Llama-3.1-70B-Instruct"],
310
- # key='model_select'
311
- # )
312
-
313
- # @st.cache_resource # Cache the model to prevent reloading
314
- # def load_model(model_name):
315
- # try:
316
- # # Optimized model loading configuration
317
- # model = AutoModelForCausalLM.from_pretrained(
318
- # model_name,
319
- # torch_dtype=torch.float16, # Use half precision
320
- # device_map="auto", # Automatic device mapping
321
- # load_in_8bit=True, # Enable 8-bit quantization
322
- # low_cpu_mem_usage=True, # Optimize CPU memory usage
323
- # max_memory={0: "10GB"} # Limit GPU memory usage
324
- # )
325
-
326
- # tokenizer = AutoTokenizer.from_pretrained(
327
- # model_name,
328
- # padding_side="left",
329
- # truncation_side="left"
330
- # )
331
-
332
- # return model, tokenizer
333
-
334
- # except Exception as e:
335
- # st.error(f"Error loading model: {str(e)}")
336
- # return None, None
337
-
338
- # # Load the selected model with optimizations
339
- # if selected_model:
340
- # model, tokenizer = load_model(selected_model)
341
-
342
- # # Check if model loaded successfully
343
- # if model is not None:
344
- # st.success(f"Successfully loaded {selected_model}")
345
- # else:
346
- # st.warning("Please select a different model or check your hardware capabilities")
347
-
348
- # # Function to generate text
349
- # def generate_response(prompt, model, tokenizer):
350
- # try:
351
- # inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
352
-
353
- # with torch.no_grad():
354
- # outputs = model.generate(
355
- # inputs["input_ids"],
356
- # max_length=256,
357
- # num_return_sequences=1,
358
- # temperature=0.7,
359
- # do_sample=True,
360
- # pad_token_id=tokenizer.pad_token_id
361
- # )
362
-
363
- # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
364
- # return response
365
-
366
- # except Exception as e:
367
- # return f"Error generating response: {str(e)}"
368
- # ################
369
-
370
- # model = AutoModelForCausalLM.from_pretrained(
371
- # "meta-llama/Meta-Llama-3-8B-Instruct",
372
- # torch_dtype=torch.float16, # Use half precision
373
- # device_map="auto", # Automatic device mapping
374
- # load_in_8bit=True # Load in 8-bit precision
375
- # )
376
  temperature = st.slider(
377
  "Temperature",
378
  0.0, 1.0, 0.7,
@@ -615,21 +406,7 @@ if "task_choice" in st.session_state:
615
 
616
  )
617
  )
618
- # template=(
619
- # "{system_role}\n"
620
- # "- Use the following parameters:\n"
621
- # "- Generate {num_examples} examples\n"
622
- # "- Each example should be between {min_words} to {max_words} words long\n"
623
- # "- Use these labels: {labels}.\n"
624
- # "- Use the following additional attributes:\n"
625
- # "{additional_attributes}\n"
626
- # #"- Format each example like this: 'Example text. Label: [label]. Attribute1: [topic1]. Attribute2: [topic2]'\n"
627
- # "- Generate the examples in this format: 'Example text. Label: label'\n"
628
- # "- Additional instructions: {user_prompt}\n"
629
- # "- Use these few-shot examples if provided:\n{few_shot_examples}\n"
630
- # "- Think step by step and ensure examples are unique and not repeated."
631
- # )
632
- # )
633
  ##########new 22/4/2025
634
  formatted_attributes = "\n".join([
635
  f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
@@ -669,16 +446,7 @@ if "task_choice" in st.session_state:
669
  st.warning("Class names must be unique.")
670
  elif any(not lbl.strip() for lbl in labels):
671
  st.warning("All class labels must be filled in.")
672
- #else:
673
- #st.success("Generating examples for domain: {domain}")
674
-
675
- #if not custom_domain_valid:
676
- #st.warning("Custom domain name is required.")
677
- #elif not labels_valid:
678
- #st.warning("Please fix the label errors before generating examples.")
679
- #else:
680
- # Proceed to generate examples
681
- #st.success(f"Generating examples for domain: {domain}")
682
 
683
  with st.spinner("Generating examples..."):
684
  try:
@@ -694,7 +462,7 @@ if "task_choice" in st.session_state:
694
  #frequency_penalty=0.5, # Discourages frequent words
695
  #presence_penalty=0.6,
696
  )
697
- #st.session_state['system_prompt'] = system_prompt
698
  #new 24 march
699
  st.session_state.messages.append({"role": "user", "content": system_prompt})
700
  # # ####################
@@ -734,18 +502,7 @@ if "task_choice" in st.session_state:
734
  'Use few-shot example?': 'Yes' if use_few_shot else 'No',
735
  })
736
 
737
- # example_dict = {
738
- # 'text': text,
739
- # 'label': label,
740
- # 'system_prompt': st.session_state.system_prompt,
741
- # 'system_role': st.session_state.system_role,
742
- # 'task_type': 'Data Generation',
743
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
744
- # }
745
- # for attr in additional_attributes:
746
- # example_dict[attr['attribute']] = random.choice(attr['topics'])
747
-
748
- # examples_list.append(example_dict)
749
 
750
 
751
  if examples_list:
@@ -778,9 +535,9 @@ if "task_choice" in st.session_state:
778
  "application/json",
779
  key='download-json-persistent'
780
  )
781
- # # Display the labeled examples
782
- # st.markdown("##### 📋 Labeled Examples Preview")
783
- # st.dataframe(df, use_container_width=True)
784
 
785
  if st.button("Continue"):
786
  if follow_up == "Generate more examples":
@@ -1003,78 +760,8 @@ if "task_choice" in st.session_state:
1003
 
1004
  if not labels:
1005
  st.warning("Please select at least one entity type.")
1006
- labels = ["PERSON"]
1007
-
1008
- ##########
1009
-
1010
- # # Extract just the entity type (before the dash)
1011
- # labels = [entity.split(" - ")[0] for entity in selected_entities]
1012
-
1013
- # if not labels:
1014
- # st.warning("Please select at least one entity type")
1015
- # labels = ["PERSON"] # Default if nothing selected
1016
-
1017
-
1018
-
1019
-
1020
-
1021
- #NNew edit
1022
- # elif classification_type == "Multi-Class Classification":
1023
- # st.write("### Multi-Class Classification Labels")
1024
-
1025
- # default_labels_by_domain = {
1026
- # "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
1027
- # "AG News": ["World", "Sports", "Business", "Sci/Tech"],
1028
- # "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
1029
- # "Food & Dining", "Local Experience", "Adventure Activities",
1030
- # "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
1031
- # "Luxury Tourism"],
1032
- # "Restaurant reviews": ["Italian", "French", "American"]
1033
- # }
1034
- # num_classes = st.slider("Number of classes", 3, 10, 3)
1035
-
1036
- # # Get defaults for selected domain, or empty list
1037
- # defaults = default_labels_by_domain.get(domain, [])
1038
-
1039
- # labels = []
1040
- # errors = []
1041
- # cols = st.columns(3)
1042
-
1043
- # for i in range(num_classes):
1044
- # with cols[i % 3]:
1045
- # default_value = defaults[i] if i < len(defaults) else ""
1046
- # label_input = st.text_input(f"Class {i+1}", default_value)
1047
- # normalized_label = label_input.strip().title()
1048
-
1049
- # if not normalized_label:
1050
- # errors.append(f"Class {i+1} name is required.")
1051
- # else:
1052
- # labels.append(normalized_label)
1053
-
1054
- # # Check for duplicates (case-insensitive)
1055
- # if len(labels) != len(set(labels)):
1056
- # errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
1057
-
1058
- # # Show validation results
1059
- # if errors:
1060
- # for error in errors:
1061
- # st.error(error)
1062
- # else:
1063
- # st.success("All Labels names are valid and unique!")
1064
- # labels_valid = not errors # Will be True only if there are no label errors
1065
-
1066
-
1067
-
1068
-
1069
- # else:
1070
- # num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes")
1071
- # labels = []
1072
- # cols = st.columns(3)
1073
- # for i in range(num_classes):
1074
- # with cols[i % 3]:
1075
- # label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
1076
- # labels.append(label)
1077
-
1078
  use_few_shot = st.toggle("Use few-shot examples for labeling")
1079
  few_shot_examples = []
1080
  if use_few_shot:
@@ -1127,78 +814,8 @@ if "task_choice" in st.session_state:
1127
 
1128
  # Customize prompt template based on classification type
1129
  if classification_type == "Named Entity Recognition (NER)":
1130
- # label_prompt_template = PromptTemplate(
1131
- # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1132
- # template=(
1133
- # "{system_role}\n"
1134
- # #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n"
1135
- # "- For each text example provided, identify all entities of the requested types.\n"
1136
- # "- Use the following entities: {labels}.\n"
1137
- # "- Return each example followed by the entities you found in this format: 'Example text.\n \n Entities:\n [ENTITY_TYPE: entity text\n\n, ENTITY_TYPE: entity text\n\n, ...] or [No entities found]'\n"
1138
- # "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n"
1139
- # "- Be precise about entity boundaries - don't include unnecessary words.\n"
1140
- # "- Do not provide any additional information or explanations.\n"
1141
- # "- Additional instructions:\n {user_prompt}\n\n"
1142
- # "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n"
1143
- # "- Examples to analyze:\n{examples}\n\n"
1144
- # "Output:\n"
1145
- # )
1146
- # )
1147
- #new 22/4/2025
1148
- # label_prompt_template = PromptTemplate(
1149
- # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1150
- # template=(
1151
- # "{system_role}\n"
1152
- # "- You are performing Named Entity Recognition (NER) in the domain of {domain}.\n"
1153
- # "- Use the following entity types: {labels}.\n\n"
1154
- # "### Reasoning Steps:\n"
1155
- # "1. Read the example carefully.\n"
1156
- # "2. For each named entity mentioned, determine its meaning and role in the sentence.\n"
1157
- # "3. Think about the **context**: Is it a physical location (LOC)? A geopolitical region (GPE)? A person (PERSON)?\n"
1158
- # "4. Based on the definition of each label, assign the most **specific and correct** label.\n\n"
1159
- # "For example:\n"
1160
- # "- 'Mount Everest' → LOC (it's a mountain)\n"
1161
- # "- 'France' → GPE (it's a country)\n"
1162
- # "- 'Microsoft' → ORG\n"
1163
- # "- 'John Smith' → PERSON\n\n"
1164
- # "- Return each example followed by the entities you found in this format:\n"
1165
- # "'Example text.'\nEntities: [ENTITY_TYPE: entity text, ENTITY_TYPE: entity text, ...] or [No entities found]\n"
1166
- # "- If no entities of the requested types are found, return 'No entities found'.\n"
1167
- # "- Be precise about entity boundaries - don't include extra words.\n"
1168
- # "- Do not explain or justify your answers.\n\n"
1169
- # "Additional instructions:\n{user_prompt}\n\n"
1170
- # "Few-shot examples:\n{few_shot_examples}\n\n"
1171
- # "Examples to label:\n{examples}\n"
1172
- # "Output:\n"
1173
- # )
1174
- #)
1175
- # label_prompt_template = PromptTemplate(
1176
- # input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1177
- # template=(
1178
- # "{system_role}\n"
1179
- # "- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
1180
- # "- Use these entity types: {labels}.\n\n"
1181
- # "### Output Format:\n"
1182
- # # "Return each example followed by the entities you found in this format: 'Example text.\n Entities:\n [ENTITY_TYPE: entity text\n\"
1183
- # "Return each example followed by the entities you found in this format: 'Example text.\n 'Entity types:\n "Then group the entities under each label like this:\n" "
1184
- # #"Then Start with this line exactly: 'Entity types\n'\n"
1185
- # #"Then group the entities under each label like this:\n"
1186
- # "\n PERSON – Angela Merkel, John Smith\n\n"
1187
- # "\ ORG – Google, United Nations\n\n"
1188
- # "\n DATE – January 1st, 2023\n\n"
1189
- # "\n ... and so on.\n\n"
1190
- # "If entity {labels} not found, do not write it in your response\n"
1191
- # "- Do NOT output them inline after the text.\n"
1192
- # "- Do NOT repeat the sentence.\n"
1193
- # "- If no entities are found for a type, skip it.\n"
1194
- # "- Keep the format consistent.\n\n"
1195
- # "User Instructions:\n{user_prompt}\n\n"
1196
- # "Few-shot Examples:\n{few_shot_examples}\n\n"
1197
- # "Examples to analyze:\n{examples}"
1198
- # )
1199
- # )
1200
-
1201
-
1202
  label_prompt_template = PromptTemplate(
1203
  input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
1204
  template=(
@@ -1257,20 +874,7 @@ if "task_choice" in st.session_state:
1257
  formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
1258
  else:
1259
  formatted_few_shot = ""
1260
- # #new 22/4/2025
1261
- # few_shot_examples = [
1262
- # {"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
1263
- # {"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
1264
- # {"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
1265
- # {"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
1266
- # {"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
1267
- # {"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
1268
- # {"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
1269
- # {"content": "She bought the dress for $200.", "label": "MONEY: $200"},
1270
- # {"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
1271
- # {"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
1272
- # ]
1273
- # ###########
1274
  # new 22/4/2025
1275
  #formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
1276
  formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
@@ -1308,69 +912,8 @@ if "task_choice" in st.session_state:
1308
  few_shot_examples=few_shot_text,
1309
  examples=examples_text,
1310
  user_prompt=user_prompt
1311
- #new
1312
- #'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1313
- )
1314
- # if classification_type == "Named Entity Recognition (NER)":
1315
- # # Step 1: Split the full response by example
1316
- # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1317
- # inputs = [ex.strip() for ex in examples_to_classify]
1318
-
1319
- # # Step 2: Match inputs with NER output blocks
1320
- # labeled_examples = []
1321
- # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1322
- # labeled_examples.append({
1323
- # 'text': text,
1324
- # 'entities': f"Entity types\n{output_block.strip()}",
1325
- # 'system_prompt': st.session_state.system_prompt,
1326
- # 'system_role': st.session_state.system_role,
1327
- # 'task_type': 'Named Entity Recognition (NER)',
1328
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1329
- # })
1330
-
1331
- # if classification_type == "Named Entity Recognition (NER)":
1332
- # # Step 1: Split the full response by example
1333
- # raw_outputs = [block.strip() for block in response.strip().split("Entity types") if block.strip()]
1334
- # inputs = [ex.strip() for ex in examples_to_classify]
1335
-
1336
- # # Step 2: Match inputs with NER output blocks
1337
- # labeled_examples = []
1338
- # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1339
- # labeled_examples.append({
1340
- # 'text': text,
1341
- # 'entities': f"Entity types\n{output_block.strip()}",
1342
- # 'system_prompt': st.session_state.system_prompt,
1343
- # 'system_role': st.session_state.system_role,
1344
- # 'task_type': 'Named Entity Recognition (NER)',
1345
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1346
- # })
1347
-
1348
-
1349
- # import re
1350
-
1351
- # if classification_type == "Named Entity Recognition (NER)":
1352
- # # Use regex to split on "Entity types" while keeping it attached to each block
1353
- # blocks = re.split(r"(Entity types)", response.strip())
1354
-
1355
- # # Recombine 'Entity types' with each block after splitting
1356
- # raw_outputs = [
1357
- # (blocks[i] + blocks[i+1]).strip()
1358
- # for i in range(1, len(blocks) - 1, 2)
1359
- # ]
1360
-
1361
- # inputs = [ex.strip() for ex in examples_to_classify]
1362
-
1363
- # labeled_examples = []
1364
- # for i, (text, output_block) in enumerate(zip(inputs, raw_outputs)):
1365
- # labeled_examples.append({
1366
- # 'text': text,
1367
- # 'entities': output_block,
1368
- # 'system_prompt': st.session_state.system_prompt,
1369
- # 'system_role': st.session_state.system_role,
1370
- # 'task_type': 'Named Entity Recognition (NER)',
1371
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1372
- # })
1373
-
1374
 
1375
  else:
1376
  system_prompt = label_prompt_template.format(
@@ -1399,16 +942,7 @@ if "task_choice" in st.session_state:
1399
  #################
1400
  response = st.write_stream(stream)
1401
  st.session_state.messages.append({"role": "assistant", "content": response})
1402
- # Display the labeled examples
1403
- # # Optional: If you want to add it as a chat-style message log
1404
- # preview_str = st.session_state.labeled_preview.to_markdown(index=False)
1405
- # st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"})
1406
-
1407
-
1408
- # # Stream response and append assistant message
1409
- # #14/4/2024
1410
- # response = st.write_stream(stream)
1411
- # st.session_state.messages.append({"role": "assistant", "content": response})
1412
 
1413
  # Initialize session state variables if they don't exist
1414
  if 'system_prompt' not in st.session_state:
@@ -1427,28 +961,6 @@ if "task_choice" in st.session_state:
1427
  st.session_state.generated_examples_json = None
1428
 
1429
 
1430
-
1431
-
1432
- # Save labeled examples to CSV
1433
- #new 14/4/2025
1434
- #labeled_examples = []
1435
- # if classification_type == "Named Entity Recognition (NER)":
1436
- # labeled_examples = []
1437
- # for line in response.split('\n'):
1438
- # if line.strip():
1439
- # parts = line.rsplit('Entities:', 1)
1440
- # if len(parts) == 2:
1441
- # text = parts[0].strip()
1442
- # entities = parts[1].strip()
1443
- # if text and entities:
1444
- # labeled_examples.append({
1445
- # 'text': text,
1446
- # 'entities': entities,
1447
- # 'system_prompt': st.session_state.system_prompt,
1448
- # 'system_role': st.session_state.system_role,
1449
- # 'task_type': 'Named Entity Recognition (NER)',
1450
- # 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1451
- # })
1452
 
1453
  #new 22/4/2025
1454
  labeled_examples = []
@@ -1504,25 +1016,6 @@ if "task_choice" in st.session_state:
1504
  "examples": labeled_examples
1505
  }, indent=2).encode('utf-8')
1506
 
1507
- ############
1508
- # CSV
1509
- # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1510
-
1511
- # # JSON
1512
- # st.session_state.labeled_examples_json = json.dumps({
1513
- # "metadata": {
1514
- # "domain": domain,
1515
- # "labels": labels,
1516
- # "used_few_shot": use_few_shot,
1517
- # "task_type": "Named Entity Recognition (NER)",
1518
- # "timestamp": datetime.now().isoformat()
1519
- # },
1520
- # "examples": labeled_examples
1521
- # }, indent=2).encode('utf-8')
1522
-
1523
- ########
1524
- # st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1525
- # st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8')
1526
 
1527
  # Download buttons
1528
  st.download_button(
@@ -1547,45 +1040,12 @@ if "task_choice" in st.session_state:
1547
  # Display the labeled examples
1548
  st.markdown("##### 📋 Labeled Examples Preview")
1549
  st.dataframe(df, use_container_width=True)
1550
- # Display section
1551
- #st.markdown("### 📋 Labeled Examples Preview")
1552
- #st.dataframe(st.session_state.labeled_preview, use_container_width=True)
 
1553
 
1554
 
1555
-
1556
- # if labeled_examples:
1557
- # df = pd.DataFrame(labeled_examples)
1558
- # csv = df.to_csv(index=False).encode('utf-8')
1559
- # st.download_button(
1560
- # "📥 Download Labeled Examples",
1561
- # csv,
1562
- # "labeled_examples.csv",
1563
- # "text/csv",
1564
- # key='download-labeled-csv'
1565
- # )
1566
- # # Add space and center the "or"
1567
- # st.markdown("""
1568
- # <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1569
- # """, unsafe_allow_html=True)
1570
-
1571
- # if labeled_examples:
1572
- # df = pd.DataFrame(labeled_examples)
1573
- # csv = df.to_csv(index=False).encode('utf-8')
1574
- # st.download_button(
1575
- # "📥 Download Labeled Examples",
1576
- # csv,
1577
- # "labeled_examples.json",
1578
- # "text/json",
1579
- # key='download-labeled-JSON'
1580
- # )
1581
-
1582
- # Add follow-up interaction options
1583
- #st.markdown("---")
1584
- #follow_up = st.radio(
1585
- #"What would you like to do next?",
1586
- #["Label more data", "Data Generation"],
1587
- # key="labeling_follow_up"
1588
- # )
1589
 
1590
  if st.button("Continue"):
1591
  if follow_up == "Label more data":
 
47
  )
48
 
49
  print(completion.choices[0].message)
50
+
51
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Create necessary directories
54
  for dir_name in ['data', 'feedback']:
 
112
  continue
113
  raise UnicodeDecodeError("Failed to read file with any supported encoding")
114
 
115
+ #
 
 
 
 
 
 
 
116
 
117
  def reset_conversation():
118
  st.session_state.conversation = []
 
136
 
137
  # Main app title
138
  st.title("🤖🦙 Text Data Labeling and Generation App")
139
+
 
 
 
 
 
 
 
 
 
140
 
141
  # Sidebar settings
142
  with st.sidebar:
 
163
  key='model_select'
164
  )
165
 
166
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  temperature = st.slider(
168
  "Temperature",
169
  0.0, 1.0, 0.7,
 
406
 
407
  )
408
  )
409
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  ##########new 22/4/2025
411
  formatted_attributes = "\n".join([
412
  f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
 
446
  st.warning("Class names must be unique.")
447
  elif any(not lbl.strip() for lbl in labels):
448
  st.warning("All class labels must be filled in.")
449
+
 
 
 
 
 
 
 
 
 
450
 
451
  with st.spinner("Generating examples..."):
452
  try:
 
462
  #frequency_penalty=0.5, # Discourages frequent words
463
  #presence_penalty=0.6,
464
  )
465
+
466
  #new 24 march
467
  st.session_state.messages.append({"role": "user", "content": system_prompt})
468
  # # ####################
 
502
  'Use few-shot example?': 'Yes' if use_few_shot else 'No',
503
  })
504
 
505
+
 
 
 
 
 
 
 
 
 
 
 
506
 
507
 
508
  if examples_list:
 
535
  "application/json",
536
  key='download-json-persistent'
537
  )
538
+ # Display the labeled examples
539
+ st.markdown("##### 📋 Labeled Examples Preview")
540
+ st.dataframe(df, use_container_width=True)
541
 
542
  if st.button("Continue"):
543
  if follow_up == "Generate more examples":
 
760
 
761
  if not labels:
762
  st.warning("Please select at least one entity type.")
763
+ labels = ["PERSON"]
764
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  use_few_shot = st.toggle("Use few-shot examples for labeling")
766
  few_shot_examples = []
767
  if use_few_shot:
 
814
 
815
  # Customize prompt template based on classification type
816
  if classification_type == "Named Entity Recognition (NER)":
817
+
818
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  label_prompt_template = PromptTemplate(
820
  input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
821
  template=(
 
874
  formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
875
  else:
876
  formatted_few_shot = ""
877
+
 
 
 
 
 
 
 
 
 
 
 
 
 
878
  # new 22/4/2025
879
  #formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
880
  formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])
 
912
  few_shot_examples=few_shot_text,
913
  examples=examples_text,
914
  user_prompt=user_prompt
915
+
916
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
917
 
918
  else:
919
  system_prompt = label_prompt_template.format(
 
942
  #################
943
  response = st.write_stream(stream)
944
  st.session_state.messages.append({"role": "assistant", "content": response})
945
+
 
 
 
 
 
 
 
 
 
946
 
947
  # Initialize session state variables if they don't exist
948
  if 'system_prompt' not in st.session_state:
 
961
  st.session_state.generated_examples_json = None
962
 
963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964
 
965
  #new 22/4/2025
966
  labeled_examples = []
 
1016
  "examples": labeled_examples
1017
  }, indent=2).encode('utf-8')
1018
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1019
 
1020
  # Download buttons
1021
  st.download_button(
 
1040
  # Display the labeled examples
1041
  st.markdown("##### 📋 Labeled Examples Preview")
1042
  st.dataframe(df, use_container_width=True)
1043
+
1044
+ Display section
1045
+ st.markdown("### 📋 Labeled Examples Preview")
1046
+ st.dataframe(st.session_state.labeled_preview, use_container_width=True)
1047
 
1048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
 
1050
  if st.button("Continue"):
1051
  if follow_up == "Label more data":