training-data-collection_2

Sleeping

App Files Files Community

oscarwang2 commited on Jul 28

Commit

ddecd6a

•

1 Parent(s): 601e197

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -30

app.py CHANGED Viewed

@@ -1,30 +1,39 @@
 import pandas as pd
-from groq import Groq
 import os
 import gradio as gr
 import threading
 import time
 client = Groq()
-max_size = 1.1 * 1024 * 1024 * 1024  # 1.1GB in bytes
 file_index = 1
-data_directory = 'data'
-current_file = os.path.join(data_directory, f'data{file_index}.csv')
 file_paths = [current_file]
 combined_tokens = 0
-update_interval = 1  # Update interval in seconds
-# Ensure the data directory exists
-if not os.path.exists(data_directory):
-    os.makedirs(data_directory)
 def get_file_size(filename):
-    if os.path.isfile(filename):
-        return os.path.getsize(filename)
-    return 0
 def generate_and_save_data():
     global file_index, current_file, file_paths, combined_tokens
     while True:
         try:
             # Generate a prompt
@@ -33,7 +42,7 @@ def generate_and_save_data():
                 messages=[
                     {
                         "role": "user",
-                        "content": "give me a single prompt to prompt an ai model, simulating what users could want from you. ensure that it is diverse and high quality. for each, choose a random writing style (though it has to be a common one), random length and random clarity of the prompt. ensure that I is a single prompt, and just the prompt itself, nothing else. eg, don't close the prompt in quotation marks or say Here is a single prompt that meets your requirements or anything similar to that"
                     }
                 ],
                 temperature=1,
@@ -47,9 +56,9 @@ def generate_and_save_data():
             prompt_tokens = 0
             for chunk in completion:
                 content = chunk.choices[0].delta.content
-                if content is not None:
                     prompt += content
-                    prompt_tokens += len(content.split())  # Assuming tokens are words for simplicity
             # Use the generated prompt to query the model again
             second_completion = client.chat.completions.create(
@@ -61,7 +70,7 @@ def generate_and_save_data():
                     }
                 ],
                 temperature=1,
-                max_tokens=1024,
                 top_p=1,
                 stream=True,
                 stop=None,
@@ -71,9 +80,9 @@ def generate_and_save_data():
             response_tokens = 0
             for chunk in second_completion:
                 content = chunk.choices[0].delta.content
-                if content is not None:
                     response += content
-                    response_tokens += len(content.split())  # Assuming tokens are words for simplicity
             # Update the combined token count
             combined_tokens += (prompt_tokens + response_tokens)
@@ -86,36 +95,43 @@ def generate_and_save_data():
             data = pd.DataFrame({"prompt": [prompt], "response": [response]})
             # Check the size of the current file
-            if get_file_size(current_file) >= max_size:
                 file_index += 1
-                current_file = os.path.join(data_directory, f'data{file_index}.csv')
                 file_paths.append(current_file)
-            # Check if the current file exists
-            file_exists = os.path.isfile(current_file)
-            # If the file exists, append without overwriting
-            if file_exists:
-                data.to_csv(current_file, mode='a', header=False, index=False)
             else:
-                data.to_csv(current_file, mode='w', header=True, index=False)
             # Wait for the next update interval
-            time.sleep(update_interval)
         except Exception as e:
             print(f"An error occurred: {e}. Retrying in 5 seconds...")
             time.sleep(5)
 def get_available_files():
     return [f for f in file_paths if os.path.isfile(f)]
 def update_file_list():
     return gr.update(choices=get_available_files())
 def update_token_count():
     return combined_tokens
 # Start the data generation in a separate thread
 thread = threading.Thread(target=generate_and_save_data)
 thread.daemon = True
@@ -126,7 +142,8 @@ with gr.Blocks() as app:
     gr.Markdown("## AI Prompt and Response Generator")
     gr.Markdown("This app continuously generates AI prompts and responses, and writes them to CSV files.")
-    file_selector = gr.Dropdown(label="Select a data file to download", choices=get_available_files())
     download_button = gr.File(label="Download Selected File")
     def download_file(selected_file):
@@ -134,6 +151,7 @@ with gr.Blocks() as app:
     refresh_button = gr.Button("Refresh File List")
     refresh_button.click(update_file_list, outputs=file_selector)
     file_selector.change(download_file, inputs=file_selector, outputs=download_button)
     token_display = gr.Textbox(label="Combined Tokens", value=str(update_token_count()), interactive=False)

 import pandas as pd
 import os
 import gradio as gr
 import threading
 import time
+from groq import Groq
+# Initialize Groq client
 client = Groq()
+# Constants
+MAX_SIZE = 1.1 * 1024 * 1024 * 1024  # 1.1GB in bytes
+DATA_DIRECTORY = 'data'
+UPDATE_INTERVAL = 1  # Update interval in seconds
+# Ensure the data directory exists
+os.makedirs(DATA_DIRECTORY, exist_ok=True)
+# Initialize variables
 file_index = 1
+current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
 file_paths = [current_file]
 combined_tokens = 0
+# Helper function to get file size
 def get_file_size(filename):
+    return os.path.getsize(filename) if os.path.isfile(filename) else 0
+# Data generation and saving function
 def generate_and_save_data():
     global file_index, current_file, file_paths, combined_tokens
+    # Create the initial file if it doesn't exist
+    if not os.path.isfile(current_file):
+        pd.DataFrame(columns=["prompt", "response"]).to_csv(current_file, index=False)
     while True:
         try:
             # Generate a prompt
                 messages=[
                     {
                         "role": "user",
+                        "content": "give me a single prompt to prompt an ai model, simulating what users could want from you. ensure that it is diverse and high quality. for each, choose a random writing style (though it has to be a common one), random length and random clarity of the prompt. ensure that it is a single prompt, and just the prompt itself, nothing else. eg, don't close the prompt in quotation marks or say Here is a single prompt that meets your requirements or anything similar to that"
                     }
                 ],
                 temperature=1,
             prompt_tokens = 0
             for chunk in completion:
                 content = chunk.choices[0].delta.content
+                if content:
                     prompt += content
+                    prompt_tokens += len(content.split())
             # Use the generated prompt to query the model again
             second_completion = client.chat.completions.create(
                     }
                 ],
                 temperature=1,
+                max_tokens=5000,
                 top_p=1,
                 stream=True,
                 stop=None,
             response_tokens = 0
             for chunk in second_completion:
                 content = chunk.choices[0].delta.content
+                if content:
                     response += content
+                    response_tokens += len(content.split())
             # Update the combined token count
             combined_tokens += (prompt_tokens + response_tokens)
             data = pd.DataFrame({"prompt": [prompt], "response": [response]})
             # Check the size of the current file
+            if get_file_size(current_file) >= MAX_SIZE:
                 file_index += 1
+                current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
                 file_paths.append(current_file)
+                # Create the new file with headers
+                with open(current_file, 'w') as f:
+                    data.to_csv(f, header=True, index=False)
             else:
+                # Append data to the current file
+                with open(current_file, 'a') as f:
+                    data.to_csv(f, header=False, index=False)
             # Wait for the next update interval
+            time.sleep(UPDATE_INTERVAL)
         except Exception as e:
             print(f"An error occurred: {e}. Retrying in 5 seconds...")
             time.sleep(5)
+# Get available files
 def get_available_files():
     return [f for f in file_paths if os.path.isfile(f)]
+# Update file list
 def update_file_list():
     return gr.update(choices=get_available_files())
+# Update token count
 def update_token_count():
     return combined_tokens
+# Display file content
+def display_file_content(selected_file):
+    if selected_file:
+        return pd.read_csv(selected_file)
+    return pd.DataFrame()
 # Start the data generation in a separate thread
 thread = threading.Thread(target=generate_and_save_data)
 thread.daemon = True
     gr.Markdown("## AI Prompt and Response Generator")
     gr.Markdown("This app continuously generates AI prompts and responses, and writes them to CSV files.")
+    file_selector = gr.Dropdown(label="Select a data file to view and download", choices=get_available_files())
+    file_viewer = gr.DataFrame(label="CSV File Content")
     download_button = gr.File(label="Download Selected File")
     def download_file(selected_file):
     refresh_button = gr.Button("Refresh File List")
     refresh_button.click(update_file_list, outputs=file_selector)
+    file_selector.change(display_file_content, inputs=file_selector, outputs=file_viewer)
     file_selector.change(download_file, inputs=file_selector, outputs=download_button)
     token_display = gr.Textbox(label="Combined Tokens", value=str(update_token_count()), interactive=False)