training-data-collection_2

Sleeping

App Files Files Community

Oscar Wang commited on Jul 27, 2024

Commit

50f4808

verified ·

1 Parent(s): 0b023c6

Create app.py

Browse files

Files changed (1) hide show

app.py +135 -0

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import pandas as pd
+from groq import Groq
+import os
+import gradio as gr
+import threading
+import time
+client = Groq()
+max_size = 1.1 * 1024 * 1024 * 1024  # 1.1GB in bytes
+file_index = 1
+current_file = f'data{file_index}.csv'
+file_paths = [current_file]
+combined_tokens = 0
+def get_file_size(filename):
+    if os.path.isfile(filename):
+        return os.path.getsize(filename)
+    return 0
+def generate_and_save_data():
+    global file_index, current_file, file_paths, combined_tokens
+    while True:
+        try:
+            # Generate a prompt
+            completion = client.chat.completions.create(
+                model="llama-3.1-70b-versatile",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": "give me a single prompt to prompt an ai model, simulating what users could want from you. ensure that it is diverse and high quality. for each, choose a random writing style (though it has to be a common one), random length and random clarity of the prompt. ensure that I is a single prompt, and just the prompt itself, nothing else. eg, don't close the prompt in quotation marks or say Here is a single prompt that meets your requirements or anything similar to that"
+                    }
+                ],
+                temperature=1,
+                max_tokens=1024,
+                top_p=1,
+                stream=True,
+                stop=None,
+            )
+            prompt = ""
+            prompt_tokens = 0
+            for chunk in completion:
+                prompt += chunk.choices[0].delta.content or ""
+                prompt_tokens += len(chunk.choices[0].delta.content.split())  # Assuming tokens are words for simplicity
+            # Use the generated prompt to query the model again
+            second_completion = client.chat.completions.create(
+                model="llama-3.1-70b-versatile",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ],
+                temperature=1,
+                max_tokens=8000,
+                top_p=1,
+                stream=True,
+                stop=None,
+            )
+            response = ""
+            response_tokens = 0
+            for chunk in second_completion:
+                response += chunk.choices[0].delta.content or ""
+                response_tokens += len(chunk.choices[0].delta.content.split())  # Assuming tokens are words for simplicity
+            # Update the combined token count
+            combined_tokens += (prompt_tokens + response_tokens)
+            # Print the generated prompt and the response
+            print("Generated prompt:", prompt)
+            print("Response to the generated prompt:", response)
+            # Create a DataFrame with the prompt and response
+            data = pd.DataFrame({"prompt": [prompt], "response": [response]})
+            # Check the size of the current file
+            if get_file_size(current_file) >= max_size:
+                file_index += 1
+                current_file = f'data{file_index}.csv'
+                file_paths.append(current_file)
+            # Check if the current file exists
+            file_exists = os.path.isfile(current_file)
+            # If the file exists, append without overwriting
+            if file_exists:
+                data.to_csv(current_file, mode='a', header=False, index=False)
+            else:
+                data.to_csv(current_file, mode='w', header=True, index=False)
+        except Exception as e:
+            print(f"An error occurred: {e}. Retrying in 5 seconds...")
+            time.sleep(5)
+def get_available_files():
+    return [f for f in file_paths if os.path.isfile(f)]
+def update_file_list():
+    return gr.Dropdown.update(choices=get_available_files())
+def update_token_count():
+    return combined_tokens
+# Start the data generation in a separate thread
+thread = threading.Thread(target=generate_and_save_data)
+thread.daemon = True
+thread.start()
+# Create Gradio interface
+with gr.Blocks() as app:
+    gr.Markdown("## AI Prompt and Response Generator")
+    gr.Markdown("This app continuously generates AI prompts and responses, and writes them to CSV files.")
+    file_selector = gr.Dropdown(label="Select a data file to download", choices=get_available_files())
+    download_button = gr.File(label="Download Selected File")
+    def download_file(selected_file):
+        return selected_file
+    refresh_button = gr.Button("Refresh File List")
+    refresh_button.click(update_file_list, outputs=file_selector)
+    file_selector.change(download_file, inputs=file_selector, outputs=download_button)
+    token_display = gr.Textbox(label="Combined Tokens", value=str(update_token_count()), interactive=False)
+    def update_token_display():
+        return str(update_token_count())
+    # Update the token count every second
+    token_refresh = gr.Button("Refresh Token Count")
+    token_refresh.click(update_token_display, outputs=token_display)
+app.launch()