training-data-collection_2

Sleeping

File size: 5,231 Bytes

50f4808
 
 
 
 
15939d8
50f4808
15939d8
 
ddecd6a
 
 
 
 
 
 
 
 
 
50f4808
ddecd6a
50f4808
 
 
ddecd6a
50f4808
ddecd6a
50f4808
ddecd6a
50f4808
 
ddecd6a
 
 
 
 
50f4808
 
 
15939d8
 
 
59eab61
15939d8
50f4808
15939d8
50f4808
15939d8
 
50f4808
 
15939d8
 
 
ddecd6a
15939d8
50f4808
15939d8
50f4808
15939d8
 
50f4808
 
 
 
 
 
 
 
 
 
 
 
ddecd6a
50f4808
ddecd6a
50f4808
ddecd6a
15939d8
50f4808
ddecd6a
15939d8
50f4808
59eab61
ddecd6a
59eab61
50f4808
 
 
 
ddecd6a
50f4808
 
 
ddecd6a
50f4808
9825b5e
50f4808
ddecd6a
50f4808
 
 
ddecd6a
 
 
 
 
 
50f4808
 
 
 
 
 
 
 
 
 
ddecd6a
 
50f4808
 
 
 
 
 
 
ddecd6a
50f4808

import pandas as pd
import os
import gradio as gr
import threading
import time
from gradio_client import Client

# Initialize Gradio client
client = Client("Nymbo/Llama-3.1-405B-Instruct")

# Constants
MAX_SIZE = 1.1 * 1024 * 1024 * 1024  # 1.1GB in bytes
DATA_DIRECTORY = 'data'
UPDATE_INTERVAL = 1  # Update interval in seconds

# Ensure the data directory exists
os.makedirs(DATA_DIRECTORY, exist_ok=True)

# Initialize variables
file_index = 1
current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
file_paths = [current_file]
combined_tokens = 0

# Helper function to get file size
def get_file_size(filename):
    return os.path.getsize(filename) if os.path.isfile(filename) else 0

# Data generation and saving function
def generate_and_save_data():
    global file_index, current_file, file_paths, combined_tokens
    
    # Create the initial file if it doesn't exist
    if not os.path.isfile(current_file):
        pd.DataFrame(columns=["prompt", "response"]).to_csv(current_file, index=False)

    while True:
        try:
            # Generate a prompt
            prompt_result = client.predict(
                message="give me a single prompt to prompt an ai model, simulating what users could want from you. ensure that it is diverse and high quality. for each, choose a random writing style (though it has to be a common one), random length and random clarity of the prompt. ensure that it is a single prompt, and just the prompt itself, nothing else. eg, don't close the prompt in quotation marks or say Here is a single prompt that meets your requirements or anything similar to that",
                system_message="",
                max_tokens=1024,
                temperature=1,
                top_p=1,
                api_name="/chat"
            )
            prompt = prompt_result['message']
            prompt_tokens = len(prompt.split())

            # Use the generated prompt to query the model again
            response_result = client.predict(
                message=prompt,
                system_message="",
                max_tokens=5000,
                temperature=1,
                top_p=1,
                api_name="/chat"
            )
            response = response_result['message']
            response_tokens = len(response.split())

            # Update the combined token count
            combined_tokens += (prompt_tokens + response_tokens)

            # Print the generated prompt and the response
            print("Generated prompt:", prompt)
            print("Response to the generated prompt:", response)

            # Create a DataFrame with the prompt and response
            data = pd.DataFrame({"prompt": [prompt], "response": [response]})

            # Check the size of the current file
            if get_file_size(current_file) >= MAX_SIZE:
                file_index += 1
                current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
                file_paths.append(current_file)
                # Create the new file with headers
                data.to_csv(current_file, index=False)
            else:
                # Append data to the current file
                data.to_csv(current_file, mode='a', header=False, index=False)

            # Wait for the next update interval
            time.sleep(UPDATE_INTERVAL)

        except Exception as e:
            print(f"An error occurred: {e}. Retrying in 5 seconds...")
            time.sleep(5)

# Get available files
def get_available_files():
    return [f for f in file_paths if os.path.isfile(f)]

# Update file list
def update_file_list():
    return gr.update(choices=get_available_files())

# Update token count
def update_token_count():
    return combined_tokens

# Display file content
def display_file_content(selected_file):
    if selected_file:
        return pd.read_csv(selected_file)
    return pd.DataFrame()

# Start the data generation in a separate thread
thread = threading.Thread(target=generate_and_save_data)
thread.daemon = True
thread.start()

# Create Gradio interface
with gr.Blocks() as app:
    gr.Markdown("## AI Prompt and Response Generator")
    gr.Markdown("This app continuously generates AI prompts and responses, and writes them to CSV files.")
    
    file_selector = gr.Dropdown(label="Select a data file to view and download", choices=get_available_files())
    file_viewer = gr.DataFrame(label="CSV File Content")
    download_button = gr.File(label="Download Selected File")
    
    def download_file(selected_file):
        return selected_file

    refresh_button = gr.Button("Refresh File List")
    refresh_button.click(update_file_list, outputs=file_selector)
    file_selector.change(display_file_content, inputs=file_selector, outputs=file_viewer)
    file_selector.change(download_file, inputs=file_selector, outputs=download_button)
    
    token_display = gr.Textbox(label="Combined Tokens", value=str(update_token_count()), interactive=False)
    
    def update_token_display():
        return str(update_token_count())

    # Update the token count every second
    token_refresh = gr.Button("Refresh Token Count")
    token_refresh.click(update_token_display, outputs=token_display)

app.launch()