File size: 5,861 Bytes
c4aca3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import json
import os
import random

def generate_sample_datasets(output_dir="./sample_datasets"):
    """

    Generate sample datasets in CSV, JSONL, and text formats for testing the Gemma fine-tuning UI



    Parameters:

    output_dir (str): Directory to save the sample datasets

    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Sample data for instruction-response pairs
    instruction_data = [
        {"instruction": "Explain the concept of machine learning in simple terms.",
         "response": "Machine learning is when computers learn from examples rather than being explicitly programmed. It's like teaching a child by showing examples instead of giving exact rules."},
        {"instruction": "What are the health benefits of regular exercise?",
         "response": "Regular exercise improves cardiovascular health, strengthens muscles and bones, helps control weight, reduces stress and anxiety, improves sleep quality, and increases overall energy levels."},
        {"instruction": "How do I make a basic pasta sauce?",
         "response": "For a basic tomato pasta sauce, sauté minced garlic and onions in olive oil until soft. Add canned crushed tomatoes, salt, pepper, and Italian herbs. Simmer for 20-30 minutes, stirring occasionally. Add basil at the end for fresh flavor."},
        {"instruction": "Summarize the water cycle.",
         "response": "The water cycle is the continuous movement of water on, above, and below Earth's surface. It involves evaporation (water turning to vapor), condensation (vapor forming clouds), precipitation (rain or snow), and collection (water returning to bodies of water)."},
        {"instruction": "What is the difference between a simile and a metaphor?",
         "response": "A simile compares two things using 'like' or 'as' (e.g., 'as brave as a lion'). A metaphor directly states that one thing is another (e.g., 'he is a lion in battle'). Both are figurative language techniques used to create vivid descriptions."}
    ]

    # Generate more instruction-response pairs
    topics = ["history", "science", "literature", "cooking", "technology", "health", "travel", "sports", "music", "art"]
    question_starters = ["Explain", "Describe", "How to", "What is", "Why does", "Compare", "Summarize", "List ways to", "Define", "Analyze"]

    for _ in range(20):
        topic = random.choice(topics)
        starter = random.choice(question_starters)
        instruction = f"{starter} {topic.lower()} {random.choice(['concepts', 'principles', 'ideas', 'techniques', 'methods'])}"
        response = f"This is a sample response about {topic} that would be more detailed in a real dataset. It would contain multiple sentences explaining {topic} concepts in depth."
        instruction_data.append({"instruction": instruction, "response": response})

    # Create a dictionary to store sample datasets
    datasets = {}

    # 1. Create CSV in instruction-response format
    df_instruction = pd.DataFrame(instruction_data)
    datasets["instruction_response.csv"] = df_instruction

    # 2. Create CSV in input-output format
    input_output_data = [{"input": item["instruction"], "output": item["response"]} for item in instruction_data]
    df_input_output = pd.DataFrame(input_output_data)
    datasets["input_output.csv"] = df_input_output

    # 3. Create CSV in text-only format
    text_data = [{"text": f"Q: {item['instruction']}\nA: {item['response']}"} for item in instruction_data]
    df_text = pd.DataFrame(text_data)
    datasets["text_only.csv"] = df_text

    # 4. Create CSV with non-standard format
    custom_data = [{"question": item["instruction"], "answer": item["response"]} for item in instruction_data]
    df_custom = pd.DataFrame(custom_data)
    datasets["custom_format.csv"] = df_custom

    # 5. Create JSONL in instruction-response format
    jsonl_instruction = instruction_data
    datasets["instruction_response.jsonl"] = jsonl_instruction

    # 6. Create JSONL in prompt-completion format
    prompt_completion_data = [{"prompt": item["instruction"], "completion": item["response"]} for item in instruction_data]
    datasets["prompt_completion.jsonl"] = prompt_completion_data

    # 7. Create JSONL with non-standard format
    jsonl_custom = [{"query": item["instruction"], "result": item["response"]} for item in instruction_data]
    datasets["custom_format.jsonl"] = jsonl_custom

    # 8. Create text format (paragraphs)
    text_paragraphs = "\n\n".join([f"Q: {item['instruction']}\nA: {item['response']}" for item in instruction_data])
    datasets["paragraphs.txt"] = text_paragraphs

    # 9. Create text format (single examples per line)
    text_lines = "\n".join([f"{item['instruction']} => {item['response']}" for item in instruction_data])
    datasets["single_lines.txt"] = text_lines

    # Save all datasets
    for filename, data in datasets.items():
        file_path = os.path.join(output_dir, filename)

        if filename.endswith('.csv'):
            data.to_csv(file_path, index=False)
        elif filename.endswith('.jsonl'):
            with open(file_path, 'w', encoding='utf-8') as f:
                for item in data:
                    f.write(json.dumps(item) + '\n')
        elif filename.endswith('.txt'):
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(data)

    print(f"Sample datasets generated in {output_dir}")
    return list(datasets.keys())

# if __name__ == "__main__":
#     # Generate sample datasets
#     generated_files = generate_sample_datasets()
#     print(f"Generated {len(generated_files)} sample dataset files:")
#     for file in generated_files:
#         print(f" - {file}")