remiai3 commited on
Commit
6e793bd
·
verified ·
1 Parent(s): dc96979

Upload 9 files

Browse files
Files changed (9) hide show
  1. Figure_1.png +0 -0
  2. all_in_one.py +115 -0
  3. app.py +39 -0
  4. custom_dataset.jsonl +50 -0
  5. document.txt +92 -0
  6. finetune_codegen.py +129 -0
  7. requirements.txt +7 -0
  8. templates/index.html +77 -0
  9. test.py +34 -0
Figure_1.png ADDED
all_in_one.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
4
+ from datasets import load_from_disk
5
+ import matplotlib.pyplot as plt
6
+
7
+ # Set Hugging Face token (replace with your actual token)
8
+ os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
9
+
10
+ # Download model and tokenizer
11
+ model_name = "Salesforce/codegen-350M-multi"
12
+ local_model_path = "./codegen_model"
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
14
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, cache_dir=local_model_path)
15
+
16
+ # Set padding token
17
+ tokenizer.pad_token = tokenizer.eos_token
18
+
19
+ # Move model to CPU
20
+ device = torch.device("cpu")
21
+ model.to(device)
22
+
23
+ # Load custom dataset
24
+ dataset_path = "./custom_dataset"
25
+ dataset = load_from_disk(dataset_path)
26
+
27
+ # Tokenize dataset
28
+ def tokenize_function(examples):
29
+ inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
30
+ return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
31
+
32
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
33
+
34
+ # Data collator for language modeling
35
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
36
+
37
+ # Define training arguments
38
+ training_args = TrainingArguments(
39
+ output_dir="./finetuned_codegen",
40
+ overwrite_output_dir=True,
41
+ num_train_epochs=5, # Increased epochs for better fine-tuning
42
+ per_device_train_batch_size=1,
43
+ gradient_accumulation_steps=4,
44
+ save_steps=500,
45
+ save_total_limit=2,
46
+ logging_steps=100,
47
+ learning_rate=5e-5,
48
+ fp16=False,
49
+ no_cuda=True,
50
+ dataloader_pin_memory=False,
51
+ )
52
+
53
+ # Custom callback to store training loss
54
+ class LossCallback(TrainerCallback):
55
+ def __init__(self):
56
+ self.losses = []
57
+
58
+ def on_log(self, args, state, control, logs=None, **kwargs):
59
+ if logs and "loss" in logs:
60
+ self.losses.append(logs["loss"])
61
+
62
+ loss_callback = LossCallback()
63
+
64
+ # Initialize Trainer
65
+ trainer = Trainer(
66
+ model=model,
67
+ args=training_args,
68
+ train_dataset=tokenized_dataset,
69
+ data_collator=data_collator,
70
+ callbacks=[loss_callback],
71
+ )
72
+
73
+ # Start fine-tuning
74
+ print("Starting fine-tuning...")
75
+ trainer.train()
76
+
77
+ # Save fine-tuned model
78
+ model.save_pretrained("./finetuned_codegen")
79
+ tokenizer.save_pretrained("./finetuned_codegen")
80
+
81
+ # Plot training loss
82
+ plt.plot(loss_callback.losses, label="Training Loss")
83
+ plt.xlabel("Steps")
84
+ plt.ylabel("Loss")
85
+ plt.title("Fine-Tuning Loss Curve")
86
+ plt.legend()
87
+ plt.savefig("./finetuned_codegen/loss_plot.png")
88
+ plt.show()
89
+
90
+ print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
91
+
92
+ # Test fine-tuned model
93
+ print("\nTesting fine-tuned model...")
94
+ prompts = [
95
+ "Write a Python program to print 'Hello, World!'",
96
+ "Write a Python function to add two numbers.",
97
+ "Write a Python function to subtract two numbers.",
98
+ "Write a Python function to calculate factorial of a number",
99
+ "Write a Python function to check if a number is prime",
100
+ "Write a Python function to reverse a string"
101
+ ]
102
+
103
+ for prompt in prompts:
104
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
105
+ outputs = model.generate(
106
+ **inputs,
107
+ max_length=200,
108
+ num_return_sequences=1,
109
+ pad_token_id=tokenizer.eos_token_id,
110
+ do_sample=True,
111
+ temperature=0.7,
112
+ top_p=0.9
113
+ )
114
+ generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
115
+ print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ app = Flask(__name__)
6
+
7
+ # Load fine-tuned model and tokenizer
8
+ model_path = "./finetuned_codegen"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
10
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
11
+
12
+ # Set padding token
13
+ tokenizer.pad_token = tokenizer.eos_token
14
+
15
+ # Move model to CPU
16
+ device = torch.device("cpu")
17
+ model.to(device)
18
+
19
+ @app.route("/", methods=["GET", "POST"])
20
+ def index():
21
+ generated_code = ""
22
+ prompt = ""
23
+ if request.method == "POST":
24
+ prompt = request.form["prompt"]
25
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
26
+ outputs = model.generate(
27
+ **inputs,
28
+ max_length=200,
29
+ num_return_sequences=1,
30
+ pad_token_id=tokenizer.eos_token_id,
31
+ do_sample=True,
32
+ temperature=0.7,
33
+ top_p=0.9
34
+ )
35
+ generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
36
+ return render_template("index.html", generated_code=generated_code, prompt=prompt)
37
+
38
+ if __name__ == "__main__":
39
+ app.run(debug=True)
custom_dataset.jsonl ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"prompt": "Write a Python program to print 'Hello, World!'", "code": "print('Hello, World!')"}
2
+ {"prompt": "Write a Python function to add two numbers.", "code": "def add_numbers(a, b):\n return a + b"}
3
+ {"prompt": "Write a Python function to subtract two numbers.", "code": "def subtract_numbers(a, b):\n return a - b"}
4
+ {"prompt": "Write a Python function to multiply two numbers.", "code": "def multiply_numbers(a, b):\n return a * b"}
5
+ {"prompt": "Write a Python function to divide two numbers.", "code": "def divide_numbers(a, b):\n if b == 0:\n return 'Error: Division by zero'\n return a / b"}
6
+ {"prompt": "Write a Python function to calculate the area of a rectangle.", "code": "def rectangle_area(length, width):\n return length * width"}
7
+ {"prompt": "Write a Python function to calculate the circumference of a circle.", "code": "import math\ndef circle_circumference(radius):\n return 2 * math.pi * radius"}
8
+ {"prompt": "Write a Python function to calculate the area of a circle.", "code": "import math\ndef circle_area(radius):\n return math.pi * radius**2"}
9
+ {"prompt": "Write a Python function to convert Celsius to Fahrenheit.", "code": "def celsius_to_fahrenheit(celsius):\n return (celsius * 9/5) + 32"}
10
+ {"prompt": "Write a Python function to convert Fahrenheit to Celsius.", "code": "def fahrenheit_to_celsius(fahrenheit):\n return (fahrenheit - 32) * 5/9"}
11
+ {"prompt": "Write a Python function to check if a number is even.", "code": "def is_even(n):\n return n % 2 == 0"}
12
+ {"prompt": "Write a Python function to check if a number is odd.", "code": "def is_odd(n):\n return n % 2 != 0"}
13
+ {"prompt": "Write a Python function to find the maximum of two numbers.", "code": "def find_max(a, b):\n if a > b:\n return a\n else:\n return b"}
14
+ {"prompt": "Write a Python function to find the minimum of two numbers.", "code": "def find_min(a, b):\n if a < b:\n return a\n else:\n return b"}
15
+ {"prompt": "Write a Python function to calculate the sum of numbers in a list.", "code": "def sum_list(numbers):\n total = 0\n for num in numbers:\n total += num\n return total"}
16
+ {"prompt": "Write a Python function to calculate the average of numbers in a list.", "code": "def average_list(numbers):\n if not numbers:\n return 0\n return sum(numbers) / len(numbers)"}
17
+ {"prompt": "Write a Python function to find the largest number in a list.", "code": "def find_largest(numbers):\n if not numbers:\n return None\n largest = numbers[0]\n for num in numbers:\n if num > largest:\n largest = num\n return largest"}
18
+ {"prompt": "Write a Python function to find the smallest number in a list.", "code": "def find_smallest(numbers):\n if not numbers:\n return None\n smallest = numbers[0]\n for num in numbers:\n if num < smallest:\n smallest = num\n return smallest"}
19
+ {"prompt": "Write a Python function to reverse a string.", "code": "def reverse_string(s):\n return s[::-1]"}
20
+ {"prompt": "Write a Python function to check if a string is a palindrome.", "code": "def is_palindrome(s):\n return s == s[::-1]"}
21
+ {"prompt": "Write a Python function to count the number of vowels in a string.", "code": "def count_vowels(s):\n vowels = 'aeiouAEIOU'\n count = 0\n for char in s:\n if char in vowels:\n count += 1\n return count"}
22
+ {"prompt": "Write a Python function to convert a string to uppercase.", "code": "def to_uppercase(s):\n return s.upper()"}
23
+ {"prompt": "Write a Python function to convert a string to lowercase.", "code": "def to_lowercase(s):\n return s.lower()"}
24
+ {"prompt": "Write a Python function to find the length of a string.", "code": "def string_length(s):\n return len(s)"}
25
+ {"prompt": "Write a Python function to check if a list is empty.", "code": "def is_list_empty(lst):\n return len(lst) == 0"}
26
+ {"prompt": "Write a Python function to append an element to a list.", "code": "def append_to_list(lst, element):\n lst.append(element)\n return lst"}
27
+ {"prompt": "Write a Python function to remove an element from a list.", "code": "def remove_from_list(lst, element):\n if element in lst:\n lst.remove(element)\n return lst"}
28
+ {"prompt": "Write a Python function to sort a list of numbers in ascending order.", "code": "def sort_list_ascending(numbers):\n return sorted(numbers)"}
29
+ {"prompt": "Write a Python function to sort a list of numbers in descending order.", "code": "def sort_list_descending(numbers):\n return sorted(numbers, reverse=True)"}
30
+ {"prompt": "Write a Python function to find the index of an element in a list.", "code": "def find_index(lst, element):\n try:\n return lst.index(element)\n except ValueError:\n return -1"}
31
+ {"prompt": "Write a Python function to check if an element exists in a list.", "code": "def element_exists(lst, element):\n return element in lst"}
32
+ {"prompt": "Write a Python function to get the first element of a list.", "code": "def get_first_element(lst):\n if lst:\n return lst[0]\n else:\n return None"}
33
+ {"prompt": "Write a Python function to get the last element of a list.", "code": "def get_last_element(lst):\n if lst:\n return lst[-1]\n else:\n return None"}
34
+ {"prompt": "Write a Python function to create a dictionary from two lists (keys and values).", "code": "def create_dictionary(keys, values):\n return dict(zip(keys, values))"}
35
+ {"prompt": "Write a Python function to get a value from a dictionary by its key.", "code": "def get_dictionary_value(d, key):\n return d.get(key)"}
36
+ {"prompt": "Write a Python function to add a new key-value pair to a dictionary.", "code": "def add_to_dictionary(d, key, value):\n d[key] = value\n return d"}
37
+ {"prompt": "Write a Python function to remove a key-value pair from a dictionary.", "code": "def remove_from_dictionary(d, key):\n if key in d:\n del d[key]\n return d"}
38
+ {"prompt": "Write a Python function to check if a key exists in a dictionary.", "code": "def key_exists_in_dictionary(d, key):\n return key in d"}
39
+ {"prompt": "Write a Python program to get user input and print it.", "code": "user_input = input('Enter something: ')\nprint('You entered:', user_input)"}
40
+ {"prompt": "Write a Python function to greet a user by name.", "code": "def greet_user(name):\n return f'Hello, {name}!'"}
41
+ {"prompt": "Write a Python function to calculate the square of a number.", "code": "def square_number(n):\n return n * n"}
42
+ {"prompt": "Write a Python function to calculate the cube of a number.", "code": "def cube_number(n):\n return n ** 3"}
43
+ {"prompt": "Write a Python function to check if a year is a leap year.", "code": "def is_leap_year(year):\n if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):\n return True\n else:\n return False"}
44
+ {"prompt": "Write a Python function to count occurrences of a character in a string.", "code": "def count_character(s, char):\n return s.count(char)"}
45
+ {"prompt": "Write a Python function to find the absolute value of a number.", "code": "def absolute_value(n):\n return abs(n)"}
46
+ {"prompt": "Write a Python function to generate a sequence of numbers.", "code": "def generate_sequence(start, end, step):\n return list(range(start, end, step))"}
47
+ {"prompt": "Write a Python function to check if a list contains duplicates.", "code": "def has_duplicates(lst):\n return len(lst) != len(set(lst))"}
48
+ {"prompt": "Write a Python function to get the current date.", "code": "from datetime import date\ndef get_current_date():\n return date.today()"}
49
+ {"prompt": "Write a Python function to get the current time.", "code": "from datetime import datetime\ndef get_current_time():\n return datetime.now().time()"}
50
+ {"prompt": "Write a Python function to simulate a simple coin flip (Heads or Tails).", "code": "import random\ndef coin_flip():\n return random.choice(['Heads', 'Tails'])"}
document.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Text-to-Code Generator using CodeGen-350M-Multi
2
+ =============================================
3
+
4
+ This project provides a text-to-code generator using a fine-tuned Salesforce/codegen-350M-multi
5
+ model, designed to run on low-end laptops (8GB RAM, CPU-only) for students to experiment with AI
6
+ model development. The model is fine-tuned on a custom dataset and includes a Flask web interface
7
+ for easy interaction. All resources are open-source under the Apache-2.0 license, with attribution
8
+ to the original model by Salesforce.
9
+
10
+
11
+
12
+ Do's and Setup Process
13
+ ---------------------
14
+ 1. **System Requirements**:
15
+ - Laptop with at least 8GB RAM and 2GB free disk space.
16
+ - Windows, macOS, or Linux (CPU-only, no GPU required).
17
+ - Internet connection for initial model download.
18
+
19
+ 2. **Install Python**:
20
+ - Use Python 3.10.9. Download from https://www.python.org/downloads/release/python-3109/.
21
+ - Verify installation: `python --version`.
22
+
23
+ 3. **Clone or Download Repository**:
24
+ - Download the project files from the Hugging Face repository:
25
+ https://huggingface.co/remiai3/text-to-code-using-codegen-project.
26
+ - Extract files to a folder (e.g., `text-to-code-codegen`).
27
+
28
+ 4. **Set Up Virtual Environment**:
29
+ - Open a terminal in the project folder.
30
+ - Create a virtual environment: `python -m venv venv`.
31
+ - Activate it:
32
+ - Windows: `venv\Scripts\activate`
33
+ - macOS/Linux: `source venv/bin/activate`
34
+
35
+ 5. **Install Dependencies**:
36
+ - Run: `pip install -r requirements.txt`.
37
+ - Required libraries: torch, transformers, datasets, accelerate, protobuf, matplotlib, flask.
38
+ NOTE: if the matplotlib version is not compatible remove the version 3.7.2 and also if any
39
+ other library is also not compitable with the python version or local device because of
40
+ previous libraries installed then remove all the versions from the libraries and install the
41
+ libraries with the names only then a default version will installed of that particualr library
42
+
43
+ 6. **Prepare Custom Dataset**:
44
+ - Ensure the `custom_dataset.jsonl` file exists in the project folder.
45
+ - Format: Each line is a JSON object with `prompt` (natural language) and `code` (Python code).
46
+ - Example:
47
+ {"prompt": "Write a Python program to print 'Hello, World!'", "code": "print('Hello, World!')"}
48
+ {"prompt": "Write a Python function to add two numbers.", "code": "def add_numbers(a, b):\n return a + b"}
49
+
50
+ 7. **Run the Model**:
51
+ - Option 1: Run the full pipeline (download, fine-tune, test):
52
+ - Update `run_all.py` with your Hugging Face token (`HF_TOKEN`).
53
+ - Run: `python run_all.py`.
54
+ - This downloads the model, fine-tunes it, tests it, and generates a loss plot.
55
+ - Option 2: Test the fine-tuned model directly:
56
+ - Run: `python test_codegen.py` to test with sample prompts.
57
+ - Option 3: Use the web interface:
58
+ - Run: `python app.py`.
59
+ - Open a browser and go to `http://127.0.0.1:5000`.
60
+
61
+ 8. **Using the AI Model**:
62
+ - **Command Line Testing**: Use `test_codegen.py` to input prompts and generate Python code.
63
+ - **Web Interface**: Use the Flask app (`app.py`) to enter prompts via a browser and view generated code.
64
+ - Example prompts:
65
+ - "Write a Python function to calculate factorial of a number"
66
+ - "Write a Python function to check if a number is prime"
67
+ - Output is saved in `./finetuned_codegen/loss_plot.png` (loss plot) and `./finetuned_codegen`
68
+ (model weights).
69
+
70
+ 9. **Model Details**:
71
+ - Model: Salesforce/codegen-350M-multi (Apache-2.0 license).
72
+ - Source: https://huggingface.co/Salesforce/codegen-350M-multi.
73
+ - Fine-tuned on a custom dataset for Python code generation.
74
+ - Attribution: This project uses the Salesforce CodeGen model, fine-tuned by remiai3 for
75
+ educational purposes.
76
+
77
+ 10. **Troubleshooting**:
78
+ - Ensure ~2GB disk space for model weights.
79
+ - If memory issues occur, reduce dataset size or batch size in `run_all.py`.
80
+ - Check terminal output for errors and ensure all files (`custom_dataset.jsonl`,
81
+ `finetuned_codegen`) are in place.
82
+
83
+ 11. **Contributing**:
84
+ - Add more examples to `custom_dataset.jsonl` to improve model performance.
85
+ - Share feedback or improvements via the Hugging Face repository:
86
+ https://huggingface.co/remiai3.
87
+
88
+ Attribution
89
+ -----------
90
+ This project is built using the Salesforce/codegen-350M-multi model, licensed under Apache-2.0.
91
+ The fine-tuned model and resources are provided by remiai3 for free educational use to help students
92
+ learn and experiment with AI models.
finetune_codegen.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
4
+ from datasets import load_dataset
5
+ import matplotlib.pyplot as plt
6
+
7
+ # Set Hugging Face token (replace with your actual token)
8
+ os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
9
+
10
+ # Recommended for download stability, if you had issues before
11
+ os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout
12
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Enable robust downloader
13
+
14
+ # Download model and tokenizer
15
+ model_name = "Salesforce/codegen-350M-multi"
16
+ local_model_path = "./codegen_model"
17
+
18
+ print(f"Attempting to download/load tokenizer from {model_name} to {local_model_path}...")
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
20
+ print("Tokenizer loaded.")
21
+
22
+ print(f"Attempting to download/load model from {model_name} to {local_model_path}...")
23
+ # Removed torch_dtype=torch.float16 as it's typically for GPU and might not help on CPU
24
+ # and could even cause unexpected behavior on some CPU setups.
25
+ model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=local_model_path)
26
+ print("Model loaded.")
27
+
28
+ # Set padding token
29
+ tokenizer.pad_token = tokenizer.eos_token
30
+
31
+ # Move model to CPU
32
+ device = torch.device("cpu")
33
+ model.to(device)
34
+ print(f"Model moved to {device}.")
35
+
36
+ # Load custom dataset from JSONL file
37
+ dataset_file = "custom_dataset.jsonl"
38
+ print(f"Loading dataset from {dataset_file}...")
39
+ dataset = load_dataset('json', data_files=dataset_file, split='train')
40
+ print("Dataset loaded.")
41
+ print(f"Dataset size: {len(dataset)} examples.")
42
+ print(f"First example of dataset: {dataset[0]}") # Print first example to check data format
43
+
44
+ # Tokenize dataset
45
+ def tokenize_function(examples):
46
+ inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
47
+ # --- REDUCED MAX_LENGTH TO SAVE MEMORY ---
48
+ return tokenizer(inputs, truncation=True, padding="max_length", max_length=64) # Try 64 or even 32 if 128 is too much
49
+
50
+ print("Tokenizing dataset...")
51
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
52
+ print("Dataset tokenized.")
53
+ print(f"First tokenized example: {tokenized_dataset[0]}")
54
+
55
+ # Data collator for language modeling
56
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
57
+
58
+ # Define training arguments
59
+ training_args = TrainingArguments(
60
+ output_dir="./finetuned_codegen",
61
+ overwrite_output_dir=True,
62
+ num_train_epochs=3,
63
+ # --- AGGRESSIVELY REDUCED BATCH SIZE AND GRADIENT ACCUMULATION FOR CPU ---
64
+ per_device_train_batch_size=1,
65
+ gradient_accumulation_steps=1, # No accumulation, true batch size of 1
66
+ save_steps=500,
67
+ save_total_limit=2,
68
+ logging_steps=10, # Log more frequently to see if it starts moving
69
+ learning_rate=5e-5,
70
+ fp16=False, # Keep False for CPU
71
+ use_cpu=True, # Use this instead of no_cuda=True
72
+ dataloader_pin_memory=False, # Disable pin_memory for CPU
73
+ report_to="none", # Disable reporting to avoid potential hangs
74
+ gradient_checkpointing=True, # Keep this, it helps with memory on CPU too
75
+ max_grad_norm=1.0,
76
+ )
77
+
78
+ # Custom callback to store training loss
79
+ class LossCallback(TrainerCallback):
80
+ def __init__(self):
81
+ self.losses = []
82
+ self.log_steps = []
83
+
84
+ def on_log(self, args, state, control, logs=None, **kwargs):
85
+ if logs and "loss" in logs:
86
+ self.losses.append(logs["loss"])
87
+ self.log_steps.append(state.global_step)
88
+ print(f"Step {state.global_step}: Loss = {logs['loss']:.4f}")
89
+
90
+ loss_callback = LossCallback()
91
+
92
+ # Initialize Trainer
93
+ trainer = Trainer(
94
+ model=model,
95
+ args=training_args,
96
+ train_dataset=tokenized_dataset,
97
+ data_collator=data_collator,
98
+ callbacks=[loss_callback],
99
+ )
100
+
101
+ # Start fine-tuning
102
+ print("Starting fine-tuning...")
103
+ print("WARNING: Training on CPU will be extremely slow. The 0% progress bar might take a very long time to update.")
104
+ print("Please monitor your system's RAM and CPU usage.")
105
+ trainer.train()
106
+ print("Fine-tuning finished.")
107
+
108
+ # Save fine-tuned model
109
+ model.save_pretrained("./finetuned_codegen")
110
+ tokenizer.save_pretrained("./finetuned_codegen")
111
+ print("Model fine-tuned and saved to ./finetuned_codegen.")
112
+
113
+ # Plot training loss
114
+ if loss_callback.losses:
115
+ plt.figure(figsize=(10, 6))
116
+ plt.plot(loss_callback.log_steps, loss_callback.losses, label="Training Loss")
117
+ plt.xlabel("Steps")
118
+ plt.ylabel("Loss")
119
+ plt.title("Fine-Tuning Loss Curve")
120
+ plt.legend()
121
+ plt.grid(True)
122
+ plot_path = "./finetuned_codegen/loss_plot.png"
123
+ plt.savefig(plot_path)
124
+ print(f"Loss plot saved to {plot_path}")
125
+ else:
126
+ print("No training losses recorded to plot.")
127
+ plt.show()
128
+
129
+ print("Fine-tuning script finished execution.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch==2.0.1
2
+ transformers==4.31.0
3
+ datasets==2.14.4
4
+ accelerate==0.21.0
5
+ protobuf==4.23.4
6
+ matplotlib==3.7.2
7
+ flask==2.3.2
templates/index.html ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Text-to-Code Generator</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ margin: 0;
11
+ padding: 20px;
12
+ background-color: #f4f4f9;
13
+ }
14
+ .container {
15
+ max-width: 800px;
16
+ margin: auto;
17
+ background: white;
18
+ padding: 20px;
19
+ border-radius: 8px;
20
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
21
+ }
22
+ h1 {
23
+ text-align: center;
24
+ color: #333;
25
+ }
26
+ textarea {
27
+ width: 100%;
28
+ height: 100px;
29
+ margin-bottom: 10px;
30
+ padding: 10px;
31
+ border-radius: 4px;
32
+ border: 1px solid #ccc;
33
+ }
34
+ button {
35
+ padding: 10px 20px;
36
+ background-color: #007bff;
37
+ color: white;
38
+ border: none;
39
+ border-radius: 4px;
40
+ cursor: pointer;
41
+ }
42
+ button:hover {
43
+ background-color: #0056b3;
44
+ }
45
+ pre {
46
+ background-color: #f8f9fa;
47
+ padding: 10px;
48
+ border-radius: 4px;
49
+ overflow-x: auto;
50
+ }
51
+ .attribution {
52
+ margin-top: 20px;
53
+ font-size: 0.9em;
54
+ color: #555;
55
+ text-align: center;
56
+ }
57
+ </style>
58
+ </head>
59
+ <body>
60
+ <div class="container">
61
+ <h1>Text-to-Code Generator</h1>
62
+ <form method="POST">
63
+ <textarea name="prompt" placeholder="Enter a prompt (e.g., 'Write a Python function to calculate factorial')" required>{{ prompt }}</textarea>
64
+ <button type="submit">Generate Code</button>
65
+ </form>
66
+ {% if generated_code %}
67
+ <h2>Generated Code:</h2>
68
+ <pre>{{ generated_code }}</pre>
69
+ {% endif %}
70
+ <div class="attribution">
71
+ <p>Built with the fine-tuned </a href="https://huggingface.co/Salesforce/codegen-350M-multi" target="_blank">Salesforce/codegen-350M-multi</a> model.</p>
72
+ <p>Developed by </a href="https://huggingface.co/remiai3" target="_blank">remiai3</a> for educational use. Licensed under Apache-2.0.</p>
73
+ <p>Free resources for students to experiment with AI model development.</p>
74
+ </div>
75
+ </div>
76
+ </body>
77
+ </html>
test.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+
4
+ # Load fine-tuned model and tokenizer
5
+ model_path = "./finetuned_codegen"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
7
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
8
+
9
+ # Set padding token
10
+ tokenizer.pad_token = tokenizer.eos_token
11
+
12
+ # Move model to CPU
13
+ device = torch.device("cpu")
14
+ model.to(device)
15
+
16
+ # Test prompts (including dataset prompts)
17
+ prompts = [
18
+ "Write a Python program to print 'Hello, you name or any other thing!'"
19
+ ]
20
+
21
+ # Generate code for each prompt
22
+ for prompt in prompts:
23
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
24
+ outputs = model.generate(
25
+ **inputs,
26
+ max_length=200,
27
+ num_return_sequences=1,
28
+ pad_token_id=tokenizer.eos_token_id,
29
+ do_sample=True,
30
+ temperature=0.7,
31
+ top_p=0.9
32
+ )
33
+ generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
34
+ print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")