Upload 9 files
Browse files- Figure_1.png +0 -0
- all_in_one.py +115 -0
- app.py +39 -0
- custom_dataset.jsonl +50 -0
- document.txt +92 -0
- finetune_codegen.py +129 -0
- requirements.txt +7 -0
- templates/index.html +77 -0
- test.py +34 -0
Figure_1.png
ADDED
![]() |
all_in_one.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
|
4 |
+
from datasets import load_from_disk
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
# Set Hugging Face token (replace with your actual token)
|
8 |
+
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
|
9 |
+
|
10 |
+
# Download model and tokenizer
|
11 |
+
model_name = "Salesforce/codegen-350M-multi"
|
12 |
+
local_model_path = "./codegen_model"
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
|
14 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, cache_dir=local_model_path)
|
15 |
+
|
16 |
+
# Set padding token
|
17 |
+
tokenizer.pad_token = tokenizer.eos_token
|
18 |
+
|
19 |
+
# Move model to CPU
|
20 |
+
device = torch.device("cpu")
|
21 |
+
model.to(device)
|
22 |
+
|
23 |
+
# Load custom dataset
|
24 |
+
dataset_path = "./custom_dataset"
|
25 |
+
dataset = load_from_disk(dataset_path)
|
26 |
+
|
27 |
+
# Tokenize dataset
|
28 |
+
def tokenize_function(examples):
|
29 |
+
inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
|
30 |
+
return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
|
31 |
+
|
32 |
+
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
|
33 |
+
|
34 |
+
# Data collator for language modeling
|
35 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
36 |
+
|
37 |
+
# Define training arguments
|
38 |
+
training_args = TrainingArguments(
|
39 |
+
output_dir="./finetuned_codegen",
|
40 |
+
overwrite_output_dir=True,
|
41 |
+
num_train_epochs=5, # Increased epochs for better fine-tuning
|
42 |
+
per_device_train_batch_size=1,
|
43 |
+
gradient_accumulation_steps=4,
|
44 |
+
save_steps=500,
|
45 |
+
save_total_limit=2,
|
46 |
+
logging_steps=100,
|
47 |
+
learning_rate=5e-5,
|
48 |
+
fp16=False,
|
49 |
+
no_cuda=True,
|
50 |
+
dataloader_pin_memory=False,
|
51 |
+
)
|
52 |
+
|
53 |
+
# Custom callback to store training loss
|
54 |
+
class LossCallback(TrainerCallback):
|
55 |
+
def __init__(self):
|
56 |
+
self.losses = []
|
57 |
+
|
58 |
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
59 |
+
if logs and "loss" in logs:
|
60 |
+
self.losses.append(logs["loss"])
|
61 |
+
|
62 |
+
loss_callback = LossCallback()
|
63 |
+
|
64 |
+
# Initialize Trainer
|
65 |
+
trainer = Trainer(
|
66 |
+
model=model,
|
67 |
+
args=training_args,
|
68 |
+
train_dataset=tokenized_dataset,
|
69 |
+
data_collator=data_collator,
|
70 |
+
callbacks=[loss_callback],
|
71 |
+
)
|
72 |
+
|
73 |
+
# Start fine-tuning
|
74 |
+
print("Starting fine-tuning...")
|
75 |
+
trainer.train()
|
76 |
+
|
77 |
+
# Save fine-tuned model
|
78 |
+
model.save_pretrained("./finetuned_codegen")
|
79 |
+
tokenizer.save_pretrained("./finetuned_codegen")
|
80 |
+
|
81 |
+
# Plot training loss
|
82 |
+
plt.plot(loss_callback.losses, label="Training Loss")
|
83 |
+
plt.xlabel("Steps")
|
84 |
+
plt.ylabel("Loss")
|
85 |
+
plt.title("Fine-Tuning Loss Curve")
|
86 |
+
plt.legend()
|
87 |
+
plt.savefig("./finetuned_codegen/loss_plot.png")
|
88 |
+
plt.show()
|
89 |
+
|
90 |
+
print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
|
91 |
+
|
92 |
+
# Test fine-tuned model
|
93 |
+
print("\nTesting fine-tuned model...")
|
94 |
+
prompts = [
|
95 |
+
"Write a Python program to print 'Hello, World!'",
|
96 |
+
"Write a Python function to add two numbers.",
|
97 |
+
"Write a Python function to subtract two numbers.",
|
98 |
+
"Write a Python function to calculate factorial of a number",
|
99 |
+
"Write a Python function to check if a number is prime",
|
100 |
+
"Write a Python function to reverse a string"
|
101 |
+
]
|
102 |
+
|
103 |
+
for prompt in prompts:
|
104 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
|
105 |
+
outputs = model.generate(
|
106 |
+
**inputs,
|
107 |
+
max_length=200,
|
108 |
+
num_return_sequences=1,
|
109 |
+
pad_token_id=tokenizer.eos_token_id,
|
110 |
+
do_sample=True,
|
111 |
+
temperature=0.7,
|
112 |
+
top_p=0.9
|
113 |
+
)
|
114 |
+
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
115 |
+
print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
+
|
5 |
+
app = Flask(__name__)
|
6 |
+
|
7 |
+
# Load fine-tuned model and tokenizer
|
8 |
+
model_path = "./finetuned_codegen"
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
10 |
+
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
|
11 |
+
|
12 |
+
# Set padding token
|
13 |
+
tokenizer.pad_token = tokenizer.eos_token
|
14 |
+
|
15 |
+
# Move model to CPU
|
16 |
+
device = torch.device("cpu")
|
17 |
+
model.to(device)
|
18 |
+
|
19 |
+
@app.route("/", methods=["GET", "POST"])
|
20 |
+
def index():
|
21 |
+
generated_code = ""
|
22 |
+
prompt = ""
|
23 |
+
if request.method == "POST":
|
24 |
+
prompt = request.form["prompt"]
|
25 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
|
26 |
+
outputs = model.generate(
|
27 |
+
**inputs,
|
28 |
+
max_length=200,
|
29 |
+
num_return_sequences=1,
|
30 |
+
pad_token_id=tokenizer.eos_token_id,
|
31 |
+
do_sample=True,
|
32 |
+
temperature=0.7,
|
33 |
+
top_p=0.9
|
34 |
+
)
|
35 |
+
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
36 |
+
return render_template("index.html", generated_code=generated_code, prompt=prompt)
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
app.run(debug=True)
|
custom_dataset.jsonl
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"prompt": "Write a Python program to print 'Hello, World!'", "code": "print('Hello, World!')"}
|
2 |
+
{"prompt": "Write a Python function to add two numbers.", "code": "def add_numbers(a, b):\n return a + b"}
|
3 |
+
{"prompt": "Write a Python function to subtract two numbers.", "code": "def subtract_numbers(a, b):\n return a - b"}
|
4 |
+
{"prompt": "Write a Python function to multiply two numbers.", "code": "def multiply_numbers(a, b):\n return a * b"}
|
5 |
+
{"prompt": "Write a Python function to divide two numbers.", "code": "def divide_numbers(a, b):\n if b == 0:\n return 'Error: Division by zero'\n return a / b"}
|
6 |
+
{"prompt": "Write a Python function to calculate the area of a rectangle.", "code": "def rectangle_area(length, width):\n return length * width"}
|
7 |
+
{"prompt": "Write a Python function to calculate the circumference of a circle.", "code": "import math\ndef circle_circumference(radius):\n return 2 * math.pi * radius"}
|
8 |
+
{"prompt": "Write a Python function to calculate the area of a circle.", "code": "import math\ndef circle_area(radius):\n return math.pi * radius**2"}
|
9 |
+
{"prompt": "Write a Python function to convert Celsius to Fahrenheit.", "code": "def celsius_to_fahrenheit(celsius):\n return (celsius * 9/5) + 32"}
|
10 |
+
{"prompt": "Write a Python function to convert Fahrenheit to Celsius.", "code": "def fahrenheit_to_celsius(fahrenheit):\n return (fahrenheit - 32) * 5/9"}
|
11 |
+
{"prompt": "Write a Python function to check if a number is even.", "code": "def is_even(n):\n return n % 2 == 0"}
|
12 |
+
{"prompt": "Write a Python function to check if a number is odd.", "code": "def is_odd(n):\n return n % 2 != 0"}
|
13 |
+
{"prompt": "Write a Python function to find the maximum of two numbers.", "code": "def find_max(a, b):\n if a > b:\n return a\n else:\n return b"}
|
14 |
+
{"prompt": "Write a Python function to find the minimum of two numbers.", "code": "def find_min(a, b):\n if a < b:\n return a\n else:\n return b"}
|
15 |
+
{"prompt": "Write a Python function to calculate the sum of numbers in a list.", "code": "def sum_list(numbers):\n total = 0\n for num in numbers:\n total += num\n return total"}
|
16 |
+
{"prompt": "Write a Python function to calculate the average of numbers in a list.", "code": "def average_list(numbers):\n if not numbers:\n return 0\n return sum(numbers) / len(numbers)"}
|
17 |
+
{"prompt": "Write a Python function to find the largest number in a list.", "code": "def find_largest(numbers):\n if not numbers:\n return None\n largest = numbers[0]\n for num in numbers:\n if num > largest:\n largest = num\n return largest"}
|
18 |
+
{"prompt": "Write a Python function to find the smallest number in a list.", "code": "def find_smallest(numbers):\n if not numbers:\n return None\n smallest = numbers[0]\n for num in numbers:\n if num < smallest:\n smallest = num\n return smallest"}
|
19 |
+
{"prompt": "Write a Python function to reverse a string.", "code": "def reverse_string(s):\n return s[::-1]"}
|
20 |
+
{"prompt": "Write a Python function to check if a string is a palindrome.", "code": "def is_palindrome(s):\n return s == s[::-1]"}
|
21 |
+
{"prompt": "Write a Python function to count the number of vowels in a string.", "code": "def count_vowels(s):\n vowels = 'aeiouAEIOU'\n count = 0\n for char in s:\n if char in vowels:\n count += 1\n return count"}
|
22 |
+
{"prompt": "Write a Python function to convert a string to uppercase.", "code": "def to_uppercase(s):\n return s.upper()"}
|
23 |
+
{"prompt": "Write a Python function to convert a string to lowercase.", "code": "def to_lowercase(s):\n return s.lower()"}
|
24 |
+
{"prompt": "Write a Python function to find the length of a string.", "code": "def string_length(s):\n return len(s)"}
|
25 |
+
{"prompt": "Write a Python function to check if a list is empty.", "code": "def is_list_empty(lst):\n return len(lst) == 0"}
|
26 |
+
{"prompt": "Write a Python function to append an element to a list.", "code": "def append_to_list(lst, element):\n lst.append(element)\n return lst"}
|
27 |
+
{"prompt": "Write a Python function to remove an element from a list.", "code": "def remove_from_list(lst, element):\n if element in lst:\n lst.remove(element)\n return lst"}
|
28 |
+
{"prompt": "Write a Python function to sort a list of numbers in ascending order.", "code": "def sort_list_ascending(numbers):\n return sorted(numbers)"}
|
29 |
+
{"prompt": "Write a Python function to sort a list of numbers in descending order.", "code": "def sort_list_descending(numbers):\n return sorted(numbers, reverse=True)"}
|
30 |
+
{"prompt": "Write a Python function to find the index of an element in a list.", "code": "def find_index(lst, element):\n try:\n return lst.index(element)\n except ValueError:\n return -1"}
|
31 |
+
{"prompt": "Write a Python function to check if an element exists in a list.", "code": "def element_exists(lst, element):\n return element in lst"}
|
32 |
+
{"prompt": "Write a Python function to get the first element of a list.", "code": "def get_first_element(lst):\n if lst:\n return lst[0]\n else:\n return None"}
|
33 |
+
{"prompt": "Write a Python function to get the last element of a list.", "code": "def get_last_element(lst):\n if lst:\n return lst[-1]\n else:\n return None"}
|
34 |
+
{"prompt": "Write a Python function to create a dictionary from two lists (keys and values).", "code": "def create_dictionary(keys, values):\n return dict(zip(keys, values))"}
|
35 |
+
{"prompt": "Write a Python function to get a value from a dictionary by its key.", "code": "def get_dictionary_value(d, key):\n return d.get(key)"}
|
36 |
+
{"prompt": "Write a Python function to add a new key-value pair to a dictionary.", "code": "def add_to_dictionary(d, key, value):\n d[key] = value\n return d"}
|
37 |
+
{"prompt": "Write a Python function to remove a key-value pair from a dictionary.", "code": "def remove_from_dictionary(d, key):\n if key in d:\n del d[key]\n return d"}
|
38 |
+
{"prompt": "Write a Python function to check if a key exists in a dictionary.", "code": "def key_exists_in_dictionary(d, key):\n return key in d"}
|
39 |
+
{"prompt": "Write a Python program to get user input and print it.", "code": "user_input = input('Enter something: ')\nprint('You entered:', user_input)"}
|
40 |
+
{"prompt": "Write a Python function to greet a user by name.", "code": "def greet_user(name):\n return f'Hello, {name}!'"}
|
41 |
+
{"prompt": "Write a Python function to calculate the square of a number.", "code": "def square_number(n):\n return n * n"}
|
42 |
+
{"prompt": "Write a Python function to calculate the cube of a number.", "code": "def cube_number(n):\n return n ** 3"}
|
43 |
+
{"prompt": "Write a Python function to check if a year is a leap year.", "code": "def is_leap_year(year):\n if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):\n return True\n else:\n return False"}
|
44 |
+
{"prompt": "Write a Python function to count occurrences of a character in a string.", "code": "def count_character(s, char):\n return s.count(char)"}
|
45 |
+
{"prompt": "Write a Python function to find the absolute value of a number.", "code": "def absolute_value(n):\n return abs(n)"}
|
46 |
+
{"prompt": "Write a Python function to generate a sequence of numbers.", "code": "def generate_sequence(start, end, step):\n return list(range(start, end, step))"}
|
47 |
+
{"prompt": "Write a Python function to check if a list contains duplicates.", "code": "def has_duplicates(lst):\n return len(lst) != len(set(lst))"}
|
48 |
+
{"prompt": "Write a Python function to get the current date.", "code": "from datetime import date\ndef get_current_date():\n return date.today()"}
|
49 |
+
{"prompt": "Write a Python function to get the current time.", "code": "from datetime import datetime\ndef get_current_time():\n return datetime.now().time()"}
|
50 |
+
{"prompt": "Write a Python function to simulate a simple coin flip (Heads or Tails).", "code": "import random\ndef coin_flip():\n return random.choice(['Heads', 'Tails'])"}
|
document.txt
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Text-to-Code Generator using CodeGen-350M-Multi
|
2 |
+
=============================================
|
3 |
+
|
4 |
+
This project provides a text-to-code generator using a fine-tuned Salesforce/codegen-350M-multi
|
5 |
+
model, designed to run on low-end laptops (8GB RAM, CPU-only) for students to experiment with AI
|
6 |
+
model development. The model is fine-tuned on a custom dataset and includes a Flask web interface
|
7 |
+
for easy interaction. All resources are open-source under the Apache-2.0 license, with attribution
|
8 |
+
to the original model by Salesforce.
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
Do's and Setup Process
|
13 |
+
---------------------
|
14 |
+
1. **System Requirements**:
|
15 |
+
- Laptop with at least 8GB RAM and 2GB free disk space.
|
16 |
+
- Windows, macOS, or Linux (CPU-only, no GPU required).
|
17 |
+
- Internet connection for initial model download.
|
18 |
+
|
19 |
+
2. **Install Python**:
|
20 |
+
- Use Python 3.10.9. Download from https://www.python.org/downloads/release/python-3109/.
|
21 |
+
- Verify installation: `python --version`.
|
22 |
+
|
23 |
+
3. **Clone or Download Repository**:
|
24 |
+
- Download the project files from the Hugging Face repository:
|
25 |
+
https://huggingface.co/remiai3/text-to-code-using-codegen-project.
|
26 |
+
- Extract files to a folder (e.g., `text-to-code-codegen`).
|
27 |
+
|
28 |
+
4. **Set Up Virtual Environment**:
|
29 |
+
- Open a terminal in the project folder.
|
30 |
+
- Create a virtual environment: `python -m venv venv`.
|
31 |
+
- Activate it:
|
32 |
+
- Windows: `venv\Scripts\activate`
|
33 |
+
- macOS/Linux: `source venv/bin/activate`
|
34 |
+
|
35 |
+
5. **Install Dependencies**:
|
36 |
+
- Run: `pip install -r requirements.txt`.
|
37 |
+
- Required libraries: torch, transformers, datasets, accelerate, protobuf, matplotlib, flask.
|
38 |
+
NOTE: if the matplotlib version is not compatible remove the version 3.7.2 and also if any
|
39 |
+
other library is also not compitable with the python version or local device because of
|
40 |
+
previous libraries installed then remove all the versions from the libraries and install the
|
41 |
+
libraries with the names only then a default version will installed of that particualr library
|
42 |
+
|
43 |
+
6. **Prepare Custom Dataset**:
|
44 |
+
- Ensure the `custom_dataset.jsonl` file exists in the project folder.
|
45 |
+
- Format: Each line is a JSON object with `prompt` (natural language) and `code` (Python code).
|
46 |
+
- Example:
|
47 |
+
{"prompt": "Write a Python program to print 'Hello, World!'", "code": "print('Hello, World!')"}
|
48 |
+
{"prompt": "Write a Python function to add two numbers.", "code": "def add_numbers(a, b):\n return a + b"}
|
49 |
+
|
50 |
+
7. **Run the Model**:
|
51 |
+
- Option 1: Run the full pipeline (download, fine-tune, test):
|
52 |
+
- Update `run_all.py` with your Hugging Face token (`HF_TOKEN`).
|
53 |
+
- Run: `python run_all.py`.
|
54 |
+
- This downloads the model, fine-tunes it, tests it, and generates a loss plot.
|
55 |
+
- Option 2: Test the fine-tuned model directly:
|
56 |
+
- Run: `python test_codegen.py` to test with sample prompts.
|
57 |
+
- Option 3: Use the web interface:
|
58 |
+
- Run: `python app.py`.
|
59 |
+
- Open a browser and go to `http://127.0.0.1:5000`.
|
60 |
+
|
61 |
+
8. **Using the AI Model**:
|
62 |
+
- **Command Line Testing**: Use `test_codegen.py` to input prompts and generate Python code.
|
63 |
+
- **Web Interface**: Use the Flask app (`app.py`) to enter prompts via a browser and view generated code.
|
64 |
+
- Example prompts:
|
65 |
+
- "Write a Python function to calculate factorial of a number"
|
66 |
+
- "Write a Python function to check if a number is prime"
|
67 |
+
- Output is saved in `./finetuned_codegen/loss_plot.png` (loss plot) and `./finetuned_codegen`
|
68 |
+
(model weights).
|
69 |
+
|
70 |
+
9. **Model Details**:
|
71 |
+
- Model: Salesforce/codegen-350M-multi (Apache-2.0 license).
|
72 |
+
- Source: https://huggingface.co/Salesforce/codegen-350M-multi.
|
73 |
+
- Fine-tuned on a custom dataset for Python code generation.
|
74 |
+
- Attribution: This project uses the Salesforce CodeGen model, fine-tuned by remiai3 for
|
75 |
+
educational purposes.
|
76 |
+
|
77 |
+
10. **Troubleshooting**:
|
78 |
+
- Ensure ~2GB disk space for model weights.
|
79 |
+
- If memory issues occur, reduce dataset size or batch size in `run_all.py`.
|
80 |
+
- Check terminal output for errors and ensure all files (`custom_dataset.jsonl`,
|
81 |
+
`finetuned_codegen`) are in place.
|
82 |
+
|
83 |
+
11. **Contributing**:
|
84 |
+
- Add more examples to `custom_dataset.jsonl` to improve model performance.
|
85 |
+
- Share feedback or improvements via the Hugging Face repository:
|
86 |
+
https://huggingface.co/remiai3.
|
87 |
+
|
88 |
+
Attribution
|
89 |
+
-----------
|
90 |
+
This project is built using the Salesforce/codegen-350M-multi model, licensed under Apache-2.0.
|
91 |
+
The fine-tuned model and resources are provided by remiai3 for free educational use to help students
|
92 |
+
learn and experiment with AI models.
|
finetune_codegen.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
|
4 |
+
from datasets import load_dataset
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
# Set Hugging Face token (replace with your actual token)
|
8 |
+
os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
9 |
+
|
10 |
+
# Recommended for download stability, if you had issues before
|
11 |
+
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout
|
12 |
+
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Enable robust downloader
|
13 |
+
|
14 |
+
# Download model and tokenizer
|
15 |
+
model_name = "Salesforce/codegen-350M-multi"
|
16 |
+
local_model_path = "./codegen_model"
|
17 |
+
|
18 |
+
print(f"Attempting to download/load tokenizer from {model_name} to {local_model_path}...")
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
|
20 |
+
print("Tokenizer loaded.")
|
21 |
+
|
22 |
+
print(f"Attempting to download/load model from {model_name} to {local_model_path}...")
|
23 |
+
# Removed torch_dtype=torch.float16 as it's typically for GPU and might not help on CPU
|
24 |
+
# and could even cause unexpected behavior on some CPU setups.
|
25 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=local_model_path)
|
26 |
+
print("Model loaded.")
|
27 |
+
|
28 |
+
# Set padding token
|
29 |
+
tokenizer.pad_token = tokenizer.eos_token
|
30 |
+
|
31 |
+
# Move model to CPU
|
32 |
+
device = torch.device("cpu")
|
33 |
+
model.to(device)
|
34 |
+
print(f"Model moved to {device}.")
|
35 |
+
|
36 |
+
# Load custom dataset from JSONL file
|
37 |
+
dataset_file = "custom_dataset.jsonl"
|
38 |
+
print(f"Loading dataset from {dataset_file}...")
|
39 |
+
dataset = load_dataset('json', data_files=dataset_file, split='train')
|
40 |
+
print("Dataset loaded.")
|
41 |
+
print(f"Dataset size: {len(dataset)} examples.")
|
42 |
+
print(f"First example of dataset: {dataset[0]}") # Print first example to check data format
|
43 |
+
|
44 |
+
# Tokenize dataset
|
45 |
+
def tokenize_function(examples):
|
46 |
+
inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
|
47 |
+
# --- REDUCED MAX_LENGTH TO SAVE MEMORY ---
|
48 |
+
return tokenizer(inputs, truncation=True, padding="max_length", max_length=64) # Try 64 or even 32 if 128 is too much
|
49 |
+
|
50 |
+
print("Tokenizing dataset...")
|
51 |
+
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
|
52 |
+
print("Dataset tokenized.")
|
53 |
+
print(f"First tokenized example: {tokenized_dataset[0]}")
|
54 |
+
|
55 |
+
# Data collator for language modeling
|
56 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
57 |
+
|
58 |
+
# Define training arguments
|
59 |
+
training_args = TrainingArguments(
|
60 |
+
output_dir="./finetuned_codegen",
|
61 |
+
overwrite_output_dir=True,
|
62 |
+
num_train_epochs=3,
|
63 |
+
# --- AGGRESSIVELY REDUCED BATCH SIZE AND GRADIENT ACCUMULATION FOR CPU ---
|
64 |
+
per_device_train_batch_size=1,
|
65 |
+
gradient_accumulation_steps=1, # No accumulation, true batch size of 1
|
66 |
+
save_steps=500,
|
67 |
+
save_total_limit=2,
|
68 |
+
logging_steps=10, # Log more frequently to see if it starts moving
|
69 |
+
learning_rate=5e-5,
|
70 |
+
fp16=False, # Keep False for CPU
|
71 |
+
use_cpu=True, # Use this instead of no_cuda=True
|
72 |
+
dataloader_pin_memory=False, # Disable pin_memory for CPU
|
73 |
+
report_to="none", # Disable reporting to avoid potential hangs
|
74 |
+
gradient_checkpointing=True, # Keep this, it helps with memory on CPU too
|
75 |
+
max_grad_norm=1.0,
|
76 |
+
)
|
77 |
+
|
78 |
+
# Custom callback to store training loss
|
79 |
+
class LossCallback(TrainerCallback):
|
80 |
+
def __init__(self):
|
81 |
+
self.losses = []
|
82 |
+
self.log_steps = []
|
83 |
+
|
84 |
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
85 |
+
if logs and "loss" in logs:
|
86 |
+
self.losses.append(logs["loss"])
|
87 |
+
self.log_steps.append(state.global_step)
|
88 |
+
print(f"Step {state.global_step}: Loss = {logs['loss']:.4f}")
|
89 |
+
|
90 |
+
loss_callback = LossCallback()
|
91 |
+
|
92 |
+
# Initialize Trainer
|
93 |
+
trainer = Trainer(
|
94 |
+
model=model,
|
95 |
+
args=training_args,
|
96 |
+
train_dataset=tokenized_dataset,
|
97 |
+
data_collator=data_collator,
|
98 |
+
callbacks=[loss_callback],
|
99 |
+
)
|
100 |
+
|
101 |
+
# Start fine-tuning
|
102 |
+
print("Starting fine-tuning...")
|
103 |
+
print("WARNING: Training on CPU will be extremely slow. The 0% progress bar might take a very long time to update.")
|
104 |
+
print("Please monitor your system's RAM and CPU usage.")
|
105 |
+
trainer.train()
|
106 |
+
print("Fine-tuning finished.")
|
107 |
+
|
108 |
+
# Save fine-tuned model
|
109 |
+
model.save_pretrained("./finetuned_codegen")
|
110 |
+
tokenizer.save_pretrained("./finetuned_codegen")
|
111 |
+
print("Model fine-tuned and saved to ./finetuned_codegen.")
|
112 |
+
|
113 |
+
# Plot training loss
|
114 |
+
if loss_callback.losses:
|
115 |
+
plt.figure(figsize=(10, 6))
|
116 |
+
plt.plot(loss_callback.log_steps, loss_callback.losses, label="Training Loss")
|
117 |
+
plt.xlabel("Steps")
|
118 |
+
plt.ylabel("Loss")
|
119 |
+
plt.title("Fine-Tuning Loss Curve")
|
120 |
+
plt.legend()
|
121 |
+
plt.grid(True)
|
122 |
+
plot_path = "./finetuned_codegen/loss_plot.png"
|
123 |
+
plt.savefig(plot_path)
|
124 |
+
print(f"Loss plot saved to {plot_path}")
|
125 |
+
else:
|
126 |
+
print("No training losses recorded to plot.")
|
127 |
+
plt.show()
|
128 |
+
|
129 |
+
print("Fine-tuning script finished execution.")
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.0.1
|
2 |
+
transformers==4.31.0
|
3 |
+
datasets==2.14.4
|
4 |
+
accelerate==0.21.0
|
5 |
+
protobuf==4.23.4
|
6 |
+
matplotlib==3.7.2
|
7 |
+
flask==2.3.2
|
templates/index.html
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Text-to-Code Generator</title>
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
font-family: Arial, sans-serif;
|
10 |
+
margin: 0;
|
11 |
+
padding: 20px;
|
12 |
+
background-color: #f4f4f9;
|
13 |
+
}
|
14 |
+
.container {
|
15 |
+
max-width: 800px;
|
16 |
+
margin: auto;
|
17 |
+
background: white;
|
18 |
+
padding: 20px;
|
19 |
+
border-radius: 8px;
|
20 |
+
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
|
21 |
+
}
|
22 |
+
h1 {
|
23 |
+
text-align: center;
|
24 |
+
color: #333;
|
25 |
+
}
|
26 |
+
textarea {
|
27 |
+
width: 100%;
|
28 |
+
height: 100px;
|
29 |
+
margin-bottom: 10px;
|
30 |
+
padding: 10px;
|
31 |
+
border-radius: 4px;
|
32 |
+
border: 1px solid #ccc;
|
33 |
+
}
|
34 |
+
button {
|
35 |
+
padding: 10px 20px;
|
36 |
+
background-color: #007bff;
|
37 |
+
color: white;
|
38 |
+
border: none;
|
39 |
+
border-radius: 4px;
|
40 |
+
cursor: pointer;
|
41 |
+
}
|
42 |
+
button:hover {
|
43 |
+
background-color: #0056b3;
|
44 |
+
}
|
45 |
+
pre {
|
46 |
+
background-color: #f8f9fa;
|
47 |
+
padding: 10px;
|
48 |
+
border-radius: 4px;
|
49 |
+
overflow-x: auto;
|
50 |
+
}
|
51 |
+
.attribution {
|
52 |
+
margin-top: 20px;
|
53 |
+
font-size: 0.9em;
|
54 |
+
color: #555;
|
55 |
+
text-align: center;
|
56 |
+
}
|
57 |
+
</style>
|
58 |
+
</head>
|
59 |
+
<body>
|
60 |
+
<div class="container">
|
61 |
+
<h1>Text-to-Code Generator</h1>
|
62 |
+
<form method="POST">
|
63 |
+
<textarea name="prompt" placeholder="Enter a prompt (e.g., 'Write a Python function to calculate factorial')" required>{{ prompt }}</textarea>
|
64 |
+
<button type="submit">Generate Code</button>
|
65 |
+
</form>
|
66 |
+
{% if generated_code %}
|
67 |
+
<h2>Generated Code:</h2>
|
68 |
+
<pre>{{ generated_code }}</pre>
|
69 |
+
{% endif %}
|
70 |
+
<div class="attribution">
|
71 |
+
<p>Built with the fine-tuned </a href="https://huggingface.co/Salesforce/codegen-350M-multi" target="_blank">Salesforce/codegen-350M-multi</a> model.</p>
|
72 |
+
<p>Developed by </a href="https://huggingface.co/remiai3" target="_blank">remiai3</a> for educational use. Licensed under Apache-2.0.</p>
|
73 |
+
<p>Free resources for students to experiment with AI model development.</p>
|
74 |
+
</div>
|
75 |
+
</div>
|
76 |
+
</body>
|
77 |
+
</html>
|
test.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
3 |
+
|
4 |
+
# Load fine-tuned model and tokenizer
|
5 |
+
model_path = "./finetuned_codegen"
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
7 |
+
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
|
8 |
+
|
9 |
+
# Set padding token
|
10 |
+
tokenizer.pad_token = tokenizer.eos_token
|
11 |
+
|
12 |
+
# Move model to CPU
|
13 |
+
device = torch.device("cpu")
|
14 |
+
model.to(device)
|
15 |
+
|
16 |
+
# Test prompts (including dataset prompts)
|
17 |
+
prompts = [
|
18 |
+
"Write a Python program to print 'Hello, you name or any other thing!'"
|
19 |
+
]
|
20 |
+
|
21 |
+
# Generate code for each prompt
|
22 |
+
for prompt in prompts:
|
23 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
|
24 |
+
outputs = model.generate(
|
25 |
+
**inputs,
|
26 |
+
max_length=200,
|
27 |
+
num_return_sequences=1,
|
28 |
+
pad_token_id=tokenizer.eos_token_id,
|
29 |
+
do_sample=True,
|
30 |
+
temperature=0.7,
|
31 |
+
top_p=0.9
|
32 |
+
)
|
33 |
+
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
34 |
+
print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")
|