Update README.md
Browse files
README.md
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
---
|
2 |
base_model:
|
3 |
- unsloth/Llama-3.2-1B-Instruct
|
4 |
-
library_name:
|
5 |
language:
|
6 |
- en
|
7 |
license: cc0-1.0
|
|
|
|
|
8 |
---
|
9 |
# A !!!!!disclaimer uh. for now, the experimentation does not lead me anywhere due to limit resources that I have and do not recommend to download this model. Working on working on it.
|
10 |
|
@@ -12,7 +14,90 @@ PEFT Finnegan-tuned LLaMA 3.2-1B-instruct on part of Finnegans Wake dataset for
|
|
12 |
|
13 |
Space: https://huggingface.co/spaces/genaforvena/huivam_finnegans_spaceship
|
14 |
|
15 |
-
## Iteration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
Dataset: same (forgot to save config with new dataset).
|
18 |
|
|
|
1 |
---
|
2 |
base_model:
|
3 |
- unsloth/Llama-3.2-1B-Instruct
|
4 |
+
library_name: transformers
|
5 |
language:
|
6 |
- en
|
7 |
license: cc0-1.0
|
8 |
+
tags:
|
9 |
+
- unsloth
|
10 |
---
|
11 |
# A !!!!!disclaimer uh. for now, the experimentation does not lead me anywhere due to limit resources that I have and do not recommend to download this model. Working on working on it.
|
12 |
|
|
|
14 |
|
15 |
Space: https://huggingface.co/spaces/genaforvena/huivam_finnegans_spaceship
|
16 |
|
17 |
+
## Iteration 3:
|
18 |
+
Realized that was doing it all wrong and this tie used https://huggingface.co/unsloth/Llama-3.2-1B-Instruct and collab available from there. Only changed dataset.
|
19 |
+
|
20 |
+
My collab is here: https://colab.research.google.com/drive/1JrqcU9idXXR3Wru5mw2e6Uh2TKJWwu7U?usp=sharing
|
21 |
+
|
22 |
+
The only difference: Created dataset like below
|
23 |
+
```
|
24 |
+
from unsloth.chat_templates import get_chat_template
|
25 |
+
import json
|
26 |
+
import random
|
27 |
+
from transformers import AutoTokenizer
|
28 |
+
from unsloth.chat_templates import get_chat_template # For chat template formatting
|
29 |
+
from datasets import Dataset, load_dataset
|
30 |
+
|
31 |
+
# Configuration
|
32 |
+
INPUT_FILE = "finnegans_30.txt" # Path to your Finnegans Wake text file
|
33 |
+
OUTPUT_FILE = "finnegans_wake_dataset.jsonl" # Local file to save the dataset
|
34 |
+
CHUNK_SIZE = 24
|
35 |
+
|
36 |
+
# Apply the chat template
|
37 |
+
tokenizer = get_chat_template(
|
38 |
+
tokenizer,
|
39 |
+
chat_template="llama-3.1", # Use the LLaMA-3.1 chat template
|
40 |
+
)
|
41 |
+
|
42 |
+
# Load the text
|
43 |
+
with open(INPUT_FILE, "r", encoding="utf-8") as file:
|
44 |
+
text = file.read()
|
45 |
+
|
46 |
+
# Tokenize the text
|
47 |
+
tokens = tokenizer.encode(text, truncation=False, add_special_tokens=False)
|
48 |
+
|
49 |
+
# Split tokens into chunks
|
50 |
+
chunks = [tokens[i:i + CHUNK_SIZE] for i in range(0, len(tokens), CHUNK_SIZE)]
|
51 |
+
|
52 |
+
# Prepare dataset in conversational format
|
53 |
+
dataset = []
|
54 |
+
for chunk in chunks:
|
55 |
+
chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
|
56 |
+
|
57 |
+
# Split the chunk into three parts randomly
|
58 |
+
split_points = sorted(random.sample(range(len(chunk_text)), 2)) # Two random split points
|
59 |
+
context = chunk_text[:split_points[0]]
|
60 |
+
instruction = chunk_text[split_points[0]:split_points[1]]
|
61 |
+
response = chunk_text[split_points[1]:]
|
62 |
+
|
63 |
+
# Format as a conversation
|
64 |
+
conversation = [
|
65 |
+
{"role": "user", "content": f"### GIVEN THE CONTEXT: {context} ### INSTRUCTION: {instruction}"},
|
66 |
+
{"role": "assistant", "content": response},
|
67 |
+
]
|
68 |
+
|
69 |
+
# Add to dataset
|
70 |
+
dataset.append({"conversations": conversation})
|
71 |
+
|
72 |
+
# Save dataset locally as a .jsonl file
|
73 |
+
with open(OUTPUT_FILE, "w", encoding="utf-8") as file:
|
74 |
+
for item in dataset:
|
75 |
+
json.dump(item, file)
|
76 |
+
file.write("\n")
|
77 |
+
|
78 |
+
print(f"Dataset saved locally to {OUTPUT_FILE}")
|
79 |
+
|
80 |
+
# Apply the formatting function
|
81 |
+
def formatting_prompts_func(examples):
|
82 |
+
convos = examples["conversations"]
|
83 |
+
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
|
84 |
+
return {"text": texts}
|
85 |
+
|
86 |
+
# Apply the formatting function using Dataset.from_dict
|
87 |
+
dataset = Dataset.from_dict({"conversations": [d['conversations'] for d in dataset]})
|
88 |
+
|
89 |
+
formatted_dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=['conversations'])
|
90 |
+
|
91 |
+
# Save the formatted dataset
|
92 |
+
formatted_dataset.to_json("formatted_finnegans_wake_dataset.jsonl")
|
93 |
+
print("Formatted dataset saved to formatted_finnegans_wake_dataset.jsonl")
|
94 |
+
|
95 |
+
# Load the formatted dataset using load_dataset
|
96 |
+
dataset = load_dataset("json", data_files="formatted_finnegans_wake_dataset.jsonl", split="train")
|
97 |
+
dataset = dataset
|
98 |
+
```
|
99 |
+
|
100 |
+
## Iteration 2 (Fail):
|
101 |
|
102 |
Dataset: same (forgot to save config with new dataset).
|
103 |
|