Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -83,6 +83,22 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
83 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
84 |
|
85 |
# Load the OpenWebText dataset using streaming (No download required)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
dataset = load_dataset("Skylion007/openwebtext", split="train[:5%]") # Load 5% to avoid streaming issues
|
87 |
|
88 |
# Tokenization function
|
|
|
83 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
84 |
|
85 |
# Load the OpenWebText dataset using streaming (No download required)
|
86 |
+
|
87 |
+
# Custom Dataset (Predefined Q&A Pairs for Project Expo)
|
88 |
+
custom_data = [
|
89 |
+
{"prompt": "Who are you?", "response": "I am Eva, a virtual voice assistant."},
|
90 |
+
{"prompt": "What is your name?", "response": "I am Eva, how can I help you?"},
|
91 |
+
{"prompt": "What can you do?", "response": "I can assist with answering questions, searching the web, and much more!"},
|
92 |
+
{"prompt": "Who invented the computer?", "response": "Charles Babbage is known as the father of the computer."},
|
93 |
+
{"prompt": "Tell me a joke.", "response": "Why don’t scientists trust atoms? Because they make up everything!"},
|
94 |
+
{"prompt": "Who is the Prime Minister of India?", "response": "The current Prime Minister of India is Narendra Modi."},
|
95 |
+
{\"prompt\": \"Who created you?\", \"response\": \"I was created by an expert team specializing in AI fine-tuning and web development.\"}, {"prompt": "Can you introduce yourself?", "response": "I am Eva, your AI assistant, designed to assist and provide information."}
|
96 |
+
]
|
97 |
+
|
98 |
+
# Convert custom dataset to Hugging Face Dataset
|
99 |
+
dataset_custom = load_dataset("json", data_files={"train": custom_data})
|
100 |
+
|
101 |
+
# Merge with OpenWebText dataset
|
102 |
dataset = load_dataset("Skylion007/openwebtext", split="train[:5%]") # Load 5% to avoid streaming issues
|
103 |
|
104 |
# Tokenization function
|