File size: 3,353 Bytes
71c0be4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import fitz # PyMuPDF for PDF extraction
import re
import unsloth
import os
from huggingface_hub import login
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import gradio as gr
from transformers import pipeline
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
doc = fitz.open(pdf_path)
text = "\n".join([page.get_text("text") for page in doc])
return text.strip()
def preprocess_text(text):
"""Basic text preprocessing"""
return re.sub(r"\s+", " ", text).strip()
pdf_text = extract_text_from_pdf("new-american-standard-bible.pdf")
clean_text = preprocess_text(pdf_text)
# Read the Hugging Face token from environment variables
hf_token = os.getenv("access_token")
if hf_token is None:
raise ValueError("'access_token' is not set. Add it as a secret variable in Hugging Face Spaces.")
# Log in to Hugging Face
login(token=hf_token)
#model_name = "meta-llama/Llama-2-7b-hf" # You can use a smaller one like "meta-llama/Llama-2-7b-chat-hf"
model_name = "unsloth/llama-2-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create dataset
data = {"text": [clean_text]}
dataset = Dataset.from_dict(data)
# Set a padding token manually
tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
# Alternatively, add a new custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenization function
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input as labels for text generation
return tokens
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Load LLaMA 2 model in 4-bit mode to save memory
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True, # Use 4-bit quantization for efficiency
device_map="auto"
#device_map="cpu",
#quantization_config=None
)
# Apply LoRA (efficient fine-tuning)
lora_config = LoraConfig(
r=8, # Low-rank parameter
lora_alpha=32,
target_modules=["q_proj", "v_proj"], # Applies only to attention layers
lora_dropout=0.05
)
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="no", # Disable evaluation (to enable, change value to 'epoch')
learning_rate=2e-4,
per_device_train_batch_size=1, # Reduce batch size for memory efficiency
per_device_eval_batch_size=1,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
tokenizer=tokenizer,
)
def perform_training():
trainer.train()
perform_training()
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
# CHATBOT START
chatbot = pipeline("text-generation", model="./fine_tuned_llama2")
def chatbot_response(prompt):
result = chatbot(prompt, max_length=100, do_sample=True, temperature=0.7)
return result[0]["generated_text"]
iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
iface.launch() |