VenkateshRoshan
instance updation
a562c0d
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import psutil
import os
import time
from typing import Dict, Any
import numpy as np
class MemoryTracker:
@staticmethod
def get_memory_usage() -> Dict[str, float]:
"""Get current memory usage statistics."""
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
return {
'rss': memory_info.rss / (1024 * 1024), # RSS in MB
'vms': memory_info.vms / (1024 * 1024), # VMS in MB
'gpu': torch.cuda.memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0 # GPU memory in MB
}
@staticmethod
def format_memory_stats(stats: Dict[str, float]) -> str:
"""Format memory statistics into a readable string."""
return (f"RSS Memory: {stats['rss']:.2f} MB\n"
f"Virtual Memory: {stats['vms']:.2f} MB\n"
f"GPU Memory: {stats['gpu']:.2f} MB")
class CustomerSupportBot:
def __init__(self, model_path="models/customer_support_gpt"):
"""
Initialize the customer support bot with the fine-tuned model and memory tracking.
Args:
model_path (str): Path to the saved model and tokenizer
"""
# Record initial memory state
self.initial_memory = MemoryTracker.get_memory_usage()
# Load tokenizer and track memory
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.post_tokenizer_memory = MemoryTracker.get_memory_usage()
# Load model and track memory
self.model = AutoModelForCausalLM.from_pretrained(model_path)
self.post_model_memory = MemoryTracker.get_memory_usage()
# Move model to GPU if available
self.device = "cpu"#"cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
self.post_device_memory = MemoryTracker.get_memory_usage()
# Calculate memory deltas
self.memory_deltas = {
'tokenizer_load': {k: self.post_tokenizer_memory[k] - self.initial_memory[k]
for k in self.initial_memory},
'model_load': {k: self.post_model_memory[k] - self.post_tokenizer_memory[k]
for k in self.initial_memory},
'device_transfer': {k: self.post_device_memory[k] - self.post_model_memory[k]
for k in self.initial_memory}
}
# Initialize inference memory tracking
self.inference_memory_stats = []
def get_memory_report(self) -> str:
"""Generate a comprehensive memory usage report."""
report = ["Memory Usage Report:"]
report.append("\nModel Loading Memory Changes:")
report.append("Tokenizer Loading:")
report.append(MemoryTracker.format_memory_stats(self.memory_deltas['tokenizer_load']))
report.append("\nModel Loading:")
report.append(MemoryTracker.format_memory_stats(self.memory_deltas['model_load']))
report.append("\nDevice Transfer:")
report.append(MemoryTracker.format_memory_stats(self.memory_deltas['device_transfer']))
if self.inference_memory_stats:
avg_inference_memory = {
k: np.mean([stats[k] for stats in self.inference_memory_stats])
for k in self.inference_memory_stats[0]
}
report.append("\nAverage Inference Memory Usage:")
report.append(MemoryTracker.format_memory_stats(avg_inference_memory))
return "\n".join(report)
def generate_response(self, instruction, max_length=100, temperature=0.7):
"""
Generate a response for a given customer support instruction/query with memory tracking.
Args:
instruction (str): Customer's query or instruction
max_length (int): Maximum length of the generated response
temperature (float): Controls randomness in generation
Returns:
tuple: (Generated response, Memory usage statistics)
"""
# Record pre-inference memory
pre_inference_memory = MemoryTracker.get_memory_usage()
# Format and tokenize input
input_text = f"Instruction: {instruction}\nResponse:"
inputs = self.tokenizer(input_text, return_tensors="pt")
inputs = inputs.to(self.device)
# Generate response and track memory
start_time = time.time()
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
num_return_sequences=1,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
do_sample=True,
top_p=0.95,
top_k=50
)
inference_time = time.time() - start_time
# Record post-inference memory
post_inference_memory = MemoryTracker.get_memory_usage()
# Calculate memory delta for this inference
inference_memory_delta = {
k: post_inference_memory[k] - pre_inference_memory[k]
for k in pre_inference_memory
}
self.inference_memory_stats.append(inference_memory_delta)
# Decode response
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("Response:")[-1].strip()
return response, {
'memory_delta': inference_memory_delta,
'inference_time': inference_time
}
def main():
# Initialize the bot
print("Initializing bot and tracking memory usage...")
bot = CustomerSupportBot()
print(bot.get_memory_report())
# Example queries
example_queries = [
"How do I reset my password?",
"What are your shipping policies?",
"I want to return a product.",
]
# Generate and print responses with memory stats
print("\nCustomer Support Bot Demo:\n")
for query in example_queries:
print(f"Customer: {query}")
response, stats = bot.generate_response(query)
print(f"Bot: {response}")
print(f"Inference Memory Delta: {MemoryTracker.format_memory_stats(stats['memory_delta'])}")
print(f"Inference Time: {stats['inference_time']:.2f} seconds\n")
# Interactive mode
print("Enter your questions (type 'quit' to exit):")
while True:
query = input("\nYour question: ")
if query.lower() == 'quit':
break
response, stats = bot.generate_response(query)
print(f"Bot: {response}")
print(f"Inference Memory Delta: {MemoryTracker.format_memory_stats(stats['memory_delta'])}")
print(f"Inference Time: {stats['inference_time']:.2f} seconds")
# Print final memory report
print("\nFinal Memory Report:")
print(bot.get_memory_report())
if __name__ == "__main__":
main()