VenkateshRoshan commited on
Commit
a562c0d
·
1 Parent(s): 45f8739

instance updation

Browse files
Files changed (3) hide show
  1. app.py +4 -4
  2. src/deploy_sagemaker.py +3 -2
  3. src/infer.py +115 -20
app.py CHANGED
@@ -47,7 +47,7 @@ class CustomerSupportBot:
47
  print("Model and tokenizer loaded successfully.")
48
 
49
  # Move model to GPU if available
50
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
51
  self.model = self.model.to(self.device)
52
 
53
  def generate_response(self, message: str, max_length=100, temperature=0.7) -> str:
@@ -170,8 +170,8 @@ if __name__ == "__main__":
170
  demo = create_chat_interface()
171
  demo.launch(
172
  share=True,
173
- # server_name="0.0.0.0", # Makes the server accessible from other machines
174
- # server_port=7860, # Specify the port
175
  debug=True,
176
- inline=False, server_port=6006
177
  )
 
47
  print("Model and tokenizer loaded successfully.")
48
 
49
  # Move model to GPU if available
50
+ self.device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
51
  self.model = self.model.to(self.device)
52
 
53
  def generate_response(self, message: str, max_length=100, temperature=0.7) -> str:
 
170
  demo = create_chat_interface()
171
  demo.launch(
172
  share=True,
173
+ server_name="0.0.0.0", # Makes the server accessible from other machines
174
+ server_port=7860, # Specify the port
175
  debug=True,
176
+ inline=False#, server_port=6006
177
  )
src/deploy_sagemaker.py CHANGED
@@ -31,14 +31,15 @@ def deploy_app(acc_id, region_name, role_arn, ecr_repo_name, endpoint_name="cust
31
  model = Model(
32
  image_uri=ecr_image,
33
  role=role_arn,
34
- sagemaker_session=sagemaker_session
 
35
  )
36
 
37
  # Deploy model as a SageMaker endpoint
38
  logger.info(f"Starting deployment of Gradio app to SageMaker endpoint {endpoint_name}...")
39
  predictor = model.deploy(
40
  initial_instance_count=1,
41
- instance_type="ml.g4dn.xlarge",
42
  endpoint_name=endpoint_name
43
  )
44
  logger.info(f"Gradio app deployed successfully to endpoint: {endpoint_name}")
 
31
  model = Model(
32
  image_uri=ecr_image,
33
  role=role_arn,
34
+ sagemaker_session=sagemaker_session,
35
+ entry_point="serve",
36
  )
37
 
38
  # Deploy model as a SageMaker endpoint
39
  logger.info(f"Starting deployment of Gradio app to SageMaker endpoint {endpoint_name}...")
40
  predictor = model.deploy(
41
  initial_instance_count=1,
42
+ instance_type="ml.t3.large", #"ml.g4dn.xlarge",
43
  endpoint_name=endpoint_name
44
  )
45
  logger.info(f"Gradio app deployed successfully to endpoint: {endpoint_name}")
src/infer.py CHANGED
@@ -1,41 +1,114 @@
1
  from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  class CustomerSupportBot:
5
  def __init__(self, model_path="models/customer_support_gpt"):
6
  """
7
- Initialize the customer support bot with the fine-tuned model.
8
 
9
  Args:
10
  model_path (str): Path to the saved model and tokenizer
11
  """
 
 
 
 
12
  self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
 
13
  self.model = AutoModelForCausalLM.from_pretrained(model_path)
 
14
 
15
  # Move model to GPU if available
16
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
  self.model = self.model.to(self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def generate_response(self, instruction, max_length=100, temperature=0.7):
20
  """
21
- Generate a response for a given customer support instruction/query.
22
 
23
  Args:
24
  instruction (str): Customer's query or instruction
25
  max_length (int): Maximum length of the generated response
26
- temperature (float): Controls randomness in generation (higher = more random)
27
 
28
  Returns:
29
- str: Generated response
30
  """
31
- # Format input text the same way as during training
32
- input_text = f"Instruction: {instruction}\nResponse:"
33
 
34
- # Tokenize input
 
35
  inputs = self.tokenizer(input_text, return_tensors="pt")
36
  inputs = inputs.to(self.device)
37
 
38
- # Generate response
 
39
  with torch.no_grad():
40
  outputs = self.model.generate(
41
  **inputs,
@@ -48,18 +121,32 @@ class CustomerSupportBot:
48
  top_p=0.95,
49
  top_k=50
50
  )
 
51
 
52
- # Decode and format response
53
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
54
 
55
- # Extract only the response part
 
56
  response = response.split("Response:")[-1].strip()
57
 
58
- return response
 
 
 
59
 
60
  def main():
61
  # Initialize the bot
 
62
  bot = CustomerSupportBot()
 
63
 
64
  # Example queries
65
  example_queries = [
@@ -68,22 +155,30 @@ def main():
68
  "I want to return a product.",
69
  ]
70
 
71
- # Generate and print responses
72
- print("Customer Support Bot Demo:\n")
73
  for query in example_queries:
74
  print(f"Customer: {query}")
75
- response = bot.generate_response(query)
76
- print(f"Bot: {response}\n")
77
-
 
 
78
  # Interactive mode
79
  print("Enter your questions (type 'quit' to exit):")
80
  while True:
81
  query = input("\nYour question: ")
82
  if query.lower() == 'quit':
83
  break
84
-
85
- response = bot.generate_response(query)
86
  print(f"Bot: {response}")
 
 
 
 
 
 
87
 
88
  if __name__ == "__main__":
89
  main()
 
1
  from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import torch
3
+ import psutil
4
+ import os
5
+ import time
6
+ from typing import Dict, Any
7
+ import numpy as np
8
+
9
+ class MemoryTracker:
10
+ @staticmethod
11
+ def get_memory_usage() -> Dict[str, float]:
12
+ """Get current memory usage statistics."""
13
+ process = psutil.Process(os.getpid())
14
+ memory_info = process.memory_info()
15
+
16
+ return {
17
+ 'rss': memory_info.rss / (1024 * 1024), # RSS in MB
18
+ 'vms': memory_info.vms / (1024 * 1024), # VMS in MB
19
+ 'gpu': torch.cuda.memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0 # GPU memory in MB
20
+ }
21
+
22
+ @staticmethod
23
+ def format_memory_stats(stats: Dict[str, float]) -> str:
24
+ """Format memory statistics into a readable string."""
25
+ return (f"RSS Memory: {stats['rss']:.2f} MB\n"
26
+ f"Virtual Memory: {stats['vms']:.2f} MB\n"
27
+ f"GPU Memory: {stats['gpu']:.2f} MB")
28
 
29
  class CustomerSupportBot:
30
  def __init__(self, model_path="models/customer_support_gpt"):
31
  """
32
+ Initialize the customer support bot with the fine-tuned model and memory tracking.
33
 
34
  Args:
35
  model_path (str): Path to the saved model and tokenizer
36
  """
37
+ # Record initial memory state
38
+ self.initial_memory = MemoryTracker.get_memory_usage()
39
+
40
+ # Load tokenizer and track memory
41
  self.tokenizer = AutoTokenizer.from_pretrained(model_path)
42
+ self.post_tokenizer_memory = MemoryTracker.get_memory_usage()
43
+
44
+ # Load model and track memory
45
  self.model = AutoModelForCausalLM.from_pretrained(model_path)
46
+ self.post_model_memory = MemoryTracker.get_memory_usage()
47
 
48
  # Move model to GPU if available
49
+ self.device = "cpu"#"cuda" if torch.cuda.is_available() else "cpu"
50
  self.model = self.model.to(self.device)
51
+ self.post_device_memory = MemoryTracker.get_memory_usage()
52
+
53
+ # Calculate memory deltas
54
+ self.memory_deltas = {
55
+ 'tokenizer_load': {k: self.post_tokenizer_memory[k] - self.initial_memory[k]
56
+ for k in self.initial_memory},
57
+ 'model_load': {k: self.post_model_memory[k] - self.post_tokenizer_memory[k]
58
+ for k in self.initial_memory},
59
+ 'device_transfer': {k: self.post_device_memory[k] - self.post_model_memory[k]
60
+ for k in self.initial_memory}
61
+ }
62
+
63
+ # Initialize inference memory tracking
64
+ self.inference_memory_stats = []
65
+
66
+ def get_memory_report(self) -> str:
67
+ """Generate a comprehensive memory usage report."""
68
+ report = ["Memory Usage Report:"]
69
+
70
+ report.append("\nModel Loading Memory Changes:")
71
+ report.append("Tokenizer Loading:")
72
+ report.append(MemoryTracker.format_memory_stats(self.memory_deltas['tokenizer_load']))
73
+
74
+ report.append("\nModel Loading:")
75
+ report.append(MemoryTracker.format_memory_stats(self.memory_deltas['model_load']))
76
+
77
+ report.append("\nDevice Transfer:")
78
+ report.append(MemoryTracker.format_memory_stats(self.memory_deltas['device_transfer']))
79
+
80
+ if self.inference_memory_stats:
81
+ avg_inference_memory = {
82
+ k: np.mean([stats[k] for stats in self.inference_memory_stats])
83
+ for k in self.inference_memory_stats[0]
84
+ }
85
+ report.append("\nAverage Inference Memory Usage:")
86
+ report.append(MemoryTracker.format_memory_stats(avg_inference_memory))
87
+
88
+ return "\n".join(report)
89
 
90
  def generate_response(self, instruction, max_length=100, temperature=0.7):
91
  """
92
+ Generate a response for a given customer support instruction/query with memory tracking.
93
 
94
  Args:
95
  instruction (str): Customer's query or instruction
96
  max_length (int): Maximum length of the generated response
97
+ temperature (float): Controls randomness in generation
98
 
99
  Returns:
100
+ tuple: (Generated response, Memory usage statistics)
101
  """
102
+ # Record pre-inference memory
103
+ pre_inference_memory = MemoryTracker.get_memory_usage()
104
 
105
+ # Format and tokenize input
106
+ input_text = f"Instruction: {instruction}\nResponse:"
107
  inputs = self.tokenizer(input_text, return_tensors="pt")
108
  inputs = inputs.to(self.device)
109
 
110
+ # Generate response and track memory
111
+ start_time = time.time()
112
  with torch.no_grad():
113
  outputs = self.model.generate(
114
  **inputs,
 
121
  top_p=0.95,
122
  top_k=50
123
  )
124
+ inference_time = time.time() - start_time
125
 
126
+ # Record post-inference memory
127
+ post_inference_memory = MemoryTracker.get_memory_usage()
128
+
129
+ # Calculate memory delta for this inference
130
+ inference_memory_delta = {
131
+ k: post_inference_memory[k] - pre_inference_memory[k]
132
+ for k in pre_inference_memory
133
+ }
134
+ self.inference_memory_stats.append(inference_memory_delta)
135
 
136
+ # Decode response
137
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
138
  response = response.split("Response:")[-1].strip()
139
 
140
+ return response, {
141
+ 'memory_delta': inference_memory_delta,
142
+ 'inference_time': inference_time
143
+ }
144
 
145
  def main():
146
  # Initialize the bot
147
+ print("Initializing bot and tracking memory usage...")
148
  bot = CustomerSupportBot()
149
+ print(bot.get_memory_report())
150
 
151
  # Example queries
152
  example_queries = [
 
155
  "I want to return a product.",
156
  ]
157
 
158
+ # Generate and print responses with memory stats
159
+ print("\nCustomer Support Bot Demo:\n")
160
  for query in example_queries:
161
  print(f"Customer: {query}")
162
+ response, stats = bot.generate_response(query)
163
+ print(f"Bot: {response}")
164
+ print(f"Inference Memory Delta: {MemoryTracker.format_memory_stats(stats['memory_delta'])}")
165
+ print(f"Inference Time: {stats['inference_time']:.2f} seconds\n")
166
+
167
  # Interactive mode
168
  print("Enter your questions (type 'quit' to exit):")
169
  while True:
170
  query = input("\nYour question: ")
171
  if query.lower() == 'quit':
172
  break
173
+
174
+ response, stats = bot.generate_response(query)
175
  print(f"Bot: {response}")
176
+ print(f"Inference Memory Delta: {MemoryTracker.format_memory_stats(stats['memory_delta'])}")
177
+ print(f"Inference Time: {stats['inference_time']:.2f} seconds")
178
+
179
+ # Print final memory report
180
+ print("\nFinal Memory Report:")
181
+ print(bot.get_memory_report())
182
 
183
  if __name__ == "__main__":
184
  main()