Spaces:

metunlp
/

gpu-backend

Paused

App Files Files Community

aacengiz commited on 11 days ago

Commit

7b1f46d

2 Parent(s): ebb4f06 b5edba5

Merge branch 'refs/heads/main' into aysu

Browse files

# Conflicts:
# src/deepeval/base_task.py
# src/deepeval/reading_comp_mc.py

Files changed (10) hide show

src/deepeval/base_task.py +19 -13
src/deepeval/commonsense_reasoning_task.py +6 -6
src/deepeval/complex_reasoning.py +5 -5
src/deepeval/deepeval_task_manager.py +7 -0
src/deepeval/nli.py +6 -6
src/deepeval/reading_comp_mc.py +7 -7
src/deepeval/sentiment_analysis_task.py +1 -1
src/deepeval/summarization_task.py +4 -4
src/deepeval/turkish_general_knowledge_task.py +5 -5
svc/router.py +14 -10

src/deepeval/base_task.py CHANGED Viewed

@@ -6,6 +6,7 @@ import openai
 from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
 import torch
 from typing import List
 load_dotenv()
 HF_TOKEN=os.getenv("HF_TOKEN")
 OPENAI_KEY = os.getenv("OPENAI_API_KEY")
@@ -16,7 +17,7 @@ class BaseTask(ABC):
     def __init__(self, dataset_repo, model_name):
         self.dataset_repo = dataset_repo
         self.dataset = self.load_dataset_from_hf()
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model, self.tokenizer = self.get_cached_model(model_name, self.device)
         openai.api_key = OPENAI_KEY
@@ -32,12 +33,15 @@ class BaseTask(ABC):
     def load_model(model_name: str, device):
         """Loads model and tokenizer once and caches it."""
         print(f"Loading model: {model_name}")
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             torch_dtype=torch.float16,
             device_map=device,
             token=HF_TOKEN,  # Replace with actual token
         )
         print("Model loaded.")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         return model, tokenizer
@@ -49,8 +53,8 @@ class BaseTask(ABC):
             self.tokenizer.pad_token = self.tokenizer.eos_token  # Use EOS token as PAD token
         inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
-        input_ids = inputs.input_ids.to(self.model.device)
-        attention_mask = inputs.attention_mask.to(self.model.device)
         if self.model.config.pad_token_id is None:
             self.model.config.pad_token_id = self.tokenizer.eos_token_id
@@ -94,16 +98,16 @@ class BaseTask(ABC):
                 {"role": "user", "content": f"{msg}"},
             ]
         formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-        print(formatted_chat)
         inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
-        input_ids = inputs.input_ids.to(self.model.device)
-        attention_mask = inputs.attention_mask.to(self.model.device)
         # Generate the sequence of letters starting from 'A'
         letters = [chr(ord('A') + i) for i in range(len(choices))]  # Create option letters A, B, C, D, E, ...
         encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
         flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist]  # Flatten the list
-        print(flattened_encoded_choices)
         allowed_tokens = flattened_encoded_choices
         allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
@@ -154,8 +158,8 @@ class BaseTask(ABC):
         )
         inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
-        input_ids = inputs.input_ids.to(self.model.device)
-        attention_mask = inputs.attention_mask.to(self.model.device)
         output = self.model.generate(
             input_ids,
@@ -186,13 +190,15 @@ class BaseTask(ABC):
         :return: Dataset
         """
         print("Loading dataset from Hugging Face.")
         dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
         print("Dataset loaded.")
-        # Load %25 of each dataset
-        print("Original dataset size: ", len(dataset))
-        dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
-        print("Reduced dataset size: ", len(dataset))
         return dataset
     @abstractmethod

 from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
 import torch
 from typing import List
+from datetime import datetime
 load_dotenv()
 HF_TOKEN=os.getenv("HF_TOKEN")
 OPENAI_KEY = os.getenv("OPENAI_API_KEY")
     def __init__(self, dataset_repo, model_name):
         self.dataset_repo = dataset_repo
         self.dataset = self.load_dataset_from_hf()
+        self.device = "auto" if torch.cuda.is_available() else "cpu"
         self.model, self.tokenizer = self.get_cached_model(model_name, self.device)
         openai.api_key = OPENAI_KEY
     def load_model(model_name: str, device):
         """Loads model and tokenizer once and caches it."""
         print(f"Loading model: {model_name}")
+        start_time = datetime.now()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             torch_dtype=torch.float16,
             device_map=device,
             token=HF_TOKEN,  # Replace with actual token
         )
+        end_time = datetime.now()
+        print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
         print("Model loaded.")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         return model, tokenizer
             self.tokenizer.pad_token = self.tokenizer.eos_token  # Use EOS token as PAD token
         inputs = self.tokenizer(msg, return_tensors="pt", padding=True, truncation=True)
+        input_ids = inputs.input_ids
+        attention_mask = inputs.attention_mask
         if self.model.config.pad_token_id is None:
             self.model.config.pad_token_id = self.tokenizer.eos_token_id
                 {"role": "user", "content": f"{msg}"},
             ]
         formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+        #print(formatted_chat)
         inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
+        input_ids = inputs.input_ids
+        attention_mask = inputs.attention_mask
         # Generate the sequence of letters starting from 'A'
         letters = [chr(ord('A') + i) for i in range(len(choices))]  # Create option letters A, B, C, D, E, ...
         encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
         flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist]  # Flatten the list
+        #print(flattened_encoded_choices)
         allowed_tokens = flattened_encoded_choices
         allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
         )
         inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
+        input_ids = inputs.input_ids
+        attention_mask = inputs.attention_mask
         output = self.model.generate(
             input_ids,
         :return: Dataset
         """
         print("Loading dataset from Hugging Face.")
+        start_time = datetime.now()
         dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
         print("Dataset loaded.")
+        # Load 50 from each dataset
+        if len(dataset) > 50:
+            dataset = dataset.shuffle(seed=42).select(range(50))
+        end_time = datetime.now()
+        print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.")
         return dataset
     @abstractmethod

src/deepeval/commonsense_reasoning_task.py CHANGED Viewed

@@ -32,9 +32,9 @@ class CommonsenseReasoningTask(BaseTask):
             context = row["context"]
             # Prints for debugging
-            print(f"Choices: {choices}")
-            print("Type of choices:", type(choices))
-            print("Type of answer:", type(answer))
             # Get answer index (starting from 0)
             if type(answer) == int:
@@ -62,9 +62,9 @@ class CommonsenseReasoningTask(BaseTask):
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Print answers
-            print(f"Correct Answer: {correct_answer_letter}")
-            print(f"Model Answer: {model_answer}")
-            print(f"Model Answer Cleaned: {model_answer_cleaned}")
             # Check if correct based on metric
             if correct_answer_letter == model_answer_cleaned:

             context = row["context"]
             # Prints for debugging
+            # print(f"Choices: {choices}")
+            # print("Type of choices:", type(choices))
+            # print("Type of answer:", type(answer))
             # Get answer index (starting from 0)
             if type(answer) == int:
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Print answers
+            # print(f"Correct Answer: {correct_answer_letter}")
+            # print(f"Model Answer: {model_answer}")
+            # print(f"Model Answer Cleaned: {model_answer_cleaned}")
             # Check if correct based on metric
             if correct_answer_letter == model_answer_cleaned:

src/deepeval/complex_reasoning.py CHANGED Viewed

@@ -33,8 +33,8 @@ class ComplexReasoningTask(BaseTask):
             correct_answers.append(correct_answer_letter)
             # Prints for debugging
-            print(f"Choices: {choices}")
-            print("Type of choices:", type(choices))
             # Construct the prompt/message
@@ -50,9 +50,9 @@ class ComplexReasoningTask(BaseTask):
             if correct_answer_letter == model_answer_cleaned:
                 true += 1
             # Print answers
-            print(f"Correct Answer: {correct_answer_letter}")
-            print(f"Model Answer: {model_answer}")
-            print(f"Model Answer Cleaned: {model_answer_cleaned}")
         print("Answers:", correct_answers)
         print("Results:", responses)

             correct_answers.append(correct_answer_letter)
             # Prints for debugging
+            # print(f"Choices: {choices}")
+            # print("Type of choices:", type(choices))
             # Construct the prompt/message
             if correct_answer_letter == model_answer_cleaned:
                 true += 1
             # Print answers
+            # print(f"Correct Answer: {correct_answer_letter}")
+            # print(f"Model Answer: {model_answer}")
+            # print(f"Model Answer Cleaned: {model_answer_cleaned}")
         print("Answers:", correct_answers)
         print("Results:", responses)

src/deepeval/deepeval_task_manager.py CHANGED Viewed

@@ -22,6 +22,7 @@ from src.deepeval.sts import STSTask
 from src.deepeval.mmlu import MMLUTask
 from src.deepeval.bias import BiasTask
 from typing import List
 load_dotenv()
 HF_TOKEN=os.getenv("HF_TOKEN")
@@ -73,15 +74,21 @@ class DeepEvalTaskManager:
     def run_tasks(self):
         """Execute validated tasks in order."""
         results = {}
         for task_name, task_method in self.tasks_to_run.items():
             try:
                 print("Running task: ", task_name)
                 task_enum = getattr(Task, task_name)
                 task_value = task_enum.value
                 results[task_value] = task_method()  # Call the stored method reference
             except Exception as e:
                 print(f"Error At Task: {task_name} - {e}")
                 continue
         print("All tasks completed.")
         return results

 from src.deepeval.mmlu import MMLUTask
 from src.deepeval.bias import BiasTask
 from typing import List
+from datetime import datetime
 load_dotenv()
 HF_TOKEN=os.getenv("HF_TOKEN")
     def run_tasks(self):
         """Execute validated tasks in order."""
         results = {}
+        total_start_time = datetime.now()
         for task_name, task_method in self.tasks_to_run.items():
             try:
+                start_time = datetime.now()
                 print("Running task: ", task_name)
                 task_enum = getattr(Task, task_name)
                 task_value = task_enum.value
                 results[task_value] = task_method()  # Call the stored method reference
+                end_time = datetime.now()
+                print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.")
             except Exception as e:
                 print(f"Error At Task: {task_name} - {e}")
                 continue
+        total_end_time = datetime.now()
+        print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.")
         print("All tasks completed.")
         return results

src/deepeval/nli.py CHANGED Viewed

@@ -36,9 +36,9 @@ class NLITask(BaseTask):
             # Prints for debugging
-            print(f"Choices: {choices}")
-            print("Type of choices:", type(choices))
-            print("Label:", label)
             # Construct the prompt/message
             instruction = ""
@@ -53,9 +53,9 @@ class NLITask(BaseTask):
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Print answers
-            print(f"Correct Answer: {correct_answer_letter}")
-            print(f"Model Answer: {model_answer}")
-            print(f"Model Answer Cleaned: {model_answer_cleaned}")
             # Check if correct based on metric
             if correct_answer_letter == model_answer_cleaned:

             # Prints for debugging
+            # print(f"Choices: {choices}")
+            # print("Type of choices:", type(choices))
+            # print("Label:", label)
             # Construct the prompt/message
             instruction = ""
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Print answers
+            # print(f"Correct Answer: {correct_answer_letter}")
+            # print(f"Model Answer: {model_answer}")
+            # print(f"Model Answer Cleaned: {model_answer_cleaned}")
             # Check if correct based on metric
             if correct_answer_letter == model_answer_cleaned:

src/deepeval/reading_comp_mc.py CHANGED Viewed

@@ -32,9 +32,9 @@ class ReadingComprehensionMCTask(BaseTask):
             question_about_the_text = row["question_about_the_text"]
             # Prints for debugging
-            print(f"Choices: {choices}")
-            print("Type of choices:", type(choices))
-            print("Type of answer:", type(answer))
             # Get answer index (starting from 0)
             if type(answer) == int:
@@ -45,6 +45,7 @@ class ReadingComprehensionMCTask(BaseTask):
             answer_index = answer_index - 1 # Because the answer is 1-indexed
             correct_answer_letter = chr(65 + answer_index)
             # Construct the prompt/message
             instruction = ""
             prompt = f"Paragraf:\n{text}\nSoru:{question_about_the_text}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
@@ -56,10 +57,9 @@ class ReadingComprehensionMCTask(BaseTask):
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
             # Print answers
-            print(f"Correct Answer: {correct_answer_letter}")
-            print(f"Model Answer: {model_answer}")
-            print(f"Model Answer Cleaned: {model_answer_cleaned}")
-            print(f"Result: {correct_answer_letter == model_answer_cleaned}")
             # Check if correct based on metric
             if correct_answer_letter == model_answer_cleaned:

             question_about_the_text = row["question_about_the_text"]
             # Prints for debugging
+            # print(f"Choices: {choices}")
+            # print("Type of choices:", type(choices))
+            # print("Type of answer:", type(answer))
             # Get answer index (starting from 0)
             if type(answer) == int:
             answer_index = answer_index - 1 # Because the answer is 1-indexed
             correct_answer_letter = chr(65 + answer_index)
             # Construct the prompt/message
             instruction = ""
             prompt = f"Paragraf:\n{text}\nSoru:{question_about_the_text}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
             # Print answers
+            # print(f"Correct Answer: {correct_answer_letter}")
+            # print(f"Model Answer: {model_answer}")
+            # print(f"Model Answer Cleaned: {model_answer_cleaned}")
             # Check if correct based on metric
             if correct_answer_letter == model_answer_cleaned:

src/deepeval/sentiment_analysis_task.py CHANGED Viewed

@@ -23,7 +23,7 @@ class SentimentAnalysisTask(BaseTask):
             prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
             messages = prompt
             answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
-            print("Answer:", answer)
             responses.append(answer)
             correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
             model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()

             prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
             messages = prompt
             answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
+            #print("Answer:", answer)
             responses.append(answer)
             correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
             model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()

src/deepeval/summarization_task.py CHANGED Viewed

@@ -23,8 +23,8 @@ class SummarizationTask(BaseTask):
             )
             generated_summary = self.generate_response(prompt, max_new_tokens=200)
-            print(f"Text: {text_data}\n")
-            print(f"Summary: {generated_summary}\n")
             test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
             metric = SummarizationMetric(
@@ -33,8 +33,8 @@ class SummarizationTask(BaseTask):
             )
             metric.measure(test_case)
-            print(f"Reason: {metric.reason}")
-            print(f"Score Breakdown: {metric.score_breakdown}")
             results.append({
                 "index": i,
                 "score": metric.score,

             )
             generated_summary = self.generate_response(prompt, max_new_tokens=200)
+            # print(f"Text: {text_data}\n")
+            # print(f"Summary: {generated_summary}\n")
             test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
             metric = SummarizationMetric(
             )
             metric.measure(test_case)
+            # print(f"Reason: {metric.reason}")
+            # print(f"Score Breakdown: {metric.score_breakdown}")
             results.append({
                 "index": i,
                 "score": metric.score,

src/deepeval/turkish_general_knowledge_task.py CHANGED Viewed

@@ -24,8 +24,8 @@ class TurkishGeneralKnowledgeTask(BaseTask):
             answer_index = row["answer"]  # Assuming it's zero-based index
             difficulty = row["difficulty"]
-            print(f"Choices: {choices}")
-            print("Type of choices:", type(choices))
             # Categorize difficulty
             if difficulty <= 3:
                 category = 'easy'
@@ -44,15 +44,15 @@ class TurkishGeneralKnowledgeTask(BaseTask):
             #"""
             model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
             responses.append(model_answer)
-            print(f"Correct Answer: {choices[answer_index]}")
-            print(f"Model Answer: {model_answer}")
             #TODO: Make the cleaning in the mcqa function
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Check if the answer is correct
             correct_answer_letter = chr(65 + answer_index)
-            print("Correct Answer Letter:", correct_answer_letter)
             if correct_answer_letter == model_answer_cleaned:
                 true += 1

             answer_index = row["answer"]  # Assuming it's zero-based index
             difficulty = row["difficulty"]
+            # print(f"Choices: {choices}")
+            # print("Type of choices:", type(choices))
             # Categorize difficulty
             if difficulty <= 3:
                 category = 'easy'
             #"""
             model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
             responses.append(model_answer)
+            # print(f"Correct Answer: {choices[answer_index]}")
+            # print(f"Model Answer: {model_answer}")
             #TODO: Make the cleaning in the mcqa function
             model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
             # Check if the answer is correct
             correct_answer_letter = chr(65 + answer_index)
+            # print("Correct Answer Letter:", correct_answer_letter)
             if correct_answer_letter == model_answer_cleaned:
                 true += 1

svc/router.py CHANGED Viewed

@@ -53,6 +53,7 @@ async def deep_eval_status():
 @router.get("/deepeval/hardware")
 def hardware_status():
     info = get_gpu_tier()
     return info
 @router.post("/chat",  response_model=TaskResponse)
@@ -155,17 +156,20 @@ def get_gpu_tier():
     if not torch.cuda.is_available():
         return {"gpu": "CPU", "tier": "cpu"}
-    gpu_name = torch.cuda.get_device_name(0).lower()
-    # Normalize GPU model to your custom tier system
-    if "t4" in gpu_name:
-        # You can improve this by checking memory or other context
         return {"gpu": "Tesla T4", "tier": "t4-medium"}
-    elif "l4" in gpu_name:
-        return {"gpu": "NVIDIA L4", "tier": "l4x1"}
-    elif "l40s" in gpu_name:
-        return {"gpu": "NVIDIA L40S", "tier": "l40sx1"}
-    elif "a10g" in gpu_name:
         return {"gpu": "NVIDIA A10G", "tier": "a10g"}
     else:
-        return {"gpu": gpu_name, "tier": "unknown"}

 @router.get("/deepeval/hardware")
 def hardware_status():
     info = get_gpu_tier()
+    print("Hardware Response:", info)
     return info
 @router.post("/chat",  response_model=TaskResponse)
     if not torch.cuda.is_available():
         return {"gpu": "CPU", "tier": "cpu"}
+    device_count = torch.cuda.device_count()
+    gpu_names = [torch.cuda.get_device_name(i).lower() for i in range(device_count)]
+    # Count how many of each GPU type we care about
+    l4_count = sum("l4" in name and "l40s" not in name for name in gpu_names)
+    l40s_count = sum("l40s" in name for name in gpu_names)
+    if l4_count == device_count:
+        return {"gpu": "NVIDIA L4", "tier": f"l4x{l4_count}"}
+    elif l40s_count == device_count:
+        return {"gpu": "NVIDIA L40S", "tier": f"l40sx{l40s_count}"}
+    elif "t4" in gpu_names[0]:
         return {"gpu": "Tesla T4", "tier": "t4-medium"}
+    elif "a10g" in gpu_names[0]:
         return {"gpu": "NVIDIA A10G", "tier": "a10g"}
     else:
+        return {"gpu": gpu_names[0], "tier": "unknown"}