ciyidogan commited on
Commit
c74c7e0
·
verified ·
1 Parent(s): 2c98904

Update intent_utils.py

Browse files
Files changed (1) hide show
  1. intent_utils.py +0 -110
intent_utils.py CHANGED
@@ -1,124 +1,14 @@
1
- import os
2
- import torch
3
- import json
4
- import shutil
5
  import re
6
- import traceback
7
- from datasets import Dataset
8
- from transformers import (
9
- AutoTokenizer,
10
- AutoModelForSequenceClassification,
11
- Trainer,
12
- TrainingArguments,
13
- default_data_collator,
14
- AutoConfig,
15
- )
16
- from log import log
17
- from core import llm_models
18
-
19
- async def detect_intent(text, project_name):
20
- llm_model_instance = llm_models.get(project_name)
21
- if not llm_model_instance or not llm_model_instance.intent_model:
22
- raise Exception(f"'{project_name}' için intent modeli yüklenmemiş.")
23
-
24
- tokenizer = llm_model_instance.intent_tokenizer
25
- model = llm_model_instance.intent_model
26
- label2id = llm_model_instance.intent_label2id
27
-
28
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
29
- outputs = model(**inputs)
30
- predicted_id = outputs.logits.argmax(dim=-1).item()
31
-
32
- detected_intent = [k for k, v in label2id.items() if v == predicted_id][0]
33
- confidence = outputs.logits.softmax(dim=-1).max().item()
34
-
35
- return detected_intent, confidence
36
-
37
- def background_training(project_name, intents, model_id, output_path, confidence_threshold):
38
- try:
39
- log(f"🔧 Intent eğitimi başlatıldı (proje: {project_name})")
40
- texts, labels, label2id = [], [], {}
41
- for idx, intent in enumerate(intents):
42
- label2id[intent["name"]] = idx
43
- for ex in intent["examples"]:
44
- texts.append(ex)
45
- labels.append(idx)
46
-
47
- dataset = Dataset.from_dict({"text": texts, "label": labels})
48
- tokenizer = AutoTokenizer.from_pretrained(model_id)
49
- config = AutoConfig.from_pretrained(model_id)
50
- config.problem_type = "single_label_classification"
51
- config.num_labels = len(label2id)
52
- model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)
53
-
54
- tokenized_data = {"input_ids": [], "attention_mask": [], "label": []}
55
- for row in dataset:
56
- out = tokenizer(row["text"], truncation=True, padding="max_length", max_length=128)
57
- tokenized_data["input_ids"].append(out["input_ids"])
58
- tokenized_data["attention_mask"].append(out["attention_mask"])
59
- tokenized_data["label"].append(row["label"])
60
-
61
- tokenized = Dataset.from_dict(tokenized_data)
62
- tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
63
-
64
- if os.path.exists(output_path):
65
- shutil.rmtree(output_path)
66
- os.makedirs(output_path, exist_ok=True)
67
-
68
- trainer = Trainer(
69
- model=model,
70
- args=TrainingArguments(output_path, per_device_train_batch_size=4, num_train_epochs=3, logging_steps=10, save_strategy="no", report_to=[]),
71
- train_dataset=tokenized,
72
- data_collator=default_data_collator,
73
- )
74
- trainer.train()
75
-
76
- log("🔧 Başarı raporu üretiliyor...")
77
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
- model.to(device)
79
- input_ids_tensor = torch.tensor(tokenized["input_ids"]).to(device)
80
- attention_mask_tensor = torch.tensor(tokenized["attention_mask"]).to(device)
81
-
82
- with torch.no_grad():
83
- outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
84
- predictions = outputs.logits.argmax(dim=-1).tolist()
85
-
86
- actuals = tokenized["label"]
87
- counts, correct = {}, {}
88
- for pred, actual in zip(predictions, actuals):
89
- intent_name = list(label2id.keys())[list(label2id.values()).index(actual)]
90
- counts[intent_name] = counts.get(intent_name, 0) + 1
91
- if pred == actual:
92
- correct[intent_name] = correct.get(intent_name, 0) + 1
93
- for intent_name, total in counts.items():
94
- accuracy = correct.get(intent_name, 0) / total
95
- log(f"📊 Intent '{intent_name}' doğruluk: {accuracy:.2f} — {total} örnek")
96
- if accuracy < confidence_threshold or total < 5:
97
- log(f"⚠️ Yetersiz performanslı intent: '{intent_name}' — Doğruluk: {accuracy:.2f}, Örnek: {total}")
98
-
99
- # Eğitim sonrası model ve tokenizer'ı diske kaydet
100
- model.save_pretrained(output_path)
101
- tokenizer.save_pretrained(output_path)
102
- with open(os.path.join(output_path, "label2id.json"), "w") as f:
103
- json.dump(label2id, f)
104
-
105
- log(f"✅ Intent eğitimi tamamlandı ve '{project_name}' için model disk üzerinde hazır.")
106
-
107
- except Exception as e:
108
- log(f"❌ Intent eğitimi hatası: {e}")
109
- traceback.print_exc()
110
 
111
  def extract_parameters(variables_list, user_input):
112
  extracted_params = []
113
  for pattern in variables_list:
114
- # Örneğin: from_location:{Ankara} to_location:{İstanbul}
115
  regex = re.sub(r"(\w+):\{(.+?)\}", r"(?P<\1>.+?)", pattern)
116
  match = re.match(regex, user_input)
117
  if match:
118
  extracted_params = [{"key": k, "value": v} for k, v in match.groupdict().items()]
119
  break
120
 
121
- # Ek özel basit yakalama: iki şehir birden yazılırsa → sırayla atama
122
  if not extracted_params:
123
  city_pattern = r"(\bAnkara\b|\bİstanbul\b|\bİzmir\b)"
124
  cities = re.findall(city_pattern, user_input)
 
 
 
 
 
1
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  def extract_parameters(variables_list, user_input):
4
  extracted_params = []
5
  for pattern in variables_list:
 
6
  regex = re.sub(r"(\w+):\{(.+?)\}", r"(?P<\1>.+?)", pattern)
7
  match = re.match(regex, user_input)
8
  if match:
9
  extracted_params = [{"key": k, "value": v} for k, v in match.groupdict().items()]
10
  break
11
 
 
12
  if not extracted_params:
13
  city_pattern = r"(\bAnkara\b|\bİstanbul\b|\bİzmir\b)"
14
  cities = re.findall(city_pattern, user_input)