mashaelalbu commited on
Commit
990142e
·
verified ·
1 Parent(s): c4d7fee

Update app/utils.py

Browse files
Files changed (1) hide show
  1. app/utils.py +34 -71
app/utils.py CHANGED
@@ -1,4 +1,3 @@
1
- # utils.py (معدل)
2
  import os
3
  import pandas as pd
4
  from transformers import AutoModel, AutoTokenizer
@@ -9,8 +8,6 @@ from transformers import BertTokenizer
9
  import nltk
10
  import requests
11
  import io
12
- from flask import current_app
13
- import base64
14
 
15
  logger = logging.getLogger(__name__)
16
 
@@ -108,129 +105,95 @@ class AllergyAnalyzer:
108
  nltk.data.find('tokenizers/punkt')
109
  except LookupError:
110
  nltk.download('punkt')
111
- try:
112
- nltk.data.find('tokenizers/punkt_tab')
113
- except LookupError:
114
- nltk.download('punkt_tab')
115
-
116
- self.allergy_dict = self.load_allergy_data()
117
  self.ocr_model = OCRModel()
118
 
119
  def load_allergy_data(self):
120
  """تحميل بيانات الحساسيات من ملف Excel"""
121
  try:
122
- df = pd.read_excel(self.dataset_path)
 
123
  allergy_dict = {}
124
 
125
  for index, row in df.iterrows():
126
- allergy = row['Allergy']
127
- ingredients = [str(ingredient).lower() for ingredient in row[1:] if pd.notna(ingredient)]
128
- allergy_dict[allergy.lower()] = ingredients
129
 
130
  return allergy_dict
 
131
  except Exception as e:
132
  logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
133
- return {}
134
 
135
  def tokenize_text(self, text):
136
  """تقسيم النص إلى كلمات"""
137
  tokens = nltk.word_tokenize(text)
138
  return [w.lower() for w in tokens if w.isalpha()]
139
 
140
- def find_allergy_for_token(self, token):
141
- """البحث عن الحساسية المقابلة للتوكن في ملف الإكسل"""
142
- results = []
143
- for allergy, ingredients in self.allergy_dict.items():
144
  if token in ingredients:
145
- results.append(allergy)
146
- return results
147
 
148
  def check_allergy_risk(self, ingredient, api_key):
149
  """الاستعلام من Claude API عن الحساسيات"""
150
  prompt = f"""
151
- You are a professional food safety expert specializing in allergen classification.
152
- Please analyze the ingredient '{ingredient}' and determine which of the following major allergen categories it belongs to:
153
- dairy, eggs, peanuts, soy, tree nuts, wheat, fish, shellfish, sesame.
154
-
155
- Return only the allergen category name if found, or 'None' if not found.
156
- Example responses: 'dairy', 'eggs', 'None', etc.
157
  """
158
  url = "https://api.anthropic.com/v1/messages"
159
  headers = {
160
  "x-api-key": api_key,
161
  "content-type": "application/json",
162
- "anthropic-version": "2023-06-01"
163
  }
164
 
165
  data = {
166
  "model": "claude-3-opus-20240229",
167
  "messages": [{"role": "user", "content": prompt}],
168
- "max_tokens": 10
169
  }
170
 
171
  try:
172
  response = requests.post(url, json=data, headers=headers)
173
- json_response = response.json()
174
 
175
- if "content" in json_response and isinstance(json_response["content"], list):
176
- result = json_response["content"][0]["text"].strip().lower()
177
- return result if result in self.allergy_dict else None
178
- return None
179
 
 
 
 
180
  except Exception as e:
181
  logger.error(f"Error querying Claude API: {str(e)}")
182
- return None
 
183
 
184
  def analyze_image(self, image, claude_api_key=None):
185
- """تحليل الصورة مباشرة للكشف عن الحساسيات"""
186
  try:
187
  # استخراج النص من الصورة
188
  extracted_text = self.ocr_model.process_image(image)
189
- logger.info(f"Extracted text: {extracted_text}")
190
 
191
  # تحويل النص إلى tokens
192
  tokens = self.tokenize_text(extracted_text)
193
- detected_allergens = set()
194
- database_matches = {}
195
- claude_matches = {}
196
 
197
- # التحقق من كل token في قاعدة البيانات
 
198
  for token in tokens:
199
- db_results = self.find_allergy_for_token(token)
200
 
201
- if db_results:
202
- for allergy in db_results:
203
- detected_allergens.add(allergy)
204
- if allergy not in database_matches:
205
- database_matches[allergy] = []
206
- database_matches[allergy].append(token)
207
- elif claude_api_key:
208
- # إذا لم توجد في قاعدة البيانات، نستخدم Claude API
209
- api_result = self.check_allergy_risk(token, claude_api_key)
210
- if api_result:
211
- detected_allergens.add(api_result)
212
- if api_result not in claude_matches:
213
- claude_matches[api_result] = []
214
- claude_matches[api_result].append(token)
215
 
216
  return {
217
  "extracted_text": extracted_text,
218
- "detected_allergens": list(detected_allergens),
219
- "database_matches": database_matches,
220
- "claude_matches": claude_matches,
221
- "analyzed_tokens": tokens
222
  }
223
-
224
  except Exception as e:
225
  logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
226
- return {
227
- "error": str(e),
228
- "detected_allergens": [],
229
- "database_matches": {},
230
- "claude_matches": {},
231
- "analyzed_tokens": []
232
- }
233
-
234
- def get_allergen_list(self):
235
- """الحصول على قائمة الحساسيات المعروفة"""
236
- return list(self.allergy_dict.keys())
 
 
1
  import os
2
  import pandas as pd
3
  from transformers import AutoModel, AutoTokenizer
 
8
  import nltk
9
  import requests
10
  import io
 
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
 
105
  nltk.data.find('tokenizers/punkt')
106
  except LookupError:
107
  nltk.download('punkt')
108
+
109
+ self.allergy_data = self.load_allergy_data()
 
 
 
 
110
  self.ocr_model = OCRModel()
111
 
112
  def load_allergy_data(self):
113
  """تحميل بيانات الحساسيات من ملف Excel"""
114
  try:
115
+ df = pd.read_excel(self.dataset_path, header=None)
116
+
117
  allergy_dict = {}
118
 
119
  for index, row in df.iterrows():
120
+ allergy_name = row.iloc[0].strip().lower()
121
+ ingredients = [str(ingredient).strip().lower() for ingredient in row[1:] if pd.notna(ingredient)]
122
+ allergy_dict[allergy_name] = ingredients
123
 
124
  return allergy_dict
125
+
126
  except Exception as e:
127
  logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
 
128
 
129
  def tokenize_text(self, text):
130
  """تقسيم النص إلى كلمات"""
131
  tokens = nltk.word_tokenize(text)
132
  return [w.lower() for w in tokens if w.isalpha()]
133
 
134
+ def check_allergen_in_excel(self, token):
135
+ """التحقق من وجود التوكن في ملف الإكسل"""
136
+ for allergy_name, ingredients in self.allergy_data.items():
 
137
  if token in ingredients:
138
+ return allergy_name # Return the allergy name if token is found
139
+ return None
140
 
141
  def check_allergy_risk(self, ingredient, api_key):
142
  """الاستعلام من Claude API عن الحساسيات"""
143
  prompt = f"""
144
+ You are a professional food safety expert specializing in allergen classification and risk assessment. Analyze the ingredient '{ingredient}' and determine whether it poses any allergy risk. Respond with 'Yes' or 'No'.
 
 
 
 
 
145
  """
146
  url = "https://api.anthropic.com/v1/messages"
147
  headers = {
148
  "x-api-key": api_key,
149
  "content-type": "application/json",
 
150
  }
151
 
152
  data = {
153
  "model": "claude-3-opus-20240229",
154
  "messages": [{"role": "user", "content": prompt}],
155
+ "max_tokens": 10,
156
  }
157
 
158
  try:
159
  response = requests.post(url, json=data, headers=headers)
 
160
 
161
+ response_json = response.json()
 
 
 
162
 
163
+ if "content" in response_json and isinstance(response_json["content"], list):
164
+ return response_json["content"][0]["text"].strip().lower() == 'yes'
165
+
166
  except Exception as e:
167
  logger.error(f"Error querying Claude API: {str(e)}")
168
+
169
+ return False
170
 
171
  def analyze_image(self, image, claude_api_key=None):
172
+ """تحليل الصورة للكشف عن الحساسيات"""
173
  try:
174
  # استخراج النص من الصورة
175
  extracted_text = self.ocr_model.process_image(image)
 
176
 
177
  # تحويل النص إلى tokens
178
  tokens = self.tokenize_text(extracted_text)
 
 
 
179
 
180
+ detected_allergens = {}
181
+
182
  for token in tokens:
183
+ allergy_from_excel = self.check_allergen_in_excel(token)
184
 
185
+ if allergy_from_excel:
186
+ detected_allergens[token] = allergy_from_excel
187
+
188
+ elif claude_api_key: # إذا لم يُوجد في ملف الإكسل، استدعِ Claude API
189
+ is_allergen_risky = self.check_allergy_risk(token, claude_api_key)
190
+ detected_allergens[token] = 'API Risk' if is_allergen_risky else 'Safe'
 
 
 
 
 
 
 
 
191
 
192
  return {
193
  "extracted_text": extracted_text,
194
+ "detected_allergens": detected_allergens,
195
+ "analyzed_tokens": tokens,
 
 
196
  }
197
+
198
  except Exception as e:
199
  logger.error(f"Error analyzing image: {str(e)}", exc_info=True)