mashaelalbu commited on
Commit
55ed0b4
·
verified ·
1 Parent(s): b6b208f

Update app/utils.py

Browse files
Files changed (1) hide show
  1. app/utils.py +95 -37
app/utils.py CHANGED
@@ -94,7 +94,7 @@ class OCRModel:
94
 
95
  except Exception as e:
96
  logger.error(f"Error in OCR processing: {str(e)}", exc_info=True)
97
- if os.path.exists(temp_image_path):
98
  os.remove(temp_image_path)
99
  return f"Error processing image: {str(e)}"
100
 
@@ -111,93 +111,151 @@ class AllergyAnalyzer:
111
  nltk.download('punkt_tab')
112
 
113
  self.allergy_data = self.load_allergy_data()
 
 
114
  self.ocr_model = OCRModel()
115
 
116
  def load_allergy_data(self):
117
  """تحميل بيانات الحساسيات من ملف Excel"""
118
  try:
119
- df = pd.read_excel(self.dataset_path, header=None)
 
120
 
121
  allergy_dict = {}
122
 
123
  for index, row in df.iterrows():
124
- allergy_name = row.iloc[0].strip().lower()
125
- ingredients = [str(ingredient).strip().lower() for ingredient in row[1:] if pd.notna(ingredient)]
126
- allergy_dict[allergy_name] = ingredients
 
127
 
 
 
 
 
 
 
 
 
 
 
128
  return allergy_dict
129
 
130
  except Exception as e:
131
  logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
 
132
 
133
  def tokenize_text(self, text):
134
  """تقسيم النص إلى كلمات"""
135
- tokens = nltk.word_tokenize(text)
136
- return [w.lower() for w in tokens if w.isalpha()]
 
 
 
 
137
 
138
  def check_allergen_in_excel(self, token):
139
  """التحقق من وجود التوكن في ملف الإكسل"""
140
- for allergy_name, ingredients in self.allergy_data.items():
141
- if token in ingredients:
142
- return allergy_name # Return the allergy name if token is found
143
- return None
 
 
 
 
 
 
 
144
 
145
  def check_allergy_risk(self, ingredient, api_key):
146
  """الاستعلام من Claude API عن الحساسيات"""
147
- prompt = f"""
148
- You are a professional food safety expert specializing in allergen classification and risk assessment. Analyze the ingredient '{ingredient}' and determine whether it poses any allergy risk. Respond with 'Yes' or 'No'.
149
- """
150
- url = "https://api.anthropic.com/v1/messages"
151
- headers = {
152
- "x-api-key": api_key,
153
- "content-type": "application/json",
154
- }
155
-
156
- data = {
157
- "model": "claude-3-opus-20240229",
158
- "messages": [{"role": "user", "content": prompt}],
159
- "max_tokens": 10,
160
- }
161
-
162
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  response = requests.post(url, json=data, headers=headers)
 
164
 
165
  response_json = response.json()
166
 
167
  if "content" in response_json and isinstance(response_json["content"], list):
168
- return response_json["content"][0]["text"].strip().lower() == 'yes'
 
 
 
 
169
 
170
  except Exception as e:
171
  logger.error(f"Error querying Claude API: {str(e)}")
172
 
173
- return False
174
 
175
  def analyze_image(self, image, claude_api_key=None):
176
  """تحليل الصورة للكشف عن الحساسيات"""
177
  try:
 
 
 
178
  # استخراج النص من الصورة
179
  extracted_text = self.ocr_model.process_image(image)
 
 
 
 
180
 
181
  # تحويل النص إلى tokens
182
  tokens = self.tokenize_text(extracted_text)
 
 
183
 
184
- detected_allergens = {}
 
185
 
186
  for token in tokens:
187
- allergy_from_excel = self.check_allergen_in_excel(token)
188
-
189
- if allergy_from_excel:
190
- detected_allergens[token] = allergy_from_excel
191
-
192
- elif claude_api_key: # إذا لم يُوجد في ملف الإكسل، استدعِ Claude API
193
- is_allergen_risky = self.check_allergy_risk(token, claude_api_key)
194
- detected_allergens[token] = 'API Risk' if is_allergen_risky else 'Safe'
 
 
 
 
 
 
 
195
 
196
  return {
197
  "extracted_text": extracted_text,
198
  "detected_allergens": detected_allergens,
 
 
199
  "analyzed_tokens": tokens,
 
200
  }
201
 
202
  except Exception as e:
203
  logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
 
 
 
 
 
94
 
95
  except Exception as e:
96
  logger.error(f"Error in OCR processing: {str(e)}", exc_info=True)
97
+ if 'temp_image_path' in locals() and os.path.exists(temp_image_path):
98
  os.remove(temp_image_path)
99
  return f"Error processing image: {str(e)}"
100
 
 
111
  nltk.download('punkt_tab')
112
 
113
  self.allergy_data = self.load_allergy_data()
114
+ if self.allergy_data is None:
115
+ raise ValueError("Failed to load allergy data from dataset")
116
  self.ocr_model = OCRModel()
117
 
118
  def load_allergy_data(self):
119
  """تحميل بيانات الحساسيات من ملف Excel"""
120
  try:
121
+ # قراءة ملف الإكسل مع تحديد أن الصف الأول هو العناوين
122
+ df = pd.read_excel(self.dataset_path, header=0)
123
 
124
  allergy_dict = {}
125
 
126
  for index, row in df.iterrows():
127
+ # الحصول على اسم الحساسية من العمود الأول
128
+ allergy_name = str(row.iloc[0]).strip().lower()
129
+ if not allergy_name:
130
+ continue
131
 
132
+ # الحصول على المكونات من الأعمدة التالية
133
+ ingredients = []
134
+ for col in range(1, len(row)):
135
+ ingredient = str(row.iloc[col]).strip().lower()
136
+ if ingredient and ingredient != 'nan':
137
+ ingredients.append(ingredient)
138
+
139
+ allergy_dict[allergy_name] = ingredients
140
+
141
+ logger.info(f"Successfully loaded allergy data with {len(allergy_dict)} categories")
142
  return allergy_dict
143
 
144
  except Exception as e:
145
  logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
146
+ return None
147
 
148
  def tokenize_text(self, text):
149
  """تقسيم النص إلى كلمات"""
150
+ try:
151
+ tokens = nltk.word_tokenize(text)
152
+ return [w.lower() for w in tokens if w.isalpha()]
153
+ except Exception as e:
154
+ logger.error(f"Error tokenizing text: {str(e)}")
155
+ return []
156
 
157
  def check_allergen_in_excel(self, token):
158
  """التحقق من وجود التوكن في ملف الإكسل"""
159
+ try:
160
+ if not self.allergy_data:
161
+ return None
162
+
163
+ for allergy_name, ingredients in self.allergy_data.items():
164
+ if token in ingredients:
165
+ return allergy_name
166
+ return None
167
+ except Exception as e:
168
+ logger.error(f"Error checking allergen in Excel: {str(e)}")
169
+ return None
170
 
171
  def check_allergy_risk(self, ingredient, api_key):
172
  """الاستعلام من Claude API عن الحساسيات"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  try:
174
+ prompt = f"""
175
+ You are a professional food safety expert. Analyze the ingredient '{ingredient}' and determine which of these allergen categories it belongs to:
176
+ dairy, eggs, peanuts, soy, tree nuts, wheat, fish, shellfish, sesame.
177
+ Respond only with the category name or 'None' if not found.
178
+ """
179
+ url = "https://api.anthropic.com/v1/messages"
180
+ headers = {
181
+ "x-api-key": api_key,
182
+ "content-type": "application/json",
183
+ "anthropic-version": "2023-06-01"
184
+ }
185
+
186
+ data = {
187
+ "model": "claude-3-opus-20240229",
188
+ "messages": [{"role": "user", "content": prompt}],
189
+ "max_tokens": 10
190
+ }
191
+
192
  response = requests.post(url, json=data, headers=headers)
193
+ response.raise_for_status()
194
 
195
  response_json = response.json()
196
 
197
  if "content" in response_json and isinstance(response_json["content"], list):
198
+ result = response_json["content"][0]["text"].strip().lower()
199
+ # التحقق من أن النتيجة هي واحدة من الحساسيات المعروفة
200
+ if result in self.allergy_data:
201
+ return result
202
+ return None
203
 
204
  except Exception as e:
205
  logger.error(f"Error querying Claude API: {str(e)}")
206
 
207
+ return None
208
 
209
  def analyze_image(self, image, claude_api_key=None):
210
  """تحليل الصورة للكشف عن الحساسيات"""
211
  try:
212
+ if not self.allergy_data:
213
+ raise ValueError("Allergy data not loaded")
214
+
215
  # استخراج النص من الصورة
216
  extracted_text = self.ocr_model.process_image(image)
217
+ if extracted_text.startswith("Error processing image"):
218
+ raise ValueError(extracted_text)
219
+
220
+ logger.info(f"Extracted text: {extracted_text[:200]}...")
221
 
222
  # تحويل النص إلى tokens
223
  tokens = self.tokenize_text(extracted_text)
224
+ if not tokens:
225
+ raise ValueError("No tokens extracted from text")
226
 
227
+ database_matches = {}
228
+ claude_matches = {}
229
 
230
  for token in tokens:
231
+ # البحث أولاً في قاعدة البيانات
232
+ allergy = self.check_allergen_in_excel(token)
233
+ if allergy:
234
+ if allergy not in database_matches:
235
+ database_matches[allergy] = []
236
+ database_matches[allergy].append(token)
237
+ elif claude_api_key:
238
+ # إذا لم يُوجد في ملف الإكسل، استدعِ Claude API
239
+ allergy = self.check_allergy_risk(token, claude_api_key)
240
+ if allergy:
241
+ if allergy not in claude_matches:
242
+ claude_matches[allergy] = []
243
+ claude_matches[allergy].append(token)
244
+
245
+ detected_allergens = list(database_matches.keys()) + list(claude_matches.keys())
246
 
247
  return {
248
  "extracted_text": extracted_text,
249
  "detected_allergens": detected_allergens,
250
+ "database_matches": database_matches,
251
+ "claude_matches": claude_matches,
252
  "analyzed_tokens": tokens,
253
+ "success": True
254
  }
255
 
256
  except Exception as e:
257
  logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
258
+ return {
259
+ "error": str(e),
260
+ "success": False
261
+ }