mashaelalbu commited on
Commit
b1c0c81
·
verified ·
1 Parent(s): 3aab4b6

Update app/utils.py

Browse files
Files changed (1) hide show
  1. app/utils.py +93 -66
app/utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import pandas as pd
3
  from transformers import AutoModel, AutoTokenizer
@@ -24,7 +25,6 @@ class OCRModel:
24
  try:
25
  logger.info("Initializing OCR model...")
26
 
27
- # محاولة تحميل النموذج
28
  try:
29
  self.tokenizer = AutoTokenizer.from_pretrained(
30
  'stepfun-ai/GOT-OCR2_0',
@@ -61,19 +61,15 @@ class OCRModel:
61
  if image.mode != 'RGB':
62
  image = image.convert('RGB')
63
 
64
- # تحسين التباين
65
  enhancer = ImageEnhance.Contrast(image)
66
  image = enhancer.enhance(1.5)
67
 
68
- # تحسين الحدة
69
  enhancer = ImageEnhance.Sharpness(image)
70
  image = enhancer.enhance(1.5)
71
 
72
- # تحسين السطوع
73
  enhancer = ImageEnhance.Brightness(image)
74
  image = enhancer.enhance(1.2)
75
 
76
- # تطبيق فلتر لتليين الصورة
77
  image = image.filter(ImageFilter.SMOOTH)
78
 
79
  return image
@@ -85,18 +81,13 @@ class OCRModel:
85
  try:
86
  logger.info("Starting image processing")
87
 
88
- # معالجة الصورة
89
  processed_image = self.preprocess_image(image)
90
-
91
- # حفظ الصورة مؤقتاً للتعامل مع النموذج
92
  temp_image_path = "temp_ocr_image.jpg"
93
  processed_image.save(temp_image_path)
94
 
95
- # استخراج النص
96
  result = self.model.chat(self.tokenizer, temp_image_path, ocr_type='format')
97
  logger.info(f"Successfully extracted text: {result[:100]}...")
98
 
99
- # حذف الملف المؤقت
100
  if os.path.exists(temp_image_path):
101
  os.remove(temp_image_path)
102
 
@@ -111,7 +102,6 @@ class OCRModel:
111
  class AllergyAnalyzer:
112
  def __init__(self, dataset_path):
113
  self.dataset_path = dataset_path
114
- # Ensure NLTK data is downloaded
115
  try:
116
  nltk.data.find('tokenizers/punkt')
117
  except LookupError:
@@ -122,6 +112,7 @@ class AllergyAnalyzer:
122
  nltk.download('punkt_tab')
123
 
124
  self.allergy_dict = self.load_allergy_data()
 
125
 
126
  def load_allergy_data(self):
127
  """تحميل بيانات الحساسيات من ملف Excel"""
@@ -131,8 +122,8 @@ class AllergyAnalyzer:
131
 
132
  for index, row in df.iterrows():
133
  allergy = row['Allergy']
134
- ingredients = [ingredient for ingredient in row[1:] if pd.notna(ingredient)]
135
- allergy_dict[allergy] = ingredients
136
 
137
  return allergy_dict
138
  except Exception as e:
@@ -152,28 +143,98 @@ class AllergyAnalyzer:
152
  results.append(allergy)
153
  return results
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def check_claude_allergens(self, token, allergy, api_key):
156
  """الاستعلام من Claude API عن الحساسيات"""
157
- prompt = f"""
158
- You are a professional food safety expert. Analyze if '{token}' contains or is derived from {allergy}.
159
-
 
 
 
 
 
 
160
  Respond ONLY with 'Yes' or 'No'. No explanations.
161
  """
162
-
163
- url = "https://api.anthropic.com/v1/messages"
164
- headers = {
165
- "x-api-key": api_key,
166
- "content-type": "application/json",
167
- "anthropic-version": "2023-06-01"
168
- }
169
-
170
- data = {
171
- "model": "claude-3-opus-20240229",
172
- "messages": [{"role": "user", "content": prompt}],
173
- "max_tokens": 10
174
- }
175
-
176
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  response = requests.post(url, json=data, headers=headers)
178
  json_response = response.json()
179
 
@@ -183,38 +244,4 @@ Respond ONLY with 'Yes' or 'No'. No explanations.
183
 
184
  except Exception as e:
185
  logger.error(f"Error querying Claude API: {str(e)}")
186
- return False
187
-
188
- def analyze_text(self, text, user_allergens, claude_api_key=None):
189
- """تحليل النص للكشف عن الحساسيات"""
190
- detected_allergens = set()
191
- database_matches = {}
192
- claude_matches = {}
193
- tokens = self.tokenize_text(text)
194
-
195
- for token in tokens:
196
- # التحقق من قاعدة البيانات أولاً
197
- db_results = self.check_database_allergens(token, user_allergens)
198
-
199
- if db_results:
200
- for allergy in db_results:
201
- detected_allergens.add(allergy)
202
- database_matches[allergy] = database_matches.get(allergy, []) + [token]
203
- else:
204
- # إذا لم توجد في قاعدة البيانات، نستخدم Claude API
205
- if claude_api_key:
206
- for allergy in user_allergens:
207
- if self.check_claude_allergens(token, allergy, claude_api_key):
208
- detected_allergens.add(allergy)
209
- claude_matches[allergy] = claude_matches.get(allergy, []) + [token]
210
-
211
- return {
212
- "detected_allergens": list(detected_allergens),
213
- "database_matches": database_matches,
214
- "claude_matches": claude_matches,
215
- "analyzed_tokens": tokens
216
- }
217
-
218
- def get_allergen_list(self):
219
- """الحصول على قائمة الحساسيات المعروفة"""
220
- return list(self.allergy_dict.keys())
 
1
+ # utils.py
2
  import os
3
  import pandas as pd
4
  from transformers import AutoModel, AutoTokenizer
 
25
  try:
26
  logger.info("Initializing OCR model...")
27
 
 
28
  try:
29
  self.tokenizer = AutoTokenizer.from_pretrained(
30
  'stepfun-ai/GOT-OCR2_0',
 
61
  if image.mode != 'RGB':
62
  image = image.convert('RGB')
63
 
 
64
  enhancer = ImageEnhance.Contrast(image)
65
  image = enhancer.enhance(1.5)
66
 
 
67
  enhancer = ImageEnhance.Sharpness(image)
68
  image = enhancer.enhance(1.5)
69
 
 
70
  enhancer = ImageEnhance.Brightness(image)
71
  image = enhancer.enhance(1.2)
72
 
 
73
  image = image.filter(ImageFilter.SMOOTH)
74
 
75
  return image
 
81
  try:
82
  logger.info("Starting image processing")
83
 
 
84
  processed_image = self.preprocess_image(image)
 
 
85
  temp_image_path = "temp_ocr_image.jpg"
86
  processed_image.save(temp_image_path)
87
 
 
88
  result = self.model.chat(self.tokenizer, temp_image_path, ocr_type='format')
89
  logger.info(f"Successfully extracted text: {result[:100]}...")
90
 
 
91
  if os.path.exists(temp_image_path):
92
  os.remove(temp_image_path)
93
 
 
102
  class AllergyAnalyzer:
103
  def __init__(self, dataset_path):
104
  self.dataset_path = dataset_path
 
105
  try:
106
  nltk.data.find('tokenizers/punkt')
107
  except LookupError:
 
112
  nltk.download('punkt_tab')
113
 
114
  self.allergy_dict = self.load_allergy_data()
115
+ self.ocr_model = OCRModel()
116
 
117
  def load_allergy_data(self):
118
  """تحميل بيانات الحساسيات من ملف Excel"""
 
122
 
123
  for index, row in df.iterrows():
124
  allergy = row['Allergy']
125
+ ingredients = [str(ingredient).lower() for ingredient in row[1:] if pd.notna(ingredient)]
126
+ allergy_dict[allergy.lower()] = ingredients
127
 
128
  return allergy_dict
129
  except Exception as e:
 
143
  results.append(allergy)
144
  return results
145
 
146
+ def analyze_image(self, image, user_allergens):
147
+ """تحليل الصورة مباشرة للكشف عن الحساسيات"""
148
+ try:
149
+ # استخراج النص من الصورة
150
+ extracted_text = self.ocr_model.process_image(image)
151
+ logger.info(f"Extracted text: {extracted_text}")
152
+
153
+ # تحويل النص إلى tokens
154
+ tokens = self.tokenize_text(extracted_text)
155
+ detected_allergens = set()
156
+ database_matches = {}
157
+ claude_matches = {}
158
+
159
+ # التحقق من كل token في قاعدة البيانات
160
+ for token in tokens:
161
+ db_results = self.check_database_allergens(token, user_allergens)
162
+
163
+ if db_results:
164
+ for allergy in db_results:
165
+ detected_allergens.add(allergy)
166
+ database_matches[allergy] = database_matches.get(allergy, []) + [token]
167
+ else:
168
+ # إذا لم توجد في قاعدة البيانات، نستخدم Claude API
169
+ claude_api_key = current_app.config.get('CLAUDE_API_KEY')
170
+ if claude_api_key:
171
+ for allergy in user_allergens:
172
+ if self.check_claude_allergens(token, allergy, claude_api_key):
173
+ detected_allergens.add(allergy)
174
+ claude_matches[allergy] = claude_matches.get(allergy, []) + [token]
175
+
176
+ return {
177
+ "detected_allergens": list(detected_allergens),
178
+ "database_matches": database_matches,
179
+ "claude_matches": claude_matches,
180
+ "analyzed_tokens": tokens
181
+ }
182
+
183
+ except Exception as e:
184
+ logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
185
+ return {
186
+ "detected_allergens": [],
187
+ "database_matches": {},
188
+ "claude_matches": {},
189
+ "analyzed_tokens": [],
190
+ "error": str(e)
191
+ }
192
+
193
  def check_claude_allergens(self, token, allergy, api_key):
194
  """الاستعلام من Claude API عن الحساسيات"""
195
+ try:
196
+ # تحضير الصورة للطلب
197
+ img_byte_arr = io.BytesIO()
198
+ image.save(img_byte_arr, format='JPEG')
199
+ img_byte_arr = img_byte_arr.getvalue()
200
+
201
+ prompt = f"""
202
+ Analyze if this product contains or is derived from {allergy}.
203
+ Focus on the ingredient: {token}.
204
  Respond ONLY with 'Yes' or 'No'. No explanations.
205
  """
206
+
207
+ url = "https://api.anthropic.com/v1/messages"
208
+ headers = {
209
+ "x-api-key": api_key,
210
+ "content-type": "application/json",
211
+ "anthropic-version": "2023-06-01"
212
+ }
213
+
214
+ data = {
215
+ "model": "claude-3-opus-20240229",
216
+ "messages": [
217
+ {
218
+ "role": "user",
219
+ "content": [
220
+ {
221
+ "type": "image",
222
+ "source": {
223
+ "type": "base64",
224
+ "media_type": "image/jpeg",
225
+ "data": base64.b64encode(img_byte_arr).decode('utf-8')
226
+ }
227
+ },
228
+ {
229
+ "type": "text",
230
+ "text": prompt
231
+ }
232
+ ]
233
+ }
234
+ ],
235
+ "max_tokens": 10
236
+ }
237
+
238
  response = requests.post(url, json=data, headers=headers)
239
  json_response = response.json()
240
 
 
244
 
245
  except Exception as e:
246
  logger.error(f"Error querying Claude API: {str(e)}")
247
+ return