Spaces:
Sleeping
Sleeping
Update app/utils.py
Browse files- app/utils.py +95 -37
app/utils.py
CHANGED
@@ -94,7 +94,7 @@ class OCRModel:
|
|
94 |
|
95 |
except Exception as e:
|
96 |
logger.error(f"Error in OCR processing: {str(e)}", exc_info=True)
|
97 |
-
if os.path.exists(temp_image_path):
|
98 |
os.remove(temp_image_path)
|
99 |
return f"Error processing image: {str(e)}"
|
100 |
|
@@ -111,93 +111,151 @@ class AllergyAnalyzer:
|
|
111 |
nltk.download('punkt_tab')
|
112 |
|
113 |
self.allergy_data = self.load_allergy_data()
|
|
|
|
|
114 |
self.ocr_model = OCRModel()
|
115 |
|
116 |
def load_allergy_data(self):
|
117 |
"""تحميل بيانات الحساسيات من ملف Excel"""
|
118 |
try:
|
119 |
-
|
|
|
120 |
|
121 |
allergy_dict = {}
|
122 |
|
123 |
for index, row in df.iterrows():
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
return allergy_dict
|
129 |
|
130 |
except Exception as e:
|
131 |
logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
|
|
|
132 |
|
133 |
def tokenize_text(self, text):
|
134 |
"""تقسيم النص إلى كلمات"""
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
137 |
|
138 |
def check_allergen_in_excel(self, token):
|
139 |
"""التحقق من وجود التوكن في ملف الإكسل"""
|
140 |
-
|
141 |
-
if
|
142 |
-
return
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
def check_allergy_risk(self, ingredient, api_key):
|
146 |
"""الاستعلام من Claude API عن الحساسيات"""
|
147 |
-
prompt = f"""
|
148 |
-
You are a professional food safety expert specializing in allergen classification and risk assessment. Analyze the ingredient '{ingredient}' and determine whether it poses any allergy risk. Respond with 'Yes' or 'No'.
|
149 |
-
"""
|
150 |
-
url = "https://api.anthropic.com/v1/messages"
|
151 |
-
headers = {
|
152 |
-
"x-api-key": api_key,
|
153 |
-
"content-type": "application/json",
|
154 |
-
}
|
155 |
-
|
156 |
-
data = {
|
157 |
-
"model": "claude-3-opus-20240229",
|
158 |
-
"messages": [{"role": "user", "content": prompt}],
|
159 |
-
"max_tokens": 10,
|
160 |
-
}
|
161 |
-
|
162 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
response = requests.post(url, json=data, headers=headers)
|
|
|
164 |
|
165 |
response_json = response.json()
|
166 |
|
167 |
if "content" in response_json and isinstance(response_json["content"], list):
|
168 |
-
|
|
|
|
|
|
|
|
|
169 |
|
170 |
except Exception as e:
|
171 |
logger.error(f"Error querying Claude API: {str(e)}")
|
172 |
|
173 |
-
return
|
174 |
|
175 |
def analyze_image(self, image, claude_api_key=None):
|
176 |
"""تحليل الصورة للكشف عن الحساسيات"""
|
177 |
try:
|
|
|
|
|
|
|
178 |
# استخراج النص من الصورة
|
179 |
extracted_text = self.ocr_model.process_image(image)
|
|
|
|
|
|
|
|
|
180 |
|
181 |
# تحويل النص إلى tokens
|
182 |
tokens = self.tokenize_text(extracted_text)
|
|
|
|
|
183 |
|
184 |
-
|
|
|
185 |
|
186 |
for token in tokens:
|
187 |
-
|
188 |
-
|
189 |
-
if
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
return {
|
197 |
"extracted_text": extracted_text,
|
198 |
"detected_allergens": detected_allergens,
|
|
|
|
|
199 |
"analyzed_tokens": tokens,
|
|
|
200 |
}
|
201 |
|
202 |
except Exception as e:
|
203 |
logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
except Exception as e:
|
96 |
logger.error(f"Error in OCR processing: {str(e)}", exc_info=True)
|
97 |
+
if 'temp_image_path' in locals() and os.path.exists(temp_image_path):
|
98 |
os.remove(temp_image_path)
|
99 |
return f"Error processing image: {str(e)}"
|
100 |
|
|
|
111 |
nltk.download('punkt_tab')
|
112 |
|
113 |
self.allergy_data = self.load_allergy_data()
|
114 |
+
if self.allergy_data is None:
|
115 |
+
raise ValueError("Failed to load allergy data from dataset")
|
116 |
self.ocr_model = OCRModel()
|
117 |
|
118 |
def load_allergy_data(self):
|
119 |
"""تحميل بيانات الحساسيات من ملف Excel"""
|
120 |
try:
|
121 |
+
# قراءة ملف الإكسل مع تحديد أن الصف الأول هو العناوين
|
122 |
+
df = pd.read_excel(self.dataset_path, header=0)
|
123 |
|
124 |
allergy_dict = {}
|
125 |
|
126 |
for index, row in df.iterrows():
|
127 |
+
# الحصول على اسم الحساسية من العمود الأول
|
128 |
+
allergy_name = str(row.iloc[0]).strip().lower()
|
129 |
+
if not allergy_name:
|
130 |
+
continue
|
131 |
|
132 |
+
# الحصول على المكونات من الأعمدة التالية
|
133 |
+
ingredients = []
|
134 |
+
for col in range(1, len(row)):
|
135 |
+
ingredient = str(row.iloc[col]).strip().lower()
|
136 |
+
if ingredient and ingredient != 'nan':
|
137 |
+
ingredients.append(ingredient)
|
138 |
+
|
139 |
+
allergy_dict[allergy_name] = ingredients
|
140 |
+
|
141 |
+
logger.info(f"Successfully loaded allergy data with {len(allergy_dict)} categories")
|
142 |
return allergy_dict
|
143 |
|
144 |
except Exception as e:
|
145 |
logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
|
146 |
+
return None
|
147 |
|
148 |
def tokenize_text(self, text):
|
149 |
"""تقسيم النص إلى كلمات"""
|
150 |
+
try:
|
151 |
+
tokens = nltk.word_tokenize(text)
|
152 |
+
return [w.lower() for w in tokens if w.isalpha()]
|
153 |
+
except Exception as e:
|
154 |
+
logger.error(f"Error tokenizing text: {str(e)}")
|
155 |
+
return []
|
156 |
|
157 |
def check_allergen_in_excel(self, token):
|
158 |
"""التحقق من وجود التوكن في ملف الإكسل"""
|
159 |
+
try:
|
160 |
+
if not self.allergy_data:
|
161 |
+
return None
|
162 |
+
|
163 |
+
for allergy_name, ingredients in self.allergy_data.items():
|
164 |
+
if token in ingredients:
|
165 |
+
return allergy_name
|
166 |
+
return None
|
167 |
+
except Exception as e:
|
168 |
+
logger.error(f"Error checking allergen in Excel: {str(e)}")
|
169 |
+
return None
|
170 |
|
171 |
def check_allergy_risk(self, ingredient, api_key):
|
172 |
"""الاستعلام من Claude API عن الحساسيات"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
try:
|
174 |
+
prompt = f"""
|
175 |
+
You are a professional food safety expert. Analyze the ingredient '{ingredient}' and determine which of these allergen categories it belongs to:
|
176 |
+
dairy, eggs, peanuts, soy, tree nuts, wheat, fish, shellfish, sesame.
|
177 |
+
Respond only with the category name or 'None' if not found.
|
178 |
+
"""
|
179 |
+
url = "https://api.anthropic.com/v1/messages"
|
180 |
+
headers = {
|
181 |
+
"x-api-key": api_key,
|
182 |
+
"content-type": "application/json",
|
183 |
+
"anthropic-version": "2023-06-01"
|
184 |
+
}
|
185 |
+
|
186 |
+
data = {
|
187 |
+
"model": "claude-3-opus-20240229",
|
188 |
+
"messages": [{"role": "user", "content": prompt}],
|
189 |
+
"max_tokens": 10
|
190 |
+
}
|
191 |
+
|
192 |
response = requests.post(url, json=data, headers=headers)
|
193 |
+
response.raise_for_status()
|
194 |
|
195 |
response_json = response.json()
|
196 |
|
197 |
if "content" in response_json and isinstance(response_json["content"], list):
|
198 |
+
result = response_json["content"][0]["text"].strip().lower()
|
199 |
+
# التحقق من أن النتيجة هي واحدة من الحساسيات المعروفة
|
200 |
+
if result in self.allergy_data:
|
201 |
+
return result
|
202 |
+
return None
|
203 |
|
204 |
except Exception as e:
|
205 |
logger.error(f"Error querying Claude API: {str(e)}")
|
206 |
|
207 |
+
return None
|
208 |
|
209 |
def analyze_image(self, image, claude_api_key=None):
|
210 |
"""تحليل الصورة للكشف عن الحساسيات"""
|
211 |
try:
|
212 |
+
if not self.allergy_data:
|
213 |
+
raise ValueError("Allergy data not loaded")
|
214 |
+
|
215 |
# استخراج النص من الصورة
|
216 |
extracted_text = self.ocr_model.process_image(image)
|
217 |
+
if extracted_text.startswith("Error processing image"):
|
218 |
+
raise ValueError(extracted_text)
|
219 |
+
|
220 |
+
logger.info(f"Extracted text: {extracted_text[:200]}...")
|
221 |
|
222 |
# تحويل النص إلى tokens
|
223 |
tokens = self.tokenize_text(extracted_text)
|
224 |
+
if not tokens:
|
225 |
+
raise ValueError("No tokens extracted from text")
|
226 |
|
227 |
+
database_matches = {}
|
228 |
+
claude_matches = {}
|
229 |
|
230 |
for token in tokens:
|
231 |
+
# البحث أولاً في قاعدة البيانات
|
232 |
+
allergy = self.check_allergen_in_excel(token)
|
233 |
+
if allergy:
|
234 |
+
if allergy not in database_matches:
|
235 |
+
database_matches[allergy] = []
|
236 |
+
database_matches[allergy].append(token)
|
237 |
+
elif claude_api_key:
|
238 |
+
# إذا لم يُوجد في ملف الإكسل، استدعِ Claude API
|
239 |
+
allergy = self.check_allergy_risk(token, claude_api_key)
|
240 |
+
if allergy:
|
241 |
+
if allergy not in claude_matches:
|
242 |
+
claude_matches[allergy] = []
|
243 |
+
claude_matches[allergy].append(token)
|
244 |
+
|
245 |
+
detected_allergens = list(database_matches.keys()) + list(claude_matches.keys())
|
246 |
|
247 |
return {
|
248 |
"extracted_text": extracted_text,
|
249 |
"detected_allergens": detected_allergens,
|
250 |
+
"database_matches": database_matches,
|
251 |
+
"claude_matches": claude_matches,
|
252 |
"analyzed_tokens": tokens,
|
253 |
+
"success": True
|
254 |
}
|
255 |
|
256 |
except Exception as e:
|
257 |
logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
|
258 |
+
return {
|
259 |
+
"error": str(e),
|
260 |
+
"success": False
|
261 |
+
}
|