Spaces:
Sleeping
Sleeping
Update app/utils.py
Browse files- app/utils.py +34 -71
app/utils.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# utils.py (معدل)
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from transformers import AutoModel, AutoTokenizer
|
@@ -9,8 +8,6 @@ from transformers import BertTokenizer
|
|
9 |
import nltk
|
10 |
import requests
|
11 |
import io
|
12 |
-
from flask import current_app
|
13 |
-
import base64
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
@@ -108,129 +105,95 @@ class AllergyAnalyzer:
|
|
108 |
nltk.data.find('tokenizers/punkt')
|
109 |
except LookupError:
|
110 |
nltk.download('punkt')
|
111 |
-
|
112 |
-
|
113 |
-
except LookupError:
|
114 |
-
nltk.download('punkt_tab')
|
115 |
-
|
116 |
-
self.allergy_dict = self.load_allergy_data()
|
117 |
self.ocr_model = OCRModel()
|
118 |
|
119 |
def load_allergy_data(self):
|
120 |
"""تحميل بيانات الحساسيات من ملف Excel"""
|
121 |
try:
|
122 |
-
df = pd.read_excel(self.dataset_path)
|
|
|
123 |
allergy_dict = {}
|
124 |
|
125 |
for index, row in df.iterrows():
|
126 |
-
|
127 |
-
ingredients = [str(ingredient).lower() for ingredient in row[1:] if pd.notna(ingredient)]
|
128 |
-
allergy_dict[
|
129 |
|
130 |
return allergy_dict
|
|
|
131 |
except Exception as e:
|
132 |
logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
|
133 |
-
return {}
|
134 |
|
135 |
def tokenize_text(self, text):
|
136 |
"""تقسيم النص إلى كلمات"""
|
137 |
tokens = nltk.word_tokenize(text)
|
138 |
return [w.lower() for w in tokens if w.isalpha()]
|
139 |
|
140 |
-
def
|
141 |
-
"""
|
142 |
-
|
143 |
-
for allergy, ingredients in self.allergy_dict.items():
|
144 |
if token in ingredients:
|
145 |
-
|
146 |
-
return
|
147 |
|
148 |
def check_allergy_risk(self, ingredient, api_key):
|
149 |
"""الاستعلام من Claude API عن الحساسيات"""
|
150 |
prompt = f"""
|
151 |
-
You are a professional food safety expert specializing in allergen classification.
|
152 |
-
Please analyze the ingredient '{ingredient}' and determine which of the following major allergen categories it belongs to:
|
153 |
-
dairy, eggs, peanuts, soy, tree nuts, wheat, fish, shellfish, sesame.
|
154 |
-
|
155 |
-
Return only the allergen category name if found, or 'None' if not found.
|
156 |
-
Example responses: 'dairy', 'eggs', 'None', etc.
|
157 |
"""
|
158 |
url = "https://api.anthropic.com/v1/messages"
|
159 |
headers = {
|
160 |
"x-api-key": api_key,
|
161 |
"content-type": "application/json",
|
162 |
-
"anthropic-version": "2023-06-01"
|
163 |
}
|
164 |
|
165 |
data = {
|
166 |
"model": "claude-3-opus-20240229",
|
167 |
"messages": [{"role": "user", "content": prompt}],
|
168 |
-
"max_tokens": 10
|
169 |
}
|
170 |
|
171 |
try:
|
172 |
response = requests.post(url, json=data, headers=headers)
|
173 |
-
json_response = response.json()
|
174 |
|
175 |
-
|
176 |
-
result = json_response["content"][0]["text"].strip().lower()
|
177 |
-
return result if result in self.allergy_dict else None
|
178 |
-
return None
|
179 |
|
|
|
|
|
|
|
180 |
except Exception as e:
|
181 |
logger.error(f"Error querying Claude API: {str(e)}")
|
182 |
-
|
|
|
183 |
|
184 |
def analyze_image(self, image, claude_api_key=None):
|
185 |
-
"""تحليل الصورة
|
186 |
try:
|
187 |
# استخراج النص من الصورة
|
188 |
extracted_text = self.ocr_model.process_image(image)
|
189 |
-
logger.info(f"Extracted text: {extracted_text}")
|
190 |
|
191 |
# تحويل النص إلى tokens
|
192 |
tokens = self.tokenize_text(extracted_text)
|
193 |
-
detected_allergens = set()
|
194 |
-
database_matches = {}
|
195 |
-
claude_matches = {}
|
196 |
|
197 |
-
|
|
|
198 |
for token in tokens:
|
199 |
-
|
200 |
|
201 |
-
if
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
elif claude_api_key:
|
208 |
-
# إذا لم توجد في قاعدة البيانات، نستخدم Claude API
|
209 |
-
api_result = self.check_allergy_risk(token, claude_api_key)
|
210 |
-
if api_result:
|
211 |
-
detected_allergens.add(api_result)
|
212 |
-
if api_result not in claude_matches:
|
213 |
-
claude_matches[api_result] = []
|
214 |
-
claude_matches[api_result].append(token)
|
215 |
|
216 |
return {
|
217 |
"extracted_text": extracted_text,
|
218 |
-
"detected_allergens":
|
219 |
-
"
|
220 |
-
"claude_matches": claude_matches,
|
221 |
-
"analyzed_tokens": tokens
|
222 |
}
|
223 |
-
|
224 |
except Exception as e:
|
225 |
logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
|
226 |
-
return {
|
227 |
-
"error": str(e),
|
228 |
-
"detected_allergens": [],
|
229 |
-
"database_matches": {},
|
230 |
-
"claude_matches": {},
|
231 |
-
"analyzed_tokens": []
|
232 |
-
}
|
233 |
-
|
234 |
-
def get_allergen_list(self):
|
235 |
-
"""الحصول على قائمة الحساسيات المعروفة"""
|
236 |
-
return list(self.allergy_dict.keys())
|
|
|
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
from transformers import AutoModel, AutoTokenizer
|
|
|
8 |
import nltk
|
9 |
import requests
|
10 |
import io
|
|
|
|
|
11 |
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
105 |
nltk.data.find('tokenizers/punkt')
|
106 |
except LookupError:
|
107 |
nltk.download('punkt')
|
108 |
+
|
109 |
+
self.allergy_data = self.load_allergy_data()
|
|
|
|
|
|
|
|
|
110 |
self.ocr_model = OCRModel()
|
111 |
|
112 |
def load_allergy_data(self):
|
113 |
"""تحميل بيانات الحساسيات من ملف Excel"""
|
114 |
try:
|
115 |
+
df = pd.read_excel(self.dataset_path, header=None)
|
116 |
+
|
117 |
allergy_dict = {}
|
118 |
|
119 |
for index, row in df.iterrows():
|
120 |
+
allergy_name = row.iloc[0].strip().lower()
|
121 |
+
ingredients = [str(ingredient).strip().lower() for ingredient in row[1:] if pd.notna(ingredient)]
|
122 |
+
allergy_dict[allergy_name] = ingredients
|
123 |
|
124 |
return allergy_dict
|
125 |
+
|
126 |
except Exception as e:
|
127 |
logger.error(f"Error loading allergy data: {str(e)}", exc_info=True)
|
|
|
128 |
|
129 |
def tokenize_text(self, text):
|
130 |
"""تقسيم النص إلى كلمات"""
|
131 |
tokens = nltk.word_tokenize(text)
|
132 |
return [w.lower() for w in tokens if w.isalpha()]
|
133 |
|
134 |
+
def check_allergen_in_excel(self, token):
|
135 |
+
"""التحقق من وجود التوكن في ملف الإكسل"""
|
136 |
+
for allergy_name, ingredients in self.allergy_data.items():
|
|
|
137 |
if token in ingredients:
|
138 |
+
return allergy_name # Return the allergy name if token is found
|
139 |
+
return None
|
140 |
|
141 |
def check_allergy_risk(self, ingredient, api_key):
|
142 |
"""الاستعلام من Claude API عن الحساسيات"""
|
143 |
prompt = f"""
|
144 |
+
You are a professional food safety expert specializing in allergen classification and risk assessment. Analyze the ingredient '{ingredient}' and determine whether it poses any allergy risk. Respond with 'Yes' or 'No'.
|
|
|
|
|
|
|
|
|
|
|
145 |
"""
|
146 |
url = "https://api.anthropic.com/v1/messages"
|
147 |
headers = {
|
148 |
"x-api-key": api_key,
|
149 |
"content-type": "application/json",
|
|
|
150 |
}
|
151 |
|
152 |
data = {
|
153 |
"model": "claude-3-opus-20240229",
|
154 |
"messages": [{"role": "user", "content": prompt}],
|
155 |
+
"max_tokens": 10,
|
156 |
}
|
157 |
|
158 |
try:
|
159 |
response = requests.post(url, json=data, headers=headers)
|
|
|
160 |
|
161 |
+
response_json = response.json()
|
|
|
|
|
|
|
162 |
|
163 |
+
if "content" in response_json and isinstance(response_json["content"], list):
|
164 |
+
return response_json["content"][0]["text"].strip().lower() == 'yes'
|
165 |
+
|
166 |
except Exception as e:
|
167 |
logger.error(f"Error querying Claude API: {str(e)}")
|
168 |
+
|
169 |
+
return False
|
170 |
|
171 |
def analyze_image(self, image, claude_api_key=None):
|
172 |
+
"""تحليل الصورة للكشف عن الحساسيات"""
|
173 |
try:
|
174 |
# استخراج النص من الصورة
|
175 |
extracted_text = self.ocr_model.process_image(image)
|
|
|
176 |
|
177 |
# تحويل النص إلى tokens
|
178 |
tokens = self.tokenize_text(extracted_text)
|
|
|
|
|
|
|
179 |
|
180 |
+
detected_allergens = {}
|
181 |
+
|
182 |
for token in tokens:
|
183 |
+
allergy_from_excel = self.check_allergen_in_excel(token)
|
184 |
|
185 |
+
if allergy_from_excel:
|
186 |
+
detected_allergens[token] = allergy_from_excel
|
187 |
+
|
188 |
+
elif claude_api_key: # إذا لم يُوجد في ملف الإكسل، استدعِ Claude API
|
189 |
+
is_allergen_risky = self.check_allergy_risk(token, claude_api_key)
|
190 |
+
detected_allergens[token] = 'API Risk' if is_allergen_risky else 'Safe'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
return {
|
193 |
"extracted_text": extracted_text,
|
194 |
+
"detected_allergens": detected_allergens,
|
195 |
+
"analyzed_tokens": tokens,
|
|
|
|
|
196 |
}
|
197 |
+
|
198 |
except Exception as e:
|
199 |
logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|