Spaces:
Sleeping
Sleeping
Update app/utils.py
Browse files- app/utils.py +93 -66
app/utils.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
from transformers import AutoModel, AutoTokenizer
|
@@ -24,7 +25,6 @@ class OCRModel:
|
|
24 |
try:
|
25 |
logger.info("Initializing OCR model...")
|
26 |
|
27 |
-
# محاولة تحميل النموذج
|
28 |
try:
|
29 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
30 |
'stepfun-ai/GOT-OCR2_0',
|
@@ -61,19 +61,15 @@ class OCRModel:
|
|
61 |
if image.mode != 'RGB':
|
62 |
image = image.convert('RGB')
|
63 |
|
64 |
-
# تحسين التباين
|
65 |
enhancer = ImageEnhance.Contrast(image)
|
66 |
image = enhancer.enhance(1.5)
|
67 |
|
68 |
-
# تحسين الحدة
|
69 |
enhancer = ImageEnhance.Sharpness(image)
|
70 |
image = enhancer.enhance(1.5)
|
71 |
|
72 |
-
# تحسين السطوع
|
73 |
enhancer = ImageEnhance.Brightness(image)
|
74 |
image = enhancer.enhance(1.2)
|
75 |
|
76 |
-
# تطبيق فلتر لتليين الصورة
|
77 |
image = image.filter(ImageFilter.SMOOTH)
|
78 |
|
79 |
return image
|
@@ -85,18 +81,13 @@ class OCRModel:
|
|
85 |
try:
|
86 |
logger.info("Starting image processing")
|
87 |
|
88 |
-
# معالجة الصورة
|
89 |
processed_image = self.preprocess_image(image)
|
90 |
-
|
91 |
-
# حفظ الصورة مؤقتاً للتعامل مع النموذج
|
92 |
temp_image_path = "temp_ocr_image.jpg"
|
93 |
processed_image.save(temp_image_path)
|
94 |
|
95 |
-
# استخراج النص
|
96 |
result = self.model.chat(self.tokenizer, temp_image_path, ocr_type='format')
|
97 |
logger.info(f"Successfully extracted text: {result[:100]}...")
|
98 |
|
99 |
-
# حذف الملف المؤقت
|
100 |
if os.path.exists(temp_image_path):
|
101 |
os.remove(temp_image_path)
|
102 |
|
@@ -111,7 +102,6 @@ class OCRModel:
|
|
111 |
class AllergyAnalyzer:
|
112 |
def __init__(self, dataset_path):
|
113 |
self.dataset_path = dataset_path
|
114 |
-
# Ensure NLTK data is downloaded
|
115 |
try:
|
116 |
nltk.data.find('tokenizers/punkt')
|
117 |
except LookupError:
|
@@ -122,6 +112,7 @@ class AllergyAnalyzer:
|
|
122 |
nltk.download('punkt_tab')
|
123 |
|
124 |
self.allergy_dict = self.load_allergy_data()
|
|
|
125 |
|
126 |
def load_allergy_data(self):
|
127 |
"""تحميل بيانات الحساسيات من ملف Excel"""
|
@@ -131,8 +122,8 @@ class AllergyAnalyzer:
|
|
131 |
|
132 |
for index, row in df.iterrows():
|
133 |
allergy = row['Allergy']
|
134 |
-
ingredients = [ingredient for ingredient in row[1:] if pd.notna(ingredient)]
|
135 |
-
allergy_dict[allergy] = ingredients
|
136 |
|
137 |
return allergy_dict
|
138 |
except Exception as e:
|
@@ -152,28 +143,98 @@ class AllergyAnalyzer:
|
|
152 |
results.append(allergy)
|
153 |
return results
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
def check_claude_allergens(self, token, allergy, api_key):
|
156 |
"""الاستعلام من Claude API عن الحساسيات"""
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
Respond ONLY with 'Yes' or 'No'. No explanations.
|
161 |
"""
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
response = requests.post(url, json=data, headers=headers)
|
178 |
json_response = response.json()
|
179 |
|
@@ -183,38 +244,4 @@ Respond ONLY with 'Yes' or 'No'. No explanations.
|
|
183 |
|
184 |
except Exception as e:
|
185 |
logger.error(f"Error querying Claude API: {str(e)}")
|
186 |
-
return
|
187 |
-
|
188 |
-
def analyze_text(self, text, user_allergens, claude_api_key=None):
|
189 |
-
"""تحليل النص للكشف عن الحساسيات"""
|
190 |
-
detected_allergens = set()
|
191 |
-
database_matches = {}
|
192 |
-
claude_matches = {}
|
193 |
-
tokens = self.tokenize_text(text)
|
194 |
-
|
195 |
-
for token in tokens:
|
196 |
-
# التحقق من قاعدة البيانات أولاً
|
197 |
-
db_results = self.check_database_allergens(token, user_allergens)
|
198 |
-
|
199 |
-
if db_results:
|
200 |
-
for allergy in db_results:
|
201 |
-
detected_allergens.add(allergy)
|
202 |
-
database_matches[allergy] = database_matches.get(allergy, []) + [token]
|
203 |
-
else:
|
204 |
-
# إذا لم توجد في قاعدة البيانات، نستخدم Claude API
|
205 |
-
if claude_api_key:
|
206 |
-
for allergy in user_allergens:
|
207 |
-
if self.check_claude_allergens(token, allergy, claude_api_key):
|
208 |
-
detected_allergens.add(allergy)
|
209 |
-
claude_matches[allergy] = claude_matches.get(allergy, []) + [token]
|
210 |
-
|
211 |
-
return {
|
212 |
-
"detected_allergens": list(detected_allergens),
|
213 |
-
"database_matches": database_matches,
|
214 |
-
"claude_matches": claude_matches,
|
215 |
-
"analyzed_tokens": tokens
|
216 |
-
}
|
217 |
-
|
218 |
-
def get_allergen_list(self):
|
219 |
-
"""الحصول على قائمة الحساسيات المعروفة"""
|
220 |
-
return list(self.allergy_dict.keys())
|
|
|
1 |
+
# utils.py
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from transformers import AutoModel, AutoTokenizer
|
|
|
25 |
try:
|
26 |
logger.info("Initializing OCR model...")
|
27 |
|
|
|
28 |
try:
|
29 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
30 |
'stepfun-ai/GOT-OCR2_0',
|
|
|
61 |
if image.mode != 'RGB':
|
62 |
image = image.convert('RGB')
|
63 |
|
|
|
64 |
enhancer = ImageEnhance.Contrast(image)
|
65 |
image = enhancer.enhance(1.5)
|
66 |
|
|
|
67 |
enhancer = ImageEnhance.Sharpness(image)
|
68 |
image = enhancer.enhance(1.5)
|
69 |
|
|
|
70 |
enhancer = ImageEnhance.Brightness(image)
|
71 |
image = enhancer.enhance(1.2)
|
72 |
|
|
|
73 |
image = image.filter(ImageFilter.SMOOTH)
|
74 |
|
75 |
return image
|
|
|
81 |
try:
|
82 |
logger.info("Starting image processing")
|
83 |
|
|
|
84 |
processed_image = self.preprocess_image(image)
|
|
|
|
|
85 |
temp_image_path = "temp_ocr_image.jpg"
|
86 |
processed_image.save(temp_image_path)
|
87 |
|
|
|
88 |
result = self.model.chat(self.tokenizer, temp_image_path, ocr_type='format')
|
89 |
logger.info(f"Successfully extracted text: {result[:100]}...")
|
90 |
|
|
|
91 |
if os.path.exists(temp_image_path):
|
92 |
os.remove(temp_image_path)
|
93 |
|
|
|
102 |
class AllergyAnalyzer:
|
103 |
def __init__(self, dataset_path):
|
104 |
self.dataset_path = dataset_path
|
|
|
105 |
try:
|
106 |
nltk.data.find('tokenizers/punkt')
|
107 |
except LookupError:
|
|
|
112 |
nltk.download('punkt_tab')
|
113 |
|
114 |
self.allergy_dict = self.load_allergy_data()
|
115 |
+
self.ocr_model = OCRModel()
|
116 |
|
117 |
def load_allergy_data(self):
|
118 |
"""تحميل بيانات الحساسيات من ملف Excel"""
|
|
|
122 |
|
123 |
for index, row in df.iterrows():
|
124 |
allergy = row['Allergy']
|
125 |
+
ingredients = [str(ingredient).lower() for ingredient in row[1:] if pd.notna(ingredient)]
|
126 |
+
allergy_dict[allergy.lower()] = ingredients
|
127 |
|
128 |
return allergy_dict
|
129 |
except Exception as e:
|
|
|
143 |
results.append(allergy)
|
144 |
return results
|
145 |
|
146 |
+
def analyze_image(self, image, user_allergens):
|
147 |
+
"""تحليل الصورة مباشرة للكشف عن الحساسيات"""
|
148 |
+
try:
|
149 |
+
# استخراج النص من الصورة
|
150 |
+
extracted_text = self.ocr_model.process_image(image)
|
151 |
+
logger.info(f"Extracted text: {extracted_text}")
|
152 |
+
|
153 |
+
# تحويل النص إلى tokens
|
154 |
+
tokens = self.tokenize_text(extracted_text)
|
155 |
+
detected_allergens = set()
|
156 |
+
database_matches = {}
|
157 |
+
claude_matches = {}
|
158 |
+
|
159 |
+
# التحقق من كل token في قاعدة البيانات
|
160 |
+
for token in tokens:
|
161 |
+
db_results = self.check_database_allergens(token, user_allergens)
|
162 |
+
|
163 |
+
if db_results:
|
164 |
+
for allergy in db_results:
|
165 |
+
detected_allergens.add(allergy)
|
166 |
+
database_matches[allergy] = database_matches.get(allergy, []) + [token]
|
167 |
+
else:
|
168 |
+
# إذا لم توجد في قاعدة البيانات، نستخدم Claude API
|
169 |
+
claude_api_key = current_app.config.get('CLAUDE_API_KEY')
|
170 |
+
if claude_api_key:
|
171 |
+
for allergy in user_allergens:
|
172 |
+
if self.check_claude_allergens(token, allergy, claude_api_key):
|
173 |
+
detected_allergens.add(allergy)
|
174 |
+
claude_matches[allergy] = claude_matches.get(allergy, []) + [token]
|
175 |
+
|
176 |
+
return {
|
177 |
+
"detected_allergens": list(detected_allergens),
|
178 |
+
"database_matches": database_matches,
|
179 |
+
"claude_matches": claude_matches,
|
180 |
+
"analyzed_tokens": tokens
|
181 |
+
}
|
182 |
+
|
183 |
+
except Exception as e:
|
184 |
+
logger.error(f"Error analyzing image: {str(e)}", exc_info=True)
|
185 |
+
return {
|
186 |
+
"detected_allergens": [],
|
187 |
+
"database_matches": {},
|
188 |
+
"claude_matches": {},
|
189 |
+
"analyzed_tokens": [],
|
190 |
+
"error": str(e)
|
191 |
+
}
|
192 |
+
|
193 |
def check_claude_allergens(self, token, allergy, api_key):
|
194 |
"""الاستعلام من Claude API عن الحساسيات"""
|
195 |
+
try:
|
196 |
+
# تحضير الصورة للطلب
|
197 |
+
img_byte_arr = io.BytesIO()
|
198 |
+
image.save(img_byte_arr, format='JPEG')
|
199 |
+
img_byte_arr = img_byte_arr.getvalue()
|
200 |
+
|
201 |
+
prompt = f"""
|
202 |
+
Analyze if this product contains or is derived from {allergy}.
|
203 |
+
Focus on the ingredient: {token}.
|
204 |
Respond ONLY with 'Yes' or 'No'. No explanations.
|
205 |
"""
|
206 |
+
|
207 |
+
url = "https://api.anthropic.com/v1/messages"
|
208 |
+
headers = {
|
209 |
+
"x-api-key": api_key,
|
210 |
+
"content-type": "application/json",
|
211 |
+
"anthropic-version": "2023-06-01"
|
212 |
+
}
|
213 |
+
|
214 |
+
data = {
|
215 |
+
"model": "claude-3-opus-20240229",
|
216 |
+
"messages": [
|
217 |
+
{
|
218 |
+
"role": "user",
|
219 |
+
"content": [
|
220 |
+
{
|
221 |
+
"type": "image",
|
222 |
+
"source": {
|
223 |
+
"type": "base64",
|
224 |
+
"media_type": "image/jpeg",
|
225 |
+
"data": base64.b64encode(img_byte_arr).decode('utf-8')
|
226 |
+
}
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"type": "text",
|
230 |
+
"text": prompt
|
231 |
+
}
|
232 |
+
]
|
233 |
+
}
|
234 |
+
],
|
235 |
+
"max_tokens": 10
|
236 |
+
}
|
237 |
+
|
238 |
response = requests.post(url, json=data, headers=headers)
|
239 |
json_response = response.json()
|
240 |
|
|
|
244 |
|
245 |
except Exception as e:
|
246 |
logger.error(f"Error querying Claude API: {str(e)}")
|
247 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|