Spaces:
Sleeping
Sleeping
error handling nltk
Browse files- llm/inference.py +64 -41
llm/inference.py
CHANGED
@@ -19,47 +19,70 @@ def extract_product_info(text):
|
|
19 |
# Initialize result dictionary
|
20 |
result = {"brand": None, "model": None, "description": None, "price": None}
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
return result
|
65 |
|
|
|
19 |
# Initialize result dictionary
|
20 |
result = {"brand": None, "model": None, "description": None, "price": None}
|
21 |
|
22 |
+
try:
|
23 |
+
|
24 |
+
# Improved regex to prioritize currency-related patterns
|
25 |
+
price_match = re.search(
|
26 |
+
r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
|
27 |
+
text
|
28 |
+
)
|
29 |
+
if price_match:
|
30 |
+
price = price_match.group().strip()
|
31 |
+
# Clean up the price format
|
32 |
+
if "$" in price or "USD" in price or "usd" in price:
|
33 |
+
result["price"] = re.sub(r'[^\d.]', '', price) # Keep only digits and decimals
|
34 |
+
else:
|
35 |
+
result["price"] = price
|
36 |
+
# Remove the price part from the text to prevent it from being included in the brand/model extraction
|
37 |
+
text = text.replace(price_match.group(), "").strip()
|
38 |
+
|
39 |
+
try:
|
40 |
+
tokens = nltk.word_tokenize(text)
|
41 |
+
print(f"Tokens: {tokens}")
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error during tokenization: {e}")
|
44 |
+
# Fall back to a simple split if tokenization fails
|
45 |
+
tokens = text.split()
|
46 |
+
print(f"Fallback tokens: {tokens}")
|
47 |
+
|
48 |
+
try:
|
49 |
+
pos_tags = nltk.pos_tag(tokens)
|
50 |
+
print(f"POS Tags: {pos_tags}")
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error during POS tagging: {e}")
|
53 |
+
# If POS tagging fails, create dummy tags
|
54 |
+
pos_tags = [(word, "NN") for word in tokens]
|
55 |
+
print(f"Fallback POS Tags: {pos_tags}")
|
56 |
+
|
57 |
+
# Extract brand, model, and description
|
58 |
+
brand_parts = []
|
59 |
+
model_parts = []
|
60 |
+
description_parts = []
|
61 |
+
|
62 |
+
for word, tag in pos_tags:
|
63 |
+
if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
|
64 |
+
if len(brand_parts) == 0: # Assume the first proper noun is the brand
|
65 |
+
brand_parts.append(word)
|
66 |
+
else: # Model number tends to follow the brand
|
67 |
+
model_parts.append(word)
|
68 |
+
else:
|
69 |
+
description_parts.append(word)
|
70 |
+
|
71 |
+
# Assign values to the result dictionary
|
72 |
+
if brand_parts:
|
73 |
+
result["brand"] = " ".join(brand_parts)
|
74 |
+
if model_parts:
|
75 |
+
result["model"] = " ".join(model_parts)
|
76 |
+
if description_parts:
|
77 |
+
result["description"] = " ".join(description_parts)
|
78 |
+
|
79 |
+
print(f"Extract function returned: {result}")
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Unexpected error: {e}")
|
83 |
+
# Return a fallback result in case of a critical error
|
84 |
+
result["description"] = text
|
85 |
+
print(f"Fallback result: {result}")
|
86 |
|
87 |
return result
|
88 |
|