root-sajjan commited on
Commit
9cdce5c
·
verified ·
1 Parent(s): 0a12fa8

error handling nltk

Browse files
Files changed (1) hide show
  1. llm/inference.py +64 -41
llm/inference.py CHANGED
@@ -19,47 +19,70 @@ def extract_product_info(text):
19
  # Initialize result dictionary
20
  result = {"brand": None, "model": None, "description": None, "price": None}
21
 
22
- # Improved regex to prioritize currency-related patterns
23
- price_match = re.search(
24
- r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
25
- text
26
- )
27
- if price_match:
28
- price = price_match.group().strip()
29
- # Clean up the price format
30
- if "$" in price or "USD" in price or "usd" in price:
31
- result["price"] = re.sub(r'[^\d.]', '', price) # Keep only digits and decimals
32
- else:
33
- result["price"] = price
34
- # Remove the price part from the text to prevent it from being included in the brand/model extraction
35
- text = text.replace(price_match.group(), "").strip()
36
-
37
- # Tokenize the remaining text and tag parts of speech
38
- tokens = nltk.word_tokenize(text)
39
- pos_tags = nltk.pos_tag(tokens)
40
-
41
- # Extract brand and model (Proper Nouns + Alphanumeric patterns)
42
- brand_parts = []
43
- model_parts = []
44
- description_parts = []
45
-
46
- for word, tag in pos_tags:
47
- if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
48
- if len(brand_parts) == 0: # Assume the first proper noun is the brand
49
- brand_parts.append(word)
50
- else: # Model number tends to follow the brand
51
- model_parts.append(word)
52
- else:
53
- description_parts.append(word)
54
-
55
- # Assign brand and model to result dictionary
56
- if brand_parts:
57
- result["brand"] = " ".join(brand_parts)
58
- if model_parts:
59
- result["model"] = " ".join(model_parts)
60
-
61
- # Combine the remaining parts as description
62
- result["description"] = " ".join(description_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  return result
65
 
 
19
  # Initialize result dictionary
20
  result = {"brand": None, "model": None, "description": None, "price": None}
21
 
22
+ try:
23
+
24
+ # Improved regex to prioritize currency-related patterns
25
+ price_match = re.search(
26
+ r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
27
+ text
28
+ )
29
+ if price_match:
30
+ price = price_match.group().strip()
31
+ # Clean up the price format
32
+ if "$" in price or "USD" in price or "usd" in price:
33
+ result["price"] = re.sub(r'[^\d.]', '', price) # Keep only digits and decimals
34
+ else:
35
+ result["price"] = price
36
+ # Remove the price part from the text to prevent it from being included in the brand/model extraction
37
+ text = text.replace(price_match.group(), "").strip()
38
+
39
+ try:
40
+ tokens = nltk.word_tokenize(text)
41
+ print(f"Tokens: {tokens}")
42
+ except Exception as e:
43
+ print(f"Error during tokenization: {e}")
44
+ # Fall back to a simple split if tokenization fails
45
+ tokens = text.split()
46
+ print(f"Fallback tokens: {tokens}")
47
+
48
+ try:
49
+ pos_tags = nltk.pos_tag(tokens)
50
+ print(f"POS Tags: {pos_tags}")
51
+ except Exception as e:
52
+ print(f"Error during POS tagging: {e}")
53
+ # If POS tagging fails, create dummy tags
54
+ pos_tags = [(word, "NN") for word in tokens]
55
+ print(f"Fallback POS Tags: {pos_tags}")
56
+
57
+ # Extract brand, model, and description
58
+ brand_parts = []
59
+ model_parts = []
60
+ description_parts = []
61
+
62
+ for word, tag in pos_tags:
63
+ if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
64
+ if len(brand_parts) == 0: # Assume the first proper noun is the brand
65
+ brand_parts.append(word)
66
+ else: # Model number tends to follow the brand
67
+ model_parts.append(word)
68
+ else:
69
+ description_parts.append(word)
70
+
71
+ # Assign values to the result dictionary
72
+ if brand_parts:
73
+ result["brand"] = " ".join(brand_parts)
74
+ if model_parts:
75
+ result["model"] = " ".join(model_parts)
76
+ if description_parts:
77
+ result["description"] = " ".join(description_parts)
78
+
79
+ print(f"Extract function returned: {result}")
80
+
81
+ except Exception as e:
82
+ print(f"Unexpected error: {e}")
83
+ # Return a fallback result in case of a critical error
84
+ result["description"] = text
85
+ print(f"Fallback result: {result}")
86
 
87
  return result
88