Spaces:

root-sajjan
/

whatisit

Sleeping

App Files Files Community

root-sajjan commited on Dec 2, 2024

Commit

9cdce5c

verified ·

1 Parent(s): 0a12fa8

error handling nltk

Browse files

Files changed (1) hide show

llm/inference.py +64 -41

llm/inference.py CHANGED Viewed

@@ -19,47 +19,70 @@ def extract_product_info(text):
     # Initialize result dictionary
     result = {"brand": None, "model": None, "description": None, "price": None}
-    # Improved regex to prioritize currency-related patterns
-    price_match = re.search(
-        r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
-        text
-    )
-    if price_match:
-        price = price_match.group().strip()
-        # Clean up the price format
-        if "$" in price or "USD" in price or "usd" in price:
-            result["price"] = re.sub(r'[^\d.]', '', price)  # Keep only digits and decimals
-        else:
-            result["price"] = price
-        # Remove the price part from the text to prevent it from being included in the brand/model extraction
-        text = text.replace(price_match.group(), "").strip()
-    # Tokenize the remaining text and tag parts of speech
-    tokens = nltk.word_tokenize(text)
-    pos_tags = nltk.pos_tag(tokens)
-    # Extract brand and model (Proper Nouns + Alphanumeric patterns)
-    brand_parts = []
-    model_parts = []
-    description_parts = []
-    for word, tag in pos_tags:
-        if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
-            if len(brand_parts) == 0:  # Assume the first proper noun is the brand
-                brand_parts.append(word)
-            else:  # Model number tends to follow the brand
-                model_parts.append(word)
-        else:
-            description_parts.append(word)
-    # Assign brand and model to result dictionary
-    if brand_parts:
-        result["brand"] = " ".join(brand_parts)
-    if model_parts:
-        result["model"] = " ".join(model_parts)
-    # Combine the remaining parts as description
-    result["description"] = " ".join(description_parts)
     return result

     # Initialize result dictionary
     result = {"brand": None, "model": None, "description": None, "price": None}
+    try:
+        # Improved regex to prioritize currency-related patterns
+        price_match = re.search(
+            r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
+            text
+        )
+        if price_match:
+            price = price_match.group().strip()
+            # Clean up the price format
+            if "$" in price or "USD" in price or "usd" in price:
+                result["price"] = re.sub(r'[^\d.]', '', price)  # Keep only digits and decimals
+            else:
+                result["price"] = price
+            # Remove the price part from the text to prevent it from being included in the brand/model extraction
+            text = text.replace(price_match.group(), "").strip()
+        try:
+            tokens = nltk.word_tokenize(text)
+            print(f"Tokens: {tokens}")
+        except Exception as e:
+            print(f"Error during tokenization: {e}")
+            # Fall back to a simple split if tokenization fails
+            tokens = text.split()
+            print(f"Fallback tokens: {tokens}")
+        try:
+            pos_tags = nltk.pos_tag(tokens)
+            print(f"POS Tags: {pos_tags}")
+        except Exception as e:
+            print(f"Error during POS tagging: {e}")
+            # If POS tagging fails, create dummy tags
+            pos_tags = [(word, "NN") for word in tokens]
+            print(f"Fallback POS Tags: {pos_tags}")
+        # Extract brand, model, and description
+        brand_parts = []
+        model_parts = []
+        description_parts = []
+        for word, tag in pos_tags:
+            if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
+                if len(brand_parts) == 0:  # Assume the first proper noun is the brand
+                    brand_parts.append(word)
+                else:  # Model number tends to follow the brand
+                    model_parts.append(word)
+            else:
+                description_parts.append(word)
+        # Assign values to the result dictionary
+        if brand_parts:
+            result["brand"] = " ".join(brand_parts)
+        if model_parts:
+            result["model"] = " ".join(model_parts)
+        if description_parts:
+            result["description"] = " ".join(description_parts)
+        print(f"Extract function returned: {result}")
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        # Return a fallback result in case of a critical error
+        result["description"] = text
+        print(f"Fallback result: {result}")
     return result