root-sajjan commited on
Commit
0a12fa8
·
verified ·
1 Parent(s): 0e5c206

updated nltk

Browse files
Files changed (1) hide show
  1. llm/inference.py +120 -118
llm/inference.py CHANGED
@@ -1,119 +1,121 @@
1
- from huggingface_hub import InferenceClient
2
- import nltk
3
- import re
4
- import requests
5
- import os
6
-
7
- api_key = os.getenv("HF_KEY")
8
-
9
- nltk.download('punkt_tab')
10
- nltk.download('averaged_perceptron_tagger')
11
-
12
-
13
- client = InferenceClient(api_key=api_key)
14
-
15
-
16
- def extract_product_info(text):
17
- # Initialize result dictionary
18
- result = {"brand": None, "model": None, "description": None, "price": None}
19
-
20
- # Improved regex to prioritize currency-related patterns
21
- price_match = re.search(
22
- r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
23
- text
24
- )
25
- if price_match:
26
- price = price_match.group().strip()
27
- # Clean up the price format
28
- if "$" in price or "USD" in price or "usd" in price:
29
- result["price"] = re.sub(r'[^\d.]', '', price) # Keep only digits and decimals
30
- else:
31
- result["price"] = price
32
- # Remove the price part from the text to prevent it from being included in the brand/model extraction
33
- text = text.replace(price_match.group(), "").strip()
34
-
35
- # Tokenize the remaining text and tag parts of speech
36
- tokens = nltk.word_tokenize(text)
37
- pos_tags = nltk.pos_tag(tokens)
38
-
39
- # Extract brand and model (Proper Nouns + Alphanumeric patterns)
40
- brand_parts = []
41
- model_parts = []
42
- description_parts = []
43
-
44
- for word, tag in pos_tags:
45
- if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
46
- if len(brand_parts) == 0: # Assume the first proper noun is the brand
47
- brand_parts.append(word)
48
- else: # Model number tends to follow the brand
49
- model_parts.append(word)
50
- else:
51
- description_parts.append(word)
52
-
53
- # Assign brand and model to result dictionary
54
- if brand_parts:
55
- result["brand"] = " ".join(brand_parts)
56
- if model_parts:
57
- result["model"] = " ".join(model_parts)
58
-
59
- # Combine the remaining parts as description
60
- result["description"] = " ".join(description_parts)
61
-
62
- return result
63
-
64
-
65
-
66
- def extract_info(text):
67
- API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
68
- headers = {"Authorization": f"Bearer {api_key}"}
69
- payload = {"inputs": f"From the given text, extract brand name, model number, description about it, and its average price in today's market. Give me back a python dictionary with keys as brand_name, model_number, desc, price. The text is {text}.",}
70
- response = requests.post(API_URL, headers=headers, json=payload)
71
- print('GOOGLEE LLM OUTPUTTTTTTT\n\n',response )
72
- output = response.json()
73
- print(output)
74
-
75
-
76
-
77
- def get_name(url, object):
78
- messages = [
79
- {
80
- "role": "user",
81
- "content": [
82
- {
83
- "type": "text",
84
- "text": f"Is this a {object}?. Can you guess what it is and give me the closest brand it resembles to? or a model number? And give me its average price in today's market in USD. In output, give me its normal name, model name, model number and price. separated by commas. No description is needed."
85
- },
86
- {
87
- "type": "image_url",
88
- "image_url": {
89
- "url": url
90
- }
91
- }
92
- ]
93
- }
94
- ]
95
-
96
- completion = client.chat.completions.create(
97
- model="meta-llama/Llama-3.2-11B-Vision-Instruct",
98
- messages=messages,
99
- max_tokens=500
100
- )
101
-
102
-
103
- print(f'\n\nNow output of LLM:\n')
104
- llm_result = completion.choices[0].message['content']
105
- print(llm_result)
106
- print(f'\n\nThat is the output')
107
-
108
- result = extract_product_info(llm_result)
109
- print(f'\n\nResult brand and price:{result}')
110
-
111
- # result2 = extract_info(llm_result)
112
- # print(f'\n\nFrom Google llm:{result2}')
113
-
114
- return result
115
-
116
- # url = "https://i.ibb.co/mNYvqDL/crop_39.jpg"
117
- # object="fridge"
118
-
 
 
119
  # get_name(url, object)
 
1
+ from huggingface_hub import InferenceClient
2
+ import nltk
3
+ import re
4
+ import requests
5
+ import os
6
+
7
+ api_key = os.getenv("HF_KEY")
8
+
9
+ nltk.download('punkt')
10
+ nltk.download('punkt_tab')
11
+ nltk.download('averaged_perceptron_tagger')
12
+ nltk.download('averaged_perceptron_tagger_eng')
13
+
14
+
15
+ client = InferenceClient(api_key=api_key)
16
+
17
+
18
+ def extract_product_info(text):
19
+ # Initialize result dictionary
20
+ result = {"brand": None, "model": None, "description": None, "price": None}
21
+
22
+ # Improved regex to prioritize currency-related patterns
23
+ price_match = re.search(
24
+ r'(\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|(?:\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s?(?:USD|usd|dollars|DOLLARS)))',
25
+ text
26
+ )
27
+ if price_match:
28
+ price = price_match.group().strip()
29
+ # Clean up the price format
30
+ if "$" in price or "USD" in price or "usd" in price:
31
+ result["price"] = re.sub(r'[^\d.]', '', price) # Keep only digits and decimals
32
+ else:
33
+ result["price"] = price
34
+ # Remove the price part from the text to prevent it from being included in the brand/model extraction
35
+ text = text.replace(price_match.group(), "").strip()
36
+
37
+ # Tokenize the remaining text and tag parts of speech
38
+ tokens = nltk.word_tokenize(text)
39
+ pos_tags = nltk.pos_tag(tokens)
40
+
41
+ # Extract brand and model (Proper Nouns + Alphanumeric patterns)
42
+ brand_parts = []
43
+ model_parts = []
44
+ description_parts = []
45
+
46
+ for word, tag in pos_tags:
47
+ if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
48
+ if len(brand_parts) == 0: # Assume the first proper noun is the brand
49
+ brand_parts.append(word)
50
+ else: # Model number tends to follow the brand
51
+ model_parts.append(word)
52
+ else:
53
+ description_parts.append(word)
54
+
55
+ # Assign brand and model to result dictionary
56
+ if brand_parts:
57
+ result["brand"] = " ".join(brand_parts)
58
+ if model_parts:
59
+ result["model"] = " ".join(model_parts)
60
+
61
+ # Combine the remaining parts as description
62
+ result["description"] = " ".join(description_parts)
63
+
64
+ return result
65
+
66
+
67
+
68
+ def extract_info(text):
69
+ API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
70
+ headers = {"Authorization": f"Bearer {api_key}"}
71
+ payload = {"inputs": f"From the given text, extract brand name, model number, description about it, and its average price in today's market. Give me back a python dictionary with keys as brand_name, model_number, desc, price. The text is {text}.",}
72
+ response = requests.post(API_URL, headers=headers, json=payload)
73
+ print('GOOGLEE LLM OUTPUTTTTTTT\n\n',response )
74
+ output = response.json()
75
+ print(output)
76
+
77
+
78
+
79
+ def get_name(url, object):
80
+ messages = [
81
+ {
82
+ "role": "user",
83
+ "content": [
84
+ {
85
+ "type": "text",
86
+ "text": f"Is this a {object}?. Can you guess what it is and give me the closest brand it resembles to? or a model number? And give me its average price in today's market in USD. In output, give me its normal name, model name, model number and price. separated by commas. No description is needed."
87
+ },
88
+ {
89
+ "type": "image_url",
90
+ "image_url": {
91
+ "url": url
92
+ }
93
+ }
94
+ ]
95
+ }
96
+ ]
97
+
98
+ completion = client.chat.completions.create(
99
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
100
+ messages=messages,
101
+ max_tokens=500
102
+ )
103
+
104
+
105
+ print(f'\n\nNow output of LLM:\n')
106
+ llm_result = completion.choices[0].message['content']
107
+ print(llm_result)
108
+ print(f'\n\nThat is the output')
109
+
110
+ result = extract_product_info(llm_result)
111
+ print(f'\n\nResult brand and price:{result}')
112
+
113
+ # result2 = extract_info(llm_result)
114
+ # print(f'\n\nFrom Google llm:{result2}')
115
+
116
+ return result
117
+
118
+ # url = "https://i.ibb.co/mNYvqDL/crop_39.jpg"
119
+ # object="fridge"
120
+
121
  # get_name(url, object)