File size: 4,039 Bytes
d1e081c
9aa4e56
 
 
 
d1e081c
 
 
 
 
9aa4e56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1e081c
 
 
 
 
 
 
 
9aa4e56
 
 
 
 
 
 
d1e081c
9aa4e56
 
 
 
 
 
 
aa6b072
 
d1e081c
 
 
 
 
9aa4e56
d1e081c
 
 
9aa4e56
 
d1e081c
 
 
9aa4e56
 
 
 
 
 
d1e081c
9aa4e56
 
 
 
 
 
 
 
 
 
d1e081c
9aa4e56
 
d1e081c
 
 
 
9aa4e56
d1e081c
9aa4e56
d1e081c
9aa4e56
d1e081c
 
9aa4e56
 
d1e081c
9aa4e56
d1e081c
 
 
 
9aa4e56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import requests
from duckduckgo_search import DDGS
import itertools
import time
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.output_parsers import JsonOutputParser
from langdetect import detect

# Fetch proxy list from GitHub
def get_proxies():
    url = "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"
    response = requests.get(url)
    proxies = response.text.splitlines()
    return proxies

# Proxy cycle for rotation
proxy_list = get_proxies()
proxy_cycle = itertools.cycle(proxy_list)

# Proxy-enabled DDGS
class ProxiedDDGS(DDGS):
    def __init__(self, proxy):
        super().__init__()
        self.proxy = proxy

    def _get(self, url, headers=None):
        response = requests.get(
            url, headers=headers, proxies={"http": self.proxy, "https": self.proxy}
        )
        response.raise_for_status()
        return response

# Search function with retries
def search_with_retries(query, max_results=3, max_retries=5, backoff_factor=1):
    retries = 0
    while retries < max_retries:
        try:
            proxy = next(proxy_cycle)
            searcher = ProxiedDDGS(proxy)
            results = searcher.text(query, max_results=max_results)
            return results, proxy
        except Exception:
            retries += 1
            time.sleep(backoff_factor * retries)
    raise RuntimeError(f"All retries failed for query: {query}")

# Initialize the LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=128,
    temperature=0.7,
    do_sample=False,
)

# Prompt template for feature extraction
template_extract_features = '''
You are a product feature extractor bot. Your task is to determine features like Brand, Model, Type, RAM, Storage, etc., from the given product description and web search results.

Return features in JSON format with keys like Brand, Model, Type, RAM, Storage, and others.
Your response MUST only include a valid JSON object and nothing else.

Example:
{{
    "Brand": "Apple",
    "Model": "iPhone 14",
    "Type": "Smartphone",
    "RAM": "4GB",
    "Storage": "128GB"
}}
Answer with JSON for the following:
Given product description and web search results: {TEXT} {SEARCH_RESULTS}
'''

json_output_parser = JsonOutputParser()

# Define the classify_text function
def extract_features(description):
    global llm
    start = time.time()

    try: 
        lang = detect(description)
    except:
        lang = "en"

    # Perform web search
    try:
        search_results, _ = search_with_retries(description, max_results=3)
        search_text = "\n".join([res.get('snippet', '') for res in search_results])
    except RuntimeError as e:
        search_text = "No search results available."

    # Format the prompt
    prompt_extract = PromptTemplate(
        template=template_extract_features,
        input_variables=["TEXT", "SEARCH_RESULTS"]
    )
    formatted_prompt = prompt_extract.format(TEXT=description, SEARCH_RESULTS=search_text)
    
    # LLM response
    response = llm.invoke(formatted_prompt)
    parsed_output = json_output_parser.parse(response)
    end = time.time()

    return lang, parsed_output, end - start

# Create the Gradio interface
def create_gradio_interface():
    with gr.Blocks() as iface:
        text_input = gr.Textbox(label="Item Description")
        lang_output = gr.Textbox(label="Detected Language")
        feature_output = gr.Textbox(label="Extracted Features (JSON)")
        time_taken = gr.Textbox(label="Time Taken (seconds)")
        submit_btn = gr.Button("Extract Features")

        def on_submit(text):
            lang, features, duration = extract_features(text)
            return lang, features, f"{duration:.2f} seconds"

        submit_btn.click(fn=on_submit, inputs=text_input, outputs=[lang_output, feature_output, time_taken])

    iface.launch()

if __name__ == "__main__":
    create_gradio_interface()