File size: 5,915 Bytes
a2335c5
 
 
8f4e927
a2335c5
753d9d8
a2335c5
664e897
a2335c5
 
 
 
 
a65ba38
a2335c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753d9d8
a2335c5
34054e0
a2335c5
 
 
a65ba38
a2335c5
 
 
a65ba38
a2335c5
 
34054e0
a2335c5
 
34054e0
a2335c5
 
a65ba38
a2335c5
34054e0
 
a2335c5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import random
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
import os

# Ensure sentencepiece is installed
try:
    import sentencepiece
except ImportError:
    raise ImportError("Please install the sentencepiece library using `pip install sentencepiece`.")

# Retrieve the Hugging Face token from secrets (replace 'HUGGINGFACE_TOKEN' with your secret key)
hf_token = os.getenv('HUGGINGFACE_TOKEN')

# Log in to Hugging Face
login(token=hf_token)

# List of user agents
_useragent_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
]

# Function to extract visible text from HTML content of a webpage
def extract_text_from_webpage(html):
    print("Extracting text from webpage...")
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()  # Remove scripts and styles
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    print(f"Extracted text length: {len(text)}")
    return text

# Function to perform a Google search and retrieve results
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
    """Performs a Google search and returns the results."""
    print(f"Searching for term: {term}")
    escaped_term = requests.utils.quote(term)
    start = 0
    all_results = []
    max_chars_per_page = 8000  # Limit the number of characters from each webpage to stay under the token limit
    
    with requests.Session() as session:
        while start < num_results:
            print(f"Fetching search results starting from: {start}")
            try:
                # Choose a random user agent
                user_agent = random.choice(_useragent_list)
                headers = {
                    'User-Agent': user_agent
                }
                print(f"Using User-Agent: {headers['User-Agent']}")
                
                resp = session.get(
                    url="https://www.google.com/search",
                    headers=headers,
                    params={
                        "q": term,
                        "num": num_results - start,
                        "hl": lang,
                        "start": start,
                        "safe": safe,
                    },
                    timeout=timeout,
                    verify=ssl_verify,
                )
                resp.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Error fetching search results: {e}")
                break
            
            soup = BeautifulSoup(resp.text, "html.parser")
            result_block = soup.find_all("div", attrs={"class": "g"})
            if not result_block:
                print("No more results found.")
                break
            for result in result_block:
                link = result.find("a", href=True)
                if link:
                    link = link["href"]
                    print(f"Found link: {link}")
                    try:
                        webpage = session.get(link, headers=headers, timeout=timeout)
                        webpage.raise_for_status()
                        visible_text = extract_text_from_webpage(webpage.text)
                        if len(visible_text) > max_chars_per_page:
                            visible_text = visible_text[:max_chars_per_page] + "..."
                        all_results.append({"link": link, "text": visible_text})
                    except requests.exceptions.RequestException as e:
                        print(f"Error fetching or processing {link}: {e}")
                        all_results.append({"link": link, "text": None})
                else:
                    print("No link found in result.")
                    all_results.append({"link": None, "text": None})
            start += len(result_block)
    print(f"Total results fetched: {len(all_results)}")
    return all_results

# Load the Mixtral-8x7B-Instruct model and tokenizer
model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Check if a GPU is available and if not, fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check for GPU
model.to(device)  # Move model to the device

# Example usage
search_term = "How did Tesla perform in Q1 2024"
search_results = google_search(search_term, num_results=3)

# Combine text from search results to create a prompt
combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])

# Tokenize the input text
inputs = tokenizer(combined_text, return_tensors="pt").to(device)  # Move inputs to the device

# Generate a response
outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)

# Decode the generated tokens to a readable string
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the response
print(response)