SearchGPT

Running

File size: 5,915 Bytes

import random
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
import os

# Ensure sentencepiece is installed
try:
    import sentencepiece
except ImportError:
    raise ImportError("Please install the sentencepiece library using `pip install sentencepiece`.")

# Retrieve the Hugging Face token from secrets (replace 'HUGGINGFACE_TOKEN' with your secret key)
hf_token = os.getenv('HUGGINGFACE_TOKEN')

# Log in to Hugging Face
login(token=hf_token)

# List of user agents
_useragent_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
]

# Function to extract visible text from HTML content of a webpage
def extract_text_from_webpage(html):
    print("Extracting text from webpage...")
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()  # Remove scripts and styles
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    print(f"Extracted text length: {len(text)}")
    return text

# Function to perform a Google search and retrieve results
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
    """Performs a Google search and returns the results."""
    print(f"Searching for term: {term}")
    escaped_term = requests.utils.quote(term)
    start = 0
    all_results = []
    max_chars_per_page = 8000  # Limit the number of characters from each webpage to stay under the token limit
    
    with requests.Session() as session:
        while start < num_results:
            print(f"Fetching search results starting from: {start}")
            try:
                # Choose a random user agent
                user_agent = random.choice(_useragent_list)
                headers = {
                    'User-Agent': user_agent
                }
                print(f"Using User-Agent: {headers['User-Agent']}")
                
                resp = session.get(
                    url="https://www.google.com/search",
                    headers=headers,
                    params={
                        "q": term,
                        "num": num_results - start,
                        "hl": lang,
                        "start": start,
                        "safe": safe,
                    },
                    timeout=timeout,
                    verify=ssl_verify,
                )
                resp.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Error fetching search results: {e}")
                break
            
            soup = BeautifulSoup(resp.text, "html.parser")
            result_block = soup.find_all("div", attrs={"class": "g"})
            if not result_block:
                print("No more results found.")
                break
            for result in result_block:
                link = result.find("a", href=True)
                if link:
                    link = link["href"]
                    print(f"Found link: {link}")
                    try:
                        webpage = session.get(link, headers=headers, timeout=timeout)
                        webpage.raise_for_status()
                        visible_text = extract_text_from_webpage(webpage.text)
                        if len(visible_text) > max_chars_per_page:
                            visible_text = visible_text[:max_chars_per_page] + "..."
                        all_results.append({"link": link, "text": visible_text})
                    except requests.exceptions.RequestException as e:
                        print(f"Error fetching or processing {link}: {e}")
                        all_results.append({"link": link, "text": None})
                else:
                    print("No link found in result.")
                    all_results.append({"link": None, "text": None})
            start += len(result_block)
    print(f"Total results fetched: {len(all_results)}")
    return all_results

# Load the Mixtral-8x7B-Instruct model and tokenizer
model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Check if a GPU is available and if not, fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check for GPU
model.to(device)  # Move model to the device

# Example usage
search_term = "How did Tesla perform in Q1 2024"
search_results = google_search(search_term, num_results=3)

# Combine text from search results to create a prompt
combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])

# Tokenize the input text
inputs = tokenizer(combined_text, return_tensors="pt").to(device)  # Move inputs to the device

# Generate a response
outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)

# Decode the generated tokens to a readable string
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the response
print(response)