Spaces:
Running
Running
File size: 5,915 Bytes
a2335c5 8f4e927 a2335c5 753d9d8 a2335c5 664e897 a2335c5 a65ba38 a2335c5 753d9d8 a2335c5 34054e0 a2335c5 a65ba38 a2335c5 a65ba38 a2335c5 34054e0 a2335c5 34054e0 a2335c5 a65ba38 a2335c5 34054e0 a2335c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import random
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
import os
# Ensure sentencepiece is installed
try:
import sentencepiece
except ImportError:
raise ImportError("Please install the sentencepiece library using `pip install sentencepiece`.")
# Retrieve the Hugging Face token from secrets (replace 'HUGGINGFACE_TOKEN' with your secret key)
hf_token = os.getenv('HUGGINGFACE_TOKEN')
# Log in to Hugging Face
login(token=hf_token)
# List of user agents
_useragent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
]
# Function to extract visible text from HTML content of a webpage
def extract_text_from_webpage(html):
print("Extracting text from webpage...")
soup = BeautifulSoup(html, 'html.parser')
for script in soup(["script", "style"]):
script.extract() # Remove scripts and styles
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
print(f"Extracted text length: {len(text)}")
return text
# Function to perform a Google search and retrieve results
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
"""Performs a Google search and returns the results."""
print(f"Searching for term: {term}")
escaped_term = requests.utils.quote(term)
start = 0
all_results = []
max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
with requests.Session() as session:
while start < num_results:
print(f"Fetching search results starting from: {start}")
try:
# Choose a random user agent
user_agent = random.choice(_useragent_list)
headers = {
'User-Agent': user_agent
}
print(f"Using User-Agent: {headers['User-Agent']}")
resp = session.get(
url="https://www.google.com/search",
headers=headers,
params={
"q": term,
"num": num_results - start,
"hl": lang,
"start": start,
"safe": safe,
},
timeout=timeout,
verify=ssl_verify,
)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching search results: {e}")
break
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", attrs={"class": "g"})
if not result_block:
print("No more results found.")
break
for result in result_block:
link = result.find("a", href=True)
if link:
link = link["href"]
print(f"Found link: {link}")
try:
webpage = session.get(link, headers=headers, timeout=timeout)
webpage.raise_for_status()
visible_text = extract_text_from_webpage(webpage.text)
if len(visible_text) > max_chars_per_page:
visible_text = visible_text[:max_chars_per_page] + "..."
all_results.append({"link": link, "text": visible_text})
except requests.exceptions.RequestException as e:
print(f"Error fetching or processing {link}: {e}")
all_results.append({"link": link, "text": None})
else:
print("No link found in result.")
all_results.append({"link": None, "text": None})
start += len(result_block)
print(f"Total results fetched: {len(all_results)}")
return all_results
# Load the Mixtral-8x7B-Instruct model and tokenizer
model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Check if a GPU is available and if not, fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Check for GPU
model.to(device) # Move model to the device
# Example usage
search_term = "How did Tesla perform in Q1 2024"
search_results = google_search(search_term, num_results=3)
# Combine text from search results to create a prompt
combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])
# Tokenize the input text
inputs = tokenizer(combined_text, return_tensors="pt").to(device) # Move inputs to the device
# Generate a response
outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)
# Decode the generated tokens to a readable string
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Print the response
print(response)
|