Shreyas094 commited on
Commit
34054e0
·
verified ·
1 Parent(s): 67f5e62

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+
6
+ # List of user agents
7
+ _useragent_list = [
8
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
9
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
10
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
11
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
12
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
14
+ ]
15
+
16
+ # Function to extract visible text from HTML content of a webpage
17
+ def extract_text_from_webpage(html):
18
+ print("Extracting text from webpage...")
19
+ soup = BeautifulSoup(html, 'html.parser')
20
+ for script in soup(["script", "style"]):
21
+ script.extract() # Remove scripts and styles
22
+ text = soup.get_text()
23
+ lines = (line.strip() for line in text.splitlines())
24
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
25
+ text = '\n'.join(chunk for chunk in chunks if chunk)
26
+ print(f"Extracted text length: {len(text)}")
27
+ return text
28
+
29
+ # Function to perform a Google search and retrieve results
30
+ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
31
+ """Performs a Google search and returns the results."""
32
+ print(f"Searching for term: {term}")
33
+ escaped_term = requests.utils.quote(term)
34
+ start = 0
35
+ all_results = []
36
+ max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
37
+
38
+ with requests.Session() as session:
39
+ while start < num_results:
40
+ print(f"Fetching search results starting from: {start}")
41
+ try:
42
+ # Choose a random user agent
43
+ user_agent = random.choice(_useragent_list)
44
+ headers = {
45
+ 'User-Agent': user_agent
46
+ }
47
+ print(f"Using User-Agent: {headers['User-Agent']}")
48
+
49
+ resp = session.get(
50
+ url="https://www.google.com/search",
51
+ headers=headers,
52
+ params={
53
+ "q": term,
54
+ "num": num_results - start,
55
+ "hl": lang,
56
+ "start": start,
57
+ "safe": safe,
58
+ },
59
+ timeout=timeout,
60
+ verify=ssl_verify,
61
+ )
62
+ resp.raise_for_status()
63
+ except requests.exceptions.RequestException as e:
64
+ print(f"Error fetching search results: {e}")
65
+ break
66
+
67
+ soup = BeautifulSoup(resp.text, "html.parser")
68
+ result_block = soup.find_all("div", attrs={"class": "g"})
69
+ if not result_block:
70
+ print("No more results found.")
71
+ break
72
+ for result in result_block:
73
+ link = result.find("a", href=True)
74
+ if link:
75
+ link = link["href"]
76
+ print(f"Found link: {link}")
77
+ try:
78
+ webpage = session.get(link, headers=headers, timeout=timeout)
79
+ webpage.raise_for_status()
80
+ visible_text = extract_text_from_webpage(webpage.text)
81
+ if len(visible_text) > max_chars_per_page:
82
+ visible_text = visible_text[:max_chars_per_page] + "..."
83
+ all_results.append({"link": link, "text": visible_text})
84
+ except requests.exceptions.RequestException as e:
85
+ print(f"Error fetching or processing {link}: {e}")
86
+ all_results.append({"link": link, "text": None})
87
+ else:
88
+ print("No link found in result.")
89
+ all_results.append({"link": None, "text": None})
90
+ start += len(result_block)
91
+ print(f"Total results fetched: {len(all_results)}")
92
+ return all_results
93
+
94
+ # Load the Mixtral-8x7B-Instruct model and tokenizer
95
+ model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
96
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
97
+ model = AutoModelForCausalLM.from_pretrained(model_name)
98
+
99
+ # Example usage
100
+ search_term = "How did Tesla perform in Q1 2024"
101
+ search_results = google_search(search_term, num_results=3)
102
+
103
+ # Combine text from search results to create a prompt
104
+ combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])
105
+
106
+ # Tokenize the input text
107
+ inputs = tokenizer(combined_text, return_tensors="pt")
108
+
109
+ # Generate a response
110
+ outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)
111
+
112
+ # Decode the generated tokens to a readable string
113
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
114
+
115
+ # Print the response
116
+ print(response)