pvanand commited on
Commit
b1fa23d
1 Parent(s): b1c8f17

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +277 -2
helper_functions_api.py CHANGED
@@ -1,10 +1,285 @@
 
1
  import mistune
2
  from mistune.plugins.table import table
 
3
  import re
4
-
5
 
6
  def md_to_html(md_text):
7
  renderer = mistune.HTMLRenderer()
8
  markdown_renderer = mistune.Markdown(renderer, plugins=[table])
9
  html_content = markdown_renderer(md_text)
10
- return html_content.replace('\n', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !pip install mistune
2
  import mistune
3
  from mistune.plugins.table import table
4
+ from jinja2 import Template
5
  import re
6
+ import os
7
 
8
  def md_to_html(md_text):
9
  renderer = mistune.HTMLRenderer()
10
  markdown_renderer = mistune.Markdown(renderer, plugins=[table])
11
  html_content = markdown_renderer(md_text)
12
+ return html_content.replace('\n', '')
13
+
14
+ ####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------####
15
+ from datetime import datetime
16
+ import psycopg2
17
+
18
+ from dotenv import load_dotenv, find_dotenv
19
+
20
+ # Load environment variables from .env file
21
+ load_dotenv("keys.env")
22
+
23
+ TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
24
+ BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
25
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
26
+ HELICON_API_KEY = os.getenv("HELICON_API_KEY")
27
+ SUPABASE_USER = os.environ['SUPABASE_USER']
28
+ SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']
29
+
30
+ def insert_data(user_id, user_query, subtopic_query, response, html_report):
31
+ # Connect to your database
32
+ conn = psycopg2.connect(
33
+ dbname="postgres",
34
+ user=SUPABASE_USER,
35
+ password=SUPABASE_PASSWORD,
36
+ host="aws-0-us-west-1.pooler.supabase.com",
37
+ port="5432"
38
+ )
39
+ cur = conn.cursor()
40
+ insert_query = """
41
+ INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
42
+ VALUES (%s, %s, %s, %s, %s, %s);
43
+ """
44
+ cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now()))
45
+ conn.commit()
46
+ cur.close()
47
+ conn.close()
48
+
49
+ ####-----------------------------------------------------END----------------------------------------------------------####
50
+
51
+
52
+ import ast
53
+ from fpdf import FPDF
54
+ import re
55
+ import pandas as pd
56
+ import nltk
57
+ nltk.download('stopwords')
58
+ nltk.download('punkt')
59
+ import requests
60
+ import json
61
+ from retry import retry
62
+ from concurrent.futures import ThreadPoolExecutor, as_completed
63
+ from bs4 import BeautifulSoup
64
+ from nltk.corpus import stopwords
65
+ from nltk.tokenize import word_tokenize
66
+ from brave import Brave
67
+ from together import Together
68
+ from langchain_core.output_parsers import JsonOutputParser
69
+ from fuzzy_json import loads
70
+ from half_json.core import JSONFixer
71
+ from openai import OpenAI
72
+
73
+ llm_default_small = "llama3-8b-8192"
74
+ llm_default_medium = "llama3-70b-8192"
75
+
76
+ SysPromptJson = "You are now in the role of an expert AI who can extract structured information from user request. Both key and value pairs must be in double quotes. You must respond ONLY with a valid JSON file. Do not add any additional comments."
77
+ SysPromptList = "You are now in the role of an expert AI who can extract structured information from user request. All elements must be in double quotes. You must respond ONLY with a valid python List. Do not add any additional comments."
78
+ SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
79
+
80
+ import tiktoken # Used to limit tokens
81
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
82
+
83
+ def limit_tokens(input_string, token_limit=8000):
84
+ """
85
+ Limit tokens sent to the model
86
+ """
87
+ return encoding.decode(encoding.encode(input_string)[:token_limit])
88
+
89
+ def together_response(message, model=llm_default_small, SysPrompt = SysPromptDefault,temperature=0.2):
90
+
91
+ client = OpenAI(
92
+ api_key=GROQ_API_KEY,
93
+ base_url="https://gateway.hconeai.com/openai/v1",
94
+ default_headers={
95
+ "Helicone-Auth": f"Bearer {HELICON_API_KEY}",
96
+ "Helicone-Target-Url": "https://api.groq.com"
97
+ }
98
+ )
99
+
100
+ messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}]
101
+
102
+ response = client.chat.completions.create(
103
+ model=model,
104
+ messages=messages,
105
+ temperature=temperature,
106
+ )
107
+ return response.choices[0].message.content
108
+
109
+
110
+ def json_from_text(text):
111
+ """
112
+ Extracts JSON from text using regex and fuzzy JSON loading.
113
+ """
114
+ match = re.search(r'\{[\s\S]*\}', text)
115
+ if match:
116
+ json_out = match.group(0)
117
+ else:
118
+ json_out = text
119
+ try:
120
+ # Using fuzzy json loader
121
+ return loads(json_out)
122
+ except Exception:
123
+ # Using JSON fixer/ Fixes even half json/ Remove if you need an exception
124
+ fix_json = JSONFixer()
125
+ return loads(fix_json.fix(json_out).line)
126
+
127
+ def remove_stopwords(text):
128
+ stop_words = set(stopwords.words('english'))
129
+ words = word_tokenize(text)
130
+ filtered_text = [word for word in words if word.lower() not in stop_words]
131
+ return ' '.join(filtered_text)
132
+
133
+ def rephrase_content(content, query):
134
+ return together_response(f"You are an information retriever,ignore everything you know, return only the\
135
+ numerical or quantitative data regarding the query: {{{query}}} structured into markdown tables only \
136
+ , using the scraped context:{{{limit_tokens(content)}}}")
137
+
138
+ class Scraper:
139
+ def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
140
+ self.session = requests.Session()
141
+ self.session.headers.update({"User-Agent": user_agent})
142
+
143
+ @retry(tries=3, delay=1)
144
+ def fetch_content(self, url):
145
+ try:
146
+ response = self.session.get(url, timeout=2)
147
+ if response.status_code == 200:
148
+ return response.text
149
+ except requests.exceptions.RequestException as e:
150
+ print(f"Error fetching page content for {url}: {e}")
151
+ return None
152
+
153
+ def extract_main_content(html):
154
+ if html:
155
+ soup = BeautifulSoup(html, 'lxml')
156
+ paragraphs = soup.find_all('p')
157
+ text = ' '.join(p.get_text() for p in paragraphs)
158
+ return text
159
+ return ""
160
+
161
+ def process_content(url, query):
162
+ scraper = Scraper()
163
+ html_content = scraper.fetch_content(url)
164
+ if html_content:
165
+ content = extract_main_content(html_content)
166
+ if content:
167
+ rephrased_content = rephrase_content(remove_stopwords(content)[:4096*4], query)
168
+ return rephrased_content, url
169
+ return "", url
170
+
171
+ def fetch_and_extract_content(urls, query):
172
+ with ThreadPoolExecutor(max_workers=len(urls)) as executor:
173
+ future_to_url = {executor.submit(process_content, url, query): url for url in urls}
174
+ all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
175
+
176
+ return all_text_with_urls
177
+
178
+ def search_brave(query, num_results=5):
179
+
180
+ brave = Brave(BRAVE_API_KEY)
181
+
182
+ search_results = brave.search(q=query, count=num_results)
183
+
184
+ return [url.__str__() for url in search_results.urls]
185
+
186
+ def generate_report_with_reference(full_data):
187
+ """
188
+ Generate HTML report with references and saves pdf report to "generated_pdf_report.pdf"
189
+ """
190
+ pdf = FPDF()
191
+ with open("report_with_references_template.html") as f: # src/research-pro/app_v1.5_online/
192
+ html_template = f.read()
193
+
194
+ # Loop through each row in your dataset
195
+ html_report = ''
196
+ idx = 1
197
+ for subtopic_data in full_data:
198
+
199
+ md_report = md_to_html(subtopic_data['md_report'])
200
+ # Convert the string representation of a list of tuples back to a list of tuples
201
+ references = ast.literal_eval(subtopic_data['text_with_urls'])
202
+
203
+ collapsible_blocks = []
204
+ for ref_idx, reference in enumerate(references):
205
+ ref_text = md_to_html(reference[0])
206
+ ref_url = reference[1]
207
+ urls_html = ''.join(f'<a href="{ref_url}"> {ref_url}</a>')
208
+
209
+ collapsible_block = '''
210
+ <details>
211
+ <summary>Reference {}: {}</summary>
212
+ <div>
213
+ <p>{}</p>
214
+ <ul>{}</ul>
215
+ </div>
216
+ </details>
217
+ '''.format(ref_idx+1, urls_html, ref_text, urls_html)
218
+
219
+ collapsible_blocks.append(collapsible_block)
220
+
221
+ references_html = '\n'.join(collapsible_blocks)
222
+
223
+ template = Template(html_template)
224
+ html_page = template.render(md_report=md_report, references=references_html)
225
+
226
+ pdf.add_page()
227
+ pdf_report = f"<h1><strong>Report {idx}</strong></h1>"+md_report+f"<h1><strong>References for Report {idx}</strong></h1>"+references_html
228
+
229
+ pdf.write_html(pdf_report.encode('ascii', 'ignore').decode('ascii')) # Filter non-asci characters
230
+ html_report += html_page
231
+ idx+=1
232
+
233
+ pdf.output("generated_pdf_report.pdf")
234
+ return html_report
235
+
236
+ def write_dataframes_to_excel(dataframes_list, filename):
237
+ """
238
+ input: [df_list1, df_list2, ..]
239
+ saves filename.xlsx
240
+ """
241
+ try:
242
+ with pd.ExcelWriter(filename, engine="openpyxl") as writer:
243
+ for idx, dataframes in enumerate(dataframes_list):
244
+ startrow = 0
245
+ for idx2, df in enumerate(dataframes):
246
+ df.to_excel(writer, sheet_name=f"Sheet{idx+1}", startrow=startrow, index=False)
247
+ startrow += len(df) + 2
248
+ except:
249
+ # Empty dataframe due to no tables found, file is not written
250
+ pass
251
+
252
+ def extract_tables_from_html(html_file):
253
+ """
254
+ input: html_file
255
+ output: [df1,df2,df3,..]
256
+ """
257
+ # Initialize an empty list to store the dataframes
258
+ dataframes = []
259
+
260
+ # Open the HTML file and parse it with BeautifulSoup
261
+ soup = BeautifulSoup(html_file, 'html.parser')
262
+
263
+ # Find all the tables in the HTML file
264
+ tables = soup.find_all('table')
265
+
266
+ # Iterate through each table
267
+ for table in tables:
268
+ # Extract the table headers
269
+ headers = [th.text for th in table.find_all('th')]
270
+
271
+ # Extract the table data
272
+ rows = table.find_all('tr')
273
+ data = []
274
+ for row in rows:
275
+ row_data = [td.text for td in row.find_all('td')]
276
+ data.append(row_data)
277
+
278
+ # Create a dataframe from the headers and data
279
+ df = pd.DataFrame(data, columns=headers)
280
+
281
+ # Append the dataframe to the list of dataframes
282
+ dataframes.append(df)
283
+
284
+ # Return the list of dataframes
285
+ return dataframes