awacke1 commited on
Commit
5ddf8c4
·
verified ·
1 Parent(s): 1a0dd7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -289
app.py CHANGED
@@ -1,289 +1,4 @@
1
- import streamlit as st
2
- import anthropic, openai, base64, cv2, glob, json, math, os, pytz, random, re, requests, textract, time, zipfile
3
- import plotly.graph_objects as go
4
- import streamlit.components.v1 as components
5
- from datetime import datetime
6
- from audio_recorder_streamlit import audio_recorder
7
- from bs4 import BeautifulSoup
8
- from collections import defaultdict, deque, Counter
9
- from dotenv import load_dotenv
10
- from gradio_client import Client
11
- from huggingface_hub import InferenceClient
12
- from io import BytesIO
13
- from PIL import Image
14
- from PyPDF2 import PdfReader
15
- from urllib.parse import quote
16
- from xml.etree import ElementTree as ET
17
- from openai import OpenAI
18
- import extra_streamlit_components as stx
19
- from streamlit.runtime.scriptrunner import get_script_run_ctx
20
- import asyncio
21
- import edge_tts
22
- from streamlit_marquee import streamlit_marquee
23
-
24
- # 🎯 1. Core Configuration & Setup
25
- st.set_page_config(
26
- page_title="🚲TalkingAIResearcher🏆",
27
- page_icon="🚲🏆",
28
- layout="wide",
29
- initial_sidebar_state="auto",
30
- menu_items={
31
- 'Get Help': 'https://huggingface.co/awacke1',
32
- 'Report a bug': 'https://huggingface.co/spaces/awacke1',
33
- 'About': "🚲TalkingAIResearcher🏆"
34
- }
35
- )
36
- load_dotenv()
37
-
38
- # Add available English voices for Edge TTS
39
- EDGE_TTS_VOICES = [
40
- "en-US-AriaNeural",
41
- "en-US-GuyNeural",
42
- "en-US-JennyNeural",
43
- "en-GB-SoniaNeural",
44
- "en-GB-RyanNeural",
45
- "en-AU-NatashaNeural",
46
- "en-AU-WilliamNeural",
47
- "en-CA-ClaraNeural",
48
- "en-CA-LiamNeural"
49
- ]
50
-
51
- def get_central_time():
52
- """Get current time in US Central timezone"""
53
- central = pytz.timezone('US/Central')
54
- return datetime.now(central)
55
-
56
- def format_timestamp_prefix():
57
- """Generate timestamp prefix in format MM_dd_yy_hh_mm_AM/PM"""
58
- ct = get_central_time()
59
- return ct.strftime("%m_%d_%y_%I_%M_%p")
60
-
61
- # Initialize session state variables
62
- if 'tts_voice' not in st.session_state:
63
- st.session_state['tts_voice'] = EDGE_TTS_VOICES[0]
64
- if 'audio_format' not in st.session_state:
65
- st.session_state['audio_format'] = 'mp3'
66
- if 'transcript_history' not in st.session_state:
67
- st.session_state['transcript_history'] = []
68
- if 'chat_history' not in st.session_state:
69
- st.session_state['chat_history'] = []
70
- if 'openai_model' not in st.session_state:
71
- st.session_state['openai_model'] = "gpt-4o-2024-05-13"
72
- if 'messages' not in st.session_state:
73
- st.session_state['messages'] = []
74
- if 'last_voice_input' not in st.session_state:
75
- st.session_state['last_voice_input'] = ""
76
- if 'editing_file' not in st.session_state:
77
- st.session_state['editing_file'] = None
78
- if 'edit_new_name' not in st.session_state:
79
- st.session_state['edit_new_name'] = ""
80
- if 'edit_new_content' not in st.session_state:
81
- st.session_state['edit_new_content'] = ""
82
- if 'viewing_prefix' not in st.session_state:
83
- st.session_state['viewing_prefix'] = None
84
- if 'should_rerun' not in st.session_state:
85
- st.session_state['should_rerun'] = False
86
- if 'old_val' not in st.session_state:
87
- st.session_state['old_val'] = None
88
- if 'last_query' not in st.session_state:
89
- st.session_state['last_query'] = ""
90
- if 'marquee_content' not in st.session_state:
91
- st.session_state['marquee_content'] = "🚀 Welcome to TalkingAIResearcher | 🤖 Your Research Assistant"
92
-
93
- # 🔑 2. API Setup & Clients
94
- openai_api_key = os.getenv('OPENAI_API_KEY', "")
95
- anthropic_key = os.getenv('ANTHROPIC_API_KEY_3', "")
96
- xai_key = os.getenv('xai',"")
97
- if 'OPENAI_API_KEY' in st.secrets:
98
- openai_api_key = st.secrets['OPENAI_API_KEY']
99
- if 'ANTHROPIC_API_KEY' in st.secrets:
100
- anthropic_key = st.secrets["ANTHROPIC_API_KEY"]
101
-
102
- openai.api_key = openai_api_key
103
- claude_client = anthropic.Anthropic(api_key=anthropic_key)
104
- openai_client = OpenAI(api_key=openai.api_key, organization=os.getenv('OPENAI_ORG_ID'))
105
- HF_KEY = os.getenv('HF_KEY')
106
- API_URL = os.getenv('API_URL')
107
-
108
- # Constants
109
- FILE_EMOJIS = {
110
- "md": "📝",
111
- "mp3": "🎵",
112
- "wav": "🔊"
113
- }
114
-
115
- # Functions
116
- def get_high_info_terms(text: str, top_n=10) -> list:
117
- stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with'])
118
- words = re.findall(r'\b\w+(?:-\w+)*\b', text.lower())
119
- bi_grams = [' '.join(pair) for pair in zip(words, words[1:])]
120
- combined = words + bi_grams
121
- filtered = [term for term in combined if term not in stop_words and len(term.split()) <= 2]
122
- counter = Counter(filtered)
123
- return [term for term, freq in counter.most_common(top_n)]
124
-
125
- def clean_text_for_filename(text: str) -> str:
126
- text = text.lower()
127
- text = re.sub(r'[^\w\s-]', '', text)
128
- words = text.split()
129
- stop_short = set(['the', 'and', 'for', 'with', 'this', 'that'])
130
- filtered = [w for w in words if len(w) > 3 and w not in stop_short]
131
- return '_'.join(filtered)[:200]
132
-
133
- def generate_filename(prompt, response, file_type="md"):
134
- prefix = format_timestamp_prefix() + "_"
135
- combined = (prompt + " " + response).strip()
136
- info_terms = get_high_info_terms(combined, top_n=10)
137
- snippet = (prompt[:100] + " " + response[:100]).strip()
138
- snippet_cleaned = clean_text_for_filename(snippet)
139
- name_parts = info_terms + [snippet_cleaned]
140
- full_name = '_'.join(name_parts)
141
- if len(full_name) > 150:
142
- full_name = full_name[:150]
143
- return f"{prefix}{full_name}.{file_type}"
144
-
145
- def create_file(prompt, response, file_type="md"):
146
- filename = generate_filename(prompt.strip(), response.strip(), file_type)
147
- with open(filename, 'w', encoding='utf-8') as f:
148
- f.write(prompt + "\n\n" + response)
149
- return filename
150
-
151
- def get_download_link(file, file_type="zip"):
152
- with open(file, "rb") as f:
153
- b64 = base64.b64encode(f.read()).decode()
154
- if file_type == "zip":
155
- return f'<a href="data:application/zip;base64,{b64}" download="{os.path.basename(file)}">📂 Download {os.path.basename(file)}</a>'
156
- elif file_type == "mp3":
157
- return f'<a href="data:audio/mpeg;base64,{b64}" download="{os.path.basename(file)}">🎵 Download {os.path.basename(file)}</a>'
158
- elif file_type == "wav":
159
- return f'<a href="data:audio/wav;base64,{b64}" download="{os.path.basename(file)}">🔊 Download {os.path.basename(file)}</a>'
160
- elif file_type == "md":
161
- return f'<a href="data:text/markdown;base64,{b64}" download="{os.path.basename(file)}">📝 Download {os.path.basename(file)}</a>'
162
- else:
163
- return f'<a href="data:application/octet-stream;base64,{b64}" download="{os.path.basename(file)}">Download {os.path.basename(file)}</a>'
164
-
165
- def clean_for_speech(text: str) -> str:
166
- text = text.replace("\n", " ")
167
- text = text.replace("</s>", " ")
168
- text = text.replace("#", "")
169
- text = re.sub(r"\(https?:\/\/[^\)]+\)", "", text)
170
- text = re.sub(r"\s+", " ", text).strip()
171
- return text
172
-
173
- async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0, file_format="mp3"):
174
- text = clean_for_speech(text)
175
- if not text.strip():
176
- return None
177
- rate_str = f"{rate:+d}%"
178
- pitch_str = f"{pitch:+d}Hz"
179
- communicate = edge_tts.Communicate(text, voice, rate=rate_str, pitch=pitch_str)
180
- out_fn = generate_filename(text, text, file_type=file_format)
181
- await communicate.save(out_fn)
182
- return out_fn
183
-
184
- def speak_with_edge_tts(text, voice="en-US-AriaNeural", rate=0, pitch=0, file_format="mp3"):
185
- return asyncio.run(edge_tts_generate_audio(text, voice, rate, pitch, file_format))
186
-
187
- def play_and_download_audio(file_path, file_type="mp3"):
188
- if file_path and os.path.exists(file_path):
189
- st.audio(file_path)
190
- dl_link = get_download_link(file_path, file_type=file_type)
191
- st.markdown(dl_link, unsafe_allow_html=True)
192
-
193
- def save_qa_with_audio(question, answer, voice=None):
194
- """Save Q&A to markdown and generate audio"""
195
- if not voice:
196
- voice = st.session_state['tts_voice']
197
-
198
- combined_text = f"# Question\n{question}\n\n# Answer\n{answer}"
199
- md_file = create_file(question, answer, "md")
200
-
201
- audio_text = f"Question: {question}\n\nAnswer: {answer}"
202
- audio_file = speak_with_edge_tts(
203
- audio_text,
204
- voice=voice,
205
- file_format=st.session_state['audio_format']
206
- )
207
-
208
- return md_file, audio_file
209
-
210
- def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
211
- titles_summary=True, full_audio=False, marquee_settings=None):
212
- start = time.time()
213
-
214
- client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
215
- refs = client.predict(q, 20, "Semantic Search",
216
- "mistralai/Mixtral-8x7B-Instruct-v0.1",
217
- api_name="/update_with_rag_md")[0]
218
- r2 = client.predict(q, "mistralai/Mixtral-8x7B-Instruct-v0.1",
219
- True, api_name="/ask_llm")
220
-
221
- result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
222
- st.markdown(result)
223
-
224
- md_file, audio_file = save_qa_with_audio(q, result)
225
-
226
- st.subheader("📝 Main Response Audio")
227
- play_and_download_audio(audio_file, st.session_state['audio_format'])
228
-
229
- papers = parse_arxiv_refs(refs)
230
- if papers:
231
- create_paper_audio_files(papers, input_question=q)
232
- if marquee_settings:
233
- display_papers(papers, marquee_settings)
234
- else:
235
- display_papers(papers, get_marquee_settings())
236
- else:
237
- st.warning("No papers found in the response.")
238
-
239
- elapsed = time.time()-start
240
- st.write(f"**Total Elapsed:** {elapsed:.2f} s")
241
-
242
- return result
243
-
244
- def process_voice_input(text):
245
- if not text:
246
- return
247
-
248
- st.subheader("🔍 Search Results")
249
- result = perform_ai_lookup(
250
- text,
251
- vocal_summary=True,
252
- extended_refs=False,
253
- titles_summary=True,
254
- full_audio=True,
255
- marquee_settings=marquee_settings)
256
-
257
- md_file, audio_file = save_qa_with_audio(text, result)
258
-
259
- st.subheader("📝 Generated Files")
260
- st.write(f"Markdown: {md_file}")
261
- st.write(f"Audio: {audio_file}")
262
- play_and_download_audio(audio_file, st.session_state['audio_format'])
263
-
264
- def load_files_for_sidebar():
265
- md_files = glob.glob("*.md")
266
- mp3_files = glob.glob("*.mp3")
267
- wav_files = glob.glob("*.wav")
268
-
269
- md_files = [f for f in md_files if os.path.basename(f).lower() != 'readme.md']
270
- all_files = md_files + mp3_files + wav_files
271
-
272
- groups = defaultdict(list)
273
- prefix_length = len("MM_dd_yy_hh_mm_AP")
274
-
275
- for f in all_files:
276
- basename = os.path.basename(f)
277
- if len(basename) >= prefix_length and '_' in basename:
278
- group_name = basename[:prefix_length]
279
- groups[group_name].append(f)
280
- else:
281
- groups['Other'].append(f)
282
-
283
- sorted_groups = sorted(groups.items(),
284
- key=lambda x: x[0] if x[0] != 'Other' else '',
285
- reverse=True)
286
- return sorted_groups
287
 
288
  def display_file_manager_sidebar(groups_sorted):
289
  st.sidebar.title("🎵 Audio & Docs Manager")
@@ -508,6 +223,7 @@ def display_papers(papers, marquee_settings):
508
  if file_ext in ['mp3', 'wav']:
509
  st.audio(paper['full_audio'])
510
 
 
511
  def main():
512
  marquee_settings = get_marquee_settings()
513
 
@@ -602,10 +318,9 @@ def main():
602
  user_text = user_text.strip().replace('\n', ' ')
603
 
604
  if st.button("📨 Send"):
605
- process_voice_input(user_text)
606
 
607
- st.subheader("📜 Chat History")
608
- for c in st.session_state.chat_history:
609
  st.write("**You:**", c["user"])
610
  st.write("**Response:**", c["claude"])
611
 
 
1
+ return sorted_groups
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  def display_file_manager_sidebar(groups_sorted):
4
  st.sidebar.title("🎵 Audio & Docs Manager")
 
223
  if file_ext in ['mp3', 'wav']:
224
  st.audio(paper['full_audio'])
225
 
226
+
227
  def main():
228
  marquee_settings = get_marquee_settings()
229
 
 
318
  user_text = user_text.strip().replace('\n', ' ')
319
 
320
  if st.button("📨 Send"):
321
+ process_voice_input(user_text, marquee_settings=marquee_settings)
322
 
323
+ st.subheader("📜 Chat History")for c in st.session_state.chat_history:
 
324
  st.write("**You:**", c["user"])
325
  st.write("**Response:**", c["claude"])
326