import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import torch import json import os import glob from pathlib import Path from datetime import datetime import edge_tts import asyncio import base64 import requests from collections import defaultdict from audio_recorder_streamlit import audio_recorder import streamlit.components.v1 as components import openai from dotenv import load_dotenv # Load environment load_dotenv() openai.api_key = os.getenv('OPENAI_API_KEY') # Ensure edge_tts and other dependencies are installed # pip install edge-tts openai streamlit-audiorecorder # Initialize session state if 'search_history' not in st.session_state: st.session_state['search_history'] = [] if 'last_voice_input' not in st.session_state: st.session_state['last_voice_input'] = "" if 'transcript_history' not in st.session_state: st.session_state['transcript_history'] = [] if 'should_rerun' not in st.session_state: st.session_state['should_rerun'] = False if 'search_columns' not in st.session_state: st.session_state['search_columns'] = [] if 'initial_search_done' not in st.session_state: st.session_state['initial_search_done'] = False if 'tts_voice' not in st.session_state: st.session_state['tts_voice'] = "en-US-AriaNeural" if 'arxiv_last_query' not in st.session_state: st.session_state['arxiv_last_query'] = "" class VideoSearch: def __init__(self): self.text_model = SentenceTransformer('all-MiniLM-L6-v2') self.load_dataset() def fetch_dataset_rows(self): """Fetch dataset from HF API""" try: url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train" response = requests.get(url, timeout=30) if response.status_code == 200: data = response.json() if 'rows' in data: processed_rows = [] for row_data in data['rows']: row = row_data.get('row', row_data) for key in row: if any(term in key.lower() for term in ['embed', 'vector', 'encoding']): if isinstance(row[key], str): try: row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()] except: continue processed_rows.append(row) df = pd.DataFrame(processed_rows) # Update search columns st.session_state['search_columns'] = [col for col in df.columns if col not in ['video_embed', 'description_embed', 'audio_embed']] return df return self.load_example_data() except Exception: return self.load_example_data() def prepare_features(self): """Prepare embeddings with adaptive field detection""" try: embed_cols = [col for col in self.dataset.columns if any(term in col.lower() for term in ['embed', 'vector', 'encoding'])] embeddings = {} for col in embed_cols: try: data = [] for row in self.dataset[col]: if isinstance(row, str): values = [float(x.strip()) for x in row.strip('[]').split(',') if x.strip()] elif isinstance(row, list): values = row else: continue data.append(values) if data: embeddings[col] = np.array(data) except: continue # Set main embeddings for search if 'video_embed' in embeddings: self.video_embeds = embeddings['video_embed'] else: self.video_embeds = next(iter(embeddings.values())) if 'description_embed' in embeddings: self.text_embeds = embeddings['description_embed'] else: self.text_embeds = self.video_embeds except Exception: # Fallback to random embeddings num_rows = len(self.dataset) self.video_embeds = np.random.randn(num_rows, 384) self.text_embeds = np.random.randn(num_rows, 384) def load_example_data(self): example_data = [ { "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc", "youtube_id": "IO-vwtyicn4", "description": "This video shows a close-up of an ancient text carved into a surface.", "views": 45489, "start_time": 1452, "end_time": 1458, "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774], "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819] } ] return pd.DataFrame(example_data) def load_dataset(self): self.dataset = self.fetch_dataset_rows() self.prepare_features() def search(self, query, column=None, top_k=20): query_embedding = self.text_model.encode([query])[0] video_sims = cosine_similarity([query_embedding], self.video_embeds)[0] text_sims = cosine_similarity([query_embedding], self.text_embeds)[0] combined_sims = 0.5 * video_sims + 0.5 * text_sims # Column filtering if column and column in self.dataset.columns and column != "All Fields": mask = self.dataset[column].astype(str).str.contains(query, case=False) combined_sims[~mask] *= 0.5 top_k = min(top_k, 100) top_indices = np.argsort(combined_sims)[-top_k:][::-1] results = [] for idx in top_indices: result = {'relevance_score': float(combined_sims[idx])} for col in self.dataset.columns: if col not in ['video_embed', 'description_embed', 'audio_embed']: result[col] = self.dataset.iloc[idx][col] results.append(result) return results # Use edge_tts for TTS @st.cache_resource def get_speech_model(): """Cache speech model initialization.""" return edge_tts.Communicate async def generate_speech(text, voice=None): if not text.strip(): return None if not voice: voice = st.session_state['tts_voice'] try: communicate = get_speech_model()(text, voice) audio_file = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3" await communicate.save(audio_file) return audio_file except Exception as e: st.error(f"Error generating speech: {e}") return None def transcribe_audio(audio_path): """Transcribe audio using Whisper.""" try: with open(audio_path, "rb") as f: transcription = openai.Audio.transcribe("whisper-1", f) return transcription["text"].strip() except Exception as e: st.error(f"Error transcribing audio: {e}") return "" def show_file_manager(): """Display file manager interface""" st.subheader("📂 File Manager") col1, col2 = st.columns(2) with col1: uploaded_file = st.file_uploader("Upload File", type=['txt', 'md', 'mp3']) if uploaded_file: with open(uploaded_file.name, "wb") as f: f.write(uploaded_file.getvalue()) st.success(f"Uploaded: {uploaded_file.name}") st.experimental_rerun() with col2: if st.button("🗑 Clear All Files"): for f in glob.glob("*.txt") + glob.glob("*.md") + glob.glob("*.mp3"): os.remove(f) st.success("All files cleared!") st.experimental_rerun() files = glob.glob("*.txt") + glob.glob("*.md") + glob.glob("*.mp3") if files: st.write("### Existing Files") for f in files: with st.expander(f"📄 {os.path.basename(f)}"): if f.endswith('.mp3'): st.audio(f) else: with open(f, 'r', encoding='utf-8') as file: st.text_area("Content", file.read(), height=100) if st.button(f"Delete {os.path.basename(f)}", key=f"del_{f}"): os.remove(f) st.experimental_rerun() ########################## # Arxiv Integration # ########################## # You need to implement or integrate perform_ai_lookup from your second app into this code. # This is a placeholder. Replace with your actual perform_ai_lookup function logic. # Ensure you have your Arxiv RAG model endpoint available. # Example placeholder implementation (replace with your actual second app code): def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary=True, full_audio=False): # Placeholder: In your real code, you'll call your Arxiv RAG endpoint and get results. # Here we just simulate a response. mock_answer = f"This is a mock Arxiv response for query: {q}.\nReferences:\n[Paper 1] Example Title" st.markdown(f"**Arxiv Search Results for '{q}':**\n\n{mock_answer}") if vocal_summary: audio_file = asyncio.run(generate_speech("This is a spoken summary of Arxiv results.")) if audio_file: st.audio(audio_file) # Add any other logic: extended_refs, titles_summary, etc. return mock_answer ############################ # Main App Layout & Logic # ############################ def main(): st.title("🎥 Video & Arxiv Search with Voice") # Initialize search class search = VideoSearch() # Create tabs tab1, tab2, tab3, tab4 = st.tabs(["🔍 Search", "🎙️ Voice Input", "📚 Arxiv", "📂 Files"]) # ---- Tab 1: Video Search ---- with tab1: st.subheader("Search Videos") col1, col2 = st.columns([3, 1]) with col1: query = st.text_input("Enter your search query:", value="ancient" if not st.session_state['initial_search_done'] else "") with col2: search_column = st.selectbox("Search in field:", ["All Fields"] + st.session_state['search_columns']) col3, col4 = st.columns(2) with col3: num_results = st.slider("Number of results:", 1, 100, 20) with col4: search_button = st.button("🔍 Search") if (search_button or not st.session_state['initial_search_done']) and query: st.session_state['initial_search_done'] = True selected_column = None if search_column == "All Fields" else search_column with st.spinner("Searching..."): results = search.search(query, selected_column, num_results) st.session_state['search_history'].append({ 'query': query, 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'results': results[:5] }) for i, result in enumerate(results, 1): with st.expander(f"Result {i}: {result['description'][:100]}...", expanded=(i==1)): cols = st.columns([2, 1]) with cols[0]: st.markdown("**Description:**") st.write(result['description']) st.markdown(f"**Time Range:** {result['start_time']}s - {result['end_time']}s") st.markdown(f"**Views:** {result['views']:,}") with cols[1]: st.markdown(f"**Relevance Score:** {result['relevance_score']:.2%}") if result.get('youtube_id'): st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}") if st.button(f"🔊 Audio Summary", key=f"audio_{i}"): summary = f"Video summary: {result['description'][:200]}" audio_file = asyncio.run(generate_speech(summary)) if audio_file: st.audio(audio_file) # Optionally delete after playing: # if os.path.exists(audio_file): # os.remove(audio_file) # ---- Tab 2: Voice Input ---- with tab2: st.subheader("Voice Input") st.write("🎙️ Record your voice and automatically transcribe to text:") audio_bytes = audio_recorder() if audio_bytes: # Save the recorded audio for transcription audio_path = f"temp_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav" with open(audio_path, "wb") as f: f.write(audio_bytes) st.success("Audio recorded successfully!") # Transcribe using Whisper voice_query = transcribe_audio(audio_path) if voice_query: st.markdown("**Transcribed Text:**") st.write(voice_query) st.session_state['last_voice_input'] = voice_query if st.button("🔍 Search from Voice"): results = search.search(voice_query, None, 20) for i, result in enumerate(results, 1): with st.expander(f"Result {i}", expanded=(i==1)): st.write(result['description']) if result.get('youtube_id'): st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result.get('start_time', 0)}") # Clean up if os.path.exists(audio_path): os.remove(audio_path) # ---- Tab 3: Arxiv Search ---- with tab3: st.subheader("Arxiv Search") q = st.text_input("Enter your Arxiv search query:", value=st.session_state['arxiv_last_query']) vocal_summary = st.checkbox("🎙 Short Audio Summary", value=True) extended_refs = st.checkbox("📜 Extended References", value=False) titles_summary = st.checkbox("🔖 Titles Only", value=True) full_audio = st.checkbox("📚 Full Audio Results", value=False) if st.button("🔍 Arxiv Search"): st.session_state['arxiv_last_query'] = q perform_ai_lookup(q, vocal_summary=vocal_summary, extended_refs=extended_refs, titles_summary=titles_summary, full_audio=full_audio) # ---- Tab 4: File Manager ---- with tab4: show_file_manager() # Sidebar with st.sidebar: st.subheader("⚙️ Settings & History") if st.button("🗑️ Clear History"): st.session_state['search_history'] = [] st.experimental_rerun() st.markdown("### Recent Searches") for entry in reversed(st.session_state['search_history'][-5:]): with st.expander(f"{entry['timestamp']}: {entry['query']}"): for i, result in enumerate(entry['results'], 1): st.write(f"{i}. {result['description'][:100]}...") st.markdown("### Voice Settings") st.selectbox("TTS Voice:", ["en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural"], key="tts_voice") if __name__ == "__main__": main()