Spaces:

eusholli
/

ttv-transcript

Sleeping

App Files Files Community

ttv-transcript / app.py

eusholli

Initial push

4b3cb44 11 months ago

raw

history blame

7.1 kB

	import os
	import streamlit as st
	import re
	from datetime import datetime
	from groq import Groq
	from langchain_community.vectorstores import FAISS
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	import pandas as pd


	# Load environment variables
	try:
	from dotenv import load_dotenv
	load_dotenv()
	except ImportError:
	pass # dotenv not installed, likely running on Hugging Face Spaces

	# Function to get the API key


	def get_api_key():
	api_key = os.environ.get("GROQ_API_KEY")
	if not api_key:
	api_key = st.secrets.get("GROQ_API_KEY")
	if not api_key:
	st.error(
	"GROQ_API_KEY is not set. Please set it in your environment or Streamlit secrets.")
	st.stop()
	return api_key


	def parse_transcript(content):

	parsed_segments = []
	current_speaker = ""
	current_company = ""
	current_timestamp = ""

	# Split the content into paragraphs
	paragraphs = re.split(r'\n\s*\n', content)

	for paragraph in paragraphs:
	paragraph = paragraph.strip()
	if not paragraph:
	continue

	# Check if the paragraph contains speaker information
	speaker_match = re.match(r'(.?),\s(.?)\((.?)\):', paragraph)
	if speaker_match:
	current_speaker, current_company, current_timestamp = speaker_match.groups()
	text = paragraph.split('\n', 1)[1] if '\n' in paragraph else ''

	# Check if the paragraph contains only a timestamp
	elif re.match(r'\((.*?)\):', paragraph):
	current_timestamp = re.match(r'\((.*?)\):', paragraph).group(1)
	text = re.sub(r'^\(.?\):\s', '', paragraph).strip()

	# If it's not a speaker line or timestamp line, it's just text
	else:
	text = paragraph

	# Add the segment
	if text:
	add_segment(parsed_segments, current_speaker,
	current_company, current_timestamp, text)

	return parsed_segments


	def add_segment(parsed_segments, speaker, company, timestamp, text):
	segment = {
	"speaker": speaker,
	"company": company,
	"timestamp": timestamp,
	"text": text
	}
	parsed_segments.append(segment)
	print_segment(speaker, company, timestamp, text)


	def print_segment(speaker, company, timestamp, text):
	print(f"Speaker: {speaker}")
	print(f"Company: {company}")
	print(f"Timestamp: {timestamp}")
	print(f"Text: {text[:100]}...") # Print first 100 characters of text
	print("-" * 50)


	def create_searchable_segments(parsed_segments):
	searchable_segments = []
	for segment in parsed_segments:
	searchable_text = f"{segment['speaker']},{segment['company']},{segment['timestamp']}:: {
	segment['text']}"
	searchable_segments.append(searchable_text)
	return searchable_segments


	# Load and parse the transcript
	def load_transcript(content):
	global vectorstore

	# Parse the transcript
	parsed_transcript = parse_transcript(content)
	searchable_segments = create_searchable_segments(parsed_transcript)

	# Create text splitter and split the searchable segments
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=200)

	splits = text_splitter.create_documents(searchable_segments)

	# Create vector store with HuggingFaceEmbeddings
	embeddings = HuggingFaceEmbeddings()
	vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)


	def search_transcript(query, k=30):
	# Perform similarity search
	docs = vectorstore.similarity_search(query=query, k=k)

	# Format results
	results = []
	for doc in docs:
	content = doc.page_content
	match = re.match(r'(.?),(.?),(.?)::\s(.*)', content, re.DOTALL)
	if match:
	speaker, company, timestamp, text = match.groups()
	results.append({
	"speaker": speaker.strip(),
	"company": company.strip(),
	"timestamp": timestamp.strip(),
	"text": text.strip()
	})

	return results


	# Groq client setup
	client = Groq(api_key=get_api_key())


	def generate_response(query, search_results):
	# Prepare the prompt with search results
	prompt = f"""You are a friendly assistant. Your job is to answer the user's question based on the transcript excerpts provided below:

	Transcript excerpts:
	{search_results}

	Question: {query}

	Please provide a concise and relevant answer based on the information in the transcript excerpts. If the information is not directly related to the question, say so and provide the most relevant information available."""

	completion = client.chat.completions.create(
	model="llama3-8b-8192",
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=0.5,
	max_tokens=3000,
	top_p=1,
	stream=False,
	stop=None,
	)

	return completion.choices[0].message.content


	# Streamlit app
	def main():
	st.title("Transcript Search and Q&A")

	st.caption("This site takes a TelecomTV video transcript and allows a chat session with it. If no transcript is provided it defaults to this one: https://www.telecomtv.com/content/dsp-leaders-forum/enabling-the-autonomous-network-with-ai-50536/")

	# File upload
	uploaded_file = st.file_uploader("Upload a transcript file", type="txt")

	if uploaded_file is None:
	file_name = "Enabling the autonomous network with AI"
	with open("example-transcript.txt", 'r') as file:
	content = file.read()
	else:
	content = uploaded_file.getvalue().decode("utf-8")
	file_name = uploaded_file.name

	# Read and process the uploaded file
	load_transcript(content)

	st.subheader(f"Chat with {file_name}")

	# User input
	user_query = st.text_input(
	"Enter your question:", placeholder="e.g.What are people speaking about? or List all people speaking")

	# Add a slider for selecting the number of results
	num_results = st.slider("Number of relevant transcript excerpts to show:",
	min_value=1, max_value=50, value=30, step=1)

	if user_query:
	search_results = search_transcript(user_query, k=num_results)
	formatted_results = "\n\n".join([f"{result['speaker']} {result['company']} ({result['timestamp']}): {
	result['text']}" for result in search_results])

	response = generate_response(user_query, formatted_results)

	st.subheader("Assistant's response:")
	st.write(response)

	st.subheader("Relevant transcript excerpts:")

	# Create a DataFrame from the search results
	df = pd.DataFrame(search_results)

	# Rename columns for better readability
	df.columns = ['Speaker', 'Company', 'Timestamp', 'Quote']

	# Display the DataFrame as a table
	st.table(df)


	if __name__ == "__main__":
	main()