embedchain init code
Browse files- .gitignore +129 -0
- app.py +229 -0
- dsp-urls.txt +8 -0
- ec-app.py +106 -0
- ec_config.py +60 -0
- requirements.txt +6 -0
- ttv_web_scraper.py +264 -0
.gitignore
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
.coverage
|
53 |
+
|
54 |
+
# Jupyter Notebook
|
55 |
+
.ipynb_checkpoints
|
56 |
+
|
57 |
+
# IPython
|
58 |
+
profile_default/
|
59 |
+
ipython_config.py
|
60 |
+
|
61 |
+
# pyenv
|
62 |
+
# For a library or tool, you might want to ignore these files since the code is intended to run in multiple environments;
|
63 |
+
# otherwise, check in the pyenv configuration files, especially if you are in an isolated environment.
|
64 |
+
.pyenv
|
65 |
+
|
66 |
+
# pipenv
|
67 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
68 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
69 |
+
# not cross-compatible, pipenv may install dependencies that are not in line with the rest of the team.
|
70 |
+
Pipfile.lock
|
71 |
+
|
72 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
73 |
+
__pypackages__/
|
74 |
+
|
75 |
+
# Environments
|
76 |
+
.env
|
77 |
+
.venv
|
78 |
+
env/
|
79 |
+
venv/
|
80 |
+
ENV/
|
81 |
+
env.bak/
|
82 |
+
venv.bak/
|
83 |
+
|
84 |
+
# Spyder project settings
|
85 |
+
.spyderproject
|
86 |
+
.spyproject
|
87 |
+
|
88 |
+
# Rope project settings
|
89 |
+
.ropeproject
|
90 |
+
|
91 |
+
# mkdocs documentation
|
92 |
+
/site
|
93 |
+
|
94 |
+
# mypy
|
95 |
+
.mypy_cache/
|
96 |
+
.dmypy.json
|
97 |
+
dmypy.json
|
98 |
+
|
99 |
+
# Pyre type checker
|
100 |
+
.pyre/
|
101 |
+
|
102 |
+
# Pycharm
|
103 |
+
.idea/
|
104 |
+
|
105 |
+
# VS Code
|
106 |
+
.vscode/
|
107 |
+
|
108 |
+
# Streamlit static files
|
109 |
+
.streamlit/
|
110 |
+
|
111 |
+
# Local environment variables
|
112 |
+
.env
|
113 |
+
|
114 |
+
# Deepface models
|
115 |
+
. deepface_weights/
|
116 |
+
|
117 |
+
# MacOS specific
|
118 |
+
.DS_Store
|
119 |
+
|
120 |
+
# Keep empty models dir
|
121 |
+
models/*
|
122 |
+
!models/.gitkeep
|
123 |
+
|
124 |
+
# All cached movie files
|
125 |
+
*.mp4.env
|
126 |
+
|
127 |
+
# cache dir
|
128 |
+
cache/
|
129 |
+
|
app.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import re
|
4 |
+
from datetime import datetime
|
5 |
+
from groq import Groq
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
|
12 |
+
# Load environment variables
|
13 |
+
try:
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
load_dotenv()
|
16 |
+
except ImportError:
|
17 |
+
pass # dotenv not installed, likely running on Hugging Face Spaces
|
18 |
+
|
19 |
+
# Function to get the API key
|
20 |
+
|
21 |
+
|
22 |
+
def get_api_key():
|
23 |
+
api_key = os.environ.get("GROQ_API_KEY")
|
24 |
+
if not api_key:
|
25 |
+
api_key = st.secrets.get("GROQ_API_KEY")
|
26 |
+
if not api_key:
|
27 |
+
st.error(
|
28 |
+
"GROQ_API_KEY is not set. Please set it in your environment or Streamlit secrets.")
|
29 |
+
st.stop()
|
30 |
+
return api_key
|
31 |
+
|
32 |
+
|
33 |
+
def parse_transcript(content):
|
34 |
+
|
35 |
+
parsed_segments = []
|
36 |
+
current_speaker = ""
|
37 |
+
current_company = ""
|
38 |
+
current_timestamp = ""
|
39 |
+
|
40 |
+
# Split the content into paragraphs
|
41 |
+
paragraphs = re.split(r'\n\s*\n', content)
|
42 |
+
|
43 |
+
for paragraph in paragraphs:
|
44 |
+
paragraph = paragraph.strip()
|
45 |
+
if not paragraph:
|
46 |
+
continue
|
47 |
+
|
48 |
+
# Check if the paragraph contains speaker information
|
49 |
+
speaker_match = re.match(r'(.*?),\s*(.*?)\((.*?)\):', paragraph)
|
50 |
+
if speaker_match:
|
51 |
+
current_speaker, current_company, current_timestamp = speaker_match.groups()
|
52 |
+
text = paragraph.split('\n', 1)[1] if '\n' in paragraph else ''
|
53 |
+
|
54 |
+
# Check if the paragraph contains only a timestamp
|
55 |
+
elif re.match(r'\((.*?)\):', paragraph):
|
56 |
+
current_timestamp = re.match(r'\((.*?)\):', paragraph).group(1)
|
57 |
+
text = re.sub(r'^\(.*?\):\s*', '', paragraph).strip()
|
58 |
+
|
59 |
+
# If it's not a speaker line or timestamp line, it's just text
|
60 |
+
else:
|
61 |
+
text = paragraph
|
62 |
+
|
63 |
+
# Add the segment
|
64 |
+
if text:
|
65 |
+
add_segment(parsed_segments, current_speaker,
|
66 |
+
current_company, current_timestamp, text)
|
67 |
+
|
68 |
+
return parsed_segments
|
69 |
+
|
70 |
+
|
71 |
+
def add_segment(parsed_segments, speaker, company, timestamp, text):
|
72 |
+
segment = {
|
73 |
+
"speaker": speaker,
|
74 |
+
"company": company,
|
75 |
+
"timestamp": timestamp,
|
76 |
+
"text": text
|
77 |
+
}
|
78 |
+
parsed_segments.append(segment)
|
79 |
+
print_segment(speaker, company, timestamp, text)
|
80 |
+
|
81 |
+
|
82 |
+
def print_segment(speaker, company, timestamp, text):
|
83 |
+
print(f"Speaker: {speaker}")
|
84 |
+
print(f"Company: {company}")
|
85 |
+
print(f"Timestamp: {timestamp}")
|
86 |
+
print(f"Text: {text[:100]}...") # Print first 100 characters of text
|
87 |
+
print("-" * 50)
|
88 |
+
|
89 |
+
|
90 |
+
def create_searchable_segments(parsed_segments):
|
91 |
+
searchable_segments = []
|
92 |
+
for segment in parsed_segments:
|
93 |
+
searchable_text = (
|
94 |
+
f"{segment['speaker']},{segment['company']},"
|
95 |
+
f"{segment['timestamp']}:: {segment['text']}"
|
96 |
+
)
|
97 |
+
searchable_segments.append(searchable_text)
|
98 |
+
return searchable_segments
|
99 |
+
|
100 |
+
|
101 |
+
# Load and parse the transcript
|
102 |
+
def load_transcript(content):
|
103 |
+
global vectorstore
|
104 |
+
|
105 |
+
# Parse the transcript
|
106 |
+
parsed_transcript = parse_transcript(content)
|
107 |
+
searchable_segments = create_searchable_segments(parsed_transcript)
|
108 |
+
|
109 |
+
# Create text splitter and split the searchable segments
|
110 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
111 |
+
chunk_size=1000, chunk_overlap=200)
|
112 |
+
|
113 |
+
splits = text_splitter.create_documents(searchable_segments)
|
114 |
+
|
115 |
+
# Create vector store with HuggingFaceEmbeddings
|
116 |
+
embeddings = HuggingFaceEmbeddings()
|
117 |
+
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
|
118 |
+
|
119 |
+
|
120 |
+
def search_transcript(query, k=30):
|
121 |
+
# Perform similarity search
|
122 |
+
docs = vectorstore.similarity_search(query=query, k=k)
|
123 |
+
|
124 |
+
# Format results
|
125 |
+
results = []
|
126 |
+
for doc in docs:
|
127 |
+
content = doc.page_content
|
128 |
+
match = re.match(r'(.*?),(.*?),(.*?)::\s*(.*)', content, re.DOTALL)
|
129 |
+
if match:
|
130 |
+
speaker, company, timestamp, text = match.groups()
|
131 |
+
results.append({
|
132 |
+
"speaker": speaker.strip(),
|
133 |
+
"company": company.strip(),
|
134 |
+
"timestamp": timestamp.strip(),
|
135 |
+
"text": text.strip()
|
136 |
+
})
|
137 |
+
|
138 |
+
return results
|
139 |
+
|
140 |
+
|
141 |
+
# Groq client setup
|
142 |
+
client = Groq(api_key=get_api_key())
|
143 |
+
|
144 |
+
|
145 |
+
def generate_response(query, search_results):
|
146 |
+
# Prepare the prompt with search results
|
147 |
+
prompt = f"""You are a friendly assistant. Your job is to answer the user's question based on the transcript excerpts provided below:
|
148 |
+
|
149 |
+
Transcript excerpts:
|
150 |
+
{search_results}
|
151 |
+
|
152 |
+
Question: {query}
|
153 |
+
|
154 |
+
Please provide a concise and relevant answer based on the information in the transcript excerpts. If the information is not directly related to the question, say so and provide the most relevant information available."""
|
155 |
+
|
156 |
+
completion = client.chat.completions.create(
|
157 |
+
model="llama3-8b-8192",
|
158 |
+
messages=[
|
159 |
+
{
|
160 |
+
"role": "user",
|
161 |
+
"content": prompt
|
162 |
+
}
|
163 |
+
],
|
164 |
+
temperature=0.5,
|
165 |
+
max_tokens=3000,
|
166 |
+
top_p=1,
|
167 |
+
stream=False,
|
168 |
+
stop=None,
|
169 |
+
)
|
170 |
+
|
171 |
+
return completion.choices[0].message.content
|
172 |
+
|
173 |
+
|
174 |
+
# Streamlit app
|
175 |
+
def main():
|
176 |
+
st.title("Transcript Search and Q&A")
|
177 |
+
|
178 |
+
st.caption("This site takes a TelecomTV video transcript and allows a chat session with it. If no transcript is provided it defaults to this one: https://www.telecomtv.com/content/dsp-leaders-forum/enabling-the-autonomous-network-with-ai-50536/")
|
179 |
+
|
180 |
+
# File upload
|
181 |
+
uploaded_file = st.file_uploader("Upload a transcript file", type="txt")
|
182 |
+
|
183 |
+
if uploaded_file is None:
|
184 |
+
file_name = "Enabling the autonomous network with AI"
|
185 |
+
with open("example-transcript.txt", 'r') as file:
|
186 |
+
content = file.read()
|
187 |
+
else:
|
188 |
+
content = uploaded_file.getvalue().decode("utf-8")
|
189 |
+
file_name = uploaded_file.name
|
190 |
+
|
191 |
+
# Read and process the uploaded file
|
192 |
+
load_transcript(content)
|
193 |
+
|
194 |
+
st.subheader(f"Chat with {file_name}")
|
195 |
+
|
196 |
+
# User input
|
197 |
+
user_query = st.text_input(
|
198 |
+
"Enter your question:", placeholder="e.g.What are people speaking about? or List all people speaking")
|
199 |
+
|
200 |
+
# Add a slider for selecting the number of results
|
201 |
+
num_results = st.slider("Number of relevant transcript excerpts to show:",
|
202 |
+
min_value=1, max_value=50, value=30, step=1)
|
203 |
+
|
204 |
+
if user_query:
|
205 |
+
search_results = search_transcript(user_query, k=num_results)
|
206 |
+
formatted_results = (
|
207 |
+
"\n\n".join([f"{result['speaker']} {result['company']} ({result['timestamp']}): "
|
208 |
+
"{result['text']}" for result in search_results])
|
209 |
+
)
|
210 |
+
|
211 |
+
response = generate_response(user_query, formatted_results)
|
212 |
+
|
213 |
+
st.subheader("Assistant's response:")
|
214 |
+
st.write(response)
|
215 |
+
|
216 |
+
st.subheader("Relevant transcript excerpts:")
|
217 |
+
|
218 |
+
# Create a DataFrame from the search results
|
219 |
+
df = pd.DataFrame(search_results)
|
220 |
+
|
221 |
+
# Rename columns for better readability
|
222 |
+
df.columns = ['Speaker', 'Company', 'Timestamp', 'Quote']
|
223 |
+
|
224 |
+
# Display the DataFrame as a table
|
225 |
+
st.table(df)
|
226 |
+
|
227 |
+
|
228 |
+
if __name__ == "__main__":
|
229 |
+
main()
|
dsp-urls.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/directing-and-accelerating-network-innovation-50535/
|
2 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/enabling-the-autonomous-network-with-ai-50536/
|
3 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/creating-cloud-native-software-engineering-teams-50537/
|
4 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/building-digital-infrastructure-from-core-to-edge-50538/
|
5 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/unlocking-platform-opportunities-with-network-apis-50539/
|
6 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/leveraging-telco-cloud-for-advanced-operations-50540/
|
7 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/improving-network-optimisation-through-automation-50541/
|
8 |
+
https://www.telecomtv.com/content/dsp-leaders-forum/focusing-on-the-customer-50542/
|
ec-app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ec_config import create_app
|
2 |
+
from ttv_web_scraper import db_load_metadata_sets
|
3 |
+
import streamlit as st
|
4 |
+
import re
|
5 |
+
|
6 |
+
|
7 |
+
@st.cache_resource
|
8 |
+
def embedchain_bot():
|
9 |
+
return create_app() # Use the create_app function from config.py
|
10 |
+
|
11 |
+
|
12 |
+
def timestamp_to_seconds(timestamp):
|
13 |
+
"""Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
|
14 |
+
parts = timestamp.split(':')
|
15 |
+
if len(parts) == 3:
|
16 |
+
h, m, s = map(int, parts)
|
17 |
+
ts = h * 3600 + m * 60 + s
|
18 |
+
elif len(parts) == 2:
|
19 |
+
m, s = map(int, parts)
|
20 |
+
ts = m * 60 + s
|
21 |
+
else:
|
22 |
+
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
23 |
+
|
24 |
+
return ts
|
25 |
+
|
26 |
+
|
27 |
+
def create_filter_panel(speakers, companies, sentiments, subjects):
|
28 |
+
st.sidebar.header("Filter Options")
|
29 |
+
|
30 |
+
selected_speaker = st.sidebar.selectbox(
|
31 |
+
"Select Speaker", [""] + list(speakers))
|
32 |
+
selected_company = st.sidebar.selectbox(
|
33 |
+
"Select Company", [""] + list(companies))
|
34 |
+
selected_sentiment = st.sidebar.selectbox(
|
35 |
+
"Select Sentiment", [""] + list(sentiments))
|
36 |
+
selected_subject = st.sidebar.selectbox(
|
37 |
+
"Select Subject", [""] + list(subjects))
|
38 |
+
|
39 |
+
where = {}
|
40 |
+
if selected_speaker:
|
41 |
+
where['speaker'] = selected_speaker
|
42 |
+
if selected_company:
|
43 |
+
where['companies'] = selected_company
|
44 |
+
if selected_sentiment:
|
45 |
+
where['sentiment'] = selected_sentiment
|
46 |
+
if selected_subject:
|
47 |
+
where['subject'] = selected_subject
|
48 |
+
|
49 |
+
return where
|
50 |
+
|
51 |
+
|
52 |
+
# Streamlit app
|
53 |
+
|
54 |
+
|
55 |
+
def main():
|
56 |
+
st.title("DSP Leaders World Forum 2024 ChatBot")
|
57 |
+
|
58 |
+
st.markdown(
|
59 |
+
"Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
|
60 |
+
|
61 |
+
# Load metadata sets
|
62 |
+
_, speakers, companies, sentiments, subjects = db_load_metadata_sets()
|
63 |
+
|
64 |
+
# Create filter panel
|
65 |
+
where = create_filter_panel(speakers, companies, sentiments, subjects)
|
66 |
+
|
67 |
+
# User input
|
68 |
+
user_query = st.text_input(
|
69 |
+
"Enter your question:", placeholder="e.g. What are people speaking about? or List all people speaking")
|
70 |
+
|
71 |
+
# Add a slider for selecting the number of results
|
72 |
+
num_results = st.slider("Number of relevant transcript excerpts to show:",
|
73 |
+
min_value=1, max_value=50, value=30, step=1)
|
74 |
+
|
75 |
+
if user_query:
|
76 |
+
app = embedchain_bot()
|
77 |
+
|
78 |
+
msg_placeholder = st.empty()
|
79 |
+
msg_placeholder.markdown("Thinking...")
|
80 |
+
|
81 |
+
# Use app.search() with the where parameter
|
82 |
+
search_results = app.search(
|
83 |
+
user_query, num_documents=num_results, where=where)
|
84 |
+
|
85 |
+
# Process and display search results
|
86 |
+
answer = "Here are the most relevant transcript excerpts:\n\n"
|
87 |
+
for i, result in enumerate(search_results, 1):
|
88 |
+
metadata = result['metadata']
|
89 |
+
ts = timestamp_to_seconds(metadata['timestamp'])
|
90 |
+
yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
|
91 |
+
|
92 |
+
speaker_info = (
|
93 |
+
f"Speaker: {metadata.get('speaker', 'Unknown')}, "
|
94 |
+
f"Company: {metadata.get('company', 'Unknown')}, "
|
95 |
+
f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
|
96 |
+
)
|
97 |
+
|
98 |
+
answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
|
99 |
+
answer += f"{metadata.get('title', 'Unknown')} \n"
|
100 |
+
answer += f"\"{result['context']}\"\n\n"
|
101 |
+
|
102 |
+
msg_placeholder.markdown(answer)
|
103 |
+
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
main()
|
ec_config.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import streamlit as st
|
4 |
+
from embedchain import App
|
5 |
+
|
6 |
+
# Load environment variables
|
7 |
+
try:
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
+
except ImportError:
|
11 |
+
pass # dotenv not installed, likely running on Hugging Face Spaces
|
12 |
+
|
13 |
+
# Function to get the API key
|
14 |
+
|
15 |
+
|
16 |
+
def get_api_key(name):
|
17 |
+
api_key = os.environ.get(name)
|
18 |
+
if not api_key:
|
19 |
+
api_key = st.secrets.get(name)
|
20 |
+
if not api_key:
|
21 |
+
raise ValueError(
|
22 |
+
f"{name} is not set. Please set it in your environment or Streamlit secrets.")
|
23 |
+
return api_key
|
24 |
+
|
25 |
+
|
26 |
+
config_dict = {
|
27 |
+
'app': {
|
28 |
+
'config': {
|
29 |
+
'name': 'ttv-ec'
|
30 |
+
}
|
31 |
+
},
|
32 |
+
'llm': {
|
33 |
+
'provider': 'huggingface',
|
34 |
+
'config': {
|
35 |
+
'model': 'mistralai/Mistral-7B-Instruct-v0.2',
|
36 |
+
'top_p': 0.5,
|
37 |
+
'stream': False,
|
38 |
+
'prompt': """You are an AI assistant that answers questions based solely on the information provided in your knowledge base.
|
39 |
+
|
40 |
+
Question: $query
|
41 |
+
Context: $context
|
42 |
+
|
43 |
+
If the information to answer a question is not available in your knowledge base,
|
44 |
+
respond with 'I don't have enough information to answer that question.
|
45 |
+
""",
|
46 |
+
'api_key': get_api_key('HF_TOKEN')
|
47 |
+
}
|
48 |
+
},
|
49 |
+
'embedder': {
|
50 |
+
'provider': 'huggingface',
|
51 |
+
'config': {
|
52 |
+
'model': 'sentence-transformers/all-mpnet-base-v2',
|
53 |
+
'api_key': get_api_key('HF_TOKEN')
|
54 |
+
}
|
55 |
+
}
|
56 |
+
}
|
57 |
+
|
58 |
+
|
59 |
+
def create_app():
|
60 |
+
return App.from_config(config=config_dict)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
embedchain
|
3 |
+
langchain_huggingface
|
4 |
+
watchdog
|
5 |
+
pyppeteer
|
6 |
+
beautifulsoup4
|
ttv_web_scraper.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import asyncio
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import traceback
|
6 |
+
from pyppeteer import launch
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import hashlib
|
9 |
+
from ec_config import create_app
|
10 |
+
|
11 |
+
|
12 |
+
CACHE_DIR = "cache/"
|
13 |
+
if not os.path.exists(CACHE_DIR):
|
14 |
+
os.makedirs(CACHE_DIR)
|
15 |
+
|
16 |
+
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
|
17 |
+
|
18 |
+
|
19 |
+
def db_load_metadata_sets():
|
20 |
+
|
21 |
+
content_hashes = set()
|
22 |
+
speakers = set()
|
23 |
+
companies = set()
|
24 |
+
sentiments = set()
|
25 |
+
subjects = set()
|
26 |
+
|
27 |
+
if os.path.exists(DB_METADATA_FILE):
|
28 |
+
with open(DB_METADATA_FILE, 'r') as f:
|
29 |
+
metadata = json.load(f)
|
30 |
+
|
31 |
+
content_hashes = set(metadata.get('content_hashes', []))
|
32 |
+
speakers = set(metadata.get('speakers', []))
|
33 |
+
companies = set(metadata.get('companies', []))
|
34 |
+
sentiments = set(metadata.get('sentiments', []))
|
35 |
+
subjects = set(metadata.get('subjects', []))
|
36 |
+
|
37 |
+
return content_hashes, speakers, companies, sentiments, subjects
|
38 |
+
|
39 |
+
|
40 |
+
def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
|
41 |
+
|
42 |
+
metadata = {
|
43 |
+
'content_hashes': list(content_hashes),
|
44 |
+
'speakers': list(speakers),
|
45 |
+
'companies': list(companies),
|
46 |
+
'sentiments': list(sentiments),
|
47 |
+
'subjects': list(subjects)
|
48 |
+
}
|
49 |
+
|
50 |
+
with open(DB_METADATA_FILE, 'w') as f:
|
51 |
+
json.dump(metadata, f, indent=2)
|
52 |
+
|
53 |
+
|
54 |
+
async def get_client_rendered_content(url):
|
55 |
+
browser = None
|
56 |
+
try:
|
57 |
+
browser = await launch()
|
58 |
+
page = await browser.newPage()
|
59 |
+
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
|
60 |
+
await asyncio.sleep(5)
|
61 |
+
content = await page.content()
|
62 |
+
return content
|
63 |
+
except Exception as e:
|
64 |
+
raise Exception(f"Error fetching content: {str(e)}")
|
65 |
+
finally:
|
66 |
+
if browser:
|
67 |
+
await browser.close()
|
68 |
+
|
69 |
+
|
70 |
+
def extract_info(html_content):
|
71 |
+
try:
|
72 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
73 |
+
title = soup.title.string.strip() if soup.title else None
|
74 |
+
date_elem = soup.find('p', class_='content-date')
|
75 |
+
date = date_elem.find(
|
76 |
+
'span', class_='ng-binding').text.strip() if date_elem else None
|
77 |
+
youtube_iframe = soup.find(
|
78 |
+
'iframe', src=lambda x: x and 'youtube.com' in x)
|
79 |
+
youtube_url = youtube_iframe['src'] if youtube_iframe else None
|
80 |
+
youtube_id = None
|
81 |
+
if youtube_url:
|
82 |
+
match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
|
83 |
+
if match:
|
84 |
+
youtube_id = match.group(1)
|
85 |
+
transcript_elem = soup.find(id='transcript0')
|
86 |
+
transcript = transcript_elem.get_text(
|
87 |
+
strip=True) if transcript_elem else None
|
88 |
+
return {
|
89 |
+
'metadata': {
|
90 |
+
'title': title,
|
91 |
+
'date': date,
|
92 |
+
'youtube_id': youtube_id,
|
93 |
+
},
|
94 |
+
'transcript': transcript
|
95 |
+
}
|
96 |
+
except Exception as e:
|
97 |
+
raise Exception(f"Error extracting information: {str(e)}")
|
98 |
+
|
99 |
+
|
100 |
+
def read_html_from_file(filename):
|
101 |
+
try:
|
102 |
+
if os.path.exists(filename):
|
103 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
104 |
+
return f.read()
|
105 |
+
return None
|
106 |
+
except Exception as e:
|
107 |
+
raise Exception(f"Error reading file {filename}: {str(e)}")
|
108 |
+
|
109 |
+
|
110 |
+
def read_json_from_file(filename):
|
111 |
+
try:
|
112 |
+
if os.path.exists(filename):
|
113 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
114 |
+
return json.load(f)
|
115 |
+
return None
|
116 |
+
except json.JSONDecodeError as e:
|
117 |
+
raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
|
118 |
+
except Exception as e:
|
119 |
+
raise Exception(f"Error reading file {filename}: {str(e)}")
|
120 |
+
|
121 |
+
|
122 |
+
def extract_speaker_info(segment):
|
123 |
+
try:
|
124 |
+
pattern = r'(?P<speaker>(?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+), (?P<company>[A-Za-z\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
|
125 |
+
match = re.match(pattern, segment)
|
126 |
+
if match:
|
127 |
+
return {key: value.strip() if value else None for key, value in match.groupdict().items()}
|
128 |
+
else:
|
129 |
+
timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
|
130 |
+
timestamp_match = re.match(timestamp_pattern, segment)
|
131 |
+
if timestamp_match:
|
132 |
+
return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
|
133 |
+
return None
|
134 |
+
except Exception as e:
|
135 |
+
raise Exception(f"Error extracting speaker info: {str(e)}")
|
136 |
+
|
137 |
+
|
138 |
+
def parse_transcript(content):
|
139 |
+
try:
|
140 |
+
parsed_segments = []
|
141 |
+
metadata = {}
|
142 |
+
pattern = r'((?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+, [A-Za-z\s]+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
|
143 |
+
segments = re.split(pattern, content)
|
144 |
+
segments = [segment.strip() for segment in segments if segment.strip()]
|
145 |
+
for segment in segments:
|
146 |
+
speaker_info = extract_speaker_info(segment)
|
147 |
+
if (speaker_info):
|
148 |
+
if speaker_info['speaker']:
|
149 |
+
metadata = speaker_info.copy()
|
150 |
+
else:
|
151 |
+
metadata = metadata.copy()
|
152 |
+
metadata['timestamp'] = speaker_info['timestamp']
|
153 |
+
else:
|
154 |
+
parsed_segments.append({
|
155 |
+
'metadata': metadata,
|
156 |
+
"text": segment
|
157 |
+
})
|
158 |
+
return parsed_segments
|
159 |
+
except Exception as e:
|
160 |
+
raise Exception(f"Error parsing transcript: {str(e)}")
|
161 |
+
|
162 |
+
|
163 |
+
def get_cached_filename(url):
|
164 |
+
return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"
|
165 |
+
|
166 |
+
|
167 |
+
async def process_url(url):
|
168 |
+
try:
|
169 |
+
cached_filename = get_cached_filename(url)
|
170 |
+
json_filename = f"{cached_filename}.json"
|
171 |
+
info = read_json_from_file(json_filename)
|
172 |
+
|
173 |
+
if info:
|
174 |
+
return info
|
175 |
+
|
176 |
+
content = read_html_from_file(cached_filename)
|
177 |
+
|
178 |
+
if content is None:
|
179 |
+
print(f"Fetching content from web for {url}...")
|
180 |
+
content = await get_client_rendered_content(url)
|
181 |
+
with open(cached_filename, 'w', encoding='utf-8') as f:
|
182 |
+
f.write(content)
|
183 |
+
else:
|
184 |
+
print(f"Using cached content from file for {url}...")
|
185 |
+
|
186 |
+
info = extract_info(content)
|
187 |
+
transcript = info['transcript']
|
188 |
+
if (transcript):
|
189 |
+
info['transcript'] = parse_transcript(transcript)
|
190 |
+
with open(json_filename, 'w', encoding='utf-8') as f:
|
191 |
+
json.dump(info, f, ensure_ascii=False, indent=4)
|
192 |
+
print(f"Information extracted and saved to {json_filename}")
|
193 |
+
else:
|
194 |
+
print(f"No transcript found for {url}")
|
195 |
+
return info
|
196 |
+
|
197 |
+
except Exception as e:
|
198 |
+
print(f"Error processing URL {url}:")
|
199 |
+
print(traceback.format_exc())
|
200 |
+
print(f"Detailed error: {str(e)}")
|
201 |
+
return None
|
202 |
+
|
203 |
+
# This function can be used to process multiple URLs
|
204 |
+
|
205 |
+
|
206 |
+
async def process_urls(urls):
|
207 |
+
tasks = [process_url(url) for url in urls]
|
208 |
+
return await asyncio.gather(*tasks)
|
209 |
+
|
210 |
+
|
211 |
+
def main():
|
212 |
+
|
213 |
+
app = create_app()
|
214 |
+
|
215 |
+
url_file = "dsp-urls.txt" # File containing list of URLs
|
216 |
+
|
217 |
+
if not os.path.exists(url_file):
|
218 |
+
print(f"Error: {url_file} not found.")
|
219 |
+
return
|
220 |
+
|
221 |
+
content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
|
222 |
+
|
223 |
+
with open(url_file, 'r') as f:
|
224 |
+
urls = [line.strip() for line in f if line.strip()]
|
225 |
+
|
226 |
+
for url in urls:
|
227 |
+
# Generate a hash of the url
|
228 |
+
filename_hash = hashlib.md5(url.encode()).hexdigest()
|
229 |
+
# Check if this content has already been added
|
230 |
+
if filename_hash in content_hashes:
|
231 |
+
print(f"{url} already added")
|
232 |
+
continue
|
233 |
+
|
234 |
+
info = asyncio.run(process_url(url))
|
235 |
+
if info is None:
|
236 |
+
continue
|
237 |
+
|
238 |
+
metadata = info['metadata']
|
239 |
+
transcript = info['transcript']
|
240 |
+
|
241 |
+
if transcript is None:
|
242 |
+
continue
|
243 |
+
|
244 |
+
for entry in transcript:
|
245 |
+
metadata.update(entry['metadata'])
|
246 |
+
speakers.add(metadata['speaker'])
|
247 |
+
companies.add(metadata['company'])
|
248 |
+
|
249 |
+
text = entry['text']
|
250 |
+
|
251 |
+
app.add(text, data_type='text', metadata=metadata)
|
252 |
+
|
253 |
+
content_hashes.add(filename_hash)
|
254 |
+
print(f"Added new url: {url}")
|
255 |
+
|
256 |
+
# Save updated hashes
|
257 |
+
save_metadata_sets(content_hashes, speakers,
|
258 |
+
companies, sentiments, subjects)
|
259 |
+
|
260 |
+
print("Processing complete. Check individual URL outputs for any errors.")
|
261 |
+
|
262 |
+
|
263 |
+
if __name__ == "__main__":
|
264 |
+
main()
|