eusholli commited on
Commit
26df9fd
·
1 Parent(s): 8197eac

embedchain init code

Browse files
Files changed (7) hide show
  1. .gitignore +129 -0
  2. app.py +229 -0
  3. dsp-urls.txt +8 -0
  4. ec-app.py +106 -0
  5. ec_config.py +60 -0
  6. requirements.txt +6 -0
  7. ttv_web_scraper.py +264 -0
.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ .coverage
53
+
54
+ # Jupyter Notebook
55
+ .ipynb_checkpoints
56
+
57
+ # IPython
58
+ profile_default/
59
+ ipython_config.py
60
+
61
+ # pyenv
62
+ # For a library or tool, you might want to ignore these files since the code is intended to run in multiple environments;
63
+ # otherwise, check in the pyenv configuration files, especially if you are in an isolated environment.
64
+ .pyenv
65
+
66
+ # pipenv
67
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
68
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
69
+ # not cross-compatible, pipenv may install dependencies that are not in line with the rest of the team.
70
+ Pipfile.lock
71
+
72
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
73
+ __pypackages__/
74
+
75
+ # Environments
76
+ .env
77
+ .venv
78
+ env/
79
+ venv/
80
+ ENV/
81
+ env.bak/
82
+ venv.bak/
83
+
84
+ # Spyder project settings
85
+ .spyderproject
86
+ .spyproject
87
+
88
+ # Rope project settings
89
+ .ropeproject
90
+
91
+ # mkdocs documentation
92
+ /site
93
+
94
+ # mypy
95
+ .mypy_cache/
96
+ .dmypy.json
97
+ dmypy.json
98
+
99
+ # Pyre type checker
100
+ .pyre/
101
+
102
+ # Pycharm
103
+ .idea/
104
+
105
+ # VS Code
106
+ .vscode/
107
+
108
+ # Streamlit static files
109
+ .streamlit/
110
+
111
+ # Local environment variables
112
+ .env
113
+
114
+ # Deepface models
115
+ . deepface_weights/
116
+
117
+ # MacOS specific
118
+ .DS_Store
119
+
120
+ # Keep empty models dir
121
+ models/*
122
+ !models/.gitkeep
123
+
124
+ # All cached movie files
125
+ *.mp4.env
126
+
127
+ # cache dir
128
+ cache/
129
+
app.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import re
4
+ from datetime import datetime
5
+ from groq import Groq
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ import pandas as pd
10
+
11
+
12
+ # Load environment variables
13
+ try:
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+ except ImportError:
17
+ pass # dotenv not installed, likely running on Hugging Face Spaces
18
+
19
+ # Function to get the API key
20
+
21
+
22
+ def get_api_key():
23
+ api_key = os.environ.get("GROQ_API_KEY")
24
+ if not api_key:
25
+ api_key = st.secrets.get("GROQ_API_KEY")
26
+ if not api_key:
27
+ st.error(
28
+ "GROQ_API_KEY is not set. Please set it in your environment or Streamlit secrets.")
29
+ st.stop()
30
+ return api_key
31
+
32
+
33
+ def parse_transcript(content):
34
+
35
+ parsed_segments = []
36
+ current_speaker = ""
37
+ current_company = ""
38
+ current_timestamp = ""
39
+
40
+ # Split the content into paragraphs
41
+ paragraphs = re.split(r'\n\s*\n', content)
42
+
43
+ for paragraph in paragraphs:
44
+ paragraph = paragraph.strip()
45
+ if not paragraph:
46
+ continue
47
+
48
+ # Check if the paragraph contains speaker information
49
+ speaker_match = re.match(r'(.*?),\s*(.*?)\((.*?)\):', paragraph)
50
+ if speaker_match:
51
+ current_speaker, current_company, current_timestamp = speaker_match.groups()
52
+ text = paragraph.split('\n', 1)[1] if '\n' in paragraph else ''
53
+
54
+ # Check if the paragraph contains only a timestamp
55
+ elif re.match(r'\((.*?)\):', paragraph):
56
+ current_timestamp = re.match(r'\((.*?)\):', paragraph).group(1)
57
+ text = re.sub(r'^\(.*?\):\s*', '', paragraph).strip()
58
+
59
+ # If it's not a speaker line or timestamp line, it's just text
60
+ else:
61
+ text = paragraph
62
+
63
+ # Add the segment
64
+ if text:
65
+ add_segment(parsed_segments, current_speaker,
66
+ current_company, current_timestamp, text)
67
+
68
+ return parsed_segments
69
+
70
+
71
+ def add_segment(parsed_segments, speaker, company, timestamp, text):
72
+ segment = {
73
+ "speaker": speaker,
74
+ "company": company,
75
+ "timestamp": timestamp,
76
+ "text": text
77
+ }
78
+ parsed_segments.append(segment)
79
+ print_segment(speaker, company, timestamp, text)
80
+
81
+
82
+ def print_segment(speaker, company, timestamp, text):
83
+ print(f"Speaker: {speaker}")
84
+ print(f"Company: {company}")
85
+ print(f"Timestamp: {timestamp}")
86
+ print(f"Text: {text[:100]}...") # Print first 100 characters of text
87
+ print("-" * 50)
88
+
89
+
90
+ def create_searchable_segments(parsed_segments):
91
+ searchable_segments = []
92
+ for segment in parsed_segments:
93
+ searchable_text = (
94
+ f"{segment['speaker']},{segment['company']},"
95
+ f"{segment['timestamp']}:: {segment['text']}"
96
+ )
97
+ searchable_segments.append(searchable_text)
98
+ return searchable_segments
99
+
100
+
101
+ # Load and parse the transcript
102
+ def load_transcript(content):
103
+ global vectorstore
104
+
105
+ # Parse the transcript
106
+ parsed_transcript = parse_transcript(content)
107
+ searchable_segments = create_searchable_segments(parsed_transcript)
108
+
109
+ # Create text splitter and split the searchable segments
110
+ text_splitter = RecursiveCharacterTextSplitter(
111
+ chunk_size=1000, chunk_overlap=200)
112
+
113
+ splits = text_splitter.create_documents(searchable_segments)
114
+
115
+ # Create vector store with HuggingFaceEmbeddings
116
+ embeddings = HuggingFaceEmbeddings()
117
+ vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
118
+
119
+
120
+ def search_transcript(query, k=30):
121
+ # Perform similarity search
122
+ docs = vectorstore.similarity_search(query=query, k=k)
123
+
124
+ # Format results
125
+ results = []
126
+ for doc in docs:
127
+ content = doc.page_content
128
+ match = re.match(r'(.*?),(.*?),(.*?)::\s*(.*)', content, re.DOTALL)
129
+ if match:
130
+ speaker, company, timestamp, text = match.groups()
131
+ results.append({
132
+ "speaker": speaker.strip(),
133
+ "company": company.strip(),
134
+ "timestamp": timestamp.strip(),
135
+ "text": text.strip()
136
+ })
137
+
138
+ return results
139
+
140
+
141
+ # Groq client setup
142
+ client = Groq(api_key=get_api_key())
143
+
144
+
145
+ def generate_response(query, search_results):
146
+ # Prepare the prompt with search results
147
+ prompt = f"""You are a friendly assistant. Your job is to answer the user's question based on the transcript excerpts provided below:
148
+
149
+ Transcript excerpts:
150
+ {search_results}
151
+
152
+ Question: {query}
153
+
154
+ Please provide a concise and relevant answer based on the information in the transcript excerpts. If the information is not directly related to the question, say so and provide the most relevant information available."""
155
+
156
+ completion = client.chat.completions.create(
157
+ model="llama3-8b-8192",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": prompt
162
+ }
163
+ ],
164
+ temperature=0.5,
165
+ max_tokens=3000,
166
+ top_p=1,
167
+ stream=False,
168
+ stop=None,
169
+ )
170
+
171
+ return completion.choices[0].message.content
172
+
173
+
174
+ # Streamlit app
175
+ def main():
176
+ st.title("Transcript Search and Q&A")
177
+
178
+ st.caption("This site takes a TelecomTV video transcript and allows a chat session with it. If no transcript is provided it defaults to this one: https://www.telecomtv.com/content/dsp-leaders-forum/enabling-the-autonomous-network-with-ai-50536/")
179
+
180
+ # File upload
181
+ uploaded_file = st.file_uploader("Upload a transcript file", type="txt")
182
+
183
+ if uploaded_file is None:
184
+ file_name = "Enabling the autonomous network with AI"
185
+ with open("example-transcript.txt", 'r') as file:
186
+ content = file.read()
187
+ else:
188
+ content = uploaded_file.getvalue().decode("utf-8")
189
+ file_name = uploaded_file.name
190
+
191
+ # Read and process the uploaded file
192
+ load_transcript(content)
193
+
194
+ st.subheader(f"Chat with {file_name}")
195
+
196
+ # User input
197
+ user_query = st.text_input(
198
+ "Enter your question:", placeholder="e.g.What are people speaking about? or List all people speaking")
199
+
200
+ # Add a slider for selecting the number of results
201
+ num_results = st.slider("Number of relevant transcript excerpts to show:",
202
+ min_value=1, max_value=50, value=30, step=1)
203
+
204
+ if user_query:
205
+ search_results = search_transcript(user_query, k=num_results)
206
+ formatted_results = (
207
+ "\n\n".join([f"{result['speaker']} {result['company']} ({result['timestamp']}): "
208
+ "{result['text']}" for result in search_results])
209
+ )
210
+
211
+ response = generate_response(user_query, formatted_results)
212
+
213
+ st.subheader("Assistant's response:")
214
+ st.write(response)
215
+
216
+ st.subheader("Relevant transcript excerpts:")
217
+
218
+ # Create a DataFrame from the search results
219
+ df = pd.DataFrame(search_results)
220
+
221
+ # Rename columns for better readability
222
+ df.columns = ['Speaker', 'Company', 'Timestamp', 'Quote']
223
+
224
+ # Display the DataFrame as a table
225
+ st.table(df)
226
+
227
+
228
+ if __name__ == "__main__":
229
+ main()
dsp-urls.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ https://www.telecomtv.com/content/dsp-leaders-forum/directing-and-accelerating-network-innovation-50535/
2
+ https://www.telecomtv.com/content/dsp-leaders-forum/enabling-the-autonomous-network-with-ai-50536/
3
+ https://www.telecomtv.com/content/dsp-leaders-forum/creating-cloud-native-software-engineering-teams-50537/
4
+ https://www.telecomtv.com/content/dsp-leaders-forum/building-digital-infrastructure-from-core-to-edge-50538/
5
+ https://www.telecomtv.com/content/dsp-leaders-forum/unlocking-platform-opportunities-with-network-apis-50539/
6
+ https://www.telecomtv.com/content/dsp-leaders-forum/leveraging-telco-cloud-for-advanced-operations-50540/
7
+ https://www.telecomtv.com/content/dsp-leaders-forum/improving-network-optimisation-through-automation-50541/
8
+ https://www.telecomtv.com/content/dsp-leaders-forum/focusing-on-the-customer-50542/
ec-app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ec_config import create_app
2
+ from ttv_web_scraper import db_load_metadata_sets
3
+ import streamlit as st
4
+ import re
5
+
6
+
7
+ @st.cache_resource
8
+ def embedchain_bot():
9
+ return create_app() # Use the create_app function from config.py
10
+
11
+
12
+ def timestamp_to_seconds(timestamp):
13
+ """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
14
+ parts = timestamp.split(':')
15
+ if len(parts) == 3:
16
+ h, m, s = map(int, parts)
17
+ ts = h * 3600 + m * 60 + s
18
+ elif len(parts) == 2:
19
+ m, s = map(int, parts)
20
+ ts = m * 60 + s
21
+ else:
22
+ raise ValueError(f"Invalid timestamp format: {timestamp}")
23
+
24
+ return ts
25
+
26
+
27
+ def create_filter_panel(speakers, companies, sentiments, subjects):
28
+ st.sidebar.header("Filter Options")
29
+
30
+ selected_speaker = st.sidebar.selectbox(
31
+ "Select Speaker", [""] + list(speakers))
32
+ selected_company = st.sidebar.selectbox(
33
+ "Select Company", [""] + list(companies))
34
+ selected_sentiment = st.sidebar.selectbox(
35
+ "Select Sentiment", [""] + list(sentiments))
36
+ selected_subject = st.sidebar.selectbox(
37
+ "Select Subject", [""] + list(subjects))
38
+
39
+ where = {}
40
+ if selected_speaker:
41
+ where['speaker'] = selected_speaker
42
+ if selected_company:
43
+ where['companies'] = selected_company
44
+ if selected_sentiment:
45
+ where['sentiment'] = selected_sentiment
46
+ if selected_subject:
47
+ where['subject'] = selected_subject
48
+
49
+ return where
50
+
51
+
52
+ # Streamlit app
53
+
54
+
55
+ def main():
56
+ st.title("DSP Leaders World Forum 2024 ChatBot")
57
+
58
+ st.markdown(
59
+ "Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
60
+
61
+ # Load metadata sets
62
+ _, speakers, companies, sentiments, subjects = db_load_metadata_sets()
63
+
64
+ # Create filter panel
65
+ where = create_filter_panel(speakers, companies, sentiments, subjects)
66
+
67
+ # User input
68
+ user_query = st.text_input(
69
+ "Enter your question:", placeholder="e.g. What are people speaking about? or List all people speaking")
70
+
71
+ # Add a slider for selecting the number of results
72
+ num_results = st.slider("Number of relevant transcript excerpts to show:",
73
+ min_value=1, max_value=50, value=30, step=1)
74
+
75
+ if user_query:
76
+ app = embedchain_bot()
77
+
78
+ msg_placeholder = st.empty()
79
+ msg_placeholder.markdown("Thinking...")
80
+
81
+ # Use app.search() with the where parameter
82
+ search_results = app.search(
83
+ user_query, num_documents=num_results, where=where)
84
+
85
+ # Process and display search results
86
+ answer = "Here are the most relevant transcript excerpts:\n\n"
87
+ for i, result in enumerate(search_results, 1):
88
+ metadata = result['metadata']
89
+ ts = timestamp_to_seconds(metadata['timestamp'])
90
+ yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
91
+
92
+ speaker_info = (
93
+ f"Speaker: {metadata.get('speaker', 'Unknown')}, "
94
+ f"Company: {metadata.get('company', 'Unknown')}, "
95
+ f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
96
+ )
97
+
98
+ answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
99
+ answer += f"{metadata.get('title', 'Unknown')} \n"
100
+ answer += f"\"{result['context']}\"\n\n"
101
+
102
+ msg_placeholder.markdown(answer)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ main()
ec_config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import streamlit as st
4
+ from embedchain import App
5
+
6
+ # Load environment variables
7
+ try:
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+ except ImportError:
11
+ pass # dotenv not installed, likely running on Hugging Face Spaces
12
+
13
+ # Function to get the API key
14
+
15
+
16
+ def get_api_key(name):
17
+ api_key = os.environ.get(name)
18
+ if not api_key:
19
+ api_key = st.secrets.get(name)
20
+ if not api_key:
21
+ raise ValueError(
22
+ f"{name} is not set. Please set it in your environment or Streamlit secrets.")
23
+ return api_key
24
+
25
+
26
+ config_dict = {
27
+ 'app': {
28
+ 'config': {
29
+ 'name': 'ttv-ec'
30
+ }
31
+ },
32
+ 'llm': {
33
+ 'provider': 'huggingface',
34
+ 'config': {
35
+ 'model': 'mistralai/Mistral-7B-Instruct-v0.2',
36
+ 'top_p': 0.5,
37
+ 'stream': False,
38
+ 'prompt': """You are an AI assistant that answers questions based solely on the information provided in your knowledge base.
39
+
40
+ Question: $query
41
+ Context: $context
42
+
43
+ If the information to answer a question is not available in your knowledge base,
44
+ respond with 'I don't have enough information to answer that question.
45
+ """,
46
+ 'api_key': get_api_key('HF_TOKEN')
47
+ }
48
+ },
49
+ 'embedder': {
50
+ 'provider': 'huggingface',
51
+ 'config': {
52
+ 'model': 'sentence-transformers/all-mpnet-base-v2',
53
+ 'api_key': get_api_key('HF_TOKEN')
54
+ }
55
+ }
56
+ }
57
+
58
+
59
+ def create_app():
60
+ return App.from_config(config=config_dict)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ embedchain
3
+ langchain_huggingface
4
+ watchdog
5
+ pyppeteer
6
+ beautifulsoup4
ttv_web_scraper.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import traceback
6
+ from pyppeteer import launch
7
+ from bs4 import BeautifulSoup
8
+ import hashlib
9
+ from ec_config import create_app
10
+
11
+
12
+ CACHE_DIR = "cache/"
13
+ if not os.path.exists(CACHE_DIR):
14
+ os.makedirs(CACHE_DIR)
15
+
16
+ DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
17
+
18
+
19
+ def db_load_metadata_sets():
20
+
21
+ content_hashes = set()
22
+ speakers = set()
23
+ companies = set()
24
+ sentiments = set()
25
+ subjects = set()
26
+
27
+ if os.path.exists(DB_METADATA_FILE):
28
+ with open(DB_METADATA_FILE, 'r') as f:
29
+ metadata = json.load(f)
30
+
31
+ content_hashes = set(metadata.get('content_hashes', []))
32
+ speakers = set(metadata.get('speakers', []))
33
+ companies = set(metadata.get('companies', []))
34
+ sentiments = set(metadata.get('sentiments', []))
35
+ subjects = set(metadata.get('subjects', []))
36
+
37
+ return content_hashes, speakers, companies, sentiments, subjects
38
+
39
+
40
+ def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
41
+
42
+ metadata = {
43
+ 'content_hashes': list(content_hashes),
44
+ 'speakers': list(speakers),
45
+ 'companies': list(companies),
46
+ 'sentiments': list(sentiments),
47
+ 'subjects': list(subjects)
48
+ }
49
+
50
+ with open(DB_METADATA_FILE, 'w') as f:
51
+ json.dump(metadata, f, indent=2)
52
+
53
+
54
+ async def get_client_rendered_content(url):
55
+ browser = None
56
+ try:
57
+ browser = await launch()
58
+ page = await browser.newPage()
59
+ await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
60
+ await asyncio.sleep(5)
61
+ content = await page.content()
62
+ return content
63
+ except Exception as e:
64
+ raise Exception(f"Error fetching content: {str(e)}")
65
+ finally:
66
+ if browser:
67
+ await browser.close()
68
+
69
+
70
+ def extract_info(html_content):
71
+ try:
72
+ soup = BeautifulSoup(html_content, 'html.parser')
73
+ title = soup.title.string.strip() if soup.title else None
74
+ date_elem = soup.find('p', class_='content-date')
75
+ date = date_elem.find(
76
+ 'span', class_='ng-binding').text.strip() if date_elem else None
77
+ youtube_iframe = soup.find(
78
+ 'iframe', src=lambda x: x and 'youtube.com' in x)
79
+ youtube_url = youtube_iframe['src'] if youtube_iframe else None
80
+ youtube_id = None
81
+ if youtube_url:
82
+ match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
83
+ if match:
84
+ youtube_id = match.group(1)
85
+ transcript_elem = soup.find(id='transcript0')
86
+ transcript = transcript_elem.get_text(
87
+ strip=True) if transcript_elem else None
88
+ return {
89
+ 'metadata': {
90
+ 'title': title,
91
+ 'date': date,
92
+ 'youtube_id': youtube_id,
93
+ },
94
+ 'transcript': transcript
95
+ }
96
+ except Exception as e:
97
+ raise Exception(f"Error extracting information: {str(e)}")
98
+
99
+
100
+ def read_html_from_file(filename):
101
+ try:
102
+ if os.path.exists(filename):
103
+ with open(filename, 'r', encoding='utf-8') as f:
104
+ return f.read()
105
+ return None
106
+ except Exception as e:
107
+ raise Exception(f"Error reading file {filename}: {str(e)}")
108
+
109
+
110
+ def read_json_from_file(filename):
111
+ try:
112
+ if os.path.exists(filename):
113
+ with open(filename, 'r', encoding='utf-8') as f:
114
+ return json.load(f)
115
+ return None
116
+ except json.JSONDecodeError as e:
117
+ raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
118
+ except Exception as e:
119
+ raise Exception(f"Error reading file {filename}: {str(e)}")
120
+
121
+
122
+ def extract_speaker_info(segment):
123
+ try:
124
+ pattern = r'(?P<speaker>(?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+), (?P<company>[A-Za-z\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
125
+ match = re.match(pattern, segment)
126
+ if match:
127
+ return {key: value.strip() if value else None for key, value in match.groupdict().items()}
128
+ else:
129
+ timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
130
+ timestamp_match = re.match(timestamp_pattern, segment)
131
+ if timestamp_match:
132
+ return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
133
+ return None
134
+ except Exception as e:
135
+ raise Exception(f"Error extracting speaker info: {str(e)}")
136
+
137
+
138
+ def parse_transcript(content):
139
+ try:
140
+ parsed_segments = []
141
+ metadata = {}
142
+ pattern = r'((?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+, [A-Za-z\s]+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
143
+ segments = re.split(pattern, content)
144
+ segments = [segment.strip() for segment in segments if segment.strip()]
145
+ for segment in segments:
146
+ speaker_info = extract_speaker_info(segment)
147
+ if (speaker_info):
148
+ if speaker_info['speaker']:
149
+ metadata = speaker_info.copy()
150
+ else:
151
+ metadata = metadata.copy()
152
+ metadata['timestamp'] = speaker_info['timestamp']
153
+ else:
154
+ parsed_segments.append({
155
+ 'metadata': metadata,
156
+ "text": segment
157
+ })
158
+ return parsed_segments
159
+ except Exception as e:
160
+ raise Exception(f"Error parsing transcript: {str(e)}")
161
+
162
+
163
+ def get_cached_filename(url):
164
+ return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"
165
+
166
+
167
+ async def process_url(url):
168
+ try:
169
+ cached_filename = get_cached_filename(url)
170
+ json_filename = f"{cached_filename}.json"
171
+ info = read_json_from_file(json_filename)
172
+
173
+ if info:
174
+ return info
175
+
176
+ content = read_html_from_file(cached_filename)
177
+
178
+ if content is None:
179
+ print(f"Fetching content from web for {url}...")
180
+ content = await get_client_rendered_content(url)
181
+ with open(cached_filename, 'w', encoding='utf-8') as f:
182
+ f.write(content)
183
+ else:
184
+ print(f"Using cached content from file for {url}...")
185
+
186
+ info = extract_info(content)
187
+ transcript = info['transcript']
188
+ if (transcript):
189
+ info['transcript'] = parse_transcript(transcript)
190
+ with open(json_filename, 'w', encoding='utf-8') as f:
191
+ json.dump(info, f, ensure_ascii=False, indent=4)
192
+ print(f"Information extracted and saved to {json_filename}")
193
+ else:
194
+ print(f"No transcript found for {url}")
195
+ return info
196
+
197
+ except Exception as e:
198
+ print(f"Error processing URL {url}:")
199
+ print(traceback.format_exc())
200
+ print(f"Detailed error: {str(e)}")
201
+ return None
202
+
203
+ # This function can be used to process multiple URLs
204
+
205
+
206
+ async def process_urls(urls):
207
+ tasks = [process_url(url) for url in urls]
208
+ return await asyncio.gather(*tasks)
209
+
210
+
211
+ def main():
212
+
213
+ app = create_app()
214
+
215
+ url_file = "dsp-urls.txt" # File containing list of URLs
216
+
217
+ if not os.path.exists(url_file):
218
+ print(f"Error: {url_file} not found.")
219
+ return
220
+
221
+ content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
222
+
223
+ with open(url_file, 'r') as f:
224
+ urls = [line.strip() for line in f if line.strip()]
225
+
226
+ for url in urls:
227
+ # Generate a hash of the url
228
+ filename_hash = hashlib.md5(url.encode()).hexdigest()
229
+ # Check if this content has already been added
230
+ if filename_hash in content_hashes:
231
+ print(f"{url} already added")
232
+ continue
233
+
234
+ info = asyncio.run(process_url(url))
235
+ if info is None:
236
+ continue
237
+
238
+ metadata = info['metadata']
239
+ transcript = info['transcript']
240
+
241
+ if transcript is None:
242
+ continue
243
+
244
+ for entry in transcript:
245
+ metadata.update(entry['metadata'])
246
+ speakers.add(metadata['speaker'])
247
+ companies.add(metadata['company'])
248
+
249
+ text = entry['text']
250
+
251
+ app.add(text, data_type='text', metadata=metadata)
252
+
253
+ content_hashes.add(filename_hash)
254
+ print(f"Added new url: {url}")
255
+
256
+ # Save updated hashes
257
+ save_metadata_sets(content_hashes, speakers,
258
+ companies, sentiments, subjects)
259
+
260
+ print("Processing complete. Check individual URL outputs for any errors.")
261
+
262
+
263
+ if __name__ == "__main__":
264
+ main()