eusholli commited on
Commit
0540b53
·
1 Parent(s): 76ebe2e

faiss filter initial commit

Browse files
ai_config_ec.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from embedchain import App
4
+ from typing import Dict, Any, List
5
+
6
+
7
+ def timestamp_to_seconds(timestamp):
8
+ """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
9
+ parts = timestamp.split(':')
10
+ if len(parts) == 3:
11
+ h, m, s = map(int, parts)
12
+ ts = h * 3600 + m * 60 + s
13
+ elif len(parts) == 2:
14
+ m, s = map(int, parts)
15
+ ts = m * 60 + s
16
+ else:
17
+ raise ValueError(f"Invalid timestamp format: {timestamp}")
18
+
19
+ return ts
20
+
21
+
22
+ class AIAssistant:
23
+ def __init__(self):
24
+ self.app = self._create_app()
25
+
26
+ def _get_api_key(self, name: str) -> str:
27
+ api_key = os.environ.get(name)
28
+ if not api_key:
29
+ api_key = st.secrets.get(name)
30
+ if not api_key:
31
+ raise ValueError(
32
+ f"{name} is not set. Please set it in your environment or Streamlit secrets.")
33
+ return api_key
34
+
35
+ def _create_config(self) -> Dict[str, Any]:
36
+ return {
37
+ 'app': {
38
+ 'config': {
39
+ 'name': 'ttv-ec'
40
+ }
41
+ },
42
+ 'llm': {
43
+ 'provider': 'huggingface',
44
+ 'config': {
45
+ 'model': 'mistralai/Mistral-7B-Instruct-v0.2',
46
+ 'top_p': 0.5,
47
+ 'stream': False,
48
+ 'prompt': """You are an AI assistant that answers questions based solely on the information provided in your knowledge base.
49
+
50
+ Question: $query
51
+ Context: $context
52
+
53
+ If the information to answer a question is not available in your knowledge base,
54
+ respond with 'I don't have enough information to answer that question.
55
+ """,
56
+ 'api_key': self._get_api_key('HF_TOKEN')
57
+ }
58
+ },
59
+ 'embedder': {
60
+ 'provider': 'huggingface',
61
+ 'config': {
62
+ 'model': 'sentence-transformers/all-mpnet-base-v2',
63
+ 'api_key': self._get_api_key('HF_TOKEN')
64
+ }
65
+ }
66
+ }
67
+
68
+ def _create_app(self) -> App:
69
+ config = self._create_config()
70
+ return App.from_config(config=config)
71
+
72
+ def save(self) -> None:
73
+ # null function
74
+ return
75
+
76
+ def add_to_knowledge_base(self, data: str, data_type: str, metadata: Dict[str, Any] = None) -> None:
77
+ self.app.add(data, data_type=data_type, metadata=metadata)
78
+
79
+ def query(self, question: str, num_results: int = 30, filters: Dict[str, Any] = None) -> Dict[str, List[Dict[str, Any]]]:
80
+ search_results = self.app.search(
81
+ question, num_documents=num_results, where=filters)
82
+ # Process and display search results
83
+ answer = "Here are the most relevant transcript excerpts:\n\n"
84
+ for i, result in enumerate(search_results['results'], 1):
85
+ metadata = result['metadata']
86
+ ts = timestamp_to_seconds(metadata['timestamp'])
87
+ yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
88
+
89
+ speaker_info = (
90
+ f"Speaker: {metadata.get('speaker', 'Unknown')}, "
91
+ f"Company: {metadata.get('company', 'Unknown')}, "
92
+ f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
93
+ )
94
+
95
+ answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
96
+ answer += f"{metadata.get('title', 'Unknown')} \n"
97
+ answer += f"\"{result['context']}\"\n\n"
98
+
99
+ return {'results': search_results}
100
+
101
+ # Usage example
102
+
103
+
104
+ def get_ai_assistant() -> AIAssistant:
105
+ return AIAssistant()
ai_config_faiss.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Any, List
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_core.documents import Document
6
+
7
+
8
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
9
+
10
+ DB_DIR = "db/"
11
+ if not os.path.exists(DB_DIR):
12
+ os.makedirs(DB_DIR)
13
+
14
+
15
+ def timestamp_to_seconds(timestamp):
16
+ """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
17
+ parts = timestamp.split(':')
18
+ if len(parts) == 3:
19
+ h, m, s = map(int, parts)
20
+ ts = h * 3600 + m * 60 + s
21
+ elif len(parts) == 2:
22
+ m, s = map(int, parts)
23
+ ts = m * 60 + s
24
+ else:
25
+ raise ValueError(f"Invalid timestamp format: {timestamp}")
26
+
27
+ return ts
28
+
29
+
30
+ class FAISSAIAssistant:
31
+ def __init__(self, index_name: str = "faiss_index"):
32
+ self.index_name = f"{DB_DIR}{index_name}.faiss"
33
+ model_name = "sentence-transformers/all-mpnet-base-v2"
34
+ model_kwargs = {'device': 'cpu'}
35
+ encode_kwargs = {'normalize_embeddings': False}
36
+ self.embeddings = HuggingFaceEmbeddings(
37
+ model_name=model_name,
38
+ model_kwargs=model_kwargs,
39
+ encode_kwargs=encode_kwargs)
40
+ self.vector_store = self._create_app()
41
+
42
+ def _create_app(self):
43
+ if os.path.exists(self.index_name):
44
+ print("Loading existing FAISS index...")
45
+ return FAISS.load_local(self.index_name, self.embeddings,
46
+ allow_dangerous_deserialization=True)
47
+ else:
48
+ print("Creating new FAISS index...")
49
+ # Create an initial document with placeholder text
50
+ initial_texts = [
51
+ "This is an initial document to create the FAISS index."]
52
+ return FAISS.from_texts(initial_texts, self.embeddings)
53
+
54
+ def add_to_knowledge_base(self, data: str, data_type: str = None, metadata: Dict[str, Any] = None) -> None:
55
+ doc = Document(page_content=data, metadata=metadata or {})
56
+ self.vector_store.add_documents([doc])
57
+
58
+ def query(self, question: str, num_results: int = 30, filters: Dict[str, List[str]] = None) -> str:
59
+ all_docs = self.list_documents()
60
+
61
+ def match_any_filter(doc_metadata, filters):
62
+ if not filters:
63
+ return True
64
+ for key, values in filters.items():
65
+ if key in doc_metadata:
66
+ doc_value = doc_metadata[key]
67
+ if isinstance(doc_value, list):
68
+ # If doc_value is a list, check if any item in doc_value is in values
69
+ if any(item in values for item in doc_value):
70
+ return True
71
+ else:
72
+ # If doc_value is a single string, check if it's in values
73
+ if doc_value in values:
74
+ return True
75
+ return False
76
+
77
+ filtered_docs = [
78
+ doc for doc in all_docs
79
+ if match_any_filter(doc['metadata'], filters)
80
+ ]
81
+
82
+ # Limit the number of results to num_results
83
+ filtered_docs = filtered_docs[:num_results]
84
+
85
+ answer = f"Here are the top {
86
+ len(filtered_docs)} documents matching the filter:\n\n"
87
+ for i, doc in enumerate(filtered_docs, 1):
88
+ metadata = doc['metadata']
89
+ st_ts = timestamp_to_seconds(metadata['start_timestamp'])
90
+ yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={st_ts}"
91
+
92
+ speaker_info = (
93
+ f"Speaker: {metadata.get('speaker', 'Unknown')}, "
94
+ f"Company: {metadata.get('company', 'Unknown')}, "
95
+ f"Timestamp: {metadata.get('start_timestamp', 'Unknown')}"
96
+ f" - {metadata.get('end_timestamp', 'Unknown')}"
97
+ )
98
+
99
+ answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
100
+ answer += f"{metadata.get('title', 'Unknown')} \n"
101
+ answer += f"\"{doc['content']}\" \n\n"
102
+
103
+ return answer
104
+
105
+ def save(self):
106
+ self.vector_store.save_local(self.index_name)
107
+ print("FAISS index saved.")
108
+
109
+ def list_documents(self) -> List[Dict[str, Any]]:
110
+ """
111
+ List all documents in the FAISS vectorstore.
112
+
113
+ Returns:
114
+ List[Dict[str, Any]]: A list of dictionaries, each containing 'content' and 'metadata' of a document.
115
+ """
116
+ documents = []
117
+ for doc_id, doc in self.vector_store.docstore._dict.items():
118
+ documents.append({
119
+ 'id': doc_id,
120
+ 'content': doc.page_content,
121
+ 'metadata': doc.metadata
122
+ })
123
+ return documents
124
+
125
+ # Usage example
126
+
127
+
128
+ def get_ai_assistant(index_name: str = "faiss_index") -> FAISSAIAssistant:
129
+ return FAISSAIAssistant(index_name)
ai_config_faiss.py.llm-query ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Any, List
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_core.documents import Document
6
+
7
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
8
+
9
+ DB_DIR = "db/"
10
+ if not os.path.exists(DB_DIR):
11
+ os.makedirs(DB_DIR)
12
+
13
+
14
+ def timestamp_to_seconds(timestamp):
15
+ """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
16
+ parts = timestamp.split(':')
17
+ if len(parts) == 3:
18
+ h, m, s = map(int, parts)
19
+ ts = h * 3600 + m * 60 + s
20
+ elif len(parts) == 2:
21
+ m, s = map(int, parts)
22
+ ts = m * 60 + s
23
+ else:
24
+ raise ValueError(f"Invalid timestamp format: {timestamp}")
25
+
26
+ return ts
27
+
28
+
29
+ class FAISSAIAssistant:
30
+ def __init__(self, index_name: str = "faiss_index"):
31
+ self.index_name = f"{DB_DIR}{index_name}.faiss"
32
+ model_name = "sentence-transformers/all-mpnet-base-v2"
33
+ model_kwargs = {'device': 'cpu'}
34
+ encode_kwargs = {'normalize_embeddings': False}
35
+ self.embeddings = HuggingFaceEmbeddings(
36
+ model_name=model_name,
37
+ model_kwargs=model_kwargs,
38
+ encode_kwargs=encode_kwargs)
39
+ self.vector_store = self._create_app()
40
+
41
+ def _create_app(self):
42
+ if os.path.exists(self.index_name):
43
+ print("Loading existing FAISS index...")
44
+ return FAISS.load_local(self.index_name, self.embeddings,
45
+ allow_dangerous_deserialization=True)
46
+ else:
47
+ print("Creating new FAISS index...")
48
+ # Create an initial document with placeholder text
49
+ initial_texts = [
50
+ "This is an initial document to create the FAISS index."]
51
+ return FAISS.from_texts(initial_texts, self.embeddings)
52
+
53
+ def add_to_knowledge_base(self, data: str, data_type: str = None, metadata: Dict[str, Any] = None) -> None:
54
+ doc = Document(page_content=data, metadata=metadata or {})
55
+ self.vector_store.add_documents([doc])
56
+
57
+ def query(self, filters: Dict[str, List[str]] = None) -> str:
58
+ all_docs = self.list_documents()
59
+
60
+ def match_filter(doc_metadata, filter_key, filter_values):
61
+ return doc_metadata.get(filter_key) in filter_values
62
+
63
+ filtered_docs = [
64
+ doc for doc in all_docs
65
+ if all(match_filter(doc['metadata'], k, v) for k, v in filters.items())
66
+ ] if filters else all_docs
67
+
68
+ answer = "Here are the documents matching the filter:\n\n"
69
+ for i, doc in enumerate(filtered_docs, 1):
70
+ metadata = doc['metadata']
71
+ st_ts = timestamp_to_seconds(metadata['start_timestamp'])
72
+ yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={st_ts}"
73
+
74
+ speaker_info = (
75
+ f"Speaker: {metadata.get('speaker', 'Unknown')}, "
76
+ f"Company: {metadata.get('company', 'Unknown')}, "
77
+ f"Timestamp: {metadata.get('start_timestamp', 'Unknown')}"
78
+ f" - {metadata.get('end_timestamp', 'Unknown')}"
79
+ )
80
+
81
+ answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
82
+ answer += f"{metadata.get('title', 'Unknown')} \n"
83
+ answer += f"\"{doc['content']}\" \n\n"
84
+
85
+ return answer
86
+
87
+ def save(self):
88
+ self.vector_store.save_local(self.index_name)
89
+ print("FAISS index saved.")
90
+
91
+ def list_documents(self) -> List[Dict[str, Any]]:
92
+ """
93
+ List all documents in the FAISS vectorstore.
94
+
95
+ Returns:
96
+ List[Dict[str, Any]]: A list of dictionaries, each containing 'content' and 'metadata' of a document.
97
+ """
98
+ documents = []
99
+ for doc_id, doc in self.vector_store.docstore._dict.items():
100
+ documents.append({
101
+ 'id': doc_id,
102
+ 'content': doc.page_content,
103
+ 'metadata': doc.metadata
104
+ })
105
+ return documents
106
+
107
+
108
+ # Usage example
109
+
110
+
111
+ def get_ai_assistant(index_name: str = "faiss_index") -> FAISSAIAssistant:
112
+ return FAISSAIAssistant(index_name)
app.py CHANGED
@@ -1,105 +1,107 @@
1
- from ec_config import create_app
2
  from ttv_web_scraper import db_load_metadata_sets
3
  import streamlit as st
4
  import re
5
 
6
 
7
  @st.cache_resource
8
- def embedchain_bot():
9
- return create_app() # Use the create_app function from config.py
10
-
11
-
12
- def timestamp_to_seconds(timestamp):
13
- """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
14
- parts = timestamp.split(':')
15
- if len(parts) == 3:
16
- h, m, s = map(int, parts)
17
- ts = h * 3600 + m * 60 + s
18
- elif len(parts) == 2:
19
- m, s = map(int, parts)
20
- ts = m * 60 + s
21
- else:
22
- raise ValueError(f"Invalid timestamp format: {timestamp}")
23
-
24
- return ts
25
-
26
-
27
- def create_filter_panel(speakers, companies, sentiments, subjects):
28
- st.sidebar.header("Filter Options")
29
-
30
- selected_speaker = st.sidebar.selectbox(
31
- "Select Speaker", [""] + list(speakers))
32
- selected_company = st.sidebar.selectbox(
33
- "Select Company", [""] + list(companies))
34
- selected_sentiment = st.sidebar.selectbox(
35
- "Select Sentiment", [""] + list(sentiments))
36
- selected_subject = st.sidebar.selectbox(
37
- "Select Subject", [""] + list(subjects))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  where = {}
40
- if selected_speaker:
41
- where['speaker'] = selected_speaker
42
- if selected_company:
43
- where['company'] = selected_company
44
- if selected_sentiment:
45
- where['sentiment'] = selected_sentiment
46
- if selected_subject:
47
- where['subject'] = selected_subject
48
 
49
  return where
50
 
51
 
52
- # Streamlit app
53
-
54
-
55
  def main():
56
- st.title("DSP Leaders World Forum 2024 ChatBot")
57
 
58
  st.markdown(
59
  "Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
60
 
61
  # Load metadata sets
62
- _, speakers, companies, sentiments, subjects = db_load_metadata_sets()
63
 
64
  # Create filter panel
65
- where = create_filter_panel(speakers, companies, sentiments, subjects)
66
-
67
- # User input
68
- user_query = st.text_input(
69
- "Enter your question:", placeholder="e.g. What are people speaking about? or List all people speaking")
70
 
71
  # Add a slider for selecting the number of results
72
  num_results = st.slider("Number of relevant transcript excerpts to show:",
73
  min_value=1, max_value=50, value=30, step=1)
74
 
75
- if user_query:
76
- app = embedchain_bot()
77
-
78
- msg_placeholder = st.empty()
79
- msg_placeholder.markdown("Thinking...")
80
-
81
- # Use app.search() with the where parameter
82
- search_results = app.search(
83
- user_query, num_documents=num_results, where=where)
84
 
85
- # Process and display search results
86
- answer = "Here are the most relevant transcript excerpts:\n\n"
87
- for i, result in enumerate(search_results, 1):
88
- metadata = result['metadata']
89
- ts = timestamp_to_seconds(metadata['timestamp'])
90
- yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
91
 
92
- speaker_info = (
93
- f"Speaker: {metadata.get('speaker', 'Unknown')}, "
94
- f"Company: {metadata.get('company', 'Unknown')}, "
95
- f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
96
- )
97
 
98
- answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
99
- answer += f"{metadata.get('title', 'Unknown')} \n"
100
- answer += f"\"{result['context']}\"\n\n"
101
 
102
- msg_placeholder.markdown(answer)
103
 
104
 
105
  if __name__ == "__main__":
 
1
+ from ai_config_faiss import get_ai_assistant
2
  from ttv_web_scraper import db_load_metadata_sets
3
  import streamlit as st
4
  import re
5
 
6
 
7
  @st.cache_resource
8
+ def get_assistant():
9
+ return get_ai_assistant()
10
+
11
+
12
+ def create_filter_panel(companies, sentiments, subjects):
13
+ st.header("Filter Options")
14
+
15
+ # Initialize session state for filters if not already present
16
+ if 'selected_companies' not in st.session_state:
17
+ st.session_state.selected_companies = []
18
+ if 'selected_speakers' not in st.session_state:
19
+ st.session_state.selected_speakers = []
20
+ if 'selected_subjects' not in st.session_state:
21
+ st.session_state.selected_subjects = []
22
+
23
+ # Add a checkbox to show/hide all filters
24
+ show_filters = st.checkbox("Show Filters", value=True)
25
+
26
+ if show_filters:
27
+ col1, col2, col3 = st.columns(3)
28
+
29
+ with col1:
30
+ st.subheader("Companies")
31
+ for company in companies.keys():
32
+ if st.checkbox(f"{company}", value=company in st.session_state.selected_companies):
33
+ if company not in st.session_state.selected_companies:
34
+ st.session_state.selected_companies.append(company)
35
+ elif company in st.session_state.selected_companies:
36
+ st.session_state.selected_companies.remove(company)
37
+
38
+ with col2:
39
+ st.subheader("Speakers")
40
+ all_speakers = set()
41
+ for speakers in companies.values():
42
+ all_speakers.update(speakers)
43
+
44
+ for speaker in sorted(all_speakers):
45
+ if st.checkbox(speaker, value=speaker in st.session_state.selected_speakers):
46
+ if speaker not in st.session_state.selected_speakers:
47
+ st.session_state.selected_speakers.append(speaker)
48
+ elif speaker in st.session_state.selected_speakers:
49
+ st.session_state.selected_speakers.remove(speaker)
50
+
51
+ with col3:
52
+ st.subheader("Subjects")
53
+ for subject in sorted(subjects):
54
+ if st.checkbox(subject, value=subject in st.session_state.selected_subjects):
55
+ if subject not in st.session_state.selected_subjects:
56
+ st.session_state.selected_subjects.append(subject)
57
+ elif subject in st.session_state.selected_subjects:
58
+ st.session_state.selected_subjects.remove(subject)
59
 
60
  where = {}
61
+ if st.session_state.selected_companies:
62
+ where['company'] = st.session_state.selected_companies
63
+ if st.session_state.selected_speakers:
64
+ where['speaker'] = st.session_state.selected_speakers
65
+ if st.session_state.selected_subjects:
66
+ where['subjects'] = st.session_state.selected_subjects
 
 
67
 
68
  return where
69
 
70
 
 
 
 
71
  def main():
72
+ st.title("Telecom TV Video Expert")
73
 
74
  st.markdown(
75
  "Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
76
 
77
  # Load metadata sets
78
+ _, _, companies, sentiments, subjects = db_load_metadata_sets()
79
 
80
  # Create filter panel
81
+ where = create_filter_panel(companies, sentiments, subjects)
 
 
 
 
82
 
83
  # Add a slider for selecting the number of results
84
  num_results = st.slider("Number of relevant transcript excerpts to show:",
85
  min_value=1, max_value=50, value=30, step=1)
86
 
87
+ # Add a submit button
88
+ submit_button = st.button("Submit")
 
 
 
 
 
 
 
89
 
90
+ if submit_button:
91
+ if not where:
92
+ st.warning(
93
+ "Please select at least one filter before submitting.")
94
+ else:
95
+ assistant = get_assistant()
96
 
97
+ msg_placeholder = st.empty()
98
+ msg_placeholder.markdown("Thinking...")
 
 
 
99
 
100
+ # Use assistant.query() instead of app.search()
101
+ response = assistant.query(
102
+ "", num_results=num_results, filters=where)
103
 
104
+ msg_placeholder.markdown(response)
105
 
106
 
107
  if __name__ == "__main__":
cache/db_metadata.json CHANGED
@@ -1,102 +1,209 @@
1
  {
2
  "content_hashes": [
3
- "d81ba6e90c2c42d82c4003c4d158d3e3",
4
  "5754ba35c4f9f27e3e1d5b4d9bb972f2",
5
- "f8f43b2e1413f709038506c3a2dfd7b9",
6
  "9ae73679959943c591be3d1c81b7c26c",
7
  "6286818c51fc82ffc065ba12d3c48c19",
8
  "e068f68ad0aed4134d075210f871ea95",
9
- "4974b044047d2523c747b79d938915c9"
 
10
  ],
11
  "speakers": [
12
- "Atoosa Hatefi",
13
- "Robert Curran",
14
- "Amol Phadke",
15
- "Colin Bannon",
16
- "Abdu Mudesir",
17
- "Hasan Jafri",
18
- "Alex Foster",
19
- "Madhukiran Medithe",
20
- "Yago Tenorio",
21
- "Dennis Hoffman",
22
- "Mark Henry",
23
- "Alfredo Musitani",
24
- "Vivek Chadha",
25
- "Faiq Khan",
26
- "Susan James",
27
- "Mirko Voltolini",
28
- "Guy Daniels",
29
- "Michele Campriani",
30
  "Sadayuki Abeta",
31
- "Chivas Nambiar",
32
- "Philippe Ensarguet",
33
- "Alexandra Foster",
34
- "Tom Burton",
35
  "Juan Manuel Caro",
36
- "Andrew Coward",
37
- "Harkirit Singh",
38
- "Ray Le Maistre",
39
- "Enrique Blanco",
40
- "Mark Gilmour",
41
- "Luis Velarde Tazon",
42
- "Vishal Mathur",
43
  "Franz Seiser",
44
- "Jose Antonio Martin Martinez",
45
- "Chris Lewis",
 
46
  "Dean Dennis",
47
- "Sushil Rawat",
48
  "Sarwar Khan",
 
 
 
 
 
 
 
 
 
 
49
  "Ahmed Hafez",
50
- "Nik Willets",
 
 
51
  "Amith Maharaj",
 
 
 
 
 
52
  "Matthias Fridstrom",
53
- "Francesca Serravalle",
54
- "Francis Haysom",
55
  "Terje Jensen",
56
- "Akira Tada",
57
- "Laura Murphy",
 
 
 
 
 
58
  "Mojdeh Amani",
59
- "Manish Singh",
 
 
 
 
 
 
 
 
 
60
  "Komal Aggarwal",
61
- "Geoff Hollingworth",
62
- "Mallik Rao"
63
- ],
64
- "companies": [
65
- "Verizon Business",
66
- "Appledore Research",
67
- "Cambridge Management Consulting",
68
- "TM Forum",
69
- "Rakuten Mobile",
70
- "Rakuten Symphony",
71
- "AWS",
72
- "Arelion",
73
- "Vodafone",
74
- "Telecom Argentina",
75
- "Vodafone UK",
76
- "BT Business",
77
- "NTT DOCOMO",
78
- "Lewis Insight",
79
- "Deutsche Telekom Technik",
80
- "American Tower",
81
- "Deutsche Telekom",
82
- "SoftBank",
83
- "Telecom Infra Project",
84
- "Telefonica",
85
- "MTN",
86
- "IBM",
87
- "Colt Technology",
88
- "TelecomTV",
89
- "Telenor",
90
- "BT",
91
- "BT Group",
92
- "Ascend Digital Solutions",
93
- "Orange",
94
- "DSP Leaders Councillor",
95
- "Optiva",
96
- "TELUS",
97
- "Dell Technologies",
98
- "Connectivitree"
99
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  "sentiments": [],
101
- "subjects": []
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
 
1
  {
2
  "content_hashes": [
3
+ "4974b044047d2523c747b79d938915c9",
4
  "5754ba35c4f9f27e3e1d5b4d9bb972f2",
 
5
  "9ae73679959943c591be3d1c81b7c26c",
6
  "6286818c51fc82ffc065ba12d3c48c19",
7
  "e068f68ad0aed4134d075210f871ea95",
8
+ "f8f43b2e1413f709038506c3a2dfd7b9",
9
+ "d81ba6e90c2c42d82c4003c4d158d3e3"
10
  ],
11
  "speakers": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "Sadayuki Abeta",
 
 
 
 
13
  "Juan Manuel Caro",
 
 
 
 
 
 
 
14
  "Franz Seiser",
15
+ "Hasan Jafri",
16
+ "Enrique Blanco",
17
+ "Nik Willets",
18
  "Dean Dennis",
 
19
  "Sarwar Khan",
20
+ "Alfredo Musitani",
21
+ "Susan James",
22
+ "Gabriela Styf Sj\u00f6man",
23
+ "Alex Foster",
24
+ "Vishal Mathur",
25
+ "Sandeep Raithatha",
26
+ "Alexandra Foster",
27
+ "Harkirit Singh",
28
+ "Tom Burton",
29
+ "Laura Murphy",
30
  "Ahmed Hafez",
31
+ "Jose Antonio Martin Martinez",
32
+ "Francis Haysom",
33
+ "Atoosa Hatefi",
34
  "Amith Maharaj",
35
+ "Mallik Rao",
36
+ "Anita D\u00f6hler",
37
+ "Geoff Hollingworth",
38
+ "Abdu Mudesir",
39
+ "Akira Tada",
40
  "Matthias Fridstrom",
41
+ "Manish Singh",
42
+ "Guy Daniels",
43
  "Terje Jensen",
44
+ "Mark Henry",
45
+ "Luis Velarde Tazon",
46
+ "Colin Bannon",
47
+ "Dennis Hoffman",
48
+ "Michele Campriani",
49
+ "Andrew Coward",
50
+ "Ray Le Maistre",
51
  "Mojdeh Amani",
52
+ "Philippe Ensarguet",
53
+ "Amol Phadke",
54
+ "Chris Lewis",
55
+ "Sushil Rawat",
56
+ "Mark Gilmour",
57
+ "Francesca Serravalle",
58
+ "Robert Curran",
59
+ "Faiq Khan",
60
+ "Chivas Nambiar",
61
+ "Mirko Voltolini",
62
  "Komal Aggarwal",
63
+ "Madhukiran Medithe",
64
+ "Vivek Chadha",
65
+ "Yago Tenorio"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ],
67
+ "companies": {
68
+ "TelecomTV": [
69
+ "Guy Daniels",
70
+ "Ray Le Maistre"
71
+ ],
72
+ "Connectivitree": [
73
+ "Mark Gilmour"
74
+ ],
75
+ "Telenor": [
76
+ "Amol Phadke",
77
+ "Terje Jensen"
78
+ ],
79
+ "Vodafone": [
80
+ "Komal Aggarwal",
81
+ "Yago Tenorio"
82
+ ],
83
+ "BT": [
84
+ "Sarwar Khan",
85
+ "Alex Foster",
86
+ "Gabriela Styf Sj\u00f6man",
87
+ "Mark Henry",
88
+ "Mojdeh Amani"
89
+ ],
90
+ "American Tower": [
91
+ "Susan James"
92
+ ],
93
+ "Dell Technologies": [
94
+ "Manish Singh",
95
+ "Dennis Hoffman"
96
+ ],
97
+ "AWS": [
98
+ "Chivas Nambiar"
99
+ ],
100
+ "VMO2 Business": [
101
+ "Sandeep Raithatha"
102
+ ],
103
+ "Deutsche Telekom": [
104
+ "Abdu Mudesir",
105
+ "Ahmed Hafez"
106
+ ],
107
+ "Telefonica": [
108
+ "Juan Manuel Caro",
109
+ "Enrique Blanco",
110
+ "Jose Antonio Martin Martinez",
111
+ "Luis Velarde Tazon",
112
+ "Mallik Rao"
113
+ ],
114
+ "SoftBank": [
115
+ "Akira Tada"
116
+ ],
117
+ "TM Forum": [
118
+ "Nik Willets"
119
+ ],
120
+ "Rakuten Mobile": [
121
+ "Madhukiran Medithe"
122
+ ],
123
+ "Appledore Research": [
124
+ "Francis Haysom",
125
+ "Robert Curran"
126
+ ],
127
+ "NGMN Alliance": [
128
+ "Anita D\u00f6hler"
129
+ ],
130
+ "Arelion": [
131
+ "Matthias Fridstrom"
132
+ ],
133
+ "Deutsche Telekom Technik": [
134
+ "Franz Seiser"
135
+ ],
136
+ "TELUS": [
137
+ "Hasan Jafri",
138
+ "Sushil Rawat"
139
+ ],
140
+ "Orange": [
141
+ "Atoosa Hatefi",
142
+ "Philippe Ensarguet"
143
+ ],
144
+ "BT Business": [
145
+ "Colin Bannon"
146
+ ],
147
+ "Telecom Argentina": [
148
+ "Alfredo Musitani"
149
+ ],
150
+ "Colt Technology": [
151
+ "Mirko Voltolini"
152
+ ],
153
+ "BT Group": [
154
+ "Laura Murphy"
155
+ ],
156
+ "MTN": [
157
+ "Amith Maharaj"
158
+ ],
159
+ "Vodafone UK": [
160
+ "Francesca Serravalle"
161
+ ],
162
+ "Verizon Business": [
163
+ "Dean Dennis"
164
+ ],
165
+ "Rakuten Symphony": [
166
+ "Faiq Khan",
167
+ "Geoff Hollingworth",
168
+ "Vivek Chadha"
169
+ ],
170
+ "Cambridge Management Consulting": [
171
+ "Tom Burton"
172
+ ],
173
+ "Ascend Digital Solutions": [
174
+ "Harkirit Singh"
175
+ ],
176
+ "IBM": [
177
+ "Andrew Coward"
178
+ ],
179
+ "Optiva": [
180
+ "Michele Campriani"
181
+ ],
182
+ "Telecom Infra Project": [
183
+ "Vishal Mathur"
184
+ ],
185
+ "NTT DOCOMO": [
186
+ "Sadayuki Abeta"
187
+ ],
188
+ "DSP Leaders Councillor": [
189
+ "Alexandra Foster"
190
+ ],
191
+ "Lewis Insight": [
192
+ "Chris Lewis"
193
+ ]
194
+ },
195
  "sentiments": [],
196
+ "subjects": [
197
+ "Connectivity",
198
+ "Infrastructure",
199
+ "5G",
200
+ "Enterprise",
201
+ "Network",
202
+ "Open RAN",
203
+ "TechCo",
204
+ "API",
205
+ "Innovation",
206
+ "B2B",
207
+ "AI"
208
+ ]
209
  }
clean_db.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ rm cache/db_metadata.json
4
+ rm cache/cached_https_www.telecomtv.com_content_dsp-leaders-forum_*.json
5
+ rm -rf db
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ffd87324d19f8f6366a4be4dccc22a83a50ca6837d1327fb660dc4b4e25d140
3
- size 100
 
 
 
 
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/index_metadata.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:952bca23a4923000a81096d77fd1e39c4270e46f697bb4f2476c550ced3f2943
3
- size 99983
 
 
 
 
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7063182d3e5741c59a41e3c9728f568cbcae4adbda7a9a560b3678335c630157
3
- size 4000
 
 
 
 
db/chroma.sqlite3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:df787a6d1a483f410e5a820ca3278c0d0f10382f4102caec34b0257587055ae9
3
- size 12341248
 
 
 
 
db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/data_level0.bin → faiss_index.faiss/index.faiss} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70eda9770ad41e004ab40bea6d88e4fe3f99e05307c811bc9e43573129c642c1
3
- size 3212000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9a89a762d762a400d92c6bd6c4cc85f0b0f1841110a44e7038689592e8e91e4
3
+ size 2408493
db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/link_lists.bin → faiss_index.faiss/index.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ba06f934bb788a6fcf347d85845dce300cedf631915df241ae9d5063c97d88d
3
- size 8148
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b336f2522b6c941539e3036a7cbf4f3d48ff14167508a220fa087a2c922ea982
3
+ size 574563
requirements.txt CHANGED
@@ -3,4 +3,6 @@ embedchain
3
  langchain_huggingface
4
  watchdog
5
  pyppeteer
6
- beautifulsoup4
 
 
 
3
  langchain_huggingface
4
  watchdog
5
  pyppeteer
6
+ beautifulsoup4
7
+ faiss-cpu
8
+ uuid
ttv_web_scraper.py CHANGED
@@ -2,53 +2,22 @@ import re
2
  import asyncio
3
  import json
4
  import os
 
5
  import traceback
6
  from pyppeteer import launch
7
  from bs4 import BeautifulSoup
8
  import hashlib
9
- from ec_config import create_app
10
-
11
 
12
  CACHE_DIR = "cache/"
13
  if not os.path.exists(CACHE_DIR):
14
  os.makedirs(CACHE_DIR)
15
 
16
  DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
17
-
18
-
19
- def db_load_metadata_sets():
20
-
21
- content_hashes = set()
22
- speakers = set()
23
- companies = set()
24
- sentiments = set()
25
- subjects = set()
26
-
27
- if os.path.exists(DB_METADATA_FILE):
28
- with open(DB_METADATA_FILE, 'r') as f:
29
- metadata = json.load(f)
30
-
31
- content_hashes = set(metadata.get('content_hashes', []))
32
- speakers = set(metadata.get('speakers', []))
33
- companies = set(metadata.get('companies', []))
34
- sentiments = set(metadata.get('sentiments', []))
35
- subjects = set(metadata.get('subjects', []))
36
-
37
- return content_hashes, speakers, companies, sentiments, subjects
38
-
39
-
40
- def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
41
-
42
- metadata = {
43
- 'content_hashes': list(content_hashes),
44
- 'speakers': list(speakers),
45
- 'companies': list(companies),
46
- 'sentiments': list(sentiments),
47
- 'subjects': list(subjects)
48
- }
49
-
50
- with open(DB_METADATA_FILE, 'w') as f:
51
- json.dump(metadata, f, indent=2)
52
 
53
 
54
  async def get_client_rendered_content(url):
@@ -121,7 +90,7 @@ def read_json_from_file(filename):
121
 
122
  def extract_speaker_info(segment):
123
  try:
124
- pattern = r'(?P<speaker>(?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+), (?P<company>[A-Za-z\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
125
  match = re.match(pattern, segment)
126
  if match:
127
  return {key: value.strip() if value else None for key, value in match.groupdict().items()}
@@ -135,26 +104,79 @@ def extract_speaker_info(segment):
135
  raise Exception(f"Error extracting speaker info: {str(e)}")
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
138
  def parse_transcript(content):
139
  try:
140
  parsed_segments = []
141
- metadata = {}
142
- pattern = r'((?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+, [A-Za-z\s]+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
143
  segments = re.split(pattern, content)
144
  segments = [segment.strip() for segment in segments if segment.strip()]
145
- for segment in segments:
 
146
  speaker_info = extract_speaker_info(segment)
147
- if (speaker_info):
148
  if speaker_info['speaker']:
149
- metadata = speaker_info.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  else:
151
- metadata = metadata.copy()
152
- metadata['timestamp'] = speaker_info['timestamp']
153
- else:
154
- parsed_segments.append({
155
- 'metadata': metadata,
156
- "text": segment
157
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  return parsed_segments
159
  except Exception as e:
160
  raise Exception(f"Error parsing transcript: {str(e)}")
@@ -200,8 +222,6 @@ async def process_url(url):
200
  print(f"Detailed error: {str(e)}")
201
  return None
202
 
203
- # This function can be used to process multiple URLs
204
-
205
 
206
  async def process_urls(urls):
207
  tasks = [process_url(url) for url in urls]
@@ -209,8 +229,8 @@ async def process_urls(urls):
209
 
210
 
211
  def main():
212
-
213
- app = create_app()
214
 
215
  url_file = "dsp-urls.txt" # File containing list of URLs
216
 
@@ -220,6 +240,10 @@ def main():
220
 
221
  content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
222
 
 
 
 
 
223
  with open(url_file, 'r') as f:
224
  urls = [line.strip() for line in f if line.strip()]
225
 
@@ -243,22 +267,68 @@ def main():
243
 
244
  for entry in transcript:
245
  metadata.update(entry['metadata'])
246
- speakers.add(metadata['speaker'])
247
- companies.add(metadata['company'])
 
 
 
 
 
248
 
249
  text = entry['text']
250
 
251
- app.add(text, data_type='text', metadata=metadata)
 
 
 
 
 
252
 
253
  content_hashes.add(filename_hash)
254
  print(f"Added new url: {url}")
255
 
256
- # Save updated hashes
257
  save_metadata_sets(content_hashes, speakers,
258
  companies, sentiments, subjects)
259
 
 
 
260
  print("Processing complete. Check individual URL outputs for any errors.")
261
 
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  if __name__ == "__main__":
264
  main()
 
2
  import asyncio
3
  import json
4
  import os
5
+ import gc
6
  import traceback
7
  from pyppeteer import launch
8
  from bs4 import BeautifulSoup
9
  import hashlib
10
+ from ai_config_faiss import get_ai_assistant
 
11
 
12
  CACHE_DIR = "cache/"
13
  if not os.path.exists(CACHE_DIR):
14
  os.makedirs(CACHE_DIR)
15
 
16
  DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
17
+ SUBJECTS = [
18
+ "5G", "AI", "Innovation", "Network", "Enterprise", "Open RAN",
19
+ "TechCo", "B2B", "API", "Infrastructure", "Connectivity"
20
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  async def get_client_rendered_content(url):
 
90
 
91
  def extract_speaker_info(segment):
92
  try:
93
+ pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
94
  match = re.match(pattern, segment)
95
  if match:
96
  return {key: value.strip() if value else None for key, value in match.groupdict().items()}
 
104
  raise Exception(f"Error extracting speaker info: {str(e)}")
105
 
106
 
107
+ def extract_subject_info(text):
108
+ # Convert text to lowercase for case-insensitive matching
109
+ lower_text = text.lower()
110
+
111
+ # Find all subjects present in the text
112
+ found_subjects = [
113
+ subject for subject in SUBJECTS if subject.lower() in lower_text]
114
+
115
+ return found_subjects
116
+
117
+
118
  def parse_transcript(content):
119
  try:
120
  parsed_segments = []
121
+ saved_info = None
122
+ pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
123
  segments = re.split(pattern, content)
124
  segments = [segment.strip() for segment in segments if segment.strip()]
125
+
126
+ for i, segment in enumerate(segments):
127
  speaker_info = extract_speaker_info(segment)
128
+ if speaker_info:
129
  if speaker_info['speaker']:
130
+ # Full speaker, company, timestamp format
131
+ if saved_info:
132
+ text = segments[i-1] if i > 0 else ""
133
+ subjects = extract_subject_info(text)
134
+ parsed_segments.append({
135
+ 'metadata': {
136
+ 'speaker': saved_info['speaker'],
137
+ 'company': saved_info['company'],
138
+ 'start_timestamp': saved_info['timestamp'],
139
+ 'end_timestamp': speaker_info['timestamp'],
140
+ 'subjects': subjects
141
+ },
142
+ 'text': text
143
+ })
144
+ saved_info = speaker_info
145
  else:
146
+ # Standalone timestamp format
147
+ if saved_info:
148
+ text = segments[i-1] if i > 0 else ""
149
+ subjects = extract_subject_info(text)
150
+ parsed_segments.append({
151
+ 'metadata': {
152
+ 'speaker': saved_info['speaker'],
153
+ 'company': saved_info['company'],
154
+ 'start_timestamp': saved_info['timestamp'],
155
+ 'end_timestamp': speaker_info['timestamp'],
156
+ 'subjects': subjects
157
+ },
158
+ 'text': text
159
+ })
160
+ saved_info['timestamp'] = speaker_info['timestamp']
161
+ elif saved_info:
162
+ # Text segment
163
+ continue
164
+
165
+ # Add final entry
166
+ if saved_info:
167
+ text = segments[-1]
168
+ subjects = extract_subject_info(text)
169
+ parsed_segments.append({
170
+ 'metadata': {
171
+ 'speaker': saved_info['speaker'],
172
+ 'company': saved_info['company'],
173
+ 'start_timestamp': saved_info['timestamp'],
174
+ 'end_timestamp': "00:00:00",
175
+ 'subjects': subjects
176
+ },
177
+ 'text': text
178
+ })
179
+
180
  return parsed_segments
181
  except Exception as e:
182
  raise Exception(f"Error parsing transcript: {str(e)}")
 
222
  print(f"Detailed error: {str(e)}")
223
  return None
224
 
 
 
225
 
226
  async def process_urls(urls):
227
  tasks = [process_url(url) for url in urls]
 
229
 
230
 
231
  def main():
232
+ global assistant
233
+ assistant = get_ai_assistant()
234
 
235
  url_file = "dsp-urls.txt" # File containing list of URLs
236
 
 
240
 
241
  content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
242
 
243
+ # Convert companies to a dictionary of speaker sets if it's not already
244
+ if not isinstance(companies, dict):
245
+ companies = {company: set() for company in companies}
246
+
247
  with open(url_file, 'r') as f:
248
  urls = [line.strip() for line in f if line.strip()]
249
 
 
267
 
268
  for entry in transcript:
269
  metadata.update(entry['metadata'])
270
+ company = metadata['company']
271
+ speaker = metadata['speaker']
272
+ entry_subjects = metadata['subjects']
273
+
274
+ speakers.add(speaker)
275
+ # Add new subjects to the master set
276
+ subjects.update(entry_subjects)
277
 
278
  text = entry['text']
279
 
280
+ assistant.add_to_knowledge_base(
281
+ text, data_type='text', metadata=metadata.copy())
282
+
283
+ if company not in companies:
284
+ companies[company] = set()
285
+ companies[company].add(speaker)
286
 
287
  content_hashes.add(filename_hash)
288
  print(f"Added new url: {url}")
289
 
290
+ # Save updated hashes and metadata
291
  save_metadata_sets(content_hashes, speakers,
292
  companies, sentiments, subjects)
293
 
294
+ assistant.save()
295
+
296
  print("Processing complete. Check individual URL outputs for any errors.")
297
 
298
 
299
+ def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
300
+ metadata = {
301
+ 'content_hashes': list(content_hashes),
302
+ 'speakers': list(speakers),
303
+ 'companies': {company: list(speakers) for company, speakers in companies.items()},
304
+ 'sentiments': list(sentiments),
305
+ 'subjects': list(subjects)
306
+ }
307
+
308
+ with open(DB_METADATA_FILE, 'w') as f:
309
+ json.dump(metadata, f, indent=2)
310
+
311
+
312
+ def db_load_metadata_sets():
313
+ content_hashes = set()
314
+ speakers = set()
315
+ companies = {}
316
+ sentiments = set()
317
+ subjects = set()
318
+
319
+ if os.path.exists(DB_METADATA_FILE):
320
+ with open(DB_METADATA_FILE, 'r') as f:
321
+ metadata = json.load(f)
322
+
323
+ content_hashes = set(metadata.get('content_hashes', []))
324
+ speakers = set(metadata.get('speakers', []))
325
+ companies = {company: set(speakers) for company, speakers in metadata.get(
326
+ 'companies', {}).items()}
327
+ sentiments = set(metadata.get('sentiments', []))
328
+ subjects = set(metadata.get('subjects', SUBJECTS))
329
+
330
+ return content_hashes, speakers, companies, sentiments, subjects
331
+
332
+
333
  if __name__ == "__main__":
334
  main()