faiss filter initial commit
Browse files- ai_config_ec.py +105 -0
- ai_config_faiss.py +129 -0
- ai_config_faiss.py.llm-query +112 -0
- app.py +75 -73
- cache/db_metadata.json +187 -80
- clean_db.sh +5 -0
- db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/header.bin +0 -3
- db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/index_metadata.pickle +0 -3
- db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/length.bin +0 -3
- db/chroma.sqlite3 +0 -3
- db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/data_level0.bin → faiss_index.faiss/index.faiss} +2 -2
- db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/link_lists.bin → faiss_index.faiss/index.pkl} +2 -2
- requirements.txt +3 -1
- ttv_web_scraper.py +128 -58
ai_config_ec.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
from embedchain import App
|
4 |
+
from typing import Dict, Any, List
|
5 |
+
|
6 |
+
|
7 |
+
def timestamp_to_seconds(timestamp):
|
8 |
+
"""Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
|
9 |
+
parts = timestamp.split(':')
|
10 |
+
if len(parts) == 3:
|
11 |
+
h, m, s = map(int, parts)
|
12 |
+
ts = h * 3600 + m * 60 + s
|
13 |
+
elif len(parts) == 2:
|
14 |
+
m, s = map(int, parts)
|
15 |
+
ts = m * 60 + s
|
16 |
+
else:
|
17 |
+
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
18 |
+
|
19 |
+
return ts
|
20 |
+
|
21 |
+
|
22 |
+
class AIAssistant:
|
23 |
+
def __init__(self):
|
24 |
+
self.app = self._create_app()
|
25 |
+
|
26 |
+
def _get_api_key(self, name: str) -> str:
|
27 |
+
api_key = os.environ.get(name)
|
28 |
+
if not api_key:
|
29 |
+
api_key = st.secrets.get(name)
|
30 |
+
if not api_key:
|
31 |
+
raise ValueError(
|
32 |
+
f"{name} is not set. Please set it in your environment or Streamlit secrets.")
|
33 |
+
return api_key
|
34 |
+
|
35 |
+
def _create_config(self) -> Dict[str, Any]:
|
36 |
+
return {
|
37 |
+
'app': {
|
38 |
+
'config': {
|
39 |
+
'name': 'ttv-ec'
|
40 |
+
}
|
41 |
+
},
|
42 |
+
'llm': {
|
43 |
+
'provider': 'huggingface',
|
44 |
+
'config': {
|
45 |
+
'model': 'mistralai/Mistral-7B-Instruct-v0.2',
|
46 |
+
'top_p': 0.5,
|
47 |
+
'stream': False,
|
48 |
+
'prompt': """You are an AI assistant that answers questions based solely on the information provided in your knowledge base.
|
49 |
+
|
50 |
+
Question: $query
|
51 |
+
Context: $context
|
52 |
+
|
53 |
+
If the information to answer a question is not available in your knowledge base,
|
54 |
+
respond with 'I don't have enough information to answer that question.
|
55 |
+
""",
|
56 |
+
'api_key': self._get_api_key('HF_TOKEN')
|
57 |
+
}
|
58 |
+
},
|
59 |
+
'embedder': {
|
60 |
+
'provider': 'huggingface',
|
61 |
+
'config': {
|
62 |
+
'model': 'sentence-transformers/all-mpnet-base-v2',
|
63 |
+
'api_key': self._get_api_key('HF_TOKEN')
|
64 |
+
}
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
def _create_app(self) -> App:
|
69 |
+
config = self._create_config()
|
70 |
+
return App.from_config(config=config)
|
71 |
+
|
72 |
+
def save(self) -> None:
|
73 |
+
# null function
|
74 |
+
return
|
75 |
+
|
76 |
+
def add_to_knowledge_base(self, data: str, data_type: str, metadata: Dict[str, Any] = None) -> None:
|
77 |
+
self.app.add(data, data_type=data_type, metadata=metadata)
|
78 |
+
|
79 |
+
def query(self, question: str, num_results: int = 30, filters: Dict[str, Any] = None) -> Dict[str, List[Dict[str, Any]]]:
|
80 |
+
search_results = self.app.search(
|
81 |
+
question, num_documents=num_results, where=filters)
|
82 |
+
# Process and display search results
|
83 |
+
answer = "Here are the most relevant transcript excerpts:\n\n"
|
84 |
+
for i, result in enumerate(search_results['results'], 1):
|
85 |
+
metadata = result['metadata']
|
86 |
+
ts = timestamp_to_seconds(metadata['timestamp'])
|
87 |
+
yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
|
88 |
+
|
89 |
+
speaker_info = (
|
90 |
+
f"Speaker: {metadata.get('speaker', 'Unknown')}, "
|
91 |
+
f"Company: {metadata.get('company', 'Unknown')}, "
|
92 |
+
f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
|
93 |
+
)
|
94 |
+
|
95 |
+
answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
|
96 |
+
answer += f"{metadata.get('title', 'Unknown')} \n"
|
97 |
+
answer += f"\"{result['context']}\"\n\n"
|
98 |
+
|
99 |
+
return {'results': search_results}
|
100 |
+
|
101 |
+
# Usage example
|
102 |
+
|
103 |
+
|
104 |
+
def get_ai_assistant() -> AIAssistant:
|
105 |
+
return AIAssistant()
|
ai_config_faiss.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Dict, Any, List
|
3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
|
7 |
+
|
8 |
+
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
9 |
+
|
10 |
+
DB_DIR = "db/"
|
11 |
+
if not os.path.exists(DB_DIR):
|
12 |
+
os.makedirs(DB_DIR)
|
13 |
+
|
14 |
+
|
15 |
+
def timestamp_to_seconds(timestamp):
|
16 |
+
"""Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
|
17 |
+
parts = timestamp.split(':')
|
18 |
+
if len(parts) == 3:
|
19 |
+
h, m, s = map(int, parts)
|
20 |
+
ts = h * 3600 + m * 60 + s
|
21 |
+
elif len(parts) == 2:
|
22 |
+
m, s = map(int, parts)
|
23 |
+
ts = m * 60 + s
|
24 |
+
else:
|
25 |
+
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
26 |
+
|
27 |
+
return ts
|
28 |
+
|
29 |
+
|
30 |
+
class FAISSAIAssistant:
|
31 |
+
def __init__(self, index_name: str = "faiss_index"):
|
32 |
+
self.index_name = f"{DB_DIR}{index_name}.faiss"
|
33 |
+
model_name = "sentence-transformers/all-mpnet-base-v2"
|
34 |
+
model_kwargs = {'device': 'cpu'}
|
35 |
+
encode_kwargs = {'normalize_embeddings': False}
|
36 |
+
self.embeddings = HuggingFaceEmbeddings(
|
37 |
+
model_name=model_name,
|
38 |
+
model_kwargs=model_kwargs,
|
39 |
+
encode_kwargs=encode_kwargs)
|
40 |
+
self.vector_store = self._create_app()
|
41 |
+
|
42 |
+
def _create_app(self):
|
43 |
+
if os.path.exists(self.index_name):
|
44 |
+
print("Loading existing FAISS index...")
|
45 |
+
return FAISS.load_local(self.index_name, self.embeddings,
|
46 |
+
allow_dangerous_deserialization=True)
|
47 |
+
else:
|
48 |
+
print("Creating new FAISS index...")
|
49 |
+
# Create an initial document with placeholder text
|
50 |
+
initial_texts = [
|
51 |
+
"This is an initial document to create the FAISS index."]
|
52 |
+
return FAISS.from_texts(initial_texts, self.embeddings)
|
53 |
+
|
54 |
+
def add_to_knowledge_base(self, data: str, data_type: str = None, metadata: Dict[str, Any] = None) -> None:
|
55 |
+
doc = Document(page_content=data, metadata=metadata or {})
|
56 |
+
self.vector_store.add_documents([doc])
|
57 |
+
|
58 |
+
def query(self, question: str, num_results: int = 30, filters: Dict[str, List[str]] = None) -> str:
|
59 |
+
all_docs = self.list_documents()
|
60 |
+
|
61 |
+
def match_any_filter(doc_metadata, filters):
|
62 |
+
if not filters:
|
63 |
+
return True
|
64 |
+
for key, values in filters.items():
|
65 |
+
if key in doc_metadata:
|
66 |
+
doc_value = doc_metadata[key]
|
67 |
+
if isinstance(doc_value, list):
|
68 |
+
# If doc_value is a list, check if any item in doc_value is in values
|
69 |
+
if any(item in values for item in doc_value):
|
70 |
+
return True
|
71 |
+
else:
|
72 |
+
# If doc_value is a single string, check if it's in values
|
73 |
+
if doc_value in values:
|
74 |
+
return True
|
75 |
+
return False
|
76 |
+
|
77 |
+
filtered_docs = [
|
78 |
+
doc for doc in all_docs
|
79 |
+
if match_any_filter(doc['metadata'], filters)
|
80 |
+
]
|
81 |
+
|
82 |
+
# Limit the number of results to num_results
|
83 |
+
filtered_docs = filtered_docs[:num_results]
|
84 |
+
|
85 |
+
answer = f"Here are the top {
|
86 |
+
len(filtered_docs)} documents matching the filter:\n\n"
|
87 |
+
for i, doc in enumerate(filtered_docs, 1):
|
88 |
+
metadata = doc['metadata']
|
89 |
+
st_ts = timestamp_to_seconds(metadata['start_timestamp'])
|
90 |
+
yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={st_ts}"
|
91 |
+
|
92 |
+
speaker_info = (
|
93 |
+
f"Speaker: {metadata.get('speaker', 'Unknown')}, "
|
94 |
+
f"Company: {metadata.get('company', 'Unknown')}, "
|
95 |
+
f"Timestamp: {metadata.get('start_timestamp', 'Unknown')}"
|
96 |
+
f" - {metadata.get('end_timestamp', 'Unknown')}"
|
97 |
+
)
|
98 |
+
|
99 |
+
answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
|
100 |
+
answer += f"{metadata.get('title', 'Unknown')} \n"
|
101 |
+
answer += f"\"{doc['content']}\" \n\n"
|
102 |
+
|
103 |
+
return answer
|
104 |
+
|
105 |
+
def save(self):
|
106 |
+
self.vector_store.save_local(self.index_name)
|
107 |
+
print("FAISS index saved.")
|
108 |
+
|
109 |
+
def list_documents(self) -> List[Dict[str, Any]]:
|
110 |
+
"""
|
111 |
+
List all documents in the FAISS vectorstore.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
List[Dict[str, Any]]: A list of dictionaries, each containing 'content' and 'metadata' of a document.
|
115 |
+
"""
|
116 |
+
documents = []
|
117 |
+
for doc_id, doc in self.vector_store.docstore._dict.items():
|
118 |
+
documents.append({
|
119 |
+
'id': doc_id,
|
120 |
+
'content': doc.page_content,
|
121 |
+
'metadata': doc.metadata
|
122 |
+
})
|
123 |
+
return documents
|
124 |
+
|
125 |
+
# Usage example
|
126 |
+
|
127 |
+
|
128 |
+
def get_ai_assistant(index_name: str = "faiss_index") -> FAISSAIAssistant:
|
129 |
+
return FAISSAIAssistant(index_name)
|
ai_config_faiss.py.llm-query
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Dict, Any, List
|
3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
|
7 |
+
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
8 |
+
|
9 |
+
DB_DIR = "db/"
|
10 |
+
if not os.path.exists(DB_DIR):
|
11 |
+
os.makedirs(DB_DIR)
|
12 |
+
|
13 |
+
|
14 |
+
def timestamp_to_seconds(timestamp):
|
15 |
+
"""Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
|
16 |
+
parts = timestamp.split(':')
|
17 |
+
if len(parts) == 3:
|
18 |
+
h, m, s = map(int, parts)
|
19 |
+
ts = h * 3600 + m * 60 + s
|
20 |
+
elif len(parts) == 2:
|
21 |
+
m, s = map(int, parts)
|
22 |
+
ts = m * 60 + s
|
23 |
+
else:
|
24 |
+
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
25 |
+
|
26 |
+
return ts
|
27 |
+
|
28 |
+
|
29 |
+
class FAISSAIAssistant:
|
30 |
+
def __init__(self, index_name: str = "faiss_index"):
|
31 |
+
self.index_name = f"{DB_DIR}{index_name}.faiss"
|
32 |
+
model_name = "sentence-transformers/all-mpnet-base-v2"
|
33 |
+
model_kwargs = {'device': 'cpu'}
|
34 |
+
encode_kwargs = {'normalize_embeddings': False}
|
35 |
+
self.embeddings = HuggingFaceEmbeddings(
|
36 |
+
model_name=model_name,
|
37 |
+
model_kwargs=model_kwargs,
|
38 |
+
encode_kwargs=encode_kwargs)
|
39 |
+
self.vector_store = self._create_app()
|
40 |
+
|
41 |
+
def _create_app(self):
|
42 |
+
if os.path.exists(self.index_name):
|
43 |
+
print("Loading existing FAISS index...")
|
44 |
+
return FAISS.load_local(self.index_name, self.embeddings,
|
45 |
+
allow_dangerous_deserialization=True)
|
46 |
+
else:
|
47 |
+
print("Creating new FAISS index...")
|
48 |
+
# Create an initial document with placeholder text
|
49 |
+
initial_texts = [
|
50 |
+
"This is an initial document to create the FAISS index."]
|
51 |
+
return FAISS.from_texts(initial_texts, self.embeddings)
|
52 |
+
|
53 |
+
def add_to_knowledge_base(self, data: str, data_type: str = None, metadata: Dict[str, Any] = None) -> None:
|
54 |
+
doc = Document(page_content=data, metadata=metadata or {})
|
55 |
+
self.vector_store.add_documents([doc])
|
56 |
+
|
57 |
+
def query(self, filters: Dict[str, List[str]] = None) -> str:
|
58 |
+
all_docs = self.list_documents()
|
59 |
+
|
60 |
+
def match_filter(doc_metadata, filter_key, filter_values):
|
61 |
+
return doc_metadata.get(filter_key) in filter_values
|
62 |
+
|
63 |
+
filtered_docs = [
|
64 |
+
doc for doc in all_docs
|
65 |
+
if all(match_filter(doc['metadata'], k, v) for k, v in filters.items())
|
66 |
+
] if filters else all_docs
|
67 |
+
|
68 |
+
answer = "Here are the documents matching the filter:\n\n"
|
69 |
+
for i, doc in enumerate(filtered_docs, 1):
|
70 |
+
metadata = doc['metadata']
|
71 |
+
st_ts = timestamp_to_seconds(metadata['start_timestamp'])
|
72 |
+
yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={st_ts}"
|
73 |
+
|
74 |
+
speaker_info = (
|
75 |
+
f"Speaker: {metadata.get('speaker', 'Unknown')}, "
|
76 |
+
f"Company: {metadata.get('company', 'Unknown')}, "
|
77 |
+
f"Timestamp: {metadata.get('start_timestamp', 'Unknown')}"
|
78 |
+
f" - {metadata.get('end_timestamp', 'Unknown')}"
|
79 |
+
)
|
80 |
+
|
81 |
+
answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n"
|
82 |
+
answer += f"{metadata.get('title', 'Unknown')} \n"
|
83 |
+
answer += f"\"{doc['content']}\" \n\n"
|
84 |
+
|
85 |
+
return answer
|
86 |
+
|
87 |
+
def save(self):
|
88 |
+
self.vector_store.save_local(self.index_name)
|
89 |
+
print("FAISS index saved.")
|
90 |
+
|
91 |
+
def list_documents(self) -> List[Dict[str, Any]]:
|
92 |
+
"""
|
93 |
+
List all documents in the FAISS vectorstore.
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
List[Dict[str, Any]]: A list of dictionaries, each containing 'content' and 'metadata' of a document.
|
97 |
+
"""
|
98 |
+
documents = []
|
99 |
+
for doc_id, doc in self.vector_store.docstore._dict.items():
|
100 |
+
documents.append({
|
101 |
+
'id': doc_id,
|
102 |
+
'content': doc.page_content,
|
103 |
+
'metadata': doc.metadata
|
104 |
+
})
|
105 |
+
return documents
|
106 |
+
|
107 |
+
|
108 |
+
# Usage example
|
109 |
+
|
110 |
+
|
111 |
+
def get_ai_assistant(index_name: str = "faiss_index") -> FAISSAIAssistant:
|
112 |
+
return FAISSAIAssistant(index_name)
|
app.py
CHANGED
@@ -1,105 +1,107 @@
|
|
1 |
-
from
|
2 |
from ttv_web_scraper import db_load_metadata_sets
|
3 |
import streamlit as st
|
4 |
import re
|
5 |
|
6 |
|
7 |
@st.cache_resource
|
8 |
-
def
|
9 |
-
return
|
10 |
-
|
11 |
-
|
12 |
-
def
|
13 |
-
"
|
14 |
-
|
15 |
-
if
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
where = {}
|
40 |
-
if
|
41 |
-
where['
|
42 |
-
if
|
43 |
-
where['
|
44 |
-
if
|
45 |
-
where['
|
46 |
-
if selected_subject:
|
47 |
-
where['subject'] = selected_subject
|
48 |
|
49 |
return where
|
50 |
|
51 |
|
52 |
-
# Streamlit app
|
53 |
-
|
54 |
-
|
55 |
def main():
|
56 |
-
st.title("
|
57 |
|
58 |
st.markdown(
|
59 |
"Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
|
60 |
|
61 |
# Load metadata sets
|
62 |
-
_,
|
63 |
|
64 |
# Create filter panel
|
65 |
-
where = create_filter_panel(
|
66 |
-
|
67 |
-
# User input
|
68 |
-
user_query = st.text_input(
|
69 |
-
"Enter your question:", placeholder="e.g. What are people speaking about? or List all people speaking")
|
70 |
|
71 |
# Add a slider for selecting the number of results
|
72 |
num_results = st.slider("Number of relevant transcript excerpts to show:",
|
73 |
min_value=1, max_value=50, value=30, step=1)
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
msg_placeholder = st.empty()
|
79 |
-
msg_placeholder.markdown("Thinking...")
|
80 |
-
|
81 |
-
# Use app.search() with the where parameter
|
82 |
-
search_results = app.search(
|
83 |
-
user_query, num_documents=num_results, where=where)
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
f"Company: {metadata.get('company', 'Unknown')}, "
|
95 |
-
f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
|
96 |
-
)
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
|
104 |
|
105 |
if __name__ == "__main__":
|
|
|
1 |
+
from ai_config_faiss import get_ai_assistant
|
2 |
from ttv_web_scraper import db_load_metadata_sets
|
3 |
import streamlit as st
|
4 |
import re
|
5 |
|
6 |
|
7 |
@st.cache_resource
|
8 |
+
def get_assistant():
|
9 |
+
return get_ai_assistant()
|
10 |
+
|
11 |
+
|
12 |
+
def create_filter_panel(companies, sentiments, subjects):
|
13 |
+
st.header("Filter Options")
|
14 |
+
|
15 |
+
# Initialize session state for filters if not already present
|
16 |
+
if 'selected_companies' not in st.session_state:
|
17 |
+
st.session_state.selected_companies = []
|
18 |
+
if 'selected_speakers' not in st.session_state:
|
19 |
+
st.session_state.selected_speakers = []
|
20 |
+
if 'selected_subjects' not in st.session_state:
|
21 |
+
st.session_state.selected_subjects = []
|
22 |
+
|
23 |
+
# Add a checkbox to show/hide all filters
|
24 |
+
show_filters = st.checkbox("Show Filters", value=True)
|
25 |
+
|
26 |
+
if show_filters:
|
27 |
+
col1, col2, col3 = st.columns(3)
|
28 |
+
|
29 |
+
with col1:
|
30 |
+
st.subheader("Companies")
|
31 |
+
for company in companies.keys():
|
32 |
+
if st.checkbox(f"{company}", value=company in st.session_state.selected_companies):
|
33 |
+
if company not in st.session_state.selected_companies:
|
34 |
+
st.session_state.selected_companies.append(company)
|
35 |
+
elif company in st.session_state.selected_companies:
|
36 |
+
st.session_state.selected_companies.remove(company)
|
37 |
+
|
38 |
+
with col2:
|
39 |
+
st.subheader("Speakers")
|
40 |
+
all_speakers = set()
|
41 |
+
for speakers in companies.values():
|
42 |
+
all_speakers.update(speakers)
|
43 |
+
|
44 |
+
for speaker in sorted(all_speakers):
|
45 |
+
if st.checkbox(speaker, value=speaker in st.session_state.selected_speakers):
|
46 |
+
if speaker not in st.session_state.selected_speakers:
|
47 |
+
st.session_state.selected_speakers.append(speaker)
|
48 |
+
elif speaker in st.session_state.selected_speakers:
|
49 |
+
st.session_state.selected_speakers.remove(speaker)
|
50 |
+
|
51 |
+
with col3:
|
52 |
+
st.subheader("Subjects")
|
53 |
+
for subject in sorted(subjects):
|
54 |
+
if st.checkbox(subject, value=subject in st.session_state.selected_subjects):
|
55 |
+
if subject not in st.session_state.selected_subjects:
|
56 |
+
st.session_state.selected_subjects.append(subject)
|
57 |
+
elif subject in st.session_state.selected_subjects:
|
58 |
+
st.session_state.selected_subjects.remove(subject)
|
59 |
|
60 |
where = {}
|
61 |
+
if st.session_state.selected_companies:
|
62 |
+
where['company'] = st.session_state.selected_companies
|
63 |
+
if st.session_state.selected_speakers:
|
64 |
+
where['speaker'] = st.session_state.selected_speakers
|
65 |
+
if st.session_state.selected_subjects:
|
66 |
+
where['subjects'] = st.session_state.selected_subjects
|
|
|
|
|
67 |
|
68 |
return where
|
69 |
|
70 |
|
|
|
|
|
|
|
71 |
def main():
|
72 |
+
st.title("Telecom TV Video Expert")
|
73 |
|
74 |
st.markdown(
|
75 |
"Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
|
76 |
|
77 |
# Load metadata sets
|
78 |
+
_, _, companies, sentiments, subjects = db_load_metadata_sets()
|
79 |
|
80 |
# Create filter panel
|
81 |
+
where = create_filter_panel(companies, sentiments, subjects)
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Add a slider for selecting the number of results
|
84 |
num_results = st.slider("Number of relevant transcript excerpts to show:",
|
85 |
min_value=1, max_value=50, value=30, step=1)
|
86 |
|
87 |
+
# Add a submit button
|
88 |
+
submit_button = st.button("Submit")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
+
if submit_button:
|
91 |
+
if not where:
|
92 |
+
st.warning(
|
93 |
+
"Please select at least one filter before submitting.")
|
94 |
+
else:
|
95 |
+
assistant = get_assistant()
|
96 |
|
97 |
+
msg_placeholder = st.empty()
|
98 |
+
msg_placeholder.markdown("Thinking...")
|
|
|
|
|
|
|
99 |
|
100 |
+
# Use assistant.query() instead of app.search()
|
101 |
+
response = assistant.query(
|
102 |
+
"", num_results=num_results, filters=where)
|
103 |
|
104 |
+
msg_placeholder.markdown(response)
|
105 |
|
106 |
|
107 |
if __name__ == "__main__":
|
cache/db_metadata.json
CHANGED
@@ -1,102 +1,209 @@
|
|
1 |
{
|
2 |
"content_hashes": [
|
3 |
-
"
|
4 |
"5754ba35c4f9f27e3e1d5b4d9bb972f2",
|
5 |
-
"f8f43b2e1413f709038506c3a2dfd7b9",
|
6 |
"9ae73679959943c591be3d1c81b7c26c",
|
7 |
"6286818c51fc82ffc065ba12d3c48c19",
|
8 |
"e068f68ad0aed4134d075210f871ea95",
|
9 |
-
"
|
|
|
10 |
],
|
11 |
"speakers": [
|
12 |
-
"Atoosa Hatefi",
|
13 |
-
"Robert Curran",
|
14 |
-
"Amol Phadke",
|
15 |
-
"Colin Bannon",
|
16 |
-
"Abdu Mudesir",
|
17 |
-
"Hasan Jafri",
|
18 |
-
"Alex Foster",
|
19 |
-
"Madhukiran Medithe",
|
20 |
-
"Yago Tenorio",
|
21 |
-
"Dennis Hoffman",
|
22 |
-
"Mark Henry",
|
23 |
-
"Alfredo Musitani",
|
24 |
-
"Vivek Chadha",
|
25 |
-
"Faiq Khan",
|
26 |
-
"Susan James",
|
27 |
-
"Mirko Voltolini",
|
28 |
-
"Guy Daniels",
|
29 |
-
"Michele Campriani",
|
30 |
"Sadayuki Abeta",
|
31 |
-
"Chivas Nambiar",
|
32 |
-
"Philippe Ensarguet",
|
33 |
-
"Alexandra Foster",
|
34 |
-
"Tom Burton",
|
35 |
"Juan Manuel Caro",
|
36 |
-
"Andrew Coward",
|
37 |
-
"Harkirit Singh",
|
38 |
-
"Ray Le Maistre",
|
39 |
-
"Enrique Blanco",
|
40 |
-
"Mark Gilmour",
|
41 |
-
"Luis Velarde Tazon",
|
42 |
-
"Vishal Mathur",
|
43 |
"Franz Seiser",
|
44 |
-
"
|
45 |
-
"
|
|
|
46 |
"Dean Dennis",
|
47 |
-
"Sushil Rawat",
|
48 |
"Sarwar Khan",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"Ahmed Hafez",
|
50 |
-
"
|
|
|
|
|
51 |
"Amith Maharaj",
|
|
|
|
|
|
|
|
|
|
|
52 |
"Matthias Fridstrom",
|
53 |
-
"
|
54 |
-
"
|
55 |
"Terje Jensen",
|
56 |
-
"
|
57 |
-
"
|
|
|
|
|
|
|
|
|
|
|
58 |
"Mojdeh Amani",
|
59 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
"Komal Aggarwal",
|
61 |
-
"
|
62 |
-
"
|
63 |
-
|
64 |
-
"companies": [
|
65 |
-
"Verizon Business",
|
66 |
-
"Appledore Research",
|
67 |
-
"Cambridge Management Consulting",
|
68 |
-
"TM Forum",
|
69 |
-
"Rakuten Mobile",
|
70 |
-
"Rakuten Symphony",
|
71 |
-
"AWS",
|
72 |
-
"Arelion",
|
73 |
-
"Vodafone",
|
74 |
-
"Telecom Argentina",
|
75 |
-
"Vodafone UK",
|
76 |
-
"BT Business",
|
77 |
-
"NTT DOCOMO",
|
78 |
-
"Lewis Insight",
|
79 |
-
"Deutsche Telekom Technik",
|
80 |
-
"American Tower",
|
81 |
-
"Deutsche Telekom",
|
82 |
-
"SoftBank",
|
83 |
-
"Telecom Infra Project",
|
84 |
-
"Telefonica",
|
85 |
-
"MTN",
|
86 |
-
"IBM",
|
87 |
-
"Colt Technology",
|
88 |
-
"TelecomTV",
|
89 |
-
"Telenor",
|
90 |
-
"BT",
|
91 |
-
"BT Group",
|
92 |
-
"Ascend Digital Solutions",
|
93 |
-
"Orange",
|
94 |
-
"DSP Leaders Councillor",
|
95 |
-
"Optiva",
|
96 |
-
"TELUS",
|
97 |
-
"Dell Technologies",
|
98 |
-
"Connectivitree"
|
99 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
"sentiments": [],
|
101 |
-
"subjects": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
}
|
|
|
1 |
{
|
2 |
"content_hashes": [
|
3 |
+
"4974b044047d2523c747b79d938915c9",
|
4 |
"5754ba35c4f9f27e3e1d5b4d9bb972f2",
|
|
|
5 |
"9ae73679959943c591be3d1c81b7c26c",
|
6 |
"6286818c51fc82ffc065ba12d3c48c19",
|
7 |
"e068f68ad0aed4134d075210f871ea95",
|
8 |
+
"f8f43b2e1413f709038506c3a2dfd7b9",
|
9 |
+
"d81ba6e90c2c42d82c4003c4d158d3e3"
|
10 |
],
|
11 |
"speakers": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"Sadayuki Abeta",
|
|
|
|
|
|
|
|
|
13 |
"Juan Manuel Caro",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"Franz Seiser",
|
15 |
+
"Hasan Jafri",
|
16 |
+
"Enrique Blanco",
|
17 |
+
"Nik Willets",
|
18 |
"Dean Dennis",
|
|
|
19 |
"Sarwar Khan",
|
20 |
+
"Alfredo Musitani",
|
21 |
+
"Susan James",
|
22 |
+
"Gabriela Styf Sj\u00f6man",
|
23 |
+
"Alex Foster",
|
24 |
+
"Vishal Mathur",
|
25 |
+
"Sandeep Raithatha",
|
26 |
+
"Alexandra Foster",
|
27 |
+
"Harkirit Singh",
|
28 |
+
"Tom Burton",
|
29 |
+
"Laura Murphy",
|
30 |
"Ahmed Hafez",
|
31 |
+
"Jose Antonio Martin Martinez",
|
32 |
+
"Francis Haysom",
|
33 |
+
"Atoosa Hatefi",
|
34 |
"Amith Maharaj",
|
35 |
+
"Mallik Rao",
|
36 |
+
"Anita D\u00f6hler",
|
37 |
+
"Geoff Hollingworth",
|
38 |
+
"Abdu Mudesir",
|
39 |
+
"Akira Tada",
|
40 |
"Matthias Fridstrom",
|
41 |
+
"Manish Singh",
|
42 |
+
"Guy Daniels",
|
43 |
"Terje Jensen",
|
44 |
+
"Mark Henry",
|
45 |
+
"Luis Velarde Tazon",
|
46 |
+
"Colin Bannon",
|
47 |
+
"Dennis Hoffman",
|
48 |
+
"Michele Campriani",
|
49 |
+
"Andrew Coward",
|
50 |
+
"Ray Le Maistre",
|
51 |
"Mojdeh Amani",
|
52 |
+
"Philippe Ensarguet",
|
53 |
+
"Amol Phadke",
|
54 |
+
"Chris Lewis",
|
55 |
+
"Sushil Rawat",
|
56 |
+
"Mark Gilmour",
|
57 |
+
"Francesca Serravalle",
|
58 |
+
"Robert Curran",
|
59 |
+
"Faiq Khan",
|
60 |
+
"Chivas Nambiar",
|
61 |
+
"Mirko Voltolini",
|
62 |
"Komal Aggarwal",
|
63 |
+
"Madhukiran Medithe",
|
64 |
+
"Vivek Chadha",
|
65 |
+
"Yago Tenorio"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
],
|
67 |
+
"companies": {
|
68 |
+
"TelecomTV": [
|
69 |
+
"Guy Daniels",
|
70 |
+
"Ray Le Maistre"
|
71 |
+
],
|
72 |
+
"Connectivitree": [
|
73 |
+
"Mark Gilmour"
|
74 |
+
],
|
75 |
+
"Telenor": [
|
76 |
+
"Amol Phadke",
|
77 |
+
"Terje Jensen"
|
78 |
+
],
|
79 |
+
"Vodafone": [
|
80 |
+
"Komal Aggarwal",
|
81 |
+
"Yago Tenorio"
|
82 |
+
],
|
83 |
+
"BT": [
|
84 |
+
"Sarwar Khan",
|
85 |
+
"Alex Foster",
|
86 |
+
"Gabriela Styf Sj\u00f6man",
|
87 |
+
"Mark Henry",
|
88 |
+
"Mojdeh Amani"
|
89 |
+
],
|
90 |
+
"American Tower": [
|
91 |
+
"Susan James"
|
92 |
+
],
|
93 |
+
"Dell Technologies": [
|
94 |
+
"Manish Singh",
|
95 |
+
"Dennis Hoffman"
|
96 |
+
],
|
97 |
+
"AWS": [
|
98 |
+
"Chivas Nambiar"
|
99 |
+
],
|
100 |
+
"VMO2 Business": [
|
101 |
+
"Sandeep Raithatha"
|
102 |
+
],
|
103 |
+
"Deutsche Telekom": [
|
104 |
+
"Abdu Mudesir",
|
105 |
+
"Ahmed Hafez"
|
106 |
+
],
|
107 |
+
"Telefonica": [
|
108 |
+
"Juan Manuel Caro",
|
109 |
+
"Enrique Blanco",
|
110 |
+
"Jose Antonio Martin Martinez",
|
111 |
+
"Luis Velarde Tazon",
|
112 |
+
"Mallik Rao"
|
113 |
+
],
|
114 |
+
"SoftBank": [
|
115 |
+
"Akira Tada"
|
116 |
+
],
|
117 |
+
"TM Forum": [
|
118 |
+
"Nik Willets"
|
119 |
+
],
|
120 |
+
"Rakuten Mobile": [
|
121 |
+
"Madhukiran Medithe"
|
122 |
+
],
|
123 |
+
"Appledore Research": [
|
124 |
+
"Francis Haysom",
|
125 |
+
"Robert Curran"
|
126 |
+
],
|
127 |
+
"NGMN Alliance": [
|
128 |
+
"Anita D\u00f6hler"
|
129 |
+
],
|
130 |
+
"Arelion": [
|
131 |
+
"Matthias Fridstrom"
|
132 |
+
],
|
133 |
+
"Deutsche Telekom Technik": [
|
134 |
+
"Franz Seiser"
|
135 |
+
],
|
136 |
+
"TELUS": [
|
137 |
+
"Hasan Jafri",
|
138 |
+
"Sushil Rawat"
|
139 |
+
],
|
140 |
+
"Orange": [
|
141 |
+
"Atoosa Hatefi",
|
142 |
+
"Philippe Ensarguet"
|
143 |
+
],
|
144 |
+
"BT Business": [
|
145 |
+
"Colin Bannon"
|
146 |
+
],
|
147 |
+
"Telecom Argentina": [
|
148 |
+
"Alfredo Musitani"
|
149 |
+
],
|
150 |
+
"Colt Technology": [
|
151 |
+
"Mirko Voltolini"
|
152 |
+
],
|
153 |
+
"BT Group": [
|
154 |
+
"Laura Murphy"
|
155 |
+
],
|
156 |
+
"MTN": [
|
157 |
+
"Amith Maharaj"
|
158 |
+
],
|
159 |
+
"Vodafone UK": [
|
160 |
+
"Francesca Serravalle"
|
161 |
+
],
|
162 |
+
"Verizon Business": [
|
163 |
+
"Dean Dennis"
|
164 |
+
],
|
165 |
+
"Rakuten Symphony": [
|
166 |
+
"Faiq Khan",
|
167 |
+
"Geoff Hollingworth",
|
168 |
+
"Vivek Chadha"
|
169 |
+
],
|
170 |
+
"Cambridge Management Consulting": [
|
171 |
+
"Tom Burton"
|
172 |
+
],
|
173 |
+
"Ascend Digital Solutions": [
|
174 |
+
"Harkirit Singh"
|
175 |
+
],
|
176 |
+
"IBM": [
|
177 |
+
"Andrew Coward"
|
178 |
+
],
|
179 |
+
"Optiva": [
|
180 |
+
"Michele Campriani"
|
181 |
+
],
|
182 |
+
"Telecom Infra Project": [
|
183 |
+
"Vishal Mathur"
|
184 |
+
],
|
185 |
+
"NTT DOCOMO": [
|
186 |
+
"Sadayuki Abeta"
|
187 |
+
],
|
188 |
+
"DSP Leaders Councillor": [
|
189 |
+
"Alexandra Foster"
|
190 |
+
],
|
191 |
+
"Lewis Insight": [
|
192 |
+
"Chris Lewis"
|
193 |
+
]
|
194 |
+
},
|
195 |
"sentiments": [],
|
196 |
+
"subjects": [
|
197 |
+
"Connectivity",
|
198 |
+
"Infrastructure",
|
199 |
+
"5G",
|
200 |
+
"Enterprise",
|
201 |
+
"Network",
|
202 |
+
"Open RAN",
|
203 |
+
"TechCo",
|
204 |
+
"API",
|
205 |
+
"Innovation",
|
206 |
+
"B2B",
|
207 |
+
"AI"
|
208 |
+
]
|
209 |
}
|
clean_db.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/sh
|
2 |
+
|
3 |
+
rm cache/db_metadata.json
|
4 |
+
rm cache/cached_https_www.telecomtv.com_content_dsp-leaders-forum_*.json
|
5 |
+
rm -rf db
|
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/header.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2ffd87324d19f8f6366a4be4dccc22a83a50ca6837d1327fb660dc4b4e25d140
|
3 |
-
size 100
|
|
|
|
|
|
|
|
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/index_metadata.pickle
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:952bca23a4923000a81096d77fd1e39c4270e46f697bb4f2476c550ced3f2943
|
3 |
-
size 99983
|
|
|
|
|
|
|
|
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/length.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7063182d3e5741c59a41e3c9728f568cbcae4adbda7a9a560b3678335c630157
|
3 |
-
size 4000
|
|
|
|
|
|
|
|
db/chroma.sqlite3
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:df787a6d1a483f410e5a820ca3278c0d0f10382f4102caec34b0257587055ae9
|
3 |
-
size 12341248
|
|
|
|
|
|
|
|
db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/data_level0.bin → faiss_index.faiss/index.faiss}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9a89a762d762a400d92c6bd6c4cc85f0b0f1841110a44e7038689592e8e91e4
|
3 |
+
size 2408493
|
db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/link_lists.bin → faiss_index.faiss/index.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b336f2522b6c941539e3036a7cbf4f3d48ff14167508a220fa087a2c922ea982
|
3 |
+
size 574563
|
requirements.txt
CHANGED
@@ -3,4 +3,6 @@ embedchain
|
|
3 |
langchain_huggingface
|
4 |
watchdog
|
5 |
pyppeteer
|
6 |
-
beautifulsoup4
|
|
|
|
|
|
3 |
langchain_huggingface
|
4 |
watchdog
|
5 |
pyppeteer
|
6 |
+
beautifulsoup4
|
7 |
+
faiss-cpu
|
8 |
+
uuid
|
ttv_web_scraper.py
CHANGED
@@ -2,53 +2,22 @@ import re
|
|
2 |
import asyncio
|
3 |
import json
|
4 |
import os
|
|
|
5 |
import traceback
|
6 |
from pyppeteer import launch
|
7 |
from bs4 import BeautifulSoup
|
8 |
import hashlib
|
9 |
-
from
|
10 |
-
|
11 |
|
12 |
CACHE_DIR = "cache/"
|
13 |
if not os.path.exists(CACHE_DIR):
|
14 |
os.makedirs(CACHE_DIR)
|
15 |
|
16 |
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
content_hashes = set()
|
22 |
-
speakers = set()
|
23 |
-
companies = set()
|
24 |
-
sentiments = set()
|
25 |
-
subjects = set()
|
26 |
-
|
27 |
-
if os.path.exists(DB_METADATA_FILE):
|
28 |
-
with open(DB_METADATA_FILE, 'r') as f:
|
29 |
-
metadata = json.load(f)
|
30 |
-
|
31 |
-
content_hashes = set(metadata.get('content_hashes', []))
|
32 |
-
speakers = set(metadata.get('speakers', []))
|
33 |
-
companies = set(metadata.get('companies', []))
|
34 |
-
sentiments = set(metadata.get('sentiments', []))
|
35 |
-
subjects = set(metadata.get('subjects', []))
|
36 |
-
|
37 |
-
return content_hashes, speakers, companies, sentiments, subjects
|
38 |
-
|
39 |
-
|
40 |
-
def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
|
41 |
-
|
42 |
-
metadata = {
|
43 |
-
'content_hashes': list(content_hashes),
|
44 |
-
'speakers': list(speakers),
|
45 |
-
'companies': list(companies),
|
46 |
-
'sentiments': list(sentiments),
|
47 |
-
'subjects': list(subjects)
|
48 |
-
}
|
49 |
-
|
50 |
-
with open(DB_METADATA_FILE, 'w') as f:
|
51 |
-
json.dump(metadata, f, indent=2)
|
52 |
|
53 |
|
54 |
async def get_client_rendered_content(url):
|
@@ -121,7 +90,7 @@ def read_json_from_file(filename):
|
|
121 |
|
122 |
def extract_speaker_info(segment):
|
123 |
try:
|
124 |
-
pattern = r'(?P<speaker>(?:[A-Z][a-
|
125 |
match = re.match(pattern, segment)
|
126 |
if match:
|
127 |
return {key: value.strip() if value else None for key, value in match.groupdict().items()}
|
@@ -135,26 +104,79 @@ def extract_speaker_info(segment):
|
|
135 |
raise Exception(f"Error extracting speaker info: {str(e)}")
|
136 |
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
def parse_transcript(content):
|
139 |
try:
|
140 |
parsed_segments = []
|
141 |
-
|
142 |
-
pattern = r'((?:[A-Z][a-
|
143 |
segments = re.split(pattern, content)
|
144 |
segments = [segment.strip() for segment in segments if segment.strip()]
|
145 |
-
|
|
|
146 |
speaker_info = extract_speaker_info(segment)
|
147 |
-
if
|
148 |
if speaker_info['speaker']:
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
else:
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
return parsed_segments
|
159 |
except Exception as e:
|
160 |
raise Exception(f"Error parsing transcript: {str(e)}")
|
@@ -200,8 +222,6 @@ async def process_url(url):
|
|
200 |
print(f"Detailed error: {str(e)}")
|
201 |
return None
|
202 |
|
203 |
-
# This function can be used to process multiple URLs
|
204 |
-
|
205 |
|
206 |
async def process_urls(urls):
|
207 |
tasks = [process_url(url) for url in urls]
|
@@ -209,8 +229,8 @@ async def process_urls(urls):
|
|
209 |
|
210 |
|
211 |
def main():
|
212 |
-
|
213 |
-
|
214 |
|
215 |
url_file = "dsp-urls.txt" # File containing list of URLs
|
216 |
|
@@ -220,6 +240,10 @@ def main():
|
|
220 |
|
221 |
content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
|
222 |
|
|
|
|
|
|
|
|
|
223 |
with open(url_file, 'r') as f:
|
224 |
urls = [line.strip() for line in f if line.strip()]
|
225 |
|
@@ -243,22 +267,68 @@ def main():
|
|
243 |
|
244 |
for entry in transcript:
|
245 |
metadata.update(entry['metadata'])
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
text = entry['text']
|
250 |
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
content_hashes.add(filename_hash)
|
254 |
print(f"Added new url: {url}")
|
255 |
|
256 |
-
# Save updated hashes
|
257 |
save_metadata_sets(content_hashes, speakers,
|
258 |
companies, sentiments, subjects)
|
259 |
|
|
|
|
|
260 |
print("Processing complete. Check individual URL outputs for any errors.")
|
261 |
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
if __name__ == "__main__":
|
264 |
main()
|
|
|
2 |
import asyncio
|
3 |
import json
|
4 |
import os
|
5 |
+
import gc
|
6 |
import traceback
|
7 |
from pyppeteer import launch
|
8 |
from bs4 import BeautifulSoup
|
9 |
import hashlib
|
10 |
+
from ai_config_faiss import get_ai_assistant
|
|
|
11 |
|
12 |
CACHE_DIR = "cache/"
|
13 |
if not os.path.exists(CACHE_DIR):
|
14 |
os.makedirs(CACHE_DIR)
|
15 |
|
16 |
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
|
17 |
+
SUBJECTS = [
|
18 |
+
"5G", "AI", "Innovation", "Network", "Enterprise", "Open RAN",
|
19 |
+
"TechCo", "B2B", "API", "Infrastructure", "Connectivity"
|
20 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
async def get_client_rendered_content(url):
|
|
|
90 |
|
91 |
def extract_speaker_info(segment):
|
92 |
try:
|
93 |
+
pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
|
94 |
match = re.match(pattern, segment)
|
95 |
if match:
|
96 |
return {key: value.strip() if value else None for key, value in match.groupdict().items()}
|
|
|
104 |
raise Exception(f"Error extracting speaker info: {str(e)}")
|
105 |
|
106 |
|
107 |
+
def extract_subject_info(text):
|
108 |
+
# Convert text to lowercase for case-insensitive matching
|
109 |
+
lower_text = text.lower()
|
110 |
+
|
111 |
+
# Find all subjects present in the text
|
112 |
+
found_subjects = [
|
113 |
+
subject for subject in SUBJECTS if subject.lower() in lower_text]
|
114 |
+
|
115 |
+
return found_subjects
|
116 |
+
|
117 |
+
|
118 |
def parse_transcript(content):
|
119 |
try:
|
120 |
parsed_segments = []
|
121 |
+
saved_info = None
|
122 |
+
pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
|
123 |
segments = re.split(pattern, content)
|
124 |
segments = [segment.strip() for segment in segments if segment.strip()]
|
125 |
+
|
126 |
+
for i, segment in enumerate(segments):
|
127 |
speaker_info = extract_speaker_info(segment)
|
128 |
+
if speaker_info:
|
129 |
if speaker_info['speaker']:
|
130 |
+
# Full speaker, company, timestamp format
|
131 |
+
if saved_info:
|
132 |
+
text = segments[i-1] if i > 0 else ""
|
133 |
+
subjects = extract_subject_info(text)
|
134 |
+
parsed_segments.append({
|
135 |
+
'metadata': {
|
136 |
+
'speaker': saved_info['speaker'],
|
137 |
+
'company': saved_info['company'],
|
138 |
+
'start_timestamp': saved_info['timestamp'],
|
139 |
+
'end_timestamp': speaker_info['timestamp'],
|
140 |
+
'subjects': subjects
|
141 |
+
},
|
142 |
+
'text': text
|
143 |
+
})
|
144 |
+
saved_info = speaker_info
|
145 |
else:
|
146 |
+
# Standalone timestamp format
|
147 |
+
if saved_info:
|
148 |
+
text = segments[i-1] if i > 0 else ""
|
149 |
+
subjects = extract_subject_info(text)
|
150 |
+
parsed_segments.append({
|
151 |
+
'metadata': {
|
152 |
+
'speaker': saved_info['speaker'],
|
153 |
+
'company': saved_info['company'],
|
154 |
+
'start_timestamp': saved_info['timestamp'],
|
155 |
+
'end_timestamp': speaker_info['timestamp'],
|
156 |
+
'subjects': subjects
|
157 |
+
},
|
158 |
+
'text': text
|
159 |
+
})
|
160 |
+
saved_info['timestamp'] = speaker_info['timestamp']
|
161 |
+
elif saved_info:
|
162 |
+
# Text segment
|
163 |
+
continue
|
164 |
+
|
165 |
+
# Add final entry
|
166 |
+
if saved_info:
|
167 |
+
text = segments[-1]
|
168 |
+
subjects = extract_subject_info(text)
|
169 |
+
parsed_segments.append({
|
170 |
+
'metadata': {
|
171 |
+
'speaker': saved_info['speaker'],
|
172 |
+
'company': saved_info['company'],
|
173 |
+
'start_timestamp': saved_info['timestamp'],
|
174 |
+
'end_timestamp': "00:00:00",
|
175 |
+
'subjects': subjects
|
176 |
+
},
|
177 |
+
'text': text
|
178 |
+
})
|
179 |
+
|
180 |
return parsed_segments
|
181 |
except Exception as e:
|
182 |
raise Exception(f"Error parsing transcript: {str(e)}")
|
|
|
222 |
print(f"Detailed error: {str(e)}")
|
223 |
return None
|
224 |
|
|
|
|
|
225 |
|
226 |
async def process_urls(urls):
|
227 |
tasks = [process_url(url) for url in urls]
|
|
|
229 |
|
230 |
|
231 |
def main():
|
232 |
+
global assistant
|
233 |
+
assistant = get_ai_assistant()
|
234 |
|
235 |
url_file = "dsp-urls.txt" # File containing list of URLs
|
236 |
|
|
|
240 |
|
241 |
content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
|
242 |
|
243 |
+
# Convert companies to a dictionary of speaker sets if it's not already
|
244 |
+
if not isinstance(companies, dict):
|
245 |
+
companies = {company: set() for company in companies}
|
246 |
+
|
247 |
with open(url_file, 'r') as f:
|
248 |
urls = [line.strip() for line in f if line.strip()]
|
249 |
|
|
|
267 |
|
268 |
for entry in transcript:
|
269 |
metadata.update(entry['metadata'])
|
270 |
+
company = metadata['company']
|
271 |
+
speaker = metadata['speaker']
|
272 |
+
entry_subjects = metadata['subjects']
|
273 |
+
|
274 |
+
speakers.add(speaker)
|
275 |
+
# Add new subjects to the master set
|
276 |
+
subjects.update(entry_subjects)
|
277 |
|
278 |
text = entry['text']
|
279 |
|
280 |
+
assistant.add_to_knowledge_base(
|
281 |
+
text, data_type='text', metadata=metadata.copy())
|
282 |
+
|
283 |
+
if company not in companies:
|
284 |
+
companies[company] = set()
|
285 |
+
companies[company].add(speaker)
|
286 |
|
287 |
content_hashes.add(filename_hash)
|
288 |
print(f"Added new url: {url}")
|
289 |
|
290 |
+
# Save updated hashes and metadata
|
291 |
save_metadata_sets(content_hashes, speakers,
|
292 |
companies, sentiments, subjects)
|
293 |
|
294 |
+
assistant.save()
|
295 |
+
|
296 |
print("Processing complete. Check individual URL outputs for any errors.")
|
297 |
|
298 |
|
299 |
+
def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
|
300 |
+
metadata = {
|
301 |
+
'content_hashes': list(content_hashes),
|
302 |
+
'speakers': list(speakers),
|
303 |
+
'companies': {company: list(speakers) for company, speakers in companies.items()},
|
304 |
+
'sentiments': list(sentiments),
|
305 |
+
'subjects': list(subjects)
|
306 |
+
}
|
307 |
+
|
308 |
+
with open(DB_METADATA_FILE, 'w') as f:
|
309 |
+
json.dump(metadata, f, indent=2)
|
310 |
+
|
311 |
+
|
312 |
+
def db_load_metadata_sets():
|
313 |
+
content_hashes = set()
|
314 |
+
speakers = set()
|
315 |
+
companies = {}
|
316 |
+
sentiments = set()
|
317 |
+
subjects = set()
|
318 |
+
|
319 |
+
if os.path.exists(DB_METADATA_FILE):
|
320 |
+
with open(DB_METADATA_FILE, 'r') as f:
|
321 |
+
metadata = json.load(f)
|
322 |
+
|
323 |
+
content_hashes = set(metadata.get('content_hashes', []))
|
324 |
+
speakers = set(metadata.get('speakers', []))
|
325 |
+
companies = {company: set(speakers) for company, speakers in metadata.get(
|
326 |
+
'companies', {}).items()}
|
327 |
+
sentiments = set(metadata.get('sentiments', []))
|
328 |
+
subjects = set(metadata.get('subjects', SUBJECTS))
|
329 |
+
|
330 |
+
return content_hashes, speakers, companies, sentiments, subjects
|
331 |
+
|
332 |
+
|
333 |
if __name__ == "__main__":
|
334 |
main()
|