Spaces:

eusholli
/

ttv-ec

Build error

App Files Files Community

eusholli commited on Aug 22, 2024

Commit

0540b53

1 Parent(s): 76ebe2e

faiss filter initial commit

Browse files

Files changed (14) hide show

ai_config_ec.py +105 -0
ai_config_faiss.py +129 -0
ai_config_faiss.py.llm-query +112 -0
app.py +75 -73
cache/db_metadata.json +187 -80
clean_db.sh +5 -0
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/header.bin +0 -3
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/index_metadata.pickle +0 -3
db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/length.bin +0 -3
db/chroma.sqlite3 +0 -3
db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/data_level0.bin → faiss_index.faiss/index.faiss} +2 -2
db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/link_lists.bin → faiss_index.faiss/index.pkl} +2 -2
requirements.txt +3 -1
ttv_web_scraper.py +128 -58

ai_config_ec.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import streamlit as st
+from embedchain import App
+from typing import Dict, Any, List
+def timestamp_to_seconds(timestamp):
+    """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
+    parts = timestamp.split(':')
+    if len(parts) == 3:
+        h, m, s = map(int, parts)
+        ts = h * 3600 + m * 60 + s
+    elif len(parts) == 2:
+        m, s = map(int, parts)
+        ts = m * 60 + s
+    else:
+        raise ValueError(f"Invalid timestamp format: {timestamp}")
+    return ts
+class AIAssistant:
+    def __init__(self):
+        self.app = self._create_app()
+    def _get_api_key(self, name: str) -> str:
+        api_key = os.environ.get(name)
+        if not api_key:
+            api_key = st.secrets.get(name)
+        if not api_key:
+            raise ValueError(
+                f"{name} is not set. Please set it in your environment or Streamlit secrets.")
+        return api_key
+    def _create_config(self) -> Dict[str, Any]:
+        return {
+            'app': {
+                'config': {
+                    'name': 'ttv-ec'
+                }
+            },
+            'llm': {
+                'provider': 'huggingface',
+                'config': {
+                    'model': 'mistralai/Mistral-7B-Instruct-v0.2',
+                    'top_p': 0.5,
+                    'stream': False,
+                    'prompt': """You are an AI assistant that answers questions based solely on the information provided in your knowledge base.
+Question: $query
+Context: $context
+If the information to answer a question is not available in your knowledge base,
+respond with 'I don't have enough information to answer that question.
+""",
+                    'api_key': self._get_api_key('HF_TOKEN')
+                }
+            },
+            'embedder': {
+                'provider': 'huggingface',
+                'config': {
+                    'model': 'sentence-transformers/all-mpnet-base-v2',
+                    'api_key': self._get_api_key('HF_TOKEN')
+                }
+            }
+        }
+    def _create_app(self) -> App:
+        config = self._create_config()
+        return App.from_config(config=config)
+    def save(self) -> None:
+        # null function
+        return
+    def add_to_knowledge_base(self, data: str, data_type: str, metadata: Dict[str, Any] = None) -> None:
+        self.app.add(data, data_type=data_type, metadata=metadata)
+    def query(self, question: str, num_results: int = 30, filters: Dict[str, Any] = None) -> Dict[str, List[Dict[str, Any]]]:
+        search_results = self.app.search(
+            question, num_documents=num_results, where=filters)
+        # Process and display search results
+        answer = "Here are the most relevant transcript excerpts:\n\n"
+        for i, result in enumerate(search_results['results'], 1):
+            metadata = result['metadata']
+            ts = timestamp_to_seconds(metadata['timestamp'])
+            yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
+            speaker_info = (
+                f"Speaker: {metadata.get('speaker', 'Unknown')}, "
+                f"Company: {metadata.get('company', 'Unknown')}, "
+                f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
+            )
+            answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url})  \n"
+            answer += f"{metadata.get('title', 'Unknown')}  \n"
+            answer += f"\"{result['context']}\"\n\n"
+        return {'results': search_results}
+# Usage example
+def get_ai_assistant() -> AIAssistant:
+    return AIAssistant()

ai_config_faiss.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+from typing import Dict, Any, List
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+os.environ["LANGCHAIN_TRACING_V2"] = "true"
+DB_DIR = "db/"
+if not os.path.exists(DB_DIR):
+    os.makedirs(DB_DIR)
+def timestamp_to_seconds(timestamp):
+    """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
+    parts = timestamp.split(':')
+    if len(parts) == 3:
+        h, m, s = map(int, parts)
+        ts = h * 3600 + m * 60 + s
+    elif len(parts) == 2:
+        m, s = map(int, parts)
+        ts = m * 60 + s
+    else:
+        raise ValueError(f"Invalid timestamp format: {timestamp}")
+    return ts
+class FAISSAIAssistant:
+    def __init__(self, index_name: str = "faiss_index"):
+        self.index_name = f"{DB_DIR}{index_name}.faiss"
+        model_name = "sentence-transformers/all-mpnet-base-v2"
+        model_kwargs = {'device': 'cpu'}
+        encode_kwargs = {'normalize_embeddings': False}
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            model_kwargs=model_kwargs,
+            encode_kwargs=encode_kwargs)
+        self.vector_store = self._create_app()
+    def _create_app(self):
+        if os.path.exists(self.index_name):
+            print("Loading existing FAISS index...")
+            return FAISS.load_local(self.index_name, self.embeddings,
+                                    allow_dangerous_deserialization=True)
+        else:
+            print("Creating new FAISS index...")
+            # Create an initial document with placeholder text
+            initial_texts = [
+                "This is an initial document to create the FAISS index."]
+            return FAISS.from_texts(initial_texts, self.embeddings)
+    def add_to_knowledge_base(self, data: str, data_type: str = None, metadata: Dict[str, Any] = None) -> None:
+        doc = Document(page_content=data, metadata=metadata or {})
+        self.vector_store.add_documents([doc])
+    def query(self, question: str, num_results: int = 30, filters: Dict[str, List[str]] = None) -> str:
+        all_docs = self.list_documents()
+        def match_any_filter(doc_metadata, filters):
+            if not filters:
+                return True
+            for key, values in filters.items():
+                if key in doc_metadata:
+                    doc_value = doc_metadata[key]
+                    if isinstance(doc_value, list):
+                        # If doc_value is a list, check if any item in doc_value is in values
+                        if any(item in values for item in doc_value):
+                            return True
+                    else:
+                        # If doc_value is a single string, check if it's in values
+                        if doc_value in values:
+                            return True
+            return False
+        filtered_docs = [
+            doc for doc in all_docs
+            if match_any_filter(doc['metadata'], filters)
+        ]
+        # Limit the number of results to num_results
+        filtered_docs = filtered_docs[:num_results]
+        answer = f"Here are the top {
+            len(filtered_docs)} documents matching the filter:\n\n"
+        for i, doc in enumerate(filtered_docs, 1):
+            metadata = doc['metadata']
+            st_ts = timestamp_to_seconds(metadata['start_timestamp'])
+            yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={st_ts}"
+            speaker_info = (
+                f"Speaker: {metadata.get('speaker', 'Unknown')}, "
+                f"Company: {metadata.get('company', 'Unknown')}, "
+                f"Timestamp: {metadata.get('start_timestamp', 'Unknown')}"
+                f" - {metadata.get('end_timestamp', 'Unknown')}"
+            )
+            answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url})  \n"
+            answer += f"{metadata.get('title', 'Unknown')}  \n"
+            answer += f"\"{doc['content']}\"  \n\n"
+        return answer
+    def save(self):
+        self.vector_store.save_local(self.index_name)
+        print("FAISS index saved.")
+    def list_documents(self) -> List[Dict[str, Any]]:
+        """
+        List all documents in the FAISS vectorstore.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, each containing 'content' and 'metadata' of a document.
+        """
+        documents = []
+        for doc_id, doc in self.vector_store.docstore._dict.items():
+            documents.append({
+                'id': doc_id,
+                'content': doc.page_content,
+                'metadata': doc.metadata
+            })
+        return documents
+# Usage example
+def get_ai_assistant(index_name: str = "faiss_index") -> FAISSAIAssistant:
+    return FAISSAIAssistant(index_name)

ai_config_faiss.py.llm-query ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+from typing import Dict, Any, List
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+os.environ["LANGCHAIN_TRACING_V2"] = "true"
+DB_DIR = "db/"
+if not os.path.exists(DB_DIR):
+    os.makedirs(DB_DIR)
+def timestamp_to_seconds(timestamp):
+    """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
+    parts = timestamp.split(':')
+    if len(parts) == 3:
+        h, m, s = map(int, parts)
+        ts = h * 3600 + m * 60 + s
+    elif len(parts) == 2:
+        m, s = map(int, parts)
+        ts = m * 60 + s
+    else:
+        raise ValueError(f"Invalid timestamp format: {timestamp}")
+    return ts
+class FAISSAIAssistant:
+    def __init__(self, index_name: str = "faiss_index"):
+        self.index_name = f"{DB_DIR}{index_name}.faiss"
+        model_name = "sentence-transformers/all-mpnet-base-v2"
+        model_kwargs = {'device': 'cpu'}
+        encode_kwargs = {'normalize_embeddings': False}
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            model_kwargs=model_kwargs,
+            encode_kwargs=encode_kwargs)
+        self.vector_store = self._create_app()
+    def _create_app(self):
+        if os.path.exists(self.index_name):
+            print("Loading existing FAISS index...")
+            return FAISS.load_local(self.index_name, self.embeddings,
+                                    allow_dangerous_deserialization=True)
+        else:
+            print("Creating new FAISS index...")
+            # Create an initial document with placeholder text
+            initial_texts = [
+                "This is an initial document to create the FAISS index."]
+            return FAISS.from_texts(initial_texts, self.embeddings)
+    def add_to_knowledge_base(self, data: str, data_type: str = None, metadata: Dict[str, Any] = None) -> None:
+        doc = Document(page_content=data, metadata=metadata or {})
+        self.vector_store.add_documents([doc])
+    def query(self, filters: Dict[str, List[str]] = None) -> str:
+        all_docs = self.list_documents()
+        def match_filter(doc_metadata, filter_key, filter_values):
+            return doc_metadata.get(filter_key) in filter_values
+        filtered_docs = [
+            doc for doc in all_docs
+            if all(match_filter(doc['metadata'], k, v) for k, v in filters.items())
+        ] if filters else all_docs
+        answer = "Here are the documents matching the filter:\n\n"
+        for i, doc in enumerate(filtered_docs, 1):
+            metadata = doc['metadata']
+            st_ts = timestamp_to_seconds(metadata['start_timestamp'])
+            yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={st_ts}"
+            speaker_info = (
+                f"Speaker: {metadata.get('speaker', 'Unknown')}, "
+                f"Company: {metadata.get('company', 'Unknown')}, "
+                f"Timestamp: {metadata.get('start_timestamp', 'Unknown')}"
+                f" - {metadata.get('end_timestamp', 'Unknown')}"
+            )
+            answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url})  \n"
+            answer += f"{metadata.get('title', 'Unknown')}  \n"
+            answer += f"\"{doc['content']}\"  \n\n"
+        return answer
+    def save(self):
+        self.vector_store.save_local(self.index_name)
+        print("FAISS index saved.")
+    def list_documents(self) -> List[Dict[str, Any]]:
+        """
+        List all documents in the FAISS vectorstore.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, each containing 'content' and 'metadata' of a document.
+        """
+        documents = []
+        for doc_id, doc in self.vector_store.docstore._dict.items():
+            documents.append({
+                'id': doc_id,
+                'content': doc.page_content,
+                'metadata': doc.metadata
+            })
+        return documents
+# Usage example
+def get_ai_assistant(index_name: str = "faiss_index") -> FAISSAIAssistant:
+    return FAISSAIAssistant(index_name)

app.py CHANGED Viewed

@@ -1,105 +1,107 @@
-from ec_config import create_app
 from ttv_web_scraper import db_load_metadata_sets
 import streamlit as st
 import re
 @st.cache_resource
-def embedchain_bot():
-    return create_app()  # Use the create_app function from config.py
-def timestamp_to_seconds(timestamp):
-    """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds."""
-    parts = timestamp.split(':')
-    if len(parts) == 3:
-        h, m, s = map(int, parts)
-        ts = h * 3600 + m * 60 + s
-    elif len(parts) == 2:
-        m, s = map(int, parts)
-        ts = m * 60 + s
-    else:
-        raise ValueError(f"Invalid timestamp format: {timestamp}")
-    return ts
-def create_filter_panel(speakers, companies, sentiments, subjects):
-    st.sidebar.header("Filter Options")
-    selected_speaker = st.sidebar.selectbox(
-        "Select Speaker", [""] + list(speakers))
-    selected_company = st.sidebar.selectbox(
-        "Select Company", [""] + list(companies))
-    selected_sentiment = st.sidebar.selectbox(
-        "Select Sentiment", [""] + list(sentiments))
-    selected_subject = st.sidebar.selectbox(
-        "Select Subject", [""] + list(subjects))
     where = {}
-    if selected_speaker:
-        where['speaker'] = selected_speaker
-    if selected_company:
-        where['company'] = selected_company
-    if selected_sentiment:
-        where['sentiment'] = selected_sentiment
-    if selected_subject:
-        where['subject'] = selected_subject
     return where
-# Streamlit app
 def main():
-    st.title("DSP Leaders World Forum 2024 ChatBot")
     st.markdown(
         "Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
     # Load metadata sets
-    _, speakers, companies, sentiments, subjects = db_load_metadata_sets()
     # Create filter panel
-    where = create_filter_panel(speakers, companies, sentiments, subjects)
-    # User input
-    user_query = st.text_input(
-        "Enter your question:", placeholder="e.g. What are people speaking about? or List all people speaking")
     # Add a slider for selecting the number of results
     num_results = st.slider("Number of relevant transcript excerpts to show:",
                             min_value=1, max_value=50, value=30, step=1)
-    if user_query:
-        app = embedchain_bot()
-        msg_placeholder = st.empty()
-        msg_placeholder.markdown("Thinking...")
-        # Use app.search() with the where parameter
-        search_results = app.search(
-            user_query, num_documents=num_results, where=where)
-        # Process and display search results
-        answer = "Here are the most relevant transcript excerpts:\n\n"
-        for i, result in enumerate(search_results, 1):
-            metadata = result['metadata']
-            ts = timestamp_to_seconds(metadata['timestamp'])
-            yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}"
-            speaker_info = (
-                f"Speaker: {metadata.get('speaker', 'Unknown')}, "
-                f"Company: {metadata.get('company', 'Unknown')}, "
-                f"Timestamp: {metadata.get('timestamp', 'Unknown')}"
-            )
-            answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url})  \n"
-            answer += f"{metadata.get('title', 'Unknown')}  \n"
-            answer += f"\"{result['context']}\"\n\n"
-        msg_placeholder.markdown(answer)
 if __name__ == "__main__":

+from ai_config_faiss import get_ai_assistant
 from ttv_web_scraper import db_load_metadata_sets
 import streamlit as st
 import re
 @st.cache_resource
+def get_assistant():
+    return get_ai_assistant()
+def create_filter_panel(companies, sentiments, subjects):
+    st.header("Filter Options")
+    # Initialize session state for filters if not already present
+    if 'selected_companies' not in st.session_state:
+        st.session_state.selected_companies = []
+    if 'selected_speakers' not in st.session_state:
+        st.session_state.selected_speakers = []
+    if 'selected_subjects' not in st.session_state:
+        st.session_state.selected_subjects = []
+    # Add a checkbox to show/hide all filters
+    show_filters = st.checkbox("Show Filters", value=True)
+    if show_filters:
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.subheader("Companies")
+            for company in companies.keys():
+                if st.checkbox(f"{company}", value=company in st.session_state.selected_companies):
+                    if company not in st.session_state.selected_companies:
+                        st.session_state.selected_companies.append(company)
+                elif company in st.session_state.selected_companies:
+                    st.session_state.selected_companies.remove(company)
+        with col2:
+            st.subheader("Speakers")
+            all_speakers = set()
+            for speakers in companies.values():
+                all_speakers.update(speakers)
+            for speaker in sorted(all_speakers):
+                if st.checkbox(speaker, value=speaker in st.session_state.selected_speakers):
+                    if speaker not in st.session_state.selected_speakers:
+                        st.session_state.selected_speakers.append(speaker)
+                elif speaker in st.session_state.selected_speakers:
+                    st.session_state.selected_speakers.remove(speaker)
+        with col3:
+            st.subheader("Subjects")
+            for subject in sorted(subjects):
+                if st.checkbox(subject, value=subject in st.session_state.selected_subjects):
+                    if subject not in st.session_state.selected_subjects:
+                        st.session_state.selected_subjects.append(subject)
+                elif subject in st.session_state.selected_subjects:
+                    st.session_state.selected_subjects.remove(subject)
     where = {}
+    if st.session_state.selected_companies:
+        where['company'] = st.session_state.selected_companies
+    if st.session_state.selected_speakers:
+        where['speaker'] = st.session_state.selected_speakers
+    if st.session_state.selected_subjects:
+        where['subjects'] = st.session_state.selected_subjects
     return where
 def main():
+    st.title("Telecom TV Video Expert")
     st.markdown(
         "Trained on data from [here](https://www.telecomtv.com/content/dsp-leaders-forum-videos/)")
     # Load metadata sets
+    _, _, companies, sentiments, subjects = db_load_metadata_sets()
     # Create filter panel
+    where = create_filter_panel(companies, sentiments, subjects)
     # Add a slider for selecting the number of results
     num_results = st.slider("Number of relevant transcript excerpts to show:",
                             min_value=1, max_value=50, value=30, step=1)
+    # Add a submit button
+    submit_button = st.button("Submit")
+    if submit_button:
+        if not where:
+            st.warning(
+                "Please select at least one filter before submitting.")
+        else:
+            assistant = get_assistant()
+            msg_placeholder = st.empty()
+            msg_placeholder.markdown("Thinking...")
+            # Use assistant.query() instead of app.search()
+            response = assistant.query(
+                "", num_results=num_results, filters=where)
+            msg_placeholder.markdown(response)
 if __name__ == "__main__":

cache/db_metadata.json CHANGED Viewed

@@ -1,102 +1,209 @@
 {
   "content_hashes": [
-    "d81ba6e90c2c42d82c4003c4d158d3e3",
     "5754ba35c4f9f27e3e1d5b4d9bb972f2",
-    "f8f43b2e1413f709038506c3a2dfd7b9",
     "9ae73679959943c591be3d1c81b7c26c",
     "6286818c51fc82ffc065ba12d3c48c19",
     "e068f68ad0aed4134d075210f871ea95",
-    "4974b044047d2523c747b79d938915c9"
   ],
   "speakers": [
-    "Atoosa Hatefi",
-    "Robert Curran",
-    "Amol Phadke",
-    "Colin Bannon",
-    "Abdu Mudesir",
-    "Hasan Jafri",
-    "Alex Foster",
-    "Madhukiran Medithe",
-    "Yago Tenorio",
-    "Dennis Hoffman",
-    "Mark Henry",
-    "Alfredo Musitani",
-    "Vivek Chadha",
-    "Faiq Khan",
-    "Susan James",
-    "Mirko Voltolini",
-    "Guy Daniels",
-    "Michele Campriani",
     "Sadayuki Abeta",
-    "Chivas Nambiar",
-    "Philippe Ensarguet",
-    "Alexandra Foster",
-    "Tom Burton",
     "Juan Manuel Caro",
-    "Andrew Coward",
-    "Harkirit Singh",
-    "Ray Le Maistre",
-    "Enrique Blanco",
-    "Mark Gilmour",
-    "Luis Velarde Tazon",
-    "Vishal Mathur",
     "Franz Seiser",
-    "Jose Antonio Martin Martinez",
-    "Chris Lewis",
     "Dean Dennis",
-    "Sushil Rawat",
     "Sarwar Khan",
     "Ahmed Hafez",
-    "Nik Willets",
     "Amith Maharaj",
     "Matthias Fridstrom",
-    "Francesca Serravalle",
-    "Francis Haysom",
     "Terje Jensen",
-    "Akira Tada",
-    "Laura Murphy",
     "Mojdeh Amani",
-    "Manish Singh",
     "Komal Aggarwal",
-    "Geoff Hollingworth",
-    "Mallik Rao"
-  ],
-  "companies": [
-    "Verizon Business",
-    "Appledore Research",
-    "Cambridge Management Consulting",
-    "TM Forum",
-    "Rakuten Mobile",
-    "Rakuten Symphony",
-    "AWS",
-    "Arelion",
-    "Vodafone",
-    "Telecom Argentina",
-    "Vodafone UK",
-    "BT Business",
-    "NTT DOCOMO",
-    "Lewis Insight",
-    "Deutsche Telekom Technik",
-    "American Tower",
-    "Deutsche Telekom",
-    "SoftBank",
-    "Telecom Infra Project",
-    "Telefonica",
-    "MTN",
-    "IBM",
-    "Colt Technology",
-    "TelecomTV",
-    "Telenor",
-    "BT",
-    "BT Group",
-    "Ascend Digital Solutions",
-    "Orange",
-    "DSP Leaders Councillor",
-    "Optiva",
-    "TELUS",
-    "Dell Technologies",
-    "Connectivitree"
   ],
   "sentiments": [],
-  "subjects": []
 }

 {
   "content_hashes": [
+    "4974b044047d2523c747b79d938915c9",
     "5754ba35c4f9f27e3e1d5b4d9bb972f2",
     "9ae73679959943c591be3d1c81b7c26c",
     "6286818c51fc82ffc065ba12d3c48c19",
     "e068f68ad0aed4134d075210f871ea95",
+    "f8f43b2e1413f709038506c3a2dfd7b9",
+    "d81ba6e90c2c42d82c4003c4d158d3e3"
   ],
   "speakers": [
     "Sadayuki Abeta",
     "Juan Manuel Caro",
     "Franz Seiser",
+    "Hasan Jafri",
+    "Enrique Blanco",
+    "Nik Willets",
     "Dean Dennis",
     "Sarwar Khan",
+    "Alfredo Musitani",
+    "Susan James",
+    "Gabriela Styf Sj\u00f6man",
+    "Alex Foster",
+    "Vishal Mathur",
+    "Sandeep Raithatha",
+    "Alexandra Foster",
+    "Harkirit Singh",
+    "Tom Burton",
+    "Laura Murphy",
     "Ahmed Hafez",
+    "Jose Antonio Martin Martinez",
+    "Francis Haysom",
+    "Atoosa Hatefi",
     "Amith Maharaj",
+    "Mallik Rao",
+    "Anita D\u00f6hler",
+    "Geoff Hollingworth",
+    "Abdu Mudesir",
+    "Akira Tada",
     "Matthias Fridstrom",
+    "Manish Singh",
+    "Guy Daniels",
     "Terje Jensen",
+    "Mark Henry",
+    "Luis Velarde Tazon",
+    "Colin Bannon",
+    "Dennis Hoffman",
+    "Michele Campriani",
+    "Andrew Coward",
+    "Ray Le Maistre",
     "Mojdeh Amani",
+    "Philippe Ensarguet",
+    "Amol Phadke",
+    "Chris Lewis",
+    "Sushil Rawat",
+    "Mark Gilmour",
+    "Francesca Serravalle",
+    "Robert Curran",
+    "Faiq Khan",
+    "Chivas Nambiar",
+    "Mirko Voltolini",
     "Komal Aggarwal",
+    "Madhukiran Medithe",
+    "Vivek Chadha",
+    "Yago Tenorio"
   ],
+  "companies": {
+    "TelecomTV": [
+      "Guy Daniels",
+      "Ray Le Maistre"
+    ],
+    "Connectivitree": [
+      "Mark Gilmour"
+    ],
+    "Telenor": [
+      "Amol Phadke",
+      "Terje Jensen"
+    ],
+    "Vodafone": [
+      "Komal Aggarwal",
+      "Yago Tenorio"
+    ],
+    "BT": [
+      "Sarwar Khan",
+      "Alex Foster",
+      "Gabriela Styf Sj\u00f6man",
+      "Mark Henry",
+      "Mojdeh Amani"
+    ],
+    "American Tower": [
+      "Susan James"
+    ],
+    "Dell Technologies": [
+      "Manish Singh",
+      "Dennis Hoffman"
+    ],
+    "AWS": [
+      "Chivas Nambiar"
+    ],
+    "VMO2 Business": [
+      "Sandeep Raithatha"
+    ],
+    "Deutsche Telekom": [
+      "Abdu Mudesir",
+      "Ahmed Hafez"
+    ],
+    "Telefonica": [
+      "Juan Manuel Caro",
+      "Enrique Blanco",
+      "Jose Antonio Martin Martinez",
+      "Luis Velarde Tazon",
+      "Mallik Rao"
+    ],
+    "SoftBank": [
+      "Akira Tada"
+    ],
+    "TM Forum": [
+      "Nik Willets"
+    ],
+    "Rakuten Mobile": [
+      "Madhukiran Medithe"
+    ],
+    "Appledore Research": [
+      "Francis Haysom",
+      "Robert Curran"
+    ],
+    "NGMN Alliance": [
+      "Anita D\u00f6hler"
+    ],
+    "Arelion": [
+      "Matthias Fridstrom"
+    ],
+    "Deutsche Telekom Technik": [
+      "Franz Seiser"
+    ],
+    "TELUS": [
+      "Hasan Jafri",
+      "Sushil Rawat"
+    ],
+    "Orange": [
+      "Atoosa Hatefi",
+      "Philippe Ensarguet"
+    ],
+    "BT Business": [
+      "Colin Bannon"
+    ],
+    "Telecom Argentina": [
+      "Alfredo Musitani"
+    ],
+    "Colt Technology": [
+      "Mirko Voltolini"
+    ],
+    "BT Group": [
+      "Laura Murphy"
+    ],
+    "MTN": [
+      "Amith Maharaj"
+    ],
+    "Vodafone UK": [
+      "Francesca Serravalle"
+    ],
+    "Verizon Business": [
+      "Dean Dennis"
+    ],
+    "Rakuten Symphony": [
+      "Faiq Khan",
+      "Geoff Hollingworth",
+      "Vivek Chadha"
+    ],
+    "Cambridge Management Consulting": [
+      "Tom Burton"
+    ],
+    "Ascend Digital Solutions": [
+      "Harkirit Singh"
+    ],
+    "IBM": [
+      "Andrew Coward"
+    ],
+    "Optiva": [
+      "Michele Campriani"
+    ],
+    "Telecom Infra Project": [
+      "Vishal Mathur"
+    ],
+    "NTT DOCOMO": [
+      "Sadayuki Abeta"
+    ],
+    "DSP Leaders Councillor": [
+      "Alexandra Foster"
+    ],
+    "Lewis Insight": [
+      "Chris Lewis"
+    ]
+  },
   "sentiments": [],
+  "subjects": [
+    "Connectivity",
+    "Infrastructure",
+    "5G",
+    "Enterprise",
+    "Network",
+    "Open RAN",
+    "TechCo",
+    "API",
+    "Innovation",
+    "B2B",
+    "AI"
+  ]
 }

clean_db.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/sh
+rm cache/db_metadata.json
+rm cache/cached_https_www.telecomtv.com_content_dsp-leaders-forum_*.json
+rm -rf db

db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/header.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2ffd87324d19f8f6366a4be4dccc22a83a50ca6837d1327fb660dc4b4e25d140
-size 100

db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/index_metadata.pickle DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:952bca23a4923000a81096d77fd1e39c4270e46f697bb4f2476c550ced3f2943
-size 99983

db/37d02d4f-a72c-4faa-8e90-6cabbae354fe/length.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7063182d3e5741c59a41e3c9728f568cbcae4adbda7a9a560b3678335c630157
-size 4000

db/chroma.sqlite3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:df787a6d1a483f410e5a820ca3278c0d0f10382f4102caec34b0257587055ae9
-size 12341248

db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/data_level0.bin → faiss_index.faiss/index.faiss} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70eda9770ad41e004ab40bea6d88e4fe3f99e05307c811bc9e43573129c642c1
-size 3212000

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9a89a762d762a400d92c6bd6c4cc85f0b0f1841110a44e7038689592e8e91e4
+size 2408493

db/{37d02d4f-a72c-4faa-8e90-6cabbae354fe/link_lists.bin → faiss_index.faiss/index.pkl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ba06f934bb788a6fcf347d85845dce300cedf631915df241ae9d5063c97d88d
-size 8148

 version https://git-lfs.github.com/spec/v1
+oid sha256:b336f2522b6c941539e3036a7cbf4f3d48ff14167508a220fa087a2c922ea982
+size 574563

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ embedchain
 langchain_huggingface
 watchdog
 pyppeteer
-beautifulsoup4

 langchain_huggingface
 watchdog
 pyppeteer
+beautifulsoup4
+faiss-cpu
+uuid

ttv_web_scraper.py CHANGED Viewed

@@ -2,53 +2,22 @@ import re
 import asyncio
 import json
 import os
 import traceback
 from pyppeteer import launch
 from bs4 import BeautifulSoup
 import hashlib
-from ec_config import create_app
 CACHE_DIR = "cache/"
 if not os.path.exists(CACHE_DIR):
     os.makedirs(CACHE_DIR)
 DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
-def db_load_metadata_sets():
-    content_hashes = set()
-    speakers = set()
-    companies = set()
-    sentiments = set()
-    subjects = set()
-    if os.path.exists(DB_METADATA_FILE):
-        with open(DB_METADATA_FILE, 'r') as f:
-            metadata = json.load(f)
-        content_hashes = set(metadata.get('content_hashes', []))
-        speakers = set(metadata.get('speakers', []))
-        companies = set(metadata.get('companies', []))
-        sentiments = set(metadata.get('sentiments', []))
-        subjects = set(metadata.get('subjects', []))
-    return content_hashes, speakers, companies, sentiments, subjects
-def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
-    metadata = {
-        'content_hashes': list(content_hashes),
-        'speakers': list(speakers),
-        'companies': list(companies),
-        'sentiments': list(sentiments),
-        'subjects': list(subjects)
-    }
-    with open(DB_METADATA_FILE, 'w') as f:
-        json.dump(metadata, f, indent=2)
 async def get_client_rendered_content(url):
@@ -121,7 +90,7 @@ def read_json_from_file(filename):
 def extract_speaker_info(segment):
     try:
-        pattern = r'(?P<speaker>(?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+), (?P<company>[A-Za-z\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
         match = re.match(pattern, segment)
         if match:
             return {key: value.strip() if value else None for key, value in match.groupdict().items()}
@@ -135,26 +104,79 @@ def extract_speaker_info(segment):
         raise Exception(f"Error extracting speaker info: {str(e)}")
 def parse_transcript(content):
     try:
         parsed_segments = []
-        metadata = {}
-        pattern = r'((?:[A-Z][a-z]+ ){1,3}[A-Z][a-z]+, [A-Za-z\s]+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
         segments = re.split(pattern, content)
         segments = [segment.strip() for segment in segments if segment.strip()]
-        for segment in segments:
             speaker_info = extract_speaker_info(segment)
-            if (speaker_info):
                 if speaker_info['speaker']:
-                    metadata = speaker_info.copy()
                 else:
-                    metadata = metadata.copy()
-                    metadata['timestamp'] = speaker_info['timestamp']
-            else:
-                parsed_segments.append({
-                    'metadata': metadata,
-                    "text": segment
-                })
         return parsed_segments
     except Exception as e:
         raise Exception(f"Error parsing transcript: {str(e)}")
@@ -200,8 +222,6 @@ async def process_url(url):
         print(f"Detailed error: {str(e)}")
         return None
-# This function can be used to process multiple URLs
 async def process_urls(urls):
     tasks = [process_url(url) for url in urls]
@@ -209,8 +229,8 @@ async def process_urls(urls):
 def main():
-    app = create_app()
     url_file = "dsp-urls.txt"  # File containing list of URLs
@@ -220,6 +240,10 @@ def main():
     content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
     with open(url_file, 'r') as f:
         urls = [line.strip() for line in f if line.strip()]
@@ -243,22 +267,68 @@ def main():
         for entry in transcript:
             metadata.update(entry['metadata'])
-            speakers.add(metadata['speaker'])
-            companies.add(metadata['company'])
             text = entry['text']
-            app.add(text, data_type='text', metadata=metadata)
         content_hashes.add(filename_hash)
         print(f"Added new url: {url}")
-    # Save updated hashes
     save_metadata_sets(content_hashes, speakers,
                        companies, sentiments, subjects)
     print("Processing complete. Check individual URL outputs for any errors.")
 if __name__ == "__main__":
     main()

 import asyncio
 import json
 import os
+import gc
 import traceback
 from pyppeteer import launch
 from bs4 import BeautifulSoup
 import hashlib
+from ai_config_faiss import get_ai_assistant
 CACHE_DIR = "cache/"
 if not os.path.exists(CACHE_DIR):
     os.makedirs(CACHE_DIR)
 DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
+SUBJECTS = [
+    "5G", "AI", "Innovation", "Network", "Enterprise", "Open RAN",
+    "TechCo", "B2B", "API", "Infrastructure", "Connectivity"
+]
 async def get_client_rendered_content(url):
 def extract_speaker_info(segment):
     try:
+        pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
         match = re.match(pattern, segment)
         if match:
             return {key: value.strip() if value else None for key, value in match.groupdict().items()}
         raise Exception(f"Error extracting speaker info: {str(e)}")
+def extract_subject_info(text):
+    # Convert text to lowercase for case-insensitive matching
+    lower_text = text.lower()
+    # Find all subjects present in the text
+    found_subjects = [
+        subject for subject in SUBJECTS if subject.lower() in lower_text]
+    return found_subjects
 def parse_transcript(content):
     try:
         parsed_segments = []
+        saved_info = None
+        pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
         segments = re.split(pattern, content)
         segments = [segment.strip() for segment in segments if segment.strip()]
+        for i, segment in enumerate(segments):
             speaker_info = extract_speaker_info(segment)
+            if speaker_info:
                 if speaker_info['speaker']:
+                    # Full speaker, company, timestamp format
+                    if saved_info:
+                        text = segments[i-1] if i > 0 else ""
+                        subjects = extract_subject_info(text)
+                        parsed_segments.append({
+                            'metadata': {
+                                'speaker': saved_info['speaker'],
+                                'company': saved_info['company'],
+                                'start_timestamp': saved_info['timestamp'],
+                                'end_timestamp': speaker_info['timestamp'],
+                                'subjects': subjects
+                            },
+                            'text': text
+                        })
+                    saved_info = speaker_info
                 else:
+                    # Standalone timestamp format
+                    if saved_info:
+                        text = segments[i-1] if i > 0 else ""
+                        subjects = extract_subject_info(text)
+                        parsed_segments.append({
+                            'metadata': {
+                                'speaker': saved_info['speaker'],
+                                'company': saved_info['company'],
+                                'start_timestamp': saved_info['timestamp'],
+                                'end_timestamp': speaker_info['timestamp'],
+                                'subjects': subjects
+                            },
+                            'text': text
+                        })
+                        saved_info['timestamp'] = speaker_info['timestamp']
+            elif saved_info:
+                # Text segment
+                continue
+        # Add final entry
+        if saved_info:
+            text = segments[-1]
+            subjects = extract_subject_info(text)
+            parsed_segments.append({
+                'metadata': {
+                    'speaker': saved_info['speaker'],
+                    'company': saved_info['company'],
+                    'start_timestamp': saved_info['timestamp'],
+                    'end_timestamp': "00:00:00",
+                    'subjects': subjects
+                },
+                'text': text
+            })
         return parsed_segments
     except Exception as e:
         raise Exception(f"Error parsing transcript: {str(e)}")
         print(f"Detailed error: {str(e)}")
         return None
 async def process_urls(urls):
     tasks = [process_url(url) for url in urls]
 def main():
+    global assistant
+    assistant = get_ai_assistant()
     url_file = "dsp-urls.txt"  # File containing list of URLs
     content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
+    # Convert companies to a dictionary of speaker sets if it's not already
+    if not isinstance(companies, dict):
+        companies = {company: set() for company in companies}
     with open(url_file, 'r') as f:
         urls = [line.strip() for line in f if line.strip()]
         for entry in transcript:
             metadata.update(entry['metadata'])
+            company = metadata['company']
+            speaker = metadata['speaker']
+            entry_subjects = metadata['subjects']
+            speakers.add(speaker)
+            # Add new subjects to the master set
+            subjects.update(entry_subjects)
             text = entry['text']
+            assistant.add_to_knowledge_base(
+                text, data_type='text', metadata=metadata.copy())
+            if company not in companies:
+                companies[company] = set()
+            companies[company].add(speaker)
         content_hashes.add(filename_hash)
         print(f"Added new url: {url}")
+    # Save updated hashes and metadata
     save_metadata_sets(content_hashes, speakers,
                        companies, sentiments, subjects)
+    assistant.save()
     print("Processing complete. Check individual URL outputs for any errors.")
+def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
+    metadata = {
+        'content_hashes': list(content_hashes),
+        'speakers': list(speakers),
+        'companies': {company: list(speakers) for company, speakers in companies.items()},
+        'sentiments': list(sentiments),
+        'subjects': list(subjects)
+    }
+    with open(DB_METADATA_FILE, 'w') as f:
+        json.dump(metadata, f, indent=2)
+def db_load_metadata_sets():
+    content_hashes = set()
+    speakers = set()
+    companies = {}
+    sentiments = set()
+    subjects = set()
+    if os.path.exists(DB_METADATA_FILE):
+        with open(DB_METADATA_FILE, 'r') as f:
+            metadata = json.load(f)
+        content_hashes = set(metadata.get('content_hashes', []))
+        speakers = set(metadata.get('speakers', []))
+        companies = {company: set(speakers) for company, speakers in metadata.get(
+            'companies', {}).items()}
+        sentiments = set(metadata.get('sentiments', []))
+        subjects = set(metadata.get('subjects', SUBJECTS))
+    return content_hashes, speakers, companies, sentiments, subjects
 if __name__ == "__main__":
     main()