File size: 4,787 Bytes
b7c716e
a78722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7c716e
 
 
 
 
 
 
 
 
 
 
a78722c
b7c716e
 
 
 
 
 
 
 
 
 
 
a471490
b7c716e
a78722c
b7c716e
 
 
 
a471490
 
a78722c
 
 
 
 
 
a471490
cd921da
 
 
 
 
 
 
 
a471490
a78722c
 
a471490
 
 
 
cd921da
 
 
 
 
 
 
a471490
 
 
b7c716e
cd921da
a471490
cd921da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a78722c
 
 
 
a471490
a78722c
 
 
 
a471490
 
a78722c
 
 
 
 
 
 
 
 
 
 
 
 
a471490
cd921da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import requests
import os
import streamlit as st
import pickle
import time
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain_groq import ChatGroq
from langchain.vectorstores import FAISS

from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env (especially openai api key)

st.title("RockyBot: News Research Tool πŸ“ˆ")
st.sidebar.title("News Article URLs")

urls = []
for i in range(3):
    url = st.sidebar.text_input(f"URL {i+1}")
    urls.append(url)

process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"

main_placeholder = st.empty()
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)

# Debugging: Check if URLs are accessible
def check_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return True
        else:
            return False
    except Exception as e:
        return False

if process_url_clicked:
    # Debugging: Verify URL accessibility
    valid_urls = []
    for url in urls:
        if check_url(url):
            valid_urls.append(url)
        else:
            main_placeholder.text(f"URL is not accessible: {url}")
    
    if not valid_urls:
        main_placeholder.text("None of the URLs are accessible.")
    
    # Load data from URLs
    loader = UnstructuredURLLoader(urls=valid_urls)
    main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
    try:
        data = loader.load()
    except Exception as e:
        main_placeholder.text(f"Error loading data: {e}")
    
    # Split data into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
    docs = text_splitter.split_documents(data)
    
    # Debugging: Check if docs is empty
    if not docs:
        main_placeholder.text("No valid documents found! Please check the URLs.")
    
    # Debugging: Check the content of docs
    for doc in docs:
        main_placeholder.text(f"Document content: {doc.page_content[:200]}")  # Show first 200 chars of each document
    
    # Create embeddings using HuggingFaceEmbeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
    
    # Generate embeddings
    embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])
    
    # Debugging: Check if embeddings are generated
    if not embeddings:
        main_placeholder.text("No embeddings were generated! Check the embedding model or document content.")
    
    # Check the size of embeddings
    main_placeholder.text(f"Generated {len(embeddings)} embeddings.")

    # Convert embeddings to numpy array (needed by FAISS)
    embeddings_np = np.array(embeddings).astype(np.float32)
    
    # Check the shape of embeddings
    main_placeholder.text(f"Shape of embeddings: {embeddings_np.shape}")
    
    # Create FAISS index
    if len(embeddings) > 0:
        dimension = len(embeddings[0])  # Embedding vector dimension
        index = FAISS(dimension)
        index.add(embeddings_np)  # Add embeddings to FAISS index
        
        # Wrap FAISS index using LangChain FAISS wrapper
        vectorstore_huggingface = FAISS(embedding_function=embedding_model, index=index)
        
        # Save the FAISS index to a pickle file
        with open(file_path, "wb") as f:
            pickle.dump(vectorstore_huggingface, f)
        
        time.sleep(2)
    else:
        main_placeholder.text("Embeddings could not be generated, skipping FAISS index creation.")

query = main_placeholder.text_input("Question: ")
if query:
    if os.path.exists(file_path):
        # Load the FAISS index from the pickle file
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)
            chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
            result = chain({"question": query}, return_only_outputs=True)
            
            # Display the answer
            st.header("Answer")
            st.write(result["answer"])

            # Display sources, if available
            sources = result.get("sources", "")
            if sources:
                st.subheader("Sources:")
                sources_list = sources.split("\n")  # Split the sources by newline
                for source in sources_list:
                    st.write(source)