File size: 2,435 Bytes
cfe33c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import streamlit as st
import pickle
import pinecone
import time
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.vectorstores import Pinecone

from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env (especially openai api key)

st.title("Research Tool πŸ“ˆ")
st.sidebar.title("Article URLs")


urls = []
for i in range(3):
    url = st.sidebar.text_input(f"URL {i+1}")
    urls.append(url)

main_placeholder = st.empty()


query = main_placeholder.text_input("Question: ")
if query:
    loader = UnstructuredURLLoader(urls=urls)
    main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
    data = loader.load()
    # split data
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
    docs = text_splitter.split_documents(data)
    # create embeddings and save it to FAISS index
    embeddings = OpenAIEmbeddings(api_key=os.getenv('OPENAI_API_KEY'))

    pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment="gcp-starter"
    )
    index_name = "langchainvector"
    index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
    main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
    def retrieve_query(mquery, k=3):
        matching_results = index.similarity_search(mquery, k=k)
        return matching_results
    llm = OpenAI(temperature=0.5)
    chain = load_qa_chain(llm, chain_type="stuff")
    def retrieve_ans(mquery):
        doc_search = retrieve_query(mquery)
        print(doc_search)
        response = chain.run(input_documents = doc_search, question=query)
        return response   
    

    result = retrieve_ans(query)
    st.header("Answer")
    st.write(result)
    # Display sources, if available
    # sources = result.get("sources", "")
    # if sources:
    #     st.subheader("Sources:")
    #     sources_list = sources.split("\n")  # Split the sources by newline
    #     for source in sources_list:
    #     st.write(source)