Spaces:
Sleeping
Sleeping
File size: 5,354 Bytes
904401c 9bb73f5 904401c 9bb73f5 89478cd 904401c 34f198a 904401c 9ad6ba3 cd183a7 89478cd 9ad6ba3 904401c b9a99db 904401c cd183a7 904401c 9bb73f5 cd183a7 342cc30 904401c cd183a7 2add24a 904401c ec2a1c4 89478cd 904401c 502892f 904401c b0bd22c 904401c b0bd22c 904401c b0bd22c 904401c b0bd22c 904401c b0bd22c 904401c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory.
#It is a great starting point for small datasets, where you may not want to launch a database server.
# import libraries
import streamlit as st
import requests
from bs4 import BeautifulSoup
#from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes.
#from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory.
from sentence_transformers import SentenceTransformer
from langchain_community.llms import HuggingFaceEndpoint
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains import RetrievalQA
#import vertexai
#from langchain.llms import VertexAI
#from langchain.embeddings import VertexAIEmbeddings
#vertexai.init(project=PROJECT, location=LOCATION) #GCP PROJECT ID, LOCATION as region.
#The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language
#tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for
#Text models can create include document summaries, answers to questions, and labels that classify content.
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3)
#model = SentenceTransformer("all-MiniLM-L6-v2")
#llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,)
#embeddings = VertexAIEmbeddings()
#embeddings = model.encode(sentences)
#The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file.
def get_text(url):
# Send a GET request to the URL
response = requests.get(url)
# Create a BeautifulSoup object with the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the specific element or elements containing the text you want to scrape
# Here, we'll find all <p> tags and extract their text
paragraphs = soup.find_all("p")
# Loop through the paragraphs and print their text
with open("text\\temp.txt", "w", encoding='utf-8') as file:
# Loop through the paragraphs and write their text to the file
for paragraph in paragraphs:
file.write(paragraph.get_text() + "\n")
@st.cache_resource
def create_langchain_index(input_text):
print("--indexing---")
get_text(input_text)
loader = TextLoader("text\\temp.txt", encoding='utf-8')
documents = loader.load()
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# create the open-source embedding function
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# load it into Chroma
db = Chroma.from_documents(docs, embeddings)
persist_directory = "chroma_db"
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory)
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
return db
# @st.cache_resource
# def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):
# index = create_langchain_index(input_text)
# summary_response = index.query(summary_query)
# tweet_response = index.query(tweet_query)
# ln_response = index.query(ln_query)
# return summary_response,tweet_response,ln_response
@st.cache_data
def get_response(input_text,query,_db):
print(f"--querying---{query}")
retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever())
response = retrieval_chain.run(query)
#response = index.query(query,llm=llm)
return response
#The below code is a simple flow to accept the webpage link and process the queries
#using the get_response function created above. Using the cache, the same.
st.title('Webpage Question and Answering ')
input_text=st.text_input("Provide the link to the webpage...")
summary_response = ""
tweet_response = ""
ln_response = ""
# if st.button("Load"):
if input_text:
db = create_langchain_index(input_text)
summary_query ="Write a 100 words summary of the document"
summary_response = get_response(input_text,summary_query,db)
tweet_query ="Write a twitter tweet"
tweet_response = get_response(input_text,tweet_query,db)
ln_query ="Write a linkedin post for the document"
ln_response = get_response(input_text,ln_query,db)
with st.expander('Page Summary'):
st.info(summary_response)
with st.expander('Tweet'):
st.info(tweet_response)
with st.expander('LinkedIn Post'):
st.info(ln_response)
st.session_state.input_text = ''
question=st.text_input("Ask a question from the link you shared...")
if st.button("Ask"):
if question:
db = create_langchain_index(input_text)
response = get_response(input_text,question,db)
st.write(response)
else:
st.warning("Please enter a question.")
|