Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain import vectorstores as vs | |
from langchain import chains | |
import pinecone | |
from goose3 import Goose | |
import streamlit as st | |
import whisper | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.llms import AI21 | |
from pytube import YouTube | |
import moviepy.editor | |
import time | |
load_dotenv() | |
api_key=os.getenv('PINECONE_API_KEY') | |
env=os.getenv('PINECONE_ENVIRONMENT') | |
ai21_api_key=os.getenv('AI21_API_KEY') | |
pinecone.init(api_key=api_key, environment=env) | |
def txtread(txt_content): | |
texts = "" | |
texts += txt_content.decode('utf-8') | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size = 1000, | |
chunk_overlap = 0) | |
chunks = text_splitter.split_text(texts) | |
process.success("Chunking of the data is done") | |
embeddings = HuggingFaceEmbeddings() | |
pinecone.init(api_key=api_key, environment=env) | |
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB") | |
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt") | |
process.success("Data is securly Uploaded") | |
def pdfread(pdf): | |
pdf_reader = PdfReader(pdf) | |
texts = "" | |
for page in pdf_reader.pages: | |
texts += page.extract_text() | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size = 4000, | |
chunk_overlap = 0) | |
chunks = text_splitter.split_text(texts) | |
process.success("Chunking of the data is done") | |
embeddings = HuggingFaceEmbeddings() | |
pinecone.init(api_key=api_key, environment=env) | |
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB") | |
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf") | |
process.success("Data is securly Uploaded") | |
def urlread(url_path): | |
g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'}) | |
texts = g.extract(url=url_path).cleaned_text | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size = 2000, | |
chunk_overlap = 0) | |
chunks = text_splitter.split_text(texts) | |
process.success("Chunking of the data is done") | |
embeddings = HuggingFaceEmbeddings() | |
pinecone.init(api_key=api_key, environment=env) | |
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB") | |
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url") | |
process.success("Data is securly Uploaded") | |
def scrape(vidlink): | |
youtubeObject = YouTube(vidlink) | |
youtubeObject = youtubeObject.streams.get_highest_resolution() | |
youtubeObject.download(filename='video.mp4') | |
process.success('Downloading Video') | |
done=False | |
while not done: | |
time.sleep(10) | |
done=os.path.exists("video.mp4") | |
video = moviepy.editor.VideoFileClip("video.mp4") | |
process.warning('Extracting Audio') | |
audio = video.audio | |
audio.write_audiofile("audio.mp3") | |
process.warning('Trancscribing the Audio') | |
model = whisper.load_model('base') | |
result=model.transcribe('audio.mp3') | |
texts=(result['text']) | |
process.success('Transcription is done') | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size = 1000, | |
chunk_overlap = 0) | |
chunks = text_splitter.split_text(texts) | |
process.success("Chunking of the data is done") | |
embeddings = HuggingFaceEmbeddings() | |
pinecone.init(api_key=api_key, environment=env) | |
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB") | |
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid") | |
process.success("Data is securly Uploaded") | |
def chain(name): | |
process.warning("Your Chain is running") | |
embeddings = HuggingFaceEmbeddings() | |
pinecone.init(api_key=api_key, environment=env) | |
db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings) | |
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10}) | |
llm = AI21(ai21_api_key=ai21_api_key) | |
qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever) | |
return qa | |
def ai(qa,prompt): | |
chat_history=[] | |
result = qa({"question": prompt, "chat_history": chat_history}) | |
process.success("Search Complete!") | |
return result | |
def intro(): | |
placeholder.title('____________π¨π»βπ» MINOR PROJECT π¨π»βπ»____________\n') | |
data.subheader('π Introducing "KnowledgeHub" Web App! ππ§ ') | |
process.write('___________________________________________') | |
intro=(''' | |
Welcome to the future of knowledge interaction! π With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. ππ» | |
How It Works: | |
π File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! π | |
π URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! π€― | |
π₯ YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! π | |
Why use KnowledgeHub: | |
π Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. π | |
π Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. π | |
π€ AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! π€π‘ | |
π Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. π | |
Embrace the future of knowledge sharing with KnowledgeHub β Where ideas come to life, and intelligence knows no bounds! ππ₯π''') | |
ph=st.empty() | |
x='' | |
for i in intro: | |
x+=i | |
time.sleep(0.005) | |
ph.markdown(x) | |
def upload(): | |
placeholder.title("Let's create the Knowledge Base") | |
process.error('Here you will be notified regarding the status of the upload') | |
page = ['','TEXT','PDF','URL','VIDEO'] | |
choice = st.sidebar.radio("Choose your mode",page) | |
if choice=='': | |
data.subheader('Choose what type of data you wanna upload') | |
elif choice == 'TEXT': | |
text = data.file_uploader("Upload your txt file", type="txt") | |
if text: | |
txtread(text) | |
elif choice == 'PDF': | |
pdf = data.file_uploader("Upload your PDF file", type="pdf") | |
if pdf: | |
pdfread(pdf) | |
elif choice == 'URL': | |
url_path = data.text_input('Enter the url') | |
if url_path: | |
urlread(url_path) | |
elif choice == 'VIDEO': | |
link = data.text_input('Enter link to the youtube video') | |
if link: | |
scrape(link) | |
time.sleep(10) | |
process.success('You can go to the chat section or upload more data') | |
def chat(): | |
placeholder.title("Let's go!!") | |
process.error('Here you will be notified regarding the retrival of your answers') | |
page = ['','TEXT','PDF','URL','VIDEO'] | |
choice = st.sidebar.radio("Choose your mode",page) | |
if choice=='': | |
data.subheader('Choose from which data you want answers from') | |
elif choice == 'TEXT': | |
name='txt' | |
query = st.text_input("Ask a question based on the txt file",value="") | |
if query: | |
qa=chain(name) | |
result=ai(qa,query) | |
ph=st.empty() | |
x='' | |
for i in result["answer"]: | |
x+=i | |
time.sleep(0.01) | |
ph.markdown(x) | |
elif choice == 'PDF': | |
name='pdf' | |
query = st.text_input("Ask a question based on the PDF",value="") | |
if query: | |
qa=chain(name) | |
result=ai(qa,query) | |
ph=st.empty() | |
x='' | |
for i in result["answer"]: | |
x+=i | |
time.sleep(0.01) | |
ph.markdown(x) | |
elif choice == 'URL': | |
name='url' | |
query = st.text_input("Ask a question based on the data from the url",value="") | |
if query: | |
qa=chain(name) | |
result=ai(qa,query) | |
ph=st.empty() | |
x='' | |
for i in result["answer"]: | |
x+=i | |
time.sleep(0.01) | |
ph.markdown(x) | |
elif choice == 'VIDEO': | |
name='vid' | |
query = st.text_input("Ask a question from based on the YouTube video",value="") | |
if query: | |
qa=chain(name) | |
result=ai(qa,query) | |
ph=st.empty() | |
x='' | |
for i in result["answer"]: | |
x+=i | |
time.sleep(0.01) | |
ph.markdown(x) | |
def main(): | |
global placeholder, process, data | |
placeholder=st.empty() | |
data=st.empty() | |
process=st.empty() | |
page = ['HOME','Upload','Chat'] | |
choice = st.sidebar.radio("Choose upload or chat",page) | |
if choice=='HOME': | |
intro() | |
elif choice=='Upload': | |
upload() | |
elif choice=='Chat': | |
chat() | |
if __name__ == "__main__": | |
main() | |