Spaces:

Nitish-py
/

KnowledgeHub

Sleeping

App Files Files Community

KnowledgeHub / app.py

Nitish-py

Update app.py

f4bf677 over 1 year ago

raw

history blame contribute delete

9.47 kB

	import os
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain import vectorstores as vs
	from langchain import chains
	import pinecone
	from goose3 import Goose
	import streamlit as st
	import whisper
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.llms import AI21
	from pytube import YouTube
	import moviepy.editor
	import time


	load_dotenv()
	api_key=os.getenv('PINECONE_API_KEY')
	env=os.getenv('PINECONE_ENVIRONMENT')
	ai21_api_key=os.getenv('AI21_API_KEY')
	pinecone.init(api_key=api_key, environment=env)

	def txtread(txt_content):
	texts = ""
	texts += txt_content.decode('utf-8')
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size = 1000,
	chunk_overlap = 0)
	chunks = text_splitter.split_text(texts)
	process.success("Chunking of the data is done")
	embeddings = HuggingFaceEmbeddings()
	pinecone.init(api_key=api_key, environment=env)
	process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
	db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt")
	process.success("Data is securly Uploaded")

	def pdfread(pdf):
	pdf_reader = PdfReader(pdf)
	texts = ""
	for page in pdf_reader.pages:
	texts += page.extract_text()
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size = 4000,
	chunk_overlap = 0)
	chunks = text_splitter.split_text(texts)
	process.success("Chunking of the data is done")
	embeddings = HuggingFaceEmbeddings()
	pinecone.init(api_key=api_key, environment=env)
	process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
	db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf")
	process.success("Data is securly Uploaded")

	def urlread(url_path):
	g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
	texts = g.extract(url=url_path).cleaned_text
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size = 2000,
	chunk_overlap = 0)
	chunks = text_splitter.split_text(texts)
	process.success("Chunking of the data is done")
	embeddings = HuggingFaceEmbeddings()
	pinecone.init(api_key=api_key, environment=env)
	process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
	db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url")
	process.success("Data is securly Uploaded")

	def scrape(vidlink):
	youtubeObject = YouTube(vidlink)
	youtubeObject = youtubeObject.streams.get_highest_resolution()
	youtubeObject.download(filename='video.mp4')
	process.success('Downloading Video')
	done=False
	while not done:
	time.sleep(10)
	done=os.path.exists("video.mp4")
	video = moviepy.editor.VideoFileClip("video.mp4")
	process.warning('Extracting Audio')
	audio = video.audio
	audio.write_audiofile("audio.mp3")
	process.warning('Trancscribing the Audio')
	model = whisper.load_model('base')
	result=model.transcribe('audio.mp3')
	texts=(result['text'])
	process.success('Transcription is done')
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size = 1000,
	chunk_overlap = 0)
	chunks = text_splitter.split_text(texts)
	process.success("Chunking of the data is done")
	embeddings = HuggingFaceEmbeddings()
	pinecone.init(api_key=api_key, environment=env)
	process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
	db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid")
	process.success("Data is securly Uploaded")

	def chain(name):
	process.warning("Your Chain is running")
	embeddings = HuggingFaceEmbeddings()
	pinecone.init(api_key=api_key, environment=env)
	db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings)
	retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10})
	llm = AI21(ai21_api_key=ai21_api_key)
	qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
	return qa

	def ai(qa,prompt):
	chat_history=[]
	result = qa({"question": prompt, "chat_history": chat_history})
	process.success("Search Complete!")
	return result

	def intro():
	placeholder.title('____________👨🏻‍💻 MINOR PROJECT 👨🏻‍💻____________\n')
	data.subheader('🚀 Introducing "KnowledgeHub" Web App! 🌐🧠')
	process.write('___________________________________________')
	intro=('''

	Welcome to the future of knowledge interaction! 🚀 With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. 📚💻

	How It Works:

	📁 File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! 🚀

	🌐 URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! 🤯

	🎥 YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! 🌟

	Why use KnowledgeHub:

	🚀 Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. 🚀

	🌐 Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. 🌍

	🤖 AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! 🤖💡

	📊 Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. 📈

	Embrace the future of knowledge sharing with KnowledgeHub – Where ideas come to life, and intelligence knows no bounds! 🚀🔥🔍''')
	ph=st.empty()
	x=''
	for i in intro:
	x+=i
	time.sleep(0.005)
	ph.markdown(x)

	def upload():
	placeholder.title("Let's create the Knowledge Base")
	process.error('Here you will be notified regarding the status of the upload')
	page = ['','TEXT','PDF','URL','VIDEO']
	choice = st.sidebar.radio("Choose your mode",page)

	if choice=='':
	data.subheader('Choose what type of data you wanna upload')

	elif choice == 'TEXT':
	text = data.file_uploader("Upload your txt file", type="txt")
	if text:
	txtread(text)

	elif choice == 'PDF':
	pdf = data.file_uploader("Upload your PDF file", type="pdf")
	if pdf:
	pdfread(pdf)

	elif choice == 'URL':
	url_path = data.text_input('Enter the url')
	if url_path:
	urlread(url_path)


	elif choice == 'VIDEO':
	link = data.text_input('Enter link to the youtube video')
	if link:
	scrape(link)
	time.sleep(10)
	process.success('You can go to the chat section or upload more data')

	def chat():
	placeholder.title("Let's go!!")
	process.error('Here you will be notified regarding the retrival of your answers')
	page = ['','TEXT','PDF','URL','VIDEO']
	choice = st.sidebar.radio("Choose your mode",page)

	if choice=='':
	data.subheader('Choose from which data you want answers from')

	elif choice == 'TEXT':
	name='txt'
	query = st.text_input("Ask a question based on the txt file",value="")
	if query:
	qa=chain(name)
	result=ai(qa,query)
	ph=st.empty()
	x=''
	for i in result["answer"]:
	x+=i
	time.sleep(0.01)
	ph.markdown(x)

	elif choice == 'PDF':
	name='pdf'
	query = st.text_input("Ask a question based on the PDF",value="")
	if query:
	qa=chain(name)
	result=ai(qa,query)
	ph=st.empty()
	x=''
	for i in result["answer"]:
	x+=i
	time.sleep(0.01)
	ph.markdown(x)

	elif choice == 'URL':
	name='url'
	query = st.text_input("Ask a question based on the data from the url",value="")
	if query:
	qa=chain(name)
	result=ai(qa,query)
	ph=st.empty()
	x=''
	for i in result["answer"]:
	x+=i
	time.sleep(0.01)
	ph.markdown(x)


	elif choice == 'VIDEO':
	name='vid'
	query = st.text_input("Ask a question from based on the YouTube video",value="")
	if query:
	qa=chain(name)
	result=ai(qa,query)
	ph=st.empty()
	x=''
	for i in result["answer"]:
	x+=i
	time.sleep(0.01)
	ph.markdown(x)



	def main():
	global placeholder, process, data
	placeholder=st.empty()
	data=st.empty()
	process=st.empty()
	page = ['HOME','Upload','Chat']
	choice = st.sidebar.radio("Choose upload or chat",page)
	if choice=='HOME':
	intro()

	elif choice=='Upload':
	upload()

	elif choice=='Chat':
	chat()

	if __name__ == "__main__":
	main()