Spaces:
Sleeping
Sleeping
File size: 9,468 Bytes
559c3d3 f4bf677 559c3d3 f4bf677 559c3d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain import vectorstores as vs
from langchain import chains
import pinecone
from goose3 import Goose
import streamlit as st
import whisper
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import AI21
from pytube import YouTube
import moviepy.editor
import time
load_dotenv()
api_key=os.getenv('PINECONE_API_KEY')
env=os.getenv('PINECONE_ENVIRONMENT')
ai21_api_key=os.getenv('AI21_API_KEY')
pinecone.init(api_key=api_key, environment=env)
def txtread(txt_content):
texts = ""
texts += txt_content.decode('utf-8')
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size = 1000,
chunk_overlap = 0)
chunks = text_splitter.split_text(texts)
process.success("Chunking of the data is done")
embeddings = HuggingFaceEmbeddings()
pinecone.init(api_key=api_key, environment=env)
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt")
process.success("Data is securly Uploaded")
def pdfread(pdf):
pdf_reader = PdfReader(pdf)
texts = ""
for page in pdf_reader.pages:
texts += page.extract_text()
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size = 4000,
chunk_overlap = 0)
chunks = text_splitter.split_text(texts)
process.success("Chunking of the data is done")
embeddings = HuggingFaceEmbeddings()
pinecone.init(api_key=api_key, environment=env)
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf")
process.success("Data is securly Uploaded")
def urlread(url_path):
g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
texts = g.extract(url=url_path).cleaned_text
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size = 2000,
chunk_overlap = 0)
chunks = text_splitter.split_text(texts)
process.success("Chunking of the data is done")
embeddings = HuggingFaceEmbeddings()
pinecone.init(api_key=api_key, environment=env)
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url")
process.success("Data is securly Uploaded")
def scrape(vidlink):
youtubeObject = YouTube(vidlink)
youtubeObject = youtubeObject.streams.get_highest_resolution()
youtubeObject.download(filename='video.mp4')
process.success('Downloading Video')
done=False
while not done:
time.sleep(10)
done=os.path.exists("video.mp4")
video = moviepy.editor.VideoFileClip("video.mp4")
process.warning('Extracting Audio')
audio = video.audio
audio.write_audiofile("audio.mp3")
process.warning('Trancscribing the Audio')
model = whisper.load_model('base')
result=model.transcribe('audio.mp3')
texts=(result['text'])
process.success('Transcription is done')
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size = 1000,
chunk_overlap = 0)
chunks = text_splitter.split_text(texts)
process.success("Chunking of the data is done")
embeddings = HuggingFaceEmbeddings()
pinecone.init(api_key=api_key, environment=env)
process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid")
process.success("Data is securly Uploaded")
def chain(name):
process.warning("Your Chain is running")
embeddings = HuggingFaceEmbeddings()
pinecone.init(api_key=api_key, environment=env)
db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10})
llm = AI21(ai21_api_key=ai21_api_key)
qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
return qa
def ai(qa,prompt):
chat_history=[]
result = qa({"question": prompt, "chat_history": chat_history})
process.success("Search Complete!")
return result
def intro():
placeholder.title('____________π¨π»βπ» MINOR PROJECT π¨π»βπ»____________\n')
data.subheader('π Introducing "KnowledgeHub" Web App! ππ§ ')
process.write('___________________________________________')
intro=('''
Welcome to the future of knowledge interaction! π With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. ππ»
How It Works:
π File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! π
π URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! π€―
π₯ YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! π
Why use KnowledgeHub:
π Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. π
π Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. π
π€ AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! π€π‘
π Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. π
Embrace the future of knowledge sharing with KnowledgeHub β Where ideas come to life, and intelligence knows no bounds! ππ₯π''')
ph=st.empty()
x=''
for i in intro:
x+=i
time.sleep(0.005)
ph.markdown(x)
def upload():
placeholder.title("Let's create the Knowledge Base")
process.error('Here you will be notified regarding the status of the upload')
page = ['','TEXT','PDF','URL','VIDEO']
choice = st.sidebar.radio("Choose your mode",page)
if choice=='':
data.subheader('Choose what type of data you wanna upload')
elif choice == 'TEXT':
text = data.file_uploader("Upload your txt file", type="txt")
if text:
txtread(text)
elif choice == 'PDF':
pdf = data.file_uploader("Upload your PDF file", type="pdf")
if pdf:
pdfread(pdf)
elif choice == 'URL':
url_path = data.text_input('Enter the url')
if url_path:
urlread(url_path)
elif choice == 'VIDEO':
link = data.text_input('Enter link to the youtube video')
if link:
scrape(link)
time.sleep(10)
process.success('You can go to the chat section or upload more data')
def chat():
placeholder.title("Let's go!!")
process.error('Here you will be notified regarding the retrival of your answers')
page = ['','TEXT','PDF','URL','VIDEO']
choice = st.sidebar.radio("Choose your mode",page)
if choice=='':
data.subheader('Choose from which data you want answers from')
elif choice == 'TEXT':
name='txt'
query = st.text_input("Ask a question based on the txt file",value="")
if query:
qa=chain(name)
result=ai(qa,query)
ph=st.empty()
x=''
for i in result["answer"]:
x+=i
time.sleep(0.01)
ph.markdown(x)
elif choice == 'PDF':
name='pdf'
query = st.text_input("Ask a question based on the PDF",value="")
if query:
qa=chain(name)
result=ai(qa,query)
ph=st.empty()
x=''
for i in result["answer"]:
x+=i
time.sleep(0.01)
ph.markdown(x)
elif choice == 'URL':
name='url'
query = st.text_input("Ask a question based on the data from the url",value="")
if query:
qa=chain(name)
result=ai(qa,query)
ph=st.empty()
x=''
for i in result["answer"]:
x+=i
time.sleep(0.01)
ph.markdown(x)
elif choice == 'VIDEO':
name='vid'
query = st.text_input("Ask a question from based on the YouTube video",value="")
if query:
qa=chain(name)
result=ai(qa,query)
ph=st.empty()
x=''
for i in result["answer"]:
x+=i
time.sleep(0.01)
ph.markdown(x)
def main():
global placeholder, process, data
placeholder=st.empty()
data=st.empty()
process=st.empty()
page = ['HOME','Upload','Chat']
choice = st.sidebar.radio("Choose upload or chat",page)
if choice=='HOME':
intro()
elif choice=='Upload':
upload()
elif choice=='Chat':
chat()
if __name__ == "__main__":
main()
|