Spaces:

Nitish-py
/

KnowledgeHub

Sleeping

File size: 9,468 Bytes

import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain import vectorstores as vs
from langchain import chains
import pinecone
from goose3 import Goose
import streamlit as st
import whisper
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import AI21
from pytube import YouTube
import moviepy.editor
import time


load_dotenv()
api_key=os.getenv('PINECONE_API_KEY')
env=os.getenv('PINECONE_ENVIRONMENT')
ai21_api_key=os.getenv('AI21_API_KEY')
pinecone.init(api_key=api_key, environment=env)

def txtread(txt_content):
    texts = ""
    texts += txt_content.decode('utf-8')
    text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 1000,
    chunk_overlap = 0)
    chunks = text_splitter.split_text(texts)
    process.success("Chunking of the data is done")
    embeddings = HuggingFaceEmbeddings()
    pinecone.init(api_key=api_key, environment=env)
    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt")
    process.success("Data is securly Uploaded")

def pdfread(pdf):
    pdf_reader = PdfReader(pdf)
    texts = ""
    for page in pdf_reader.pages:
        texts += page.extract_text()
    text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 4000,
    chunk_overlap = 0)
    chunks = text_splitter.split_text(texts)
    process.success("Chunking of the data is done")
    embeddings = HuggingFaceEmbeddings()
    pinecone.init(api_key=api_key, environment=env)
    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf")
    process.success("Data is securly Uploaded")

def urlread(url_path):
    g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
    texts = g.extract(url=url_path).cleaned_text
    text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 2000,
    chunk_overlap = 0)
    chunks = text_splitter.split_text(texts)
    process.success("Chunking of the data is done")
    embeddings = HuggingFaceEmbeddings()
    pinecone.init(api_key=api_key, environment=env)
    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url")
    process.success("Data is securly Uploaded")

def scrape(vidlink):
    youtubeObject = YouTube(vidlink)
    youtubeObject = youtubeObject.streams.get_highest_resolution()
    youtubeObject.download(filename='video.mp4')
    process.success('Downloading Video')
    done=False
    while not done:
        time.sleep(10)
        done=os.path.exists("video.mp4")
    video = moviepy.editor.VideoFileClip("video.mp4")
    process.warning('Extracting Audio')
    audio = video.audio
    audio.write_audiofile("audio.mp3")   
    process.warning('Trancscribing the Audio') 
    model = whisper.load_model('base')
    result=model.transcribe('audio.mp3')
    texts=(result['text'])
    process.success('Transcription is done')
    text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 1000,
    chunk_overlap = 0)
    chunks = text_splitter.split_text(texts)
    process.success("Chunking of the data is done")
    embeddings = HuggingFaceEmbeddings()
    pinecone.init(api_key=api_key, environment=env)
    process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
    db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid")
    process.success("Data is securly Uploaded")

def chain(name):
    process.warning("Your Chain is running")
    embeddings = HuggingFaceEmbeddings()
    pinecone.init(api_key=api_key, environment=env)
    db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings)
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10})
    llm = AI21(ai21_api_key=ai21_api_key)    
    qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    return qa

def ai(qa,prompt):
    chat_history=[]
    result = qa({"question": prompt,  "chat_history": chat_history})
    process.success("Search Complete!")
    return result

def intro():
    placeholder.title('____________👨🏻‍💻 MINOR PROJECT 👨🏻‍💻____________\n')
    data.subheader('🚀 Introducing "KnowledgeHub" Web App! 🌐🧠')
    process.write('___________________________________________')
    intro=('''

Welcome to the future of knowledge interaction! 🚀 With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. 📚💻

How It Works:

📁 File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! 🚀

🌐 URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! 🤯

🎥 YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! 🌟

Why use KnowledgeHub:

🚀 Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. 🚀

🌐 Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. 🌍

🤖 AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! 🤖💡

📊 Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. 📈

Embrace the future of knowledge sharing with KnowledgeHub – Where ideas come to life, and intelligence knows no bounds! 🚀🔥🔍''')
    ph=st.empty()
    x=''
    for i in intro:
        x+=i
        time.sleep(0.005)
        ph.markdown(x)

def upload():
    placeholder.title("Let's create the Knowledge Base")
    process.error('Here you will be notified regarding the status of the upload')
    page = ['','TEXT','PDF','URL','VIDEO']
    choice = st.sidebar.radio("Choose your mode",page)

    if choice=='':
        data.subheader('Choose what type of data you wanna upload')

    elif choice == 'TEXT':
        text = data.file_uploader("Upload your txt file", type="txt")
        if text:
            txtread(text)

    elif choice == 'PDF':
        pdf = data.file_uploader("Upload your PDF file", type="pdf")
        if pdf:
            pdfread(pdf)

    elif choice == 'URL':
        url_path = data.text_input('Enter the url')
        if url_path:
            urlread(url_path)
        

    elif choice == 'VIDEO':
        link = data.text_input('Enter link to the youtube video')
        if link:
            scrape(link)
    time.sleep(10)
    process.success('You can go to the chat section or upload more data')

def chat():
    placeholder.title("Let's go!!")
    process.error('Here you will be notified regarding the retrival of your answers')
    page = ['','TEXT','PDF','URL','VIDEO']
    choice = st.sidebar.radio("Choose your mode",page)
    
    if choice=='':
        data.subheader('Choose from which data you want answers from')

    elif choice == 'TEXT':
        name='txt'
        query = st.text_input("Ask a question based on the txt file",value="")
        if query:
            qa=chain(name)
            result=ai(qa,query)
            ph=st.empty()
            x=''
            for i in result["answer"]:
                x+=i
                time.sleep(0.01)
                ph.markdown(x)

    elif choice == 'PDF':
        name='pdf'
        query = st.text_input("Ask a question based on the PDF",value="")
        if query:
            qa=chain(name)
            result=ai(qa,query)
            ph=st.empty()
            x=''
            for i in result["answer"]:
                x+=i
                time.sleep(0.01)
                ph.markdown(x)

    elif choice == 'URL':
        name='url'
        query = st.text_input("Ask a question based on the data from the url",value="")
        if query:
            qa=chain(name)
            result=ai(qa,query)
            ph=st.empty()
            x=''
            for i in result["answer"]:
                x+=i
                time.sleep(0.01)
                ph.markdown(x)
        

    elif choice == 'VIDEO':
        name='vid'
        query = st.text_input("Ask a question from based on the YouTube video",value="")
        if query:
            qa=chain(name)
            result=ai(qa,query)
            ph=st.empty()
            x=''
            for i in result["answer"]:
                x+=i
                time.sleep(0.01)
                ph.markdown(x)
    


def main(): 
    global placeholder, process, data
    placeholder=st.empty()
    data=st.empty()
    process=st.empty()
    page = ['HOME','Upload','Chat']
    choice = st.sidebar.radio("Choose upload or chat",page)
    if choice=='HOME':
        intro()
    
    elif choice=='Upload':
        upload()

    elif choice=='Chat':
        chat()

if __name__ == "__main__":
    main()