Nitish-py commited on
Commit
559c3d3
Β·
1 Parent(s): b6983e3

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +4 -10
  3. app.py +264 -0
  4. requirements.txt +12 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md CHANGED
@@ -1,12 +1,6 @@
1
- ---
2
  title: KnowledgeHub
3
- emoji: πŸŒ–
4
- colorFrom: red
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.28.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  title: KnowledgeHub
 
 
 
2
  sdk: streamlit
3
+ emoji: πŸš€
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ pinned: true
 
 
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain import vectorstores as vs
6
+ from langchain import chains
7
+ import pinecone
8
+ from goose3 import Goose
9
+ import streamlit as st
10
+ import whisper
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ from langchain.llms import AI21
13
+ from pytube import YouTube
14
+ import moviepy.editor
15
+ import time
16
+
17
+
18
+ load_dotenv()
19
+ api_key=os.getenv('PINECONE_API_KEY')
20
+ env=os.getenv('PINECONE_ENVIRONMENT')
21
+ ai21_api_key=os.getenv('AI21_API_KEY')
22
+ pinecone.init(api_key=api_key, environment=env)
23
+
24
+ def txtread(txt_content):
25
+ texts = ""
26
+ texts += txt_content.decode('utf-8')
27
+ text_splitter = CharacterTextSplitter(
28
+ separator="\n",
29
+ chunk_size = 1000,
30
+ chunk_overlap = 0)
31
+ chunks = text_splitter.split_text(texts)
32
+ process.success("Chunking of the data is done")
33
+ embeddings = HuggingFaceEmbeddings()
34
+ pinecone.init(api_key=api_key, environment=env)
35
+ process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
36
+ db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="txt")
37
+ process.success("Data is securly Uploaded")
38
+
39
+ def pdfread(pdf):
40
+ pdf_reader = PdfReader(pdf)
41
+ texts = ""
42
+ for page in pdf_reader.pages:
43
+ texts += page.extract_text()
44
+ text_splitter = CharacterTextSplitter(
45
+ separator="\n",
46
+ chunk_size = 4000,
47
+ chunk_overlap = 0)
48
+ chunks = text_splitter.split_text(texts)
49
+ process.success("Chunking of the data is done")
50
+ embeddings = HuggingFaceEmbeddings()
51
+ pinecone.init(api_key=api_key, environment=env)
52
+ process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
53
+ db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="pdf")
54
+ process.success("Data is securly Uploaded")
55
+
56
+ def urlread(url_path):
57
+ g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
58
+ texts = g.extract(url=url_path).cleaned_text
59
+ text_splitter = CharacterTextSplitter(
60
+ separator="\n",
61
+ chunk_size = 2000,
62
+ chunk_overlap = 0)
63
+ chunks = text_splitter.split_text(texts)
64
+ process.success("Chunking of the data is done")
65
+ embeddings = HuggingFaceEmbeddings()
66
+ pinecone.init(api_key=api_key, environment=env)
67
+ process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
68
+ db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="url")
69
+ process.success("Data is securly Uploaded")
70
+
71
+ def scrape(vidlink):
72
+ youtubeObject = YouTube(vidlink)
73
+ youtubeObject = youtubeObject.streams.get_highest_resolution()
74
+ youtubeObject.download(filename='video.mp4')
75
+ process.success('Downloading Video')
76
+ done=False
77
+ while not done:
78
+ time.sleep(10)
79
+ done=os.path.exists("video.mp4")
80
+ video = moviepy.editor.VideoFileClip("video.mp4")
81
+ process.warning('Extracting Audio')
82
+ audio = video.audio
83
+ audio.write_audiofile("audio.mp3")
84
+ process.warning('Trancscribing the Audio')
85
+ model = whisper.load_model('base')
86
+ result=model.transcribe('audio.mp3')
87
+ texts=(result['text'])
88
+ process.success('Transcription is done')
89
+ text_splitter = CharacterTextSplitter(
90
+ separator="\n",
91
+ chunk_size = 1000,
92
+ chunk_overlap = 0)
93
+ chunks = text_splitter.split_text(texts)
94
+ process.success("Chunking of the data is done")
95
+ embeddings = HuggingFaceEmbeddings()
96
+ pinecone.init(api_key=api_key, environment=env)
97
+ process.warning("Starting Upload of the vector data in the Pinecone VectoreDB")
98
+ db = vs.pinecone.Pinecone.from_texts(chunks, embeddings,index_name="multigpt",namespace="vid")
99
+ process.success("Data is securly Uploaded")
100
+
101
+ def chain(name):
102
+ process.warning("Your Chain is running")
103
+ embeddings = HuggingFaceEmbeddings()
104
+ pinecone.init(api_key=api_key, environment=env)
105
+ db=vs.pinecone.Pinecone.from_existing_index(index_name='multigpt',namespace=name, embedding=embeddings)
106
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":10})
107
+ llm = AI21(ai21_api_key=ai21_api_key)
108
+ qa = chains.ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
109
+ return qa
110
+
111
+ def ai(qa,prompt):
112
+ chat_history=[]
113
+ result = qa({"question": prompt, "chat_history": chat_history})
114
+ process.success("Search Complete!")
115
+ return result
116
+
117
+ def intro():
118
+ placeholder.title('____________πŸ‘¨πŸ»β€πŸ’» MINOR PROJECT πŸ‘¨πŸ»β€πŸ’»____________\n')
119
+ data.subheader('πŸš€ Introducing "KnowledgeHub" Web App! 🌐🧠')
120
+ process.write('___________________________________________')
121
+ intro=('''
122
+
123
+ Welcome to the future of knowledge interaction! πŸš€ With our groundbreaking web app, "KnowledgeHub," you can effortlessly infuse intelligence into our platform through various mediums. πŸ“šπŸ’»
124
+
125
+ How It Works:
126
+
127
+ πŸ“ File Magic: Upload your knowledge-packed text files or PDFs to seamlessly share insights and wisdom with the world! πŸš€
128
+
129
+ 🌐 URL Wizardry: Simply paste a website URL, and watch as the KnowledgeHub transforms online information into a dynamic source of intelligence! 🀯
130
+
131
+ πŸŽ₯ YouTube Brilliance: Share video insights by dropping those mind-blowing YouTube links! Transforming video content into knowledge gold has never been easier! 🌟
132
+
133
+ Why use KnowledgeHub:
134
+
135
+ πŸš€ Instant Interaction: Say goodbye to static data! Engage with your knowledge instantly and turn information into actionable insights. πŸš€
136
+
137
+ 🌐 Universal Accessibility: Access your knowledge from anywhere, anytime, and empower your audience to dive into your insights effortlessly. 🌍
138
+
139
+ πŸ€– AI-Powered Conversations: Leverage cutting-edge AI for interactive conversations based on your knowledge repository! It's like having a brilliant virtual assistant at your fingertips! πŸ€–πŸ’‘
140
+
141
+ πŸ“Š Data-Driven Decisions: Turn raw data into actionable intelligence. Make informed decisions backed by the power of your knowledge repository. πŸ“ˆ
142
+
143
+ Embrace the future of knowledge sharing with KnowledgeHub – Where ideas come to life, and intelligence knows no bounds! πŸš€πŸ”₯πŸ”''')
144
+ ph=st.empty()
145
+ x=''
146
+ for i in intro:
147
+ x+=i
148
+ time.sleep(0.01)
149
+ ph.markdown(x)
150
+
151
+ def upload():
152
+ placeholder.title("Let's create the Knowledge Base")
153
+ process.error('Here you will be notified regarding the status of the upload')
154
+ page = ['','TEXT','PDF','URL','VIDEO']
155
+ choice = st.sidebar.radio("Choose your mode",page)
156
+
157
+ if choice=='':
158
+ data.subheader('Choose what type of data you wanna upload')
159
+
160
+ elif choice == 'TEXT':
161
+ text = data.file_uploader("Upload your txt file", type="txt")
162
+ if text:
163
+ txtread(text)
164
+
165
+ elif choice == 'PDF':
166
+ pdf = data.file_uploader("Upload your PDF file", type="pdf")
167
+ if pdf:
168
+ pdfread(pdf)
169
+
170
+ elif choice == 'URL':
171
+ url_path = data.text_input('Enter the url')
172
+ if url_path:
173
+ urlread(url_path)
174
+
175
+
176
+ elif choice == 'VIDEO':
177
+ link = data.text_input('Enter link to the youtube video')
178
+ if link:
179
+ scrape(link)
180
+ time.sleep(3)
181
+ process.success('You can go to the chat section or upload more data')
182
+
183
+ def chat():
184
+ placeholder.title("Let's go!!")
185
+ process.error('Here you will be notified regarding the retrival of your answers')
186
+ page = ['','TEXT','PDF','URL','VIDEO']
187
+ choice = st.sidebar.radio("Choose your mode",page)
188
+
189
+ if choice=='':
190
+ data.subheader('Choose from which data you want answers from')
191
+
192
+ elif choice == 'TEXT':
193
+ name='txt'
194
+ query = st.text_input("Ask a question based on the txt file",value="")
195
+ if query:
196
+ qa=chain(name)
197
+ result=ai(qa,query)
198
+ ph=st.empty()
199
+ x=''
200
+ for i in result["answer"]:
201
+ x+=i
202
+ time.sleep(0.01)
203
+ ph.markdown(x)
204
+
205
+ elif choice == 'PDF':
206
+ name='pdf'
207
+ query = st.text_input("Ask a question based on the PDF",value="")
208
+ if query:
209
+ qa=chain(name)
210
+ result=ai(qa,query)
211
+ ph=st.empty()
212
+ x=''
213
+ for i in result["answer"]:
214
+ x+=i
215
+ time.sleep(0.01)
216
+ ph.markdown(x)
217
+
218
+ elif choice == 'URL':
219
+ name='url'
220
+ query = st.text_input("Ask a question based on the data from the url",value="")
221
+ if query:
222
+ qa=chain(name)
223
+ result=ai(qa,query)
224
+ ph=st.empty()
225
+ x=''
226
+ for i in result["answer"]:
227
+ x+=i
228
+ time.sleep(0.01)
229
+ ph.markdown(x)
230
+
231
+
232
+ elif choice == 'VIDEO':
233
+ name='vid'
234
+ query = st.text_input("Ask a question from based on the YouTube video",value="")
235
+ if query:
236
+ qa=chain(name)
237
+ result=ai(qa,query)
238
+ ph=st.empty()
239
+ x=''
240
+ for i in result["answer"]:
241
+ x+=i
242
+ time.sleep(0.01)
243
+ ph.markdown(x)
244
+
245
+
246
+
247
+ def main():
248
+ global placeholder, process, data
249
+ placeholder=st.empty()
250
+ data=st.empty()
251
+ process=st.empty()
252
+ page = ['HOME','Upload','Chat']
253
+ choice = st.sidebar.radio("Choose upload or chat",page)
254
+ if choice=='HOME':
255
+ intro()
256
+
257
+ elif choice=='Upload':
258
+ upload()
259
+
260
+ elif choice=='Chat':
261
+ chat()
262
+
263
+ if __name__ == "__main__":
264
+ main()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai21
2
+ goose3
3
+ pinecone-client
4
+ pydantic==1.10.12
5
+ langchain==0.0.278
6
+ PyPDF2
7
+ python-dotenv
8
+ streamlit
9
+ moviepy
10
+ pytube
11
+ git+https://github.com/openai/whisper.git
12
+ sentence_transformers