AamirAli123 commited on
Commit
4229477
·
verified ·
1 Parent(s): 0d7efba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -21
app.py CHANGED
@@ -12,9 +12,15 @@ from langchain.memory import ConversationBufferMemory
12
  from langchain.llms import HuggingFaceHub
13
  from pathlib import Path
14
  import chromadb
 
 
 
 
 
 
15
  load_dotenv()
16
  huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
-
18
  # default_persist_directory = './chroma_HF/'
19
  list_llm = ["mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
20
  "google/gemma-7b-it","google/gemma-2b-it", \
@@ -23,7 +29,16 @@ list_llm = ["mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instru
23
  "google/flan-t5-xxl"
24
  ]
25
  list_llm_simple = [os.path.basename(llm) for llm in list_llm]
26
-
 
 
 
 
 
 
 
 
 
27
  # Load PDF document and create doc splits
28
  def load_doc(list_file_path, chunk_size, chunk_overlap):
29
  # Processing for one document only
@@ -37,18 +52,6 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
37
  doc_splits = text_splitter.split_documents(pages)
38
  return doc_splits
39
 
40
- def load_doc_for_openai(list_file_path):
41
- # Processing for one document only
42
- loaders = [PyPDFLoader(x) for x in list_file_path]
43
- pages = []
44
- for loader in loaders:
45
- pages.extend(loader.load())
46
- text_splitter = RecursiveCharacterTextSplitter(
47
- chunk_size = 600,
48
- chunk_overlap = 40)
49
- doc_splits = text_splitter.split_documents(pages)
50
- return doc_splits
51
-
52
  # Create vector database
53
  def create_db(splits, collection_name):
54
  embedding = HuggingFaceEmbeddings()
@@ -117,9 +120,15 @@ def initialize_llmchain(llm_model, temperature, max_tokens, top_k, vector_db, pr
117
 
118
 
119
  # Initialize database
120
- def initialize_database(list_file_obj, chunk_size, chunk_overlap, vector_db, progress = gr.Progress()):
121
- # Create list of documents (when valid)
122
- list_file_path = [x.name for x in list_file_obj if x is not None]
 
 
 
 
 
 
123
  # Create collection_name for vector database
124
  progress(0.1, desc="Creating collection name...")
125
  collection_name = Path(list_file_path[0]).stem
@@ -142,7 +151,7 @@ def initialize_database(list_file_obj, chunk_size, chunk_overlap, vector_db, pro
142
  progress(0.7, desc="Generating vector database...")
143
  # global vector_db
144
  vector_db = create_db(doc_splits, collection_name)
145
- return vector_db, collection_name, "Complete!"
146
 
147
 
148
  def re_initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db):
@@ -195,6 +204,15 @@ def demo():
195
  with gr.Row():
196
  with gr.Column():
197
  document = gr.Files(file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
 
 
 
 
 
 
 
 
 
198
  with gr.Row():
199
  db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database", visible = False)
200
  with gr.Accordion("Advanced options - Document text splitter", open=False, visible = False):
@@ -203,7 +221,7 @@ def demo():
203
  with gr.Row():
204
  slider_chunk_overlap = gr.Slider(minimum = 10, maximum = 200, value=40, step=10, label="Chunk overlap", info="Chunk overlap", interactive=True, visible = False)
205
  llm_btn = gr.Radio(list_llm_simple, label = "LLM models", type = "index", info = "Choose your LLM model")
206
- db_progress = gr.Textbox(label="Vector database initialization", value="None")
207
  with gr.Row():
208
  submit_file = gr.Button("Submit File")
209
  with gr.Row():
@@ -223,8 +241,8 @@ def demo():
223
  # Preprocessing events
224
  #upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
225
  submit_file.click(initialize_database, \
226
- inputs=[document, slider_chunk_size, slider_chunk_overlap, vector_db], \
227
- outputs = [vector_db, collection_name, db_progress])
228
  llm_btn.change(
229
  re_initialize_LLM, \
230
  inputs = [llm_btn, slider_temperature, slider_maxtokens, slider_topk, vector_db], \
 
12
  from langchain.llms import HuggingFaceHub
13
  from pathlib import Path
14
  import chromadb
15
+ # Later Packages
16
+ from getpass import getpass
17
+
18
+ import weasyprint
19
+ import matplotlib.pyplot as plt
20
+ from langchain.document_loaders import PyPDFDirectoryLoader
21
  load_dotenv()
22
  huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
23
+ openai_key = os.getenv("OPEN_API_KEY")
24
  # default_persist_directory = './chroma_HF/'
25
  list_llm = ["mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
26
  "google/gemma-7b-it","google/gemma-2b-it", \
 
29
  "google/flan-t5-xxl"
30
  ]
31
  list_llm_simple = [os.path.basename(llm) for llm in list_llm]
32
+ # Craete PDf from URL
33
+ def create_pdf_from_url(url):
34
+ pdf = weasyprint.HTML(url).write_pdf()
35
+ output_dir = "pdfDir"
36
+ if not os.path.exists(output_dir):
37
+ os.makedirs(output_dir)
38
+ file_path = os.path.join(output_dir,'url_pdf.pdf')
39
+ with open(file_path,'wb') as f:
40
+ f.write(pdf)
41
+ return file_path
42
  # Load PDF document and create doc splits
43
  def load_doc(list_file_path, chunk_size, chunk_overlap):
44
  # Processing for one document only
 
52
  doc_splits = text_splitter.split_documents(pages)
53
  return doc_splits
54
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # Create vector database
56
  def create_db(splits, collection_name):
57
  embedding = HuggingFaceEmbeddings()
 
120
 
121
 
122
  # Initialize database
123
+ def initialize_database(list_file_obj, chunk_size, chunk_overlap, vector_db, url, progress = gr.Progress()):
124
+ if url != "":
125
+ file_path = create_pdf_from_url(url)
126
+ list_file_obj = []
127
+ list_file_obj.append(file_path)
128
+ list_file_path = list_file_obj
129
+ else:
130
+ # Create list of documents (when valid)
131
+ list_file_path = [x.name for x in list_file_obj if x is not None]
132
  # Create collection_name for vector database
133
  progress(0.1, desc="Creating collection name...")
134
  collection_name = Path(list_file_path[0]).stem
 
151
  progress(0.7, desc="Generating vector database...")
152
  # global vector_db
153
  vector_db = create_db(doc_splits, collection_name)
154
+ return vector_db, collection_name, gr.update(value = ""), "Complete!"
155
 
156
 
157
  def re_initialize_LLM(llm_option, llm_temperature, max_tokens, top_k, vector_db):
 
204
  with gr.Row():
205
  with gr.Column():
206
  document = gr.Files(file_count="multiple", file_types=["pdf"], interactive=True, label="Upload your PDF documents (single or multiple)")
207
+ with gr.Row():
208
+ gr.Markdown(
209
+ '''
210
+ <div style="text-align:center;">
211
+ <span style="font-size:2em; font-weight:bold;">OR</span>
212
+ </div>
213
+ ''')
214
+ with gr.Row():
215
+ url = gr.Textbox(placeholder = "Enter your URL Here")
216
  with gr.Row():
217
  db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database", visible = False)
218
  with gr.Accordion("Advanced options - Document text splitter", open=False, visible = False):
 
221
  with gr.Row():
222
  slider_chunk_overlap = gr.Slider(minimum = 10, maximum = 200, value=40, step=10, label="Chunk overlap", info="Chunk overlap", interactive=True, visible = False)
223
  llm_btn = gr.Radio(list_llm_simple, label = "LLM models", type = "index", info = "Choose your LLM model")
224
+ db_progres = gr.Textbox(label="Vector database initialization", value="None")
225
  with gr.Row():
226
  submit_file = gr.Button("Submit File")
227
  with gr.Row():
 
241
  # Preprocessing events
242
  #upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
243
  submit_file.click(initialize_database, \
244
+ inputs=[document, slider_chunk_size, slider_chunk_overlap, vector_db, url], \
245
+ outputs = [vector_db, collection_name, url, db_progres])
246
  llm_btn.change(
247
  re_initialize_LLM, \
248
  inputs = [llm_btn, slider_temperature, slider_maxtokens, slider_topk, vector_db], \