mgokg commited on
Commit
137a6a3
·
verified ·
1 Parent(s): c32efda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import chromadb
3
  from chromadb.utils import embedding_functions
 
4
  from PyPDF2 import PdfReader
5
  from gradio_client import Client
6
  import speech_recognition as sr
@@ -78,15 +79,23 @@ def process_pdf(file):
78
  for page in pdf_reader.pages:
79
  text += page.extract_text()
80
 
81
- embeddings = embedding_function([text])
82
-
83
- # Store the entire text in ChromaDB
84
- collection.add(
85
- documents=[text],
86
- metadatas=[{"filename": file.name}],
87
- ids=[file.name] # Use the filename as the unique ID
88
  )
 
 
 
 
89
 
 
 
 
 
 
 
 
90
  return f"PDF wurde erfolgreich in ChromaDB gespeichert."
91
 
92
  def search_similar_documents(prompt):
 
1
  import gradio as gr
2
  import chromadb
3
  from chromadb.utils import embedding_functions
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from PyPDF2 import PdfReader
6
  from gradio_client import Client
7
  import speech_recognition as sr
 
79
  for page in pdf_reader.pages:
80
  text += page.extract_text()
81
 
82
+ # Split the text into smaller chunks
83
+ text_splitter = RecursiveCharacterTextSplitter(
84
+ chunk_size=1000, # Adjust the chunk size as needed
85
+ chunk_overlap=100 # Adjust the overlap as needed
 
 
 
86
  )
87
+ chunks = text_splitter.split_text(text)
88
+
89
+ # Create embeddings for each chunk
90
+ embeddings = embedding_function(chunks)
91
 
92
+ # Store each chunk in ChromaDB
93
+ for i, chunk in enumerate(chunks):
94
+ collection.add(
95
+ documents=[chunk],
96
+ metadatas=[{"filename": file.name, "chunk_id": i}],
97
+ ids=[f"{file.name}_{i}"] # Use a unique ID for each chunk
98
+ )
99
  return f"PDF wurde erfolgreich in ChromaDB gespeichert."
100
 
101
  def search_similar_documents(prompt):