Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import chromadb
|
3 |
from chromadb.utils import embedding_functions
|
|
|
4 |
from PyPDF2 import PdfReader
|
5 |
from gradio_client import Client
|
6 |
import speech_recognition as sr
|
@@ -78,15 +79,23 @@ def process_pdf(file):
|
|
78 |
for page in pdf_reader.pages:
|
79 |
text += page.extract_text()
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
documents=[text],
|
86 |
-
metadatas=[{"filename": file.name}],
|
87 |
-
ids=[file.name] # Use the filename as the unique ID
|
88 |
)
|
|
|
|
|
|
|
|
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
return f"PDF wurde erfolgreich in ChromaDB gespeichert."
|
91 |
|
92 |
def search_similar_documents(prompt):
|
|
|
1 |
import gradio as gr
|
2 |
import chromadb
|
3 |
from chromadb.utils import embedding_functions
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from PyPDF2 import PdfReader
|
6 |
from gradio_client import Client
|
7 |
import speech_recognition as sr
|
|
|
79 |
for page in pdf_reader.pages:
|
80 |
text += page.extract_text()
|
81 |
|
82 |
+
# Split the text into smaller chunks
|
83 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
84 |
+
chunk_size=1000, # Adjust the chunk size as needed
|
85 |
+
chunk_overlap=100 # Adjust the overlap as needed
|
|
|
|
|
|
|
86 |
)
|
87 |
+
chunks = text_splitter.split_text(text)
|
88 |
+
|
89 |
+
# Create embeddings for each chunk
|
90 |
+
embeddings = embedding_function(chunks)
|
91 |
|
92 |
+
# Store each chunk in ChromaDB
|
93 |
+
for i, chunk in enumerate(chunks):
|
94 |
+
collection.add(
|
95 |
+
documents=[chunk],
|
96 |
+
metadatas=[{"filename": file.name, "chunk_id": i}],
|
97 |
+
ids=[f"{file.name}_{i}"] # Use a unique ID for each chunk
|
98 |
+
)
|
99 |
return f"PDF wurde erfolgreich in ChromaDB gespeichert."
|
100 |
|
101 |
def search_similar_documents(prompt):
|