RAG-Vereine

Sleeping

mgokg commited on Nov 17, 2024

Commit

67bfd1d

verified ·

1 Parent(s): 32813a7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from chromadb.utils import embedding_functions
 from PyPDF2 import PdfReader
 from gradio_client import Client
 from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT
 # Initialisiere ChromaDB
 client_chroma = chromadb.Client()
@@ -46,26 +46,34 @@ def ask_llm(llm_prompt_input):
     return result
 def process_pdf(file):
-    # Lese den PDF-Inhalt
     pdf_reader = PdfReader(file.name)
     text = ""
     for page in pdf_reader.pages:
         text += page.extract_text()
-    # Erstelle Embedding
-    embedding = embedding_function([text])[0]
-    # Speichere das PDF in ChromaDB
-    collection.add(
-        documents=[text],
-        metadatas=[{"filename": file.name}],
-        ids=[file.name]  # Verwende den Dateinamen als ID
     )
-    return f"PDF wurde erfolgreich in ChromaDB gespeichert."
 def search_similar_documents(prompt):
     # Erstelle Embedding für den Prompt
     query_embedding = embedding_function([prompt])[0]

 from PyPDF2 import PdfReader
 from gradio_client import Client
 from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 # Initialisiere ChromaDB
 client_chroma = chromadb.Client()
     return result
 def process_pdf(file):
+    # Read the PDF content
     pdf_reader = PdfReader(file.name)
     text = ""
     for page in pdf_reader.pages:
         text += page.extract_text()
+    # Split the text into smaller chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,  # Adjust the chunk size as needed
+        chunk_overlap=100  # Adjust the overlap as needed
     )
+    chunks = text_splitter.split_text(text)
+    # Create embeddings for each chunk
+    embeddings = embedding_function(chunks)
+    # Store each chunk in ChromaDB
+    for i, chunk in enumerate(chunks):
+        collection.add(
+            documents=[chunk],
+            metadatas=[{"filename": file.name, "chunk_id": i}],
+            ids=[f"{file.name}_{i}"]  # Use a unique ID for each chunk
+        )
+# Example usage
+# process_pdf(your_file_object)
 def search_similar_documents(prompt):
     # Erstelle Embedding für den Prompt
     query_embedding = embedding_function([prompt])[0]