mgokg commited on
Commit
67bfd1d
·
verified ·
1 Parent(s): 32813a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -12
app.py CHANGED
@@ -4,7 +4,7 @@ from chromadb.utils import embedding_functions
4
  from PyPDF2 import PdfReader
5
  from gradio_client import Client
6
  from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT
7
-
8
 
9
  # Initialisiere ChromaDB
10
  client_chroma = chromadb.Client()
@@ -46,26 +46,34 @@ def ask_llm(llm_prompt_input):
46
 
47
 
48
  return result
49
-
50
  def process_pdf(file):
51
- # Lese den PDF-Inhalt
52
  pdf_reader = PdfReader(file.name)
53
  text = ""
54
  for page in pdf_reader.pages:
55
  text += page.extract_text()
56
 
57
- # Erstelle Embedding
58
- embedding = embedding_function([text])[0]
59
-
60
- # Speichere das PDF in ChromaDB
61
- collection.add(
62
- documents=[text],
63
- metadatas=[{"filename": file.name}],
64
- ids=[file.name] # Verwende den Dateinamen als ID
65
  )
 
 
 
 
66
 
67
- return f"PDF wurde erfolgreich in ChromaDB gespeichert."
 
 
 
 
 
 
68
 
 
 
69
  def search_similar_documents(prompt):
70
  # Erstelle Embedding für den Prompt
71
  query_embedding = embedding_function([prompt])[0]
 
4
  from PyPDF2 import PdfReader
5
  from gradio_client import Client
6
  from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
 
9
  # Initialisiere ChromaDB
10
  client_chroma = chromadb.Client()
 
46
 
47
 
48
  return result
49
+
50
  def process_pdf(file):
51
+ # Read the PDF content
52
  pdf_reader = PdfReader(file.name)
53
  text = ""
54
  for page in pdf_reader.pages:
55
  text += page.extract_text()
56
 
57
+ # Split the text into smaller chunks
58
+ text_splitter = RecursiveCharacterTextSplitter(
59
+ chunk_size=1000, # Adjust the chunk size as needed
60
+ chunk_overlap=100 # Adjust the overlap as needed
 
 
 
 
61
  )
62
+ chunks = text_splitter.split_text(text)
63
+
64
+ # Create embeddings for each chunk
65
+ embeddings = embedding_function(chunks)
66
 
67
+ # Store each chunk in ChromaDB
68
+ for i, chunk in enumerate(chunks):
69
+ collection.add(
70
+ documents=[chunk],
71
+ metadatas=[{"filename": file.name, "chunk_id": i}],
72
+ ids=[f"{file.name}_{i}"] # Use a unique ID for each chunk
73
+ )
74
 
75
+ # Example usage
76
+ # process_pdf(your_file_object)
77
  def search_similar_documents(prompt):
78
  # Erstelle Embedding für den Prompt
79
  query_embedding = embedding_function([prompt])[0]