RAG-Vereine

Sleeping

App Files Files Community

mgokg commited on Dec 6, 2024

Commit

fdeebc3

verified ·

1 Parent(s): a1a20d2

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -70

app.py CHANGED Viewed

@@ -3,8 +3,6 @@ import chromadb
 from chromadb.utils import embedding_functions
 from PyPDF2 import PdfReader
 from gradio_client import Client
-#from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT #is needed for persistent client
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 import speech_recognition as sr
 import groq
 import os
@@ -80,23 +78,15 @@ def process_pdf(file):
     for page in pdf_reader.pages:
         text += page.extract_text()
-    # Split the text into smaller chunks
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=300,  # Adjust the chunk size as needed
-        chunk_overlap=10  # Adjust the overlap as needed
-    )
-    chunks = text_splitter.split_text(text)
-    # Create embeddings for each chunk
-    #embeddings = embedding_function(chunks)
-    embeddings = embedding_function(text)
-    # Store each chunk in ChromaDB
-    for i, chunk in enumerate(text):
-        collection.add(
-            documents=[chunk],
-            metadatas=[{"filename": file.name, "chunk_id": i}],
-            ids=[f"{file.name}_{i}"]  # Use a unique ID for each chunk
-        )
     return f"PDF wurde erfolgreich in ChromaDB gespeichert."
 def search_similar_documents(prompt):
@@ -114,55 +104,6 @@ def search_similar_documents(prompt):
     for i, doc in enumerate(results["documents"][0]):
         metadata = results["metadatas"][0][i]
         filename = metadata["filename"]
-        formatted_results.append(f"{doc}\n")
-    ergebnis = f"{''.join(formatted_results)}"
-    ergebnis = gr.Markdown(ergebnis)
-    return ergebnis
-with gr.Blocks() as chat:
-    gr.Markdown("### Ask the RKI Files", elem_classes="tab-header")
-    with gr.Row():
-        llm_output = gr.Textbox(label="LLM Answer")
-    with gr.Row():
-        llm_prompt_input = gr.Textbox(label="Frage an das LLM", placeholder="Gib eine Frage ein")
-        llm_submit_button = gr.Button("send")
-    llm_submit_button.click(ask_llm, inputs=llm_prompt_input, outputs=llm_output)
-with gr.Blocks() as upload:
-    gr.Markdown("### File upload", elem_classes="tab-header")
-    with gr.Row():
-        file_input = gr.File(label="Wähle eine PDF-Datei aus", type="filepath")
-        upload_output = gr.Textbox(label="Upload Status")
-    with gr.Row():
-        submit_button = gr.Button("upload")
-    submit_button.click(process_pdf, inputs=file_input, outputs=upload_output)
-with gr.Blocks() as suche:
-    gr.Markdown("### Datenbank durchsuchen", elem_classes="tab-header")
-    with gr.Row():
-        prompt_input = gr.Textbox(label="Suche nach ähnlichen Dokumenten", placeholder="Gib einen Suchbegriff ein")
-    with gr.Row():
-        search_output = gr.Textbox(label="Ähnliche Dokumente")
-    with gr.Row():
-        search_button = gr.Button("Suchen")
-    search_button.click(search_similar_documents, inputs=prompt_input, outputs=search_output)
-#optional, Spracheingabe
-with gr.Blocks() as speech:
-    gr.Markdown("### Highspeed Voicebot", elem_classes="tab-header")
-    with gr.Row():
-        sr_outputs = gr.Textbox(label="Antwort")
-    with gr.Row():
-        sr_inputs = gr.Microphone(type="filepath")
-    sr_inputs.change(transcribe_audio, inputs=sr_inputs, outputs=sr_outputs)
-# Erstelle die Gradio-Schnittstelle
-with gr.Blocks() as demo:
-    gr.TabbedInterface(
-        [chat, upload, suche],
-        ["Chat", "Upload", "Suche"]
-    )
-# Starte die Gradio-Anwendung
-demo.launch()

 from chromadb.utils import embedding_functions
 from PyPDF2 import PdfReader
 from gradio_client import Client
 import speech_recognition as sr
 import groq
 import os
     for page in pdf_reader.pages:
         text += page.extract_text()
+    # Create embedding for the entire text
+    embedding = embedding_function([text])[0]
+    # Store the entire text in ChromaDB
+    collection.add(
+        documents=[text],
+        metadatas=[{"filename": file.name}],
+        ids=[file.name]  # Use the filename as the ID
+    )
     return f"PDF wurde erfolgreich in ChromaDB gespeichert."
 def search_similar_documents(prompt):
     for i, doc in enumerate(results["documents"][0]):
         metadata = results["metadatas"][0][i]
         filename = metadata["filename"]
+        formatted_results.append(f"### Dokument {i+1} (Dateiname: {filename})\n{doc}\n")
+    return gr.Markdown(''.join(formatted_results))