mgokg commited on
Commit
fdeebc3
·
verified ·
1 Parent(s): a1a20d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -70
app.py CHANGED
@@ -3,8 +3,6 @@ import chromadb
3
  from chromadb.utils import embedding_functions
4
  from PyPDF2 import PdfReader
5
  from gradio_client import Client
6
- #from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT #is needed for persistent client
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  import speech_recognition as sr
9
  import groq
10
  import os
@@ -80,23 +78,15 @@ def process_pdf(file):
80
  for page in pdf_reader.pages:
81
  text += page.extract_text()
82
 
83
- # Split the text into smaller chunks
84
- text_splitter = RecursiveCharacterTextSplitter(
85
- chunk_size=300, # Adjust the chunk size as needed
86
- chunk_overlap=10 # Adjust the overlap as needed
87
- )
88
- chunks = text_splitter.split_text(text)
89
 
90
- # Create embeddings for each chunk
91
- #embeddings = embedding_function(chunks)
92
- embeddings = embedding_function(text)
93
- # Store each chunk in ChromaDB
94
- for i, chunk in enumerate(text):
95
- collection.add(
96
- documents=[chunk],
97
- metadatas=[{"filename": file.name, "chunk_id": i}],
98
- ids=[f"{file.name}_{i}"] # Use a unique ID for each chunk
99
- )
100
  return f"PDF wurde erfolgreich in ChromaDB gespeichert."
101
 
102
  def search_similar_documents(prompt):
@@ -114,55 +104,6 @@ def search_similar_documents(prompt):
114
  for i, doc in enumerate(results["documents"][0]):
115
  metadata = results["metadatas"][0][i]
116
  filename = metadata["filename"]
117
- formatted_results.append(f"{doc}\n")
118
-
119
- ergebnis = f"{''.join(formatted_results)}"
120
- ergebnis = gr.Markdown(ergebnis)
121
- return ergebnis
122
-
123
- with gr.Blocks() as chat:
124
- gr.Markdown("### Ask the RKI Files", elem_classes="tab-header")
125
- with gr.Row():
126
- llm_output = gr.Textbox(label="LLM Answer")
127
- with gr.Row():
128
- llm_prompt_input = gr.Textbox(label="Frage an das LLM", placeholder="Gib eine Frage ein")
129
- llm_submit_button = gr.Button("send")
130
- llm_submit_button.click(ask_llm, inputs=llm_prompt_input, outputs=llm_output)
131
-
132
- with gr.Blocks() as upload:
133
- gr.Markdown("### File upload", elem_classes="tab-header")
134
- with gr.Row():
135
- file_input = gr.File(label="Wähle eine PDF-Datei aus", type="filepath")
136
- upload_output = gr.Textbox(label="Upload Status")
137
- with gr.Row():
138
- submit_button = gr.Button("upload")
139
- submit_button.click(process_pdf, inputs=file_input, outputs=upload_output)
140
-
141
- with gr.Blocks() as suche:
142
- gr.Markdown("### Datenbank durchsuchen", elem_classes="tab-header")
143
- with gr.Row():
144
- prompt_input = gr.Textbox(label="Suche nach ähnlichen Dokumenten", placeholder="Gib einen Suchbegriff ein")
145
- with gr.Row():
146
- search_output = gr.Textbox(label="Ähnliche Dokumente")
147
- with gr.Row():
148
- search_button = gr.Button("Suchen")
149
- search_button.click(search_similar_documents, inputs=prompt_input, outputs=search_output)
150
-
151
- #optional, Spracheingabe
152
- with gr.Blocks() as speech:
153
- gr.Markdown("### Highspeed Voicebot", elem_classes="tab-header")
154
- with gr.Row():
155
- sr_outputs = gr.Textbox(label="Antwort")
156
- with gr.Row():
157
- sr_inputs = gr.Microphone(type="filepath")
158
- sr_inputs.change(transcribe_audio, inputs=sr_inputs, outputs=sr_outputs)
159
-
160
- # Erstelle die Gradio-Schnittstelle
161
- with gr.Blocks() as demo:
162
- gr.TabbedInterface(
163
- [chat, upload, suche],
164
- ["Chat", "Upload", "Suche"]
165
- )
166
-
167
- # Starte die Gradio-Anwendung
168
- demo.launch()
 
3
  from chromadb.utils import embedding_functions
4
  from PyPDF2 import PdfReader
5
  from gradio_client import Client
 
 
6
  import speech_recognition as sr
7
  import groq
8
  import os
 
78
  for page in pdf_reader.pages:
79
  text += page.extract_text()
80
 
81
+ # Create embedding for the entire text
82
+ embedding = embedding_function([text])[0]
 
 
 
 
83
 
84
+ # Store the entire text in ChromaDB
85
+ collection.add(
86
+ documents=[text],
87
+ metadatas=[{"filename": file.name}],
88
+ ids=[file.name] # Use the filename as the ID
89
+ )
 
 
 
 
90
  return f"PDF wurde erfolgreich in ChromaDB gespeichert."
91
 
92
  def search_similar_documents(prompt):
 
104
  for i, doc in enumerate(results["documents"][0]):
105
  metadata = results["metadatas"][0][i]
106
  filename = metadata["filename"]
107
+ formatted_results.append(f"### Dokument {i+1} (Dateiname: {filename})\n{doc}\n")
108
+
109
+ return gr.Markdown(''.join(formatted_results))