RAG-Vereine

Sleeping

App Files Files Community

RAG-Vereine / app.py

mgokg

Update app.py

137a6a3 verified about 2 months ago

raw

history blame contribute delete

6.09 kB

	import gradio as gr
	import chromadb
	from chromadb.utils import embedding_functions
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from PyPDF2 import PdfReader
	from gradio_client import Client
	import speech_recognition as sr
	import groq
	import os

	#get your api-key @groq.com. its free!
	api_key = os.getenv('groq')

	# Initialisiere ChromaDB
	client_chroma = chromadb.Client()
	collection_name = "pdf_collection"
	collection = client_chroma.get_or_create_collection(name=collection_name)
	# Verwende die integrierten Embeddings von ChromaDB
	embedding_function = embedding_functions.DefaultEmbeddingFunction()

	client = groq.Client(api_key=api_key)

	# Use Llama 3 70B powered by Groq for answering
	def update(message):
	try:
	completion = client.chat.completions.create(
	model="llama3-8b-8192",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": f"{message}. antworte immer auf deutsch"}
	],
	)
	return completion.choices[0].message.content
	except Exception as e:
	return f"Error in response generation: {str(e)}"

	# Function to transcribe audio data to text
	def transcribe_audio(audio):
	recognizer = sr.Recognizer()
	with sr.AudioFile(audio) as source:
	audio_data = recognizer.record(source)
	try:
	text = recognizer.recognize_google(audio_data, language="de-DE")
	result = update(text)
	result=gr.Markdown(result)
	return result

	except sr.UnknownValueError:
	return "Speech recognition could not understand the audio."
	except sr.RequestError as e:
	return f"Could not request results from Google Speech Recognition service; {e}"

	def ask_llm(llm_prompt_input):
	# Erstelle Embedding für den Prompt
	query_embedding = embedding_function([llm_prompt_input])[0]
	# Führe die Ähnlichkeitssuche durch
	results = collection.query(
	query_embeddings=[query_embedding],
	n_results=3
	)

	# Formatiere die Ergebnisse
	formatted_results = []
	for i, doc in enumerate(results["documents"][0]):
	metadata = results["metadatas"][0][i]
	filename = metadata["filename"]
	formatted_results.append(f"### Dokument {i+1} (Dateiname: {filename})\n{doc}\n")

	# Füge die formatierten Ergebnisse zum Prompt hinzu
	enriched_prompt = f"{llm_prompt_input}\n\n### Verwandte Informationen:\n{''.join(formatted_results)}"
	result = update(enriched_prompt)
	result=gr.Markdown(result)
	return result

	def process_pdf(file):
	# Read the PDF content
	pdf_reader = PdfReader(file.name)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()

	# Split the text into smaller chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, # Adjust the chunk size as needed
	chunk_overlap=100 # Adjust the overlap as needed
	)
	chunks = text_splitter.split_text(text)

	# Create embeddings for each chunk
	embeddings = embedding_function(chunks)

	# Store each chunk in ChromaDB
	for i, chunk in enumerate(chunks):
	collection.add(
	documents=[chunk],
	metadatas=[{"filename": file.name, "chunk_id": i}],
	ids=[f"{file.name}_{i}"] # Use a unique ID for each chunk
	)
	return f"PDF wurde erfolgreich in ChromaDB gespeichert."

	def search_similar_documents(prompt):
	# Erstelle Embedding für den Prompt
	query_embedding = embedding_function([prompt])[0]

	# Führe die Ähnlichkeitssuche durch
	results = collection.query(
	query_embeddings=[query_embedding],
	n_results=1
	)

	# Formatiere die Ergebnisse
	formatted_results = []
	for i, doc in enumerate(results["documents"][0]):
	metadata = results["metadatas"][0][i]
	filename = metadata["filename"]
	formatted_results.append(f"{doc}\n")

	ergebnis = f"{''.join(formatted_results)}"
	ergebnis = gr.Markdown(ergebnis)
	return ergebnis


	with gr.Blocks() as chat:
	gr.Markdown("### Ask the RKI Files", elem_classes="tab-header")

	with gr.Row():
	llm_output = gr.Textbox(label="LLM Answer")
	with gr.Row():
	llm_prompt_input = gr.Textbox(label="Frage an das LLM", placeholder="Gib eine Frage ein")
	with gr.Row():
	llm_submit_button = gr.Button("send")

	llm_submit_button.click(ask_llm, inputs=llm_prompt_input, outputs=llm_output)

	with gr.Blocks() as upload:
	gr.Markdown("### File upload", elem_classes="tab-header")

	with gr.Row():
	upload_output = gr.Textbox(label="Upload Status")
	with gr.Row():
	file_input = gr.File(label="Wähle eine PDF-Datei aus", type="filepath")
	with gr.Row():
	submit_button = gr.Button("upload")

	submit_button.click(process_pdf, inputs=file_input, outputs=upload_output)

	with gr.Blocks() as suche:
	gr.Markdown("### Datenbank durchsuchen", elem_classes="tab-header")

	with gr.Row():
	prompt_input = gr.Textbox(label="Suche nach ähnlichen Dokumenten", placeholder="Gib einen Suchbegriff ein")
	with gr.Row():
	search_output = gr.Textbox(label="Ähnliche Dokumente")
	with gr.Row():
	search_button = gr.Button("Suchen")

	search_button.click(search_similar_documents, inputs=prompt_input, outputs=search_output)

	#optional, Spracheingabe
	with gr.Blocks() as speech:
	gr.Markdown("### Highspeed Voicebot", elem_classes="tab-header")
	with gr.Row():
	sr_outputs = gr.Textbox(label="Antwort")
	with gr.Row():
	sr_inputs = gr.Microphone(type="filepath")
	sr_inputs.change(transcribe_audio, inputs=sr_inputs, outputs=sr_outputs)

	# Erstelle die Gradio-Schnittstelle
	with gr.Blocks() as demo:
	gr.TabbedInterface(
	[chat, upload, suche],
	["Chat", "Upload", "Suche"]
	)

	# Starte die Gradio-Anwendung
	demo.launch()