RAG_test_1 / app.py
la04's picture
Update app.py
93850b9 verified
raw
history blame
2.97 kB
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import gradio as gr
from PyPDF2 import PdfReader
import re
# Modell und Tokenizer laden
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
# Funktion zum Extrahieren und Bereinigen von Text aus PDF
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def clean_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,-]', '', text)
return text.strip()
def split_text_into_paragraphs(text, max_length=500):
paragraphs = text.split("\n")
refined_paragraphs = []
temp = ""
for para in paragraphs:
if len(temp) + len(para) <= max_length:
temp += " " + para
else:
refined_paragraphs.append(temp.strip())
temp = para
if temp:
refined_paragraphs.append(temp.strip())
return refined_paragraphs
def find_relevant_parts(question, context_parts):
keywords = question.split()
relevant_parts = [
part for part in context_parts if any(keyword.lower() in part.lower() for keyword in keywords)
]
return relevant_parts if relevant_parts else context_parts
def validate_and_refine_answer(answer):
if not answer or len(answer.split()) < 5:
return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"]
for phrase in invalid_phrases:
answer = answer.replace(phrase, "")
return answer.capitalize().strip()
def chatbot_response(pdf_path, question):
context = extract_text_from_pdf(pdf_path)
context = clean_text(context)
context_parts = split_text_into_paragraphs(context)
relevant_parts = find_relevant_parts(question, context_parts)
answers = []
for part in relevant_parts:
try:
result = qa_pipeline(question=question, context=part)
answers.append(result['answer'])
except Exception:
continue
final_answer = validate_and_refine_answer(" ".join(answers).strip())
return final_answer
# Gradio-Interface
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
response_output = gr.Textbox(label="Antwort")
interface = gr.Interface(
fn=chatbot_response,
inputs=[pdf_input, question_input],
outputs=response_output,
title="Verbesserte PDF-Fragebeantwortung",
description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Antworten basieren nur auf den PDF-Inhalten."
)
if __name__ == "__main__":
interface.launch()