Spaces:
Sleeping
Sleeping
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer | |
import gradio as gr | |
from PyPDF2 import PdfReader | |
import re | |
# Modell und Tokenizer laden | |
model_name = "deepset/roberta-base-squad2" | |
model = AutoModelForQuestionAnswering.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) | |
# Funktion zum Extrahieren und Bereinigen von Text aus PDF | |
def extract_text_from_pdf(pdf_path): | |
reader = PdfReader(pdf_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
def clean_text(text): | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[^\w\s.,-]', '', text) | |
return text.strip() | |
def split_text_into_paragraphs(text, max_length=500): | |
paragraphs = text.split("\n") | |
refined_paragraphs = [] | |
temp = "" | |
for para in paragraphs: | |
if len(temp) + len(para) <= max_length: | |
temp += " " + para | |
else: | |
refined_paragraphs.append(temp.strip()) | |
temp = para | |
if temp: | |
refined_paragraphs.append(temp.strip()) | |
return refined_paragraphs | |
def find_relevant_parts(question, context_parts): | |
keywords = question.split() | |
relevant_parts = [ | |
part for part in context_parts if any(keyword.lower() in part.lower() for keyword in keywords) | |
] | |
return relevant_parts if relevant_parts else context_parts | |
def validate_and_refine_answer(answer): | |
if not answer or len(answer.split()) < 5: | |
return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden." | |
invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"] | |
for phrase in invalid_phrases: | |
answer = answer.replace(phrase, "") | |
return answer.capitalize().strip() | |
def chatbot_response(pdf_path, question): | |
context = extract_text_from_pdf(pdf_path) | |
context = clean_text(context) | |
context_parts = split_text_into_paragraphs(context) | |
relevant_parts = find_relevant_parts(question, context_parts) | |
answers = [] | |
for part in relevant_parts: | |
try: | |
result = qa_pipeline(question=question, context=part) | |
answers.append(result['answer']) | |
except Exception: | |
continue | |
final_answer = validate_and_refine_answer(" ".join(answers).strip()) | |
return final_answer | |
# Gradio-Interface | |
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath") | |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument") | |
response_output = gr.Textbox(label="Antwort") | |
interface = gr.Interface( | |
fn=chatbot_response, | |
inputs=[pdf_input, question_input], | |
outputs=response_output, | |
title="Verbesserte PDF-Fragebeantwortung", | |
description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Antworten basieren nur auf den PDF-Inhalten." | |
) | |
if __name__ == "__main__": | |
interface.launch() | |