from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer import gradio as gr from PyPDF2 import PdfReader import re # Modell und Tokenizer laden model_name = "deepset/roberta-base-squad2" model = AutoModelForQuestionAnswering.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) # Funktion zum Extrahieren und Bereinigen von Text aus PDF def extract_text_from_pdf(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() return text def clean_text(text): text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s.,-]', '', text) return text.strip() def split_text_into_paragraphs(text, max_length=500): paragraphs = text.split("\n") refined_paragraphs = [] temp = "" for para in paragraphs: if len(temp) + len(para) <= max_length: temp += " " + para else: refined_paragraphs.append(temp.strip()) temp = para if temp: refined_paragraphs.append(temp.strip()) return refined_paragraphs def find_relevant_parts(question, context_parts): keywords = question.split() relevant_parts = [ part for part in context_parts if any(keyword.lower() in part.lower() for keyword in keywords) ] return relevant_parts if relevant_parts else context_parts def validate_and_refine_answer(answer): if not answer or len(answer.split()) < 5: return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden." invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"] for phrase in invalid_phrases: answer = answer.replace(phrase, "") return answer.capitalize().strip() def chatbot_response(pdf_path, question): context = extract_text_from_pdf(pdf_path) context = clean_text(context) context_parts = split_text_into_paragraphs(context) relevant_parts = find_relevant_parts(question, context_parts) answers = [] for part in relevant_parts: try: result = qa_pipeline(question=question, context=part) answers.append(result['answer']) except Exception: continue final_answer = validate_and_refine_answer(" ".join(answers).strip()) return final_answer # Gradio-Interface pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath") question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument") response_output = gr.Textbox(label="Antwort") interface = gr.Interface( fn=chatbot_response, inputs=[pdf_input, question_input], outputs=response_output, title="Verbesserte PDF-Fragebeantwortung", description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Antworten basieren nur auf den PDF-Inhalten." ) if __name__ == "__main__": interface.launch()