Spaces:
Sleeping
Sleeping
File size: 2,969 Bytes
1b999bf 7f96312 d8d8be1 93850b9 d859c3e 93850b9 1b999bf 7bf65ec 93850b9 318ff7b d8d8be1 93850b9 7b74120 93850b9 7b74120 93850b9 7b74120 93850b9 7b74120 adbd41e d8d8be1 93850b9 7b74120 93850b9 7b74120 93850b9 7b74120 d8d8be1 93850b9 d93fe74 318ff7b ee9ba92 fbe3ac4 318ff7b 7b74120 93850b9 fbe3ac4 ee9ba92 318ff7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import gradio as gr
from PyPDF2 import PdfReader
import re
# Modell und Tokenizer laden
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
# Funktion zum Extrahieren und Bereinigen von Text aus PDF
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def clean_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,-]', '', text)
return text.strip()
def split_text_into_paragraphs(text, max_length=500):
paragraphs = text.split("\n")
refined_paragraphs = []
temp = ""
for para in paragraphs:
if len(temp) + len(para) <= max_length:
temp += " " + para
else:
refined_paragraphs.append(temp.strip())
temp = para
if temp:
refined_paragraphs.append(temp.strip())
return refined_paragraphs
def find_relevant_parts(question, context_parts):
keywords = question.split()
relevant_parts = [
part for part in context_parts if any(keyword.lower() in part.lower() for keyword in keywords)
]
return relevant_parts if relevant_parts else context_parts
def validate_and_refine_answer(answer):
if not answer or len(answer.split()) < 5:
return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"]
for phrase in invalid_phrases:
answer = answer.replace(phrase, "")
return answer.capitalize().strip()
def chatbot_response(pdf_path, question):
context = extract_text_from_pdf(pdf_path)
context = clean_text(context)
context_parts = split_text_into_paragraphs(context)
relevant_parts = find_relevant_parts(question, context_parts)
answers = []
for part in relevant_parts:
try:
result = qa_pipeline(question=question, context=part)
answers.append(result['answer'])
except Exception:
continue
final_answer = validate_and_refine_answer(" ".join(answers).strip())
return final_answer
# Gradio-Interface
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
response_output = gr.Textbox(label="Antwort")
interface = gr.Interface(
fn=chatbot_response,
inputs=[pdf_input, question_input],
outputs=response_output,
title="Verbesserte PDF-Fragebeantwortung",
description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Antworten basieren nur auf den PDF-Inhalten."
)
if __name__ == "__main__":
interface.launch()
|