File size: 2,969 Bytes
1b999bf
7f96312
d8d8be1
93850b9
d859c3e
93850b9
 
1b999bf
 
 
7bf65ec
93850b9
318ff7b
d8d8be1
 
 
 
 
 
93850b9
 
 
 
 
7b74120
93850b9
7b74120
 
 
 
 
 
 
 
 
 
 
 
93850b9
 
 
 
 
 
 
 
 
7b74120
93850b9
 
 
7b74120
 
adbd41e
d8d8be1
93850b9
7b74120
93850b9
7b74120
 
 
 
 
 
93850b9
 
 
 
7b74120
d8d8be1
93850b9
d93fe74
318ff7b
 
ee9ba92
fbe3ac4
 
318ff7b
 
7b74120
93850b9
fbe3ac4
ee9ba92
318ff7b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import gradio as gr
from PyPDF2 import PdfReader
import re

# Modell und Tokenizer laden
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Funktion zum Extrahieren und Bereinigen von Text aus PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,-]', '', text)
    return text.strip()

def split_text_into_paragraphs(text, max_length=500):
    paragraphs = text.split("\n")
    refined_paragraphs = []
    temp = ""
    for para in paragraphs:
        if len(temp) + len(para) <= max_length:
            temp += " " + para
        else:
            refined_paragraphs.append(temp.strip())
            temp = para
    if temp:
        refined_paragraphs.append(temp.strip())
    return refined_paragraphs

def find_relevant_parts(question, context_parts):
    keywords = question.split()
    relevant_parts = [
        part for part in context_parts if any(keyword.lower() in part.lower() for keyword in keywords)
    ]
    return relevant_parts if relevant_parts else context_parts

def validate_and_refine_answer(answer):
    if not answer or len(answer.split()) < 5:
        return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
    invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"]
    for phrase in invalid_phrases:
        answer = answer.replace(phrase, "")
    return answer.capitalize().strip()

def chatbot_response(pdf_path, question):
    context = extract_text_from_pdf(pdf_path)
    context = clean_text(context)
    context_parts = split_text_into_paragraphs(context)
    relevant_parts = find_relevant_parts(question, context_parts)
    
    answers = []
    for part in relevant_parts:
        try:
            result = qa_pipeline(question=question, context=part)
            answers.append(result['answer'])
        except Exception:
            continue
    
    final_answer = validate_and_refine_answer(" ".join(answers).strip())
    return final_answer

# Gradio-Interface
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
response_output = gr.Textbox(label="Antwort")

interface = gr.Interface(
    fn=chatbot_response,
    inputs=[pdf_input, question_input],
    outputs=response_output,
    title="Verbesserte PDF-Fragebeantwortung",
    description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Antworten basieren nur auf den PDF-Inhalten."
)

if __name__ == "__main__":
    interface.launch()