Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -18,8 +18,8 @@ def extract_text_from_pdf(pdf_path):
|
|
18 |
return text
|
19 |
|
20 |
def clean_text(text):
|
21 |
-
text = re.sub(r'\s+', ' ', text)
|
22 |
-
text = re.sub(r'[^\w\s.,-]', '', text)
|
23 |
return text.strip()
|
24 |
|
25 |
def split_text_into_paragraphs(text, max_length=500):
|
@@ -36,27 +36,46 @@ def split_text_into_paragraphs(text, max_length=500):
|
|
36 |
refined_paragraphs.append(temp.strip())
|
37 |
return refined_paragraphs
|
38 |
|
|
|
39 |
def find_relevant_parts(question, context_parts):
|
40 |
-
keywords = question.split()
|
41 |
relevant_parts = [
|
42 |
-
part for part in context_parts if any(keyword
|
43 |
]
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
|
49 |
-
invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"]
|
50 |
-
for phrase in invalid_phrases:
|
51 |
-
answer = answer.replace(phrase, "")
|
52 |
return answer.capitalize().strip()
|
53 |
|
|
|
54 |
def chatbot_response(pdf_path, question):
|
55 |
-
|
56 |
-
context = clean_text(
|
57 |
context_parts = split_text_into_paragraphs(context)
|
|
|
|
|
58 |
relevant_parts = find_relevant_parts(question, context_parts)
|
59 |
|
|
|
60 |
answers = []
|
61 |
for part in relevant_parts:
|
62 |
try:
|
@@ -64,11 +83,13 @@ def chatbot_response(pdf_path, question):
|
|
64 |
answers.append(result['answer'])
|
65 |
except Exception:
|
66 |
continue
|
67 |
-
|
68 |
-
|
|
|
|
|
69 |
return final_answer
|
70 |
|
71 |
-
# Gradio-Interface
|
72 |
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
|
73 |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
|
74 |
response_output = gr.Textbox(label="Antwort")
|
|
|
18 |
return text
|
19 |
|
20 |
def clean_text(text):
|
21 |
+
text = re.sub(r'\s+', ' ', text) # Mehrere Leerzeichen und Zeilenumbrüche reduzieren
|
22 |
+
text = re.sub(r'[^\w\s.,-]', '', text) # Entfernen von Sonderzeichen
|
23 |
return text.strip()
|
24 |
|
25 |
def split_text_into_paragraphs(text, max_length=500):
|
|
|
36 |
refined_paragraphs.append(temp.strip())
|
37 |
return refined_paragraphs
|
38 |
|
39 |
+
# Funktion zur Relevanzbewertung von Abschnitten
|
40 |
def find_relevant_parts(question, context_parts):
|
41 |
+
keywords = question.lower().split()
|
42 |
relevant_parts = [
|
43 |
+
part for part in context_parts if any(keyword in part.lower() for keyword in keywords)
|
44 |
]
|
45 |
+
if not relevant_parts:
|
46 |
+
# Fallback: Abschnitte mit den meisten Übereinstimmungen wählen
|
47 |
+
keyword_counts = [
|
48 |
+
(part, sum(part.lower().count(keyword) for keyword in keywords))
|
49 |
+
for part in context_parts
|
50 |
+
]
|
51 |
+
keyword_counts.sort(key=lambda x: x[1], reverse=True)
|
52 |
+
relevant_parts = [keyword_counts[0][0]] if keyword_counts else context_parts
|
53 |
+
return relevant_parts
|
54 |
|
55 |
+
# Funktion für Antwort-Postprocessing
|
56 |
+
def refine_answer(answer, question, context):
|
57 |
+
if not answer or len(answer.split()) < 3:
|
58 |
+
# Versuche, die Antwort direkt aus dem Kontext zu extrahieren
|
59 |
+
keywords = question.lower().split()
|
60 |
+
relevant_sentences = [
|
61 |
+
sentence for sentence in context.split('.')
|
62 |
+
if any(keyword in sentence.lower() for keyword in keywords)
|
63 |
+
]
|
64 |
+
if relevant_sentences:
|
65 |
+
return " ".join(relevant_sentences).strip()
|
66 |
return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
|
|
|
|
|
|
|
67 |
return answer.capitalize().strip()
|
68 |
|
69 |
+
# Hauptfunktion für den Chatbot
|
70 |
def chatbot_response(pdf_path, question):
|
71 |
+
# Text extrahieren und bereinigen
|
72 |
+
context = clean_text(extract_text_from_pdf(pdf_path))
|
73 |
context_parts = split_text_into_paragraphs(context)
|
74 |
+
|
75 |
+
# Relevante Abschnitte finden
|
76 |
relevant_parts = find_relevant_parts(question, context_parts)
|
77 |
|
78 |
+
# Antworten aus relevanten Abschnitten generieren
|
79 |
answers = []
|
80 |
for part in relevant_parts:
|
81 |
try:
|
|
|
83 |
answers.append(result['answer'])
|
84 |
except Exception:
|
85 |
continue
|
86 |
+
|
87 |
+
# Beste Antwort auswählen und verfeinern
|
88 |
+
combined_context = " ".join(relevant_parts)
|
89 |
+
final_answer = refine_answer(" ".join(answers).strip(), question, combined_context)
|
90 |
return final_answer
|
91 |
|
92 |
+
# Gradio-Interface erstellen
|
93 |
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
|
94 |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
|
95 |
response_output = gr.Textbox(label="Antwort")
|