la04 commited on
Commit
b0a7bef
·
verified ·
1 Parent(s): 93850b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -15
app.py CHANGED
@@ -18,8 +18,8 @@ def extract_text_from_pdf(pdf_path):
18
  return text
19
 
20
  def clean_text(text):
21
- text = re.sub(r'\s+', ' ', text)
22
- text = re.sub(r'[^\w\s.,-]', '', text)
23
  return text.strip()
24
 
25
  def split_text_into_paragraphs(text, max_length=500):
@@ -36,27 +36,46 @@ def split_text_into_paragraphs(text, max_length=500):
36
  refined_paragraphs.append(temp.strip())
37
  return refined_paragraphs
38
 
 
39
  def find_relevant_parts(question, context_parts):
40
- keywords = question.split()
41
  relevant_parts = [
42
- part for part in context_parts if any(keyword.lower() in part.lower() for keyword in keywords)
43
  ]
44
- return relevant_parts if relevant_parts else context_parts
 
 
 
 
 
 
 
 
45
 
46
- def validate_and_refine_answer(answer):
47
- if not answer or len(answer.split()) < 5:
 
 
 
 
 
 
 
 
 
48
  return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
49
- invalid_phrases = ["bluetooth", "hand", "ke", "eingelegt"]
50
- for phrase in invalid_phrases:
51
- answer = answer.replace(phrase, "")
52
  return answer.capitalize().strip()
53
 
 
54
  def chatbot_response(pdf_path, question):
55
- context = extract_text_from_pdf(pdf_path)
56
- context = clean_text(context)
57
  context_parts = split_text_into_paragraphs(context)
 
 
58
  relevant_parts = find_relevant_parts(question, context_parts)
59
 
 
60
  answers = []
61
  for part in relevant_parts:
62
  try:
@@ -64,11 +83,13 @@ def chatbot_response(pdf_path, question):
64
  answers.append(result['answer'])
65
  except Exception:
66
  continue
67
-
68
- final_answer = validate_and_refine_answer(" ".join(answers).strip())
 
 
69
  return final_answer
70
 
71
- # Gradio-Interface
72
  pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
73
  question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
74
  response_output = gr.Textbox(label="Antwort")
 
18
  return text
19
 
20
  def clean_text(text):
21
+ text = re.sub(r'\s+', ' ', text) # Mehrere Leerzeichen und Zeilenumbrüche reduzieren
22
+ text = re.sub(r'[^\w\s.,-]', '', text) # Entfernen von Sonderzeichen
23
  return text.strip()
24
 
25
  def split_text_into_paragraphs(text, max_length=500):
 
36
  refined_paragraphs.append(temp.strip())
37
  return refined_paragraphs
38
 
39
+ # Funktion zur Relevanzbewertung von Abschnitten
40
  def find_relevant_parts(question, context_parts):
41
+ keywords = question.lower().split()
42
  relevant_parts = [
43
+ part for part in context_parts if any(keyword in part.lower() for keyword in keywords)
44
  ]
45
+ if not relevant_parts:
46
+ # Fallback: Abschnitte mit den meisten Übereinstimmungen wählen
47
+ keyword_counts = [
48
+ (part, sum(part.lower().count(keyword) for keyword in keywords))
49
+ for part in context_parts
50
+ ]
51
+ keyword_counts.sort(key=lambda x: x[1], reverse=True)
52
+ relevant_parts = [keyword_counts[0][0]] if keyword_counts else context_parts
53
+ return relevant_parts
54
 
55
+ # Funktion für Antwort-Postprocessing
56
+ def refine_answer(answer, question, context):
57
+ if not answer or len(answer.split()) < 3:
58
+ # Versuche, die Antwort direkt aus dem Kontext zu extrahieren
59
+ keywords = question.lower().split()
60
+ relevant_sentences = [
61
+ sentence for sentence in context.split('.')
62
+ if any(keyword in sentence.lower() for keyword in keywords)
63
+ ]
64
+ if relevant_sentences:
65
+ return " ".join(relevant_sentences).strip()
66
  return "Die Antwort konnte nicht eindeutig aus dem Dokument ermittelt werden."
 
 
 
67
  return answer.capitalize().strip()
68
 
69
+ # Hauptfunktion für den Chatbot
70
  def chatbot_response(pdf_path, question):
71
+ # Text extrahieren und bereinigen
72
+ context = clean_text(extract_text_from_pdf(pdf_path))
73
  context_parts = split_text_into_paragraphs(context)
74
+
75
+ # Relevante Abschnitte finden
76
  relevant_parts = find_relevant_parts(question, context_parts)
77
 
78
+ # Antworten aus relevanten Abschnitten generieren
79
  answers = []
80
  for part in relevant_parts:
81
  try:
 
83
  answers.append(result['answer'])
84
  except Exception:
85
  continue
86
+
87
+ # Beste Antwort auswählen und verfeinern
88
+ combined_context = " ".join(relevant_parts)
89
+ final_answer = refine_answer(" ".join(answers).strip(), question, combined_context)
90
  return final_answer
91
 
92
+ # Gradio-Interface erstellen
93
  pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
94
  question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
95
  response_output = gr.Textbox(label="Antwort")