Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,106 +1,44 @@
|
|
1 |
-
import faiss
|
2 |
-
import numpy as np
|
3 |
import gradio as gr
|
4 |
-
from sentence_transformers import SentenceTransformer
|
5 |
-
import fitz # PyMuPDF für die Textextraktion aus PDFs
|
6 |
from transformers import pipeline
|
7 |
-
import
|
|
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
|
12 |
-
#
|
13 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
-
qa_model = pipeline("question-answering", model="deepset/bert-large-uncased-whole-word-masking-finetuned-squad")
|
15 |
-
|
16 |
-
# FAISS-Index erstellen
|
17 |
-
def create_faiss_index(documents):
|
18 |
-
document_embeddings = model.encode(documents)
|
19 |
-
dimension = len(document_embeddings[0])
|
20 |
-
index = faiss.IndexFlatL2(dimension)
|
21 |
-
document_embeddings = np.array(document_embeddings).astype('float32')
|
22 |
-
index.add(document_embeddings)
|
23 |
-
return index, documents
|
24 |
-
|
25 |
-
# Text in größere Chunks aufteilen
|
26 |
-
def split_text_into_chunks(text, chunk_size=500): # Größere Chunks
|
27 |
-
words = text.split()
|
28 |
-
return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
29 |
-
|
30 |
-
# Text aus PDF extrahieren
|
31 |
def extract_text_from_pdf(pdf_path):
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
text
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
text_chunks.extend(chunks)
|
41 |
-
return text_chunks
|
42 |
-
|
43 |
-
# Kontexte nach Relevanz bewerten
|
44 |
-
def rank_contexts_by_relevance(query, contexts):
|
45 |
-
query_embedding = model.encode([query])[0].astype('float32')
|
46 |
-
context_embeddings = model.encode(contexts)
|
47 |
-
scores = np.dot(query_embedding, context_embeddings.T)
|
48 |
-
ranked_contexts = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)
|
49 |
-
return [context for _, context in ranked_contexts[:5]] # Nur die Top 5 Kontexte zurückgeben
|
50 |
-
|
51 |
-
# Suche nach den besten Antworten
|
52 |
-
def search_and_rank_answers(query, index, documents, k=10):
|
53 |
-
query_embedding = model.encode([query])[0].astype('float32')
|
54 |
-
D, I = index.search(np.array([query_embedding]), k=k)
|
55 |
-
ranked_contexts = [documents[i] for i in I[0]]
|
56 |
-
top_contexts = rank_contexts_by_relevance(query, ranked_contexts)
|
57 |
-
|
58 |
-
ranked_answers = []
|
59 |
-
for context in top_contexts:
|
60 |
-
try:
|
61 |
-
result = qa_model(question=query, context=context)
|
62 |
-
ranked_answers.append((result['answer'], result['score']))
|
63 |
-
except Exception as e:
|
64 |
-
logging.warning(f"Fehler bei der Antwortgenerierung: {e}")
|
65 |
-
|
66 |
-
ranked_answers = sorted(ranked_answers, key=lambda x: x[1], reverse=True)
|
67 |
-
return [answer for answer, _ in ranked_answers]
|
68 |
-
|
69 |
-
# Antworten kombinieren
|
70 |
-
def combine_answers(answers):
|
71 |
-
return " ".join(set(answers[:3])) # Entfernt Duplikate und kombiniert
|
72 |
-
|
73 |
-
# Gesamtprozess
|
74 |
def chatbot_response(pdf_path, question):
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
answers = search_and_rank_answers(question, index, documents, k=10)
|
85 |
-
|
86 |
-
# Antworten kombinieren
|
87 |
-
detailed_answer = combine_answers(answers)
|
88 |
-
|
89 |
-
logging.info(f"Antwort: {detailed_answer}")
|
90 |
-
return detailed_answer
|
91 |
-
|
92 |
-
# Gradio-Interface
|
93 |
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
|
94 |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
|
95 |
response_output = gr.Textbox(label="Antwort")
|
96 |
|
|
|
97 |
interface = gr.Interface(
|
98 |
fn=chatbot_response,
|
99 |
inputs=[pdf_input, question_input],
|
100 |
outputs=response_output,
|
101 |
-
title="PDF-Fragebeantwortung mit
|
102 |
-
description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Das System
|
103 |
)
|
104 |
|
|
|
105 |
if __name__ == "__main__":
|
106 |
interface.launch()
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
from transformers import pipeline
|
3 |
+
from PyPDF2 import PdfReader
|
4 |
+
import os
|
5 |
|
6 |
+
# Setze das T5-small Modell für die Frage-Antwort-Pipeline
|
7 |
+
qa_model = pipeline("question-answering", model="t5-small")
|
8 |
|
9 |
+
# Funktion zur Extraktion von Text aus einer PDF-Datei
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def extract_text_from_pdf(pdf_path):
|
11 |
+
# Lade die PDF und extrahiere den Text
|
12 |
+
reader = PdfReader(pdf_path)
|
13 |
+
text = ""
|
14 |
+
for page in reader.pages:
|
15 |
+
text += page.extract_text()
|
16 |
+
return text
|
17 |
+
|
18 |
+
# Funktion zur Beantwortung der Frage basierend auf dem extrahierten PDF-Text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def chatbot_response(pdf_path, question):
|
20 |
+
# Extrahiere den Text aus der PDF
|
21 |
+
context = extract_text_from_pdf(pdf_path)
|
22 |
+
|
23 |
+
# Generiere eine Antwort basierend auf der Frage und dem extrahierten Kontext
|
24 |
+
result = qa_model(question=question, context=context)
|
25 |
+
|
26 |
+
return result['answer']
|
27 |
+
|
28 |
+
# Gradio-Interface erstellen
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
|
30 |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
|
31 |
response_output = gr.Textbox(label="Antwort")
|
32 |
|
33 |
+
# Gradio-Interface für die Benutzeroberfläche
|
34 |
interface = gr.Interface(
|
35 |
fn=chatbot_response,
|
36 |
inputs=[pdf_input, question_input],
|
37 |
outputs=response_output,
|
38 |
+
title="PDF-Fragebeantwortung mit T5 und Transformers",
|
39 |
+
description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Das System verwendet T5, um die passende Antwort zu finden."
|
40 |
)
|
41 |
|
42 |
+
# Gradio-Interface starten
|
43 |
if __name__ == "__main__":
|
44 |
interface.launch()
|