Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,9 +9,9 @@ import logging
|
|
9 |
# Logging konfigurieren
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
-
#
|
13 |
-
model = SentenceTransformer('all-mpnet-base-v2')
|
14 |
-
qa_model = pipeline("question-answering", model="
|
15 |
|
16 |
# FAISS-Index erstellen
|
17 |
def create_faiss_index(documents):
|
@@ -19,57 +19,57 @@ def create_faiss_index(documents):
|
|
19 |
dimension = len(document_embeddings[0])
|
20 |
index = faiss.IndexFlatL2(dimension)
|
21 |
document_embeddings = np.array(document_embeddings).astype('float32')
|
22 |
-
index.add(document_embeddings)
|
23 |
return index, documents
|
24 |
|
25 |
-
#
|
26 |
def extract_text_from_pdf(pdf_path):
|
27 |
doc = fitz.open(pdf_path)
|
28 |
text_chunks = []
|
29 |
for page_num in range(len(doc)):
|
30 |
page = doc.load_page(page_num)
|
31 |
text = page.get_text("text")
|
32 |
-
# Text in kleinere Abschnitte (z. B. Absätze) unterteilen
|
33 |
chunks = text.split('\n\n') # Unterteilen nach Absätzen
|
34 |
text_chunks.extend(chunks)
|
35 |
return text_chunks
|
36 |
|
37 |
-
#
|
38 |
def search_documents(query, index, documents, k=5):
|
39 |
query_embedding = model.encode([query])[0].astype('float32')
|
40 |
-
D, I = index.search(np.array([query_embedding]), k=k)
|
41 |
results = [documents[i] for i in I[0]]
|
42 |
-
return " ".join(results) # Kombiniere
|
43 |
|
44 |
-
#
|
45 |
def generate_answer(context, question):
|
46 |
-
|
|
|
|
|
47 |
return result['answer']
|
48 |
|
49 |
-
#
|
50 |
def chatbot_response(pdf_path, question):
|
51 |
logging.info(f"Frage: {question}")
|
52 |
|
53 |
-
#
|
54 |
text_chunks = extract_text_from_pdf(pdf_path)
|
55 |
|
56 |
# FAISS-Index erstellen
|
57 |
index, documents = create_faiss_index(text_chunks)
|
58 |
|
59 |
-
#
|
60 |
context = search_documents(question, index, documents, k=5)
|
61 |
-
logging.info(f"
|
62 |
|
63 |
# Antwort generieren
|
64 |
answer = generate_answer(context, question)
|
65 |
return answer
|
66 |
|
67 |
# Gradio-Interface
|
68 |
-
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
|
69 |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
|
70 |
response_output = gr.Textbox(label="Antwort")
|
71 |
|
72 |
-
# Gradio-Interface erstellen
|
73 |
interface = gr.Interface(
|
74 |
fn=chatbot_response,
|
75 |
inputs=[pdf_input, question_input],
|
|
|
9 |
# Logging konfigurieren
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
+
# Modelle laden
|
13 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
14 |
+
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
15 |
|
16 |
# FAISS-Index erstellen
|
17 |
def create_faiss_index(documents):
|
|
|
19 |
dimension = len(document_embeddings[0])
|
20 |
index = faiss.IndexFlatL2(dimension)
|
21 |
document_embeddings = np.array(document_embeddings).astype('float32')
|
22 |
+
index.add(document_embeddings)
|
23 |
return index, documents
|
24 |
|
25 |
+
# Text aus PDF extrahieren (kleinere Abschnitte)
|
26 |
def extract_text_from_pdf(pdf_path):
|
27 |
doc = fitz.open(pdf_path)
|
28 |
text_chunks = []
|
29 |
for page_num in range(len(doc)):
|
30 |
page = doc.load_page(page_num)
|
31 |
text = page.get_text("text")
|
|
|
32 |
chunks = text.split('\n\n') # Unterteilen nach Absätzen
|
33 |
text_chunks.extend(chunks)
|
34 |
return text_chunks
|
35 |
|
36 |
+
# Suche nach mehreren passenden Abschnitten
|
37 |
def search_documents(query, index, documents, k=5):
|
38 |
query_embedding = model.encode([query])[0].astype('float32')
|
39 |
+
D, I = index.search(np.array([query_embedding]), k=k)
|
40 |
results = [documents[i] for i in I[0]]
|
41 |
+
return " ".join(results) # Kombiniere mehrere Treffer
|
42 |
|
43 |
+
# QA-Modell für präzise Antworten nutzen
|
44 |
def generate_answer(context, question):
|
45 |
+
max_context_length = 512
|
46 |
+
truncated_context = " ".join(context.split()[:max_context_length]) # Kontext begrenzen
|
47 |
+
result = qa_model(question=question, context=truncated_context)
|
48 |
return result['answer']
|
49 |
|
50 |
+
# Gesamtprozess
|
51 |
def chatbot_response(pdf_path, question):
|
52 |
logging.info(f"Frage: {question}")
|
53 |
|
54 |
+
# Text extrahieren
|
55 |
text_chunks = extract_text_from_pdf(pdf_path)
|
56 |
|
57 |
# FAISS-Index erstellen
|
58 |
index, documents = create_faiss_index(text_chunks)
|
59 |
|
60 |
+
# Kontext suchen
|
61 |
context = search_documents(question, index, documents, k=5)
|
62 |
+
logging.info(f"Verwendeter Kontext: {context[:500]}") # Loggen des Kontexts
|
63 |
|
64 |
# Antwort generieren
|
65 |
answer = generate_answer(context, question)
|
66 |
return answer
|
67 |
|
68 |
# Gradio-Interface
|
69 |
+
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
|
70 |
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
|
71 |
response_output = gr.Textbox(label="Antwort")
|
72 |
|
|
|
73 |
interface = gr.Interface(
|
74 |
fn=chatbot_response,
|
75 |
inputs=[pdf_input, question_input],
|