Spaces:
Sleeping
Sleeping
File size: 3,955 Bytes
40a799a fbe3ac4 318ff7b d859c3e 7bf65ec d93fe74 05103a4 d93fe74 6378bf1 318ff7b d93fe74 318ff7b e8a59ae 0c5dd07 b12560a 318ff7b d859c3e 318ff7b d859c3e 05103a4 0c5dd07 d859c3e 42c84c3 cb92135 05103a4 cb92135 0c5dd07 40a799a 0c5dd07 cb92135 0c5dd07 cb92135 b12560a 0c5dd07 cb92135 b12560a 0c5dd07 cb92135 0c5dd07 05103a4 42c84c3 d93fe74 adbd41e d859c3e 0c5dd07 d93fe74 d859c3e 318ff7b d859c3e 318ff7b 0c5dd07 318ff7b cb92135 0c5dd07 318ff7b d93fe74 318ff7b ee9ba92 fbe3ac4 318ff7b 0c5dd07 fbe3ac4 ee9ba92 318ff7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import faiss
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
import fitz # PyMuPDF für die Textextraktion aus PDFs
from transformers import pipeline
import logging
# Logging konfigurieren
logging.basicConfig(level=logging.INFO)
# Modelle laden
model = SentenceTransformer('all-MiniLM-L6-v2')
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
# FAISS-Index erstellen
def create_faiss_index(documents):
document_embeddings = model.encode(documents)
dimension = len(document_embeddings[0])
index = faiss.IndexFlatL2(dimension)
document_embeddings = np.array(document_embeddings).astype('float32')
index.add(document_embeddings)
return index, documents
# Text in kleinere Chunks aufteilen
def split_text_into_chunks(text, chunk_size=300):
words = text.split()
return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
# Text aus PDF extrahieren
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text_chunks = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text("text")
if not text.strip(): # Überprüfen, ob der Text leer ist
logging.warning(f"Leerer Text auf Seite {page_num}")
chunks = split_text_into_chunks(text)
text_chunks.extend(chunks)
return text_chunks
# Kontexte nach Relevanz bewerten
def rank_contexts_by_relevance(query, contexts):
query_embedding = model.encode([query])[0].astype('float32')
context_embeddings = model.encode(contexts)
scores = np.dot(query_embedding, context_embeddings.T) # Dot-Produkt zur Berechnung der Relevanz
ranked_contexts = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)
return [context for _, context in ranked_contexts[:5]] # Nur die Top 5 Kontexte zurückgeben
# Suche und Bewertung
def search_and_rank_answers(query, index, documents, k=10):
query_embedding = model.encode([query])[0].astype('float32')
D, I = index.search(np.array([query_embedding]), k=k)
ranked_contexts = [documents[i] for i in I[0]]
top_contexts = rank_contexts_by_relevance(query, ranked_contexts)
ranked_answers = []
for context in top_contexts:
try:
result = qa_model(question=query, context=context)
ranked_answers.append((result['answer'], result['score']))
except Exception as e:
logging.warning(f"Fehler bei der Antwortgenerierung: {e}")
ranked_answers = sorted(ranked_answers, key=lambda x: x[1], reverse=True)
return [answer for answer, _ in ranked_answers]
# Antworten kombinieren
def combine_answers(answers):
# Kombiniert die Top 3 Antworten zu einer einzigen Antwort
return " ".join(answers[:3])
# Gesamtprozess
def chatbot_response(pdf_path, question):
logging.info(f"Frage: {question}")
# Text extrahieren
text_chunks = extract_text_from_pdf(pdf_path)
# FAISS-Index erstellen
index, documents = create_faiss_index(text_chunks)
# Suche nach Antworten
answers = search_and_rank_answers(question, index, documents, k=10)
# Antworten kombinieren
detailed_answer = combine_answers(answers)
logging.info(f"Antwort: {detailed_answer}")
return detailed_answer
# Gradio-Interface
pdf_input = gr.File(label="PDF-Datei hochladen", type="filepath")
question_input = gr.Textbox(label="Frage eingeben", placeholder="Stelle eine Frage zu dem PDF-Dokument")
response_output = gr.Textbox(label="Antwort")
interface = gr.Interface(
fn=chatbot_response,
inputs=[pdf_input, question_input],
outputs=response_output,
title="PDF-Fragebeantwortung mit FAISS und Transformers",
description="Lade eine PDF-Datei hoch und stelle Fragen zu ihrem Inhalt. Das System kombiniert mehrere Antworten, um präzisere Ergebnisse zu liefern."
)
if __name__ == "__main__":
interface.launch()
|