Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ logging.basicConfig(level=logging.INFO)
|
|
11 |
|
12 |
# Modelle laden
|
13 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
-
qa_model = pipeline("question-answering", model="deepset/
|
15 |
|
16 |
# FAISS-Index erstellen
|
17 |
def create_faiss_index(documents):
|
@@ -22,8 +22,8 @@ def create_faiss_index(documents):
|
|
22 |
index.add(document_embeddings)
|
23 |
return index, documents
|
24 |
|
25 |
-
# Text in
|
26 |
-
def split_text_into_chunks(text, chunk_size=
|
27 |
words = text.split()
|
28 |
return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
29 |
|
@@ -34,7 +34,7 @@ def extract_text_from_pdf(pdf_path):
|
|
34 |
for page_num in range(len(doc)):
|
35 |
page = doc.load_page(page_num)
|
36 |
text = page.get_text("text")
|
37 |
-
if not text.strip():
|
38 |
logging.warning(f"Leerer Text auf Seite {page_num}")
|
39 |
chunks = split_text_into_chunks(text)
|
40 |
text_chunks.extend(chunks)
|
@@ -44,15 +44,14 @@ def extract_text_from_pdf(pdf_path):
|
|
44 |
def rank_contexts_by_relevance(query, contexts):
|
45 |
query_embedding = model.encode([query])[0].astype('float32')
|
46 |
context_embeddings = model.encode(contexts)
|
47 |
-
scores = np.dot(query_embedding, context_embeddings.T)
|
48 |
ranked_contexts = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)
|
49 |
return [context for _, context in ranked_contexts[:5]] # Nur die Top 5 Kontexte zurückgeben
|
50 |
|
51 |
-
# Suche
|
52 |
def search_and_rank_answers(query, index, documents, k=10):
|
53 |
query_embedding = model.encode([query])[0].astype('float32')
|
54 |
D, I = index.search(np.array([query_embedding]), k=k)
|
55 |
-
|
56 |
ranked_contexts = [documents[i] for i in I[0]]
|
57 |
top_contexts = rank_contexts_by_relevance(query, ranked_contexts)
|
58 |
|
@@ -69,8 +68,7 @@ def search_and_rank_answers(query, index, documents, k=10):
|
|
69 |
|
70 |
# Antworten kombinieren
|
71 |
def combine_answers(answers):
|
72 |
-
|
73 |
-
return " ".join(answers[:3])
|
74 |
|
75 |
# Gesamtprozess
|
76 |
def chatbot_response(pdf_path, question):
|
|
|
11 |
|
12 |
# Modelle laden
|
13 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
+
qa_model = pipeline("question-answering", model="deepset/bert-large-uncased-whole-word-masking-finetuned-squad")
|
15 |
|
16 |
# FAISS-Index erstellen
|
17 |
def create_faiss_index(documents):
|
|
|
22 |
index.add(document_embeddings)
|
23 |
return index, documents
|
24 |
|
25 |
+
# Text in größere Chunks aufteilen
|
26 |
+
def split_text_into_chunks(text, chunk_size=500): # Größere Chunks
|
27 |
words = text.split()
|
28 |
return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
29 |
|
|
|
34 |
for page_num in range(len(doc)):
|
35 |
page = doc.load_page(page_num)
|
36 |
text = page.get_text("text")
|
37 |
+
if not text.strip():
|
38 |
logging.warning(f"Leerer Text auf Seite {page_num}")
|
39 |
chunks = split_text_into_chunks(text)
|
40 |
text_chunks.extend(chunks)
|
|
|
44 |
def rank_contexts_by_relevance(query, contexts):
|
45 |
query_embedding = model.encode([query])[0].astype('float32')
|
46 |
context_embeddings = model.encode(contexts)
|
47 |
+
scores = np.dot(query_embedding, context_embeddings.T)
|
48 |
ranked_contexts = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)
|
49 |
return [context for _, context in ranked_contexts[:5]] # Nur die Top 5 Kontexte zurückgeben
|
50 |
|
51 |
+
# Suche nach den besten Antworten
|
52 |
def search_and_rank_answers(query, index, documents, k=10):
|
53 |
query_embedding = model.encode([query])[0].astype('float32')
|
54 |
D, I = index.search(np.array([query_embedding]), k=k)
|
|
|
55 |
ranked_contexts = [documents[i] for i in I[0]]
|
56 |
top_contexts = rank_contexts_by_relevance(query, ranked_contexts)
|
57 |
|
|
|
68 |
|
69 |
# Antworten kombinieren
|
70 |
def combine_answers(answers):
|
71 |
+
return " ".join(set(answers[:3])) # Entfernt Duplikate und kombiniert
|
|
|
72 |
|
73 |
# Gesamtprozess
|
74 |
def chatbot_response(pdf_path, question):
|