la04 commited on
Commit
b264d4d
·
verified ·
1 Parent(s): 05103a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -9
app.py CHANGED
@@ -11,7 +11,7 @@ logging.basicConfig(level=logging.INFO)
11
 
12
  # Modelle laden
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
- qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
15
 
16
  # FAISS-Index erstellen
17
  def create_faiss_index(documents):
@@ -22,8 +22,8 @@ def create_faiss_index(documents):
22
  index.add(document_embeddings)
23
  return index, documents
24
 
25
- # Text in kleinere Chunks aufteilen
26
- def split_text_into_chunks(text, chunk_size=300):
27
  words = text.split()
28
  return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
29
 
@@ -34,7 +34,7 @@ def extract_text_from_pdf(pdf_path):
34
  for page_num in range(len(doc)):
35
  page = doc.load_page(page_num)
36
  text = page.get_text("text")
37
- if not text.strip(): # Überprüfen, ob der Text leer ist
38
  logging.warning(f"Leerer Text auf Seite {page_num}")
39
  chunks = split_text_into_chunks(text)
40
  text_chunks.extend(chunks)
@@ -44,15 +44,14 @@ def extract_text_from_pdf(pdf_path):
44
  def rank_contexts_by_relevance(query, contexts):
45
  query_embedding = model.encode([query])[0].astype('float32')
46
  context_embeddings = model.encode(contexts)
47
- scores = np.dot(query_embedding, context_embeddings.T) # Dot-Produkt zur Berechnung der Relevanz
48
  ranked_contexts = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)
49
  return [context for _, context in ranked_contexts[:5]] # Nur die Top 5 Kontexte zurückgeben
50
 
51
- # Suche und Bewertung
52
  def search_and_rank_answers(query, index, documents, k=10):
53
  query_embedding = model.encode([query])[0].astype('float32')
54
  D, I = index.search(np.array([query_embedding]), k=k)
55
-
56
  ranked_contexts = [documents[i] for i in I[0]]
57
  top_contexts = rank_contexts_by_relevance(query, ranked_contexts)
58
 
@@ -69,8 +68,7 @@ def search_and_rank_answers(query, index, documents, k=10):
69
 
70
  # Antworten kombinieren
71
  def combine_answers(answers):
72
- # Kombiniert die Top 3 Antworten zu einer einzigen Antwort
73
- return " ".join(answers[:3])
74
 
75
  # Gesamtprozess
76
  def chatbot_response(pdf_path, question):
 
11
 
12
  # Modelle laden
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
+ qa_model = pipeline("question-answering", model="deepset/bert-large-uncased-whole-word-masking-finetuned-squad")
15
 
16
  # FAISS-Index erstellen
17
  def create_faiss_index(documents):
 
22
  index.add(document_embeddings)
23
  return index, documents
24
 
25
+ # Text in größere Chunks aufteilen
26
+ def split_text_into_chunks(text, chunk_size=500): # Größere Chunks
27
  words = text.split()
28
  return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
29
 
 
34
  for page_num in range(len(doc)):
35
  page = doc.load_page(page_num)
36
  text = page.get_text("text")
37
+ if not text.strip():
38
  logging.warning(f"Leerer Text auf Seite {page_num}")
39
  chunks = split_text_into_chunks(text)
40
  text_chunks.extend(chunks)
 
44
  def rank_contexts_by_relevance(query, contexts):
45
  query_embedding = model.encode([query])[0].astype('float32')
46
  context_embeddings = model.encode(contexts)
47
+ scores = np.dot(query_embedding, context_embeddings.T)
48
  ranked_contexts = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)
49
  return [context for _, context in ranked_contexts[:5]] # Nur die Top 5 Kontexte zurückgeben
50
 
51
+ # Suche nach den besten Antworten
52
  def search_and_rank_answers(query, index, documents, k=10):
53
  query_embedding = model.encode([query])[0].astype('float32')
54
  D, I = index.search(np.array([query_embedding]), k=k)
 
55
  ranked_contexts = [documents[i] for i in I[0]]
56
  top_contexts = rank_contexts_by_relevance(query, ranked_contexts)
57
 
 
68
 
69
  # Antworten kombinieren
70
  def combine_answers(answers):
71
+ return " ".join(set(answers[:3])) # Entfernt Duplikate und kombiniert
 
72
 
73
  # Gesamtprozess
74
  def chatbot_response(pdf_path, question):