Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -60,19 +60,59 @@ def generate_with_gpt(prompt, max_length=150):
|
|
60 |
)
|
61 |
return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
62 |
|
63 |
-
def
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
|
|
|
|
68 |
|
69 |
-
|
|
|
70 |
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
-
|
74 |
-
""
|
75 |
-
return generate_with_gpt(prompt)
|
76 |
|
77 |
def extract_direct_definition(text, term):
|
78 |
"""Try to find a sentence that directly defines the term"""
|
|
|
60 |
)
|
61 |
return gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
62 |
|
63 |
+
def ask_question(file, question, history):
|
64 |
+
if not file:
|
65 |
+
return "Please upload a file.", history
|
66 |
+
|
67 |
+
text = extract_text(file)
|
68 |
+
if not text:
|
69 |
+
return "Could not extract text from the file.", history
|
70 |
|
71 |
+
chunks = chunk_text(text)
|
72 |
+
if not chunks:
|
73 |
+
return "No meaningful text chunks could be created.", history
|
74 |
|
75 |
+
# Initialize answer as None
|
76 |
+
answer = None
|
77 |
|
78 |
+
try:
|
79 |
+
# Normalize question for better matching
|
80 |
+
normalized_question = question.lower().strip(" ?")
|
81 |
+
|
82 |
+
# First try to find direct definitions
|
83 |
+
if "artificial system" in normalized_question:
|
84 |
+
answer = extract_direct_definition(text, "artificial system")
|
85 |
+
elif "natural system" in normalized_question:
|
86 |
+
answer = extract_direct_definition(text, "natural system")
|
87 |
+
elif "component" in normalized_question:
|
88 |
+
answer = extract_direct_definition(text, "component")
|
89 |
+
|
90 |
+
# If no direct definition found, use semantic search
|
91 |
+
if not answer:
|
92 |
+
emb_chunks = embedder.encode(chunks, convert_to_tensor=True)
|
93 |
+
emb_question = embedder.encode(question, convert_to_tensor=True)
|
94 |
+
scores = util.pytorch_cos_sim(emb_question, emb_chunks)[0]
|
95 |
+
best_idx = scores.argmax().item()
|
96 |
+
best_chunk = chunks[best_idx]
|
97 |
+
|
98 |
+
if scores[best_idx] < 0.3: # Low confidence
|
99 |
+
top_k = min(3, len(chunks))
|
100 |
+
best_indices = scores.topk(top_k).indices.tolist()
|
101 |
+
best_chunk = " ".join([chunks[i] for i in best_indices])
|
102 |
+
|
103 |
+
result = qa_pipeline(question=question, context=best_chunk)
|
104 |
+
if result["score"] > 0.1 and len(result["answer"].split()) >= 2:
|
105 |
+
answer = result["answer"]
|
106 |
+
|
107 |
+
# Final fallback if no answer found
|
108 |
+
if not answer:
|
109 |
+
answer = "Sorry, I couldn't find a clear answer in the document."
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
answer = f"An error occurred: {str(e)}"
|
113 |
|
114 |
+
history.append((question, answer))
|
115 |
+
return "", history
|
|
|
116 |
|
117 |
def extract_direct_definition(text, term):
|
118 |
"""Try to find a sentence that directly defines the term"""
|