Spaces:
Sleeping
Sleeping
index and searching done
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
9 |
import numpy as np
|
10 |
import nltk
|
11 |
from nltk.tokenize import sent_tokenize
|
12 |
-
|
13 |
|
14 |
def is_new_file_upload(uploaded_file):
|
15 |
if 'last_uploaded_file' in st.session_state:
|
@@ -26,6 +26,20 @@ def is_new_file_upload(uploaded_file):
|
|
26 |
# st.write("This is the first file upload detected.")
|
27 |
st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
|
28 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
big_text = """
|
30 |
<div style='text-align: center;'>
|
31 |
<h1 style='font-size: 30x;'>Knowledge Extraction 1</h1>
|
@@ -39,7 +53,7 @@ uploaded_json_file = st.file_uploader("Upload a pre-processed file",
|
|
39 |
st.markdown(
|
40 |
f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
|
41 |
unsafe_allow_html=True)
|
42 |
-
|
43 |
if uploaded_json_file is not None:
|
44 |
if is_new_file_upload(uploaded_json_file):
|
45 |
print("is new file uploaded")
|
@@ -70,6 +84,8 @@ if 'is_initialized' not in st.session_state:
|
|
70 |
st.session_state['is_initialized'] = True
|
71 |
|
72 |
nltk.download('punkt')
|
|
|
|
|
73 |
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
|
74 |
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
|
75 |
|
@@ -83,7 +99,7 @@ if 'list_count' in st.session_state:
|
|
83 |
#print(paragraph)
|
84 |
|
85 |
progress_percentage = (index) / (st.session_state.list_count - 1)
|
86 |
-
print(progress_percentage)
|
87 |
read_progress_bar.progress(progress_percentage)
|
88 |
|
89 |
sentence_encodings = []
|
@@ -101,3 +117,53 @@ if 'list_count' in st.session_state:
|
|
101 |
sentence_encodings.append([sentence, sentence_encoding])
|
102 |
# sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
|
103 |
st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
import numpy as np
|
10 |
import nltk
|
11 |
from nltk.tokenize import sent_tokenize
|
12 |
+
from nltk.corpus import stopwords
|
13 |
|
14 |
def is_new_file_upload(uploaded_file):
|
15 |
if 'last_uploaded_file' in st.session_state:
|
|
|
26 |
# st.write("This is the first file upload detected.")
|
27 |
st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
|
28 |
return True
|
29 |
+
def combined_similarity(similarity, sentence, query):
|
30 |
+
# Tokenize both the sentence and the query
|
31 |
+
# sentence_words = set(sentence.split())
|
32 |
+
# query_words = set(query.split())
|
33 |
+
sentence_words = set(word for word in sentence.split() if word.lower() not in st.session_state.stop_words)
|
34 |
+
query_words = set(word for word in query.split() if word.lower() not in st.session_state.stop_words)
|
35 |
+
|
36 |
+
# Calculate the number of common words
|
37 |
+
common_words = len(sentence_words.intersection(query_words))
|
38 |
+
|
39 |
+
# Adjust the similarity score with the common words count
|
40 |
+
combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
|
41 |
+
return combined_score
|
42 |
+
|
43 |
big_text = """
|
44 |
<div style='text-align: center;'>
|
45 |
<h1 style='font-size: 30x;'>Knowledge Extraction 1</h1>
|
|
|
53 |
st.markdown(
|
54 |
f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
|
55 |
unsafe_allow_html=True)
|
56 |
+
st.markdown("sample queries for above file: <br/> What is death? What is a lucid dream? What is the seat of consciousness?",unsafe_allow_html=True)
|
57 |
if uploaded_json_file is not None:
|
58 |
if is_new_file_upload(uploaded_json_file):
|
59 |
print("is new file uploaded")
|
|
|
84 |
st.session_state['is_initialized'] = True
|
85 |
|
86 |
nltk.download('punkt')
|
87 |
+
nltk.download('stopwords')
|
88 |
+
st.session_state.stop_words = set(stopwords.words('english'))
|
89 |
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
|
90 |
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
|
91 |
|
|
|
99 |
#print(paragraph)
|
100 |
|
101 |
progress_percentage = (index) / (st.session_state.list_count - 1)
|
102 |
+
# print(progress_percentage)
|
103 |
read_progress_bar.progress(progress_percentage)
|
104 |
|
105 |
sentence_encodings = []
|
|
|
117 |
sentence_encodings.append([sentence, sentence_encoding])
|
118 |
# sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
|
119 |
st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
|
120 |
+
st.rerun()
|
121 |
+
if 'paragraph_sentence_encodings' in st.session_state:
|
122 |
+
query = st.text_input("Enter your query")
|
123 |
+
if query:
|
124 |
+
query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to('cuda')
|
125 |
+
with torch.no_grad(): # Disable gradient calculation for inference
|
126 |
+
# Perform the forward pass on the GPU
|
127 |
+
query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
|
128 |
+
:].cpu().numpy() # Move the result to CPU and convert to NumPy
|
129 |
+
paragraph_scores = []
|
130 |
+
sentence_scores = []
|
131 |
+
sentence_encoding = []
|
132 |
+
for paragraph_sentence_encoding in st.session_state.paragraph_sentence_encodings:
|
133 |
+
|
134 |
+
best_similarity = -1
|
135 |
+
sentence_similarities = []
|
136 |
+
for sentence_encoding in paragraph_sentence_encoding[1]:
|
137 |
+
if sentence_encoding:
|
138 |
+
similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
|
139 |
+
# adjusted_similarity = similarity*len(sentence_encoding[0].split())**0.5
|
140 |
+
combined_score = combined_similarity(similarity, sentence_encoding[0], query)
|
141 |
+
|
142 |
+
# print("sentence="+sentence_encoding[0] + " len="+str())
|
143 |
+
|
144 |
+
sentence_similarities.append(combined_score)
|
145 |
+
sentence_scores.append((combined_score, sentence_encoding[0]))
|
146 |
+
# best_similarity = max(best_similarity, similarity)
|
147 |
+
sentence_similarities.sort(reverse=True)
|
148 |
+
|
149 |
+
# Calculate the average of the top three sentence similarities
|
150 |
+
if len(sentence_similarities) >= 3:
|
151 |
+
top_three_avg_similarity = np.mean(sentence_similarities[:3])
|
152 |
+
elif sentence_similarities:
|
153 |
+
top_three_avg_similarity = np.mean(sentence_similarities)
|
154 |
+
else:
|
155 |
+
top_three_avg_similarity = 0
|
156 |
+
paragraph_scores.append((top_three_avg_similarity, paragraph_sentence_encoding[0]))
|
157 |
+
sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
|
158 |
+
# Display the scores and sentences
|
159 |
+
# print("Top scored sentences and their scores:")
|
160 |
+
# for score, sentence in sentence_scores: # Print top 10 for demonstration
|
161 |
+
# print(f"Score: {score:.4f}, Sentence: {sentence}")
|
162 |
+
# Sort the paragraphs by their best similarity score
|
163 |
+
paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
|
164 |
+
|
165 |
+
# Debug prints to understand the scores and paragraphs
|
166 |
+
st.write("Top scored paragraphs and their scores:")
|
167 |
+
for score, paragraph in paragraph_scores[:5]: # Print top 5 for debugging
|
168 |
+
|
169 |
+
st.write(f"Score: {score}, Paragraph: {paragraph['text']}")
|