zmbfeng commited on
Commit
5aa1276
·
1 Parent(s): 561a0db

index and searching done

Browse files
Files changed (1) hide show
  1. app.py +69 -3
app.py CHANGED
@@ -9,7 +9,7 @@ from sklearn.metrics.pairwise import cosine_similarity
9
  import numpy as np
10
  import nltk
11
  from nltk.tokenize import sent_tokenize
12
-
13
 
14
  def is_new_file_upload(uploaded_file):
15
  if 'last_uploaded_file' in st.session_state:
@@ -26,6 +26,20 @@ def is_new_file_upload(uploaded_file):
26
  # st.write("This is the first file upload detected.")
27
  st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
28
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  big_text = """
30
  <div style='text-align: center;'>
31
  <h1 style='font-size: 30x;'>Knowledge Extraction 1</h1>
@@ -39,7 +53,7 @@ uploaded_json_file = st.file_uploader("Upload a pre-processed file",
39
  st.markdown(
40
  f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
41
  unsafe_allow_html=True)
42
-
43
  if uploaded_json_file is not None:
44
  if is_new_file_upload(uploaded_json_file):
45
  print("is new file uploaded")
@@ -70,6 +84,8 @@ if 'is_initialized' not in st.session_state:
70
  st.session_state['is_initialized'] = True
71
 
72
  nltk.download('punkt')
 
 
73
  st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
74
  st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
75
 
@@ -83,7 +99,7 @@ if 'list_count' in st.session_state:
83
  #print(paragraph)
84
 
85
  progress_percentage = (index) / (st.session_state.list_count - 1)
86
- print(progress_percentage)
87
  read_progress_bar.progress(progress_percentage)
88
 
89
  sentence_encodings = []
@@ -101,3 +117,53 @@ if 'list_count' in st.session_state:
101
  sentence_encodings.append([sentence, sentence_encoding])
102
  # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
103
  st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import numpy as np
10
  import nltk
11
  from nltk.tokenize import sent_tokenize
12
+ from nltk.corpus import stopwords
13
 
14
  def is_new_file_upload(uploaded_file):
15
  if 'last_uploaded_file' in st.session_state:
 
26
  # st.write("This is the first file upload detected.")
27
  st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
28
  return True
29
+ def combined_similarity(similarity, sentence, query):
30
+ # Tokenize both the sentence and the query
31
+ # sentence_words = set(sentence.split())
32
+ # query_words = set(query.split())
33
+ sentence_words = set(word for word in sentence.split() if word.lower() not in st.session_state.stop_words)
34
+ query_words = set(word for word in query.split() if word.lower() not in st.session_state.stop_words)
35
+
36
+ # Calculate the number of common words
37
+ common_words = len(sentence_words.intersection(query_words))
38
+
39
+ # Adjust the similarity score with the common words count
40
+ combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
41
+ return combined_score
42
+
43
  big_text = """
44
  <div style='text-align: center;'>
45
  <h1 style='font-size: 30x;'>Knowledge Extraction 1</h1>
 
53
  st.markdown(
54
  f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
55
  unsafe_allow_html=True)
56
+ st.markdown("sample queries for above file: <br/> What is death? What is a lucid dream? What is the seat of consciousness?",unsafe_allow_html=True)
57
  if uploaded_json_file is not None:
58
  if is_new_file_upload(uploaded_json_file):
59
  print("is new file uploaded")
 
84
  st.session_state['is_initialized'] = True
85
 
86
  nltk.download('punkt')
87
+ nltk.download('stopwords')
88
+ st.session_state.stop_words = set(stopwords.words('english'))
89
  st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
90
  st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
91
 
 
99
  #print(paragraph)
100
 
101
  progress_percentage = (index) / (st.session_state.list_count - 1)
102
+ # print(progress_percentage)
103
  read_progress_bar.progress(progress_percentage)
104
 
105
  sentence_encodings = []
 
117
  sentence_encodings.append([sentence, sentence_encoding])
118
  # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
119
  st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
120
+ st.rerun()
121
+ if 'paragraph_sentence_encodings' in st.session_state:
122
+ query = st.text_input("Enter your query")
123
+ if query:
124
+ query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to('cuda')
125
+ with torch.no_grad(): # Disable gradient calculation for inference
126
+ # Perform the forward pass on the GPU
127
+ query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
128
+ :].cpu().numpy() # Move the result to CPU and convert to NumPy
129
+ paragraph_scores = []
130
+ sentence_scores = []
131
+ sentence_encoding = []
132
+ for paragraph_sentence_encoding in st.session_state.paragraph_sentence_encodings:
133
+
134
+ best_similarity = -1
135
+ sentence_similarities = []
136
+ for sentence_encoding in paragraph_sentence_encoding[1]:
137
+ if sentence_encoding:
138
+ similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
139
+ # adjusted_similarity = similarity*len(sentence_encoding[0].split())**0.5
140
+ combined_score = combined_similarity(similarity, sentence_encoding[0], query)
141
+
142
+ # print("sentence="+sentence_encoding[0] + " len="+str())
143
+
144
+ sentence_similarities.append(combined_score)
145
+ sentence_scores.append((combined_score, sentence_encoding[0]))
146
+ # best_similarity = max(best_similarity, similarity)
147
+ sentence_similarities.sort(reverse=True)
148
+
149
+ # Calculate the average of the top three sentence similarities
150
+ if len(sentence_similarities) >= 3:
151
+ top_three_avg_similarity = np.mean(sentence_similarities[:3])
152
+ elif sentence_similarities:
153
+ top_three_avg_similarity = np.mean(sentence_similarities)
154
+ else:
155
+ top_three_avg_similarity = 0
156
+ paragraph_scores.append((top_three_avg_similarity, paragraph_sentence_encoding[0]))
157
+ sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
158
+ # Display the scores and sentences
159
+ # print("Top scored sentences and their scores:")
160
+ # for score, sentence in sentence_scores: # Print top 10 for demonstration
161
+ # print(f"Score: {score:.4f}, Sentence: {sentence}")
162
+ # Sort the paragraphs by their best similarity score
163
+ paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
164
+
165
+ # Debug prints to understand the scores and paragraphs
166
+ st.write("Top scored paragraphs and their scores:")
167
+ for score, paragraph in paragraph_scores[:5]: # Print top 5 for debugging
168
+
169
+ st.write(f"Score: {score}, Paragraph: {paragraph['text']}")