DevBM commited on
Commit
27b0b20
·
verified ·
1 Parent(s): b48ea22

using spacy for option generation

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -24,9 +24,6 @@ nlp = spacy.load("en_core_web_sm")
24
  user_agent = 'QGen/1.0 ([email protected])'
25
  wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
26
 
27
- # Load pre-trained word vectors (this may take a while)
28
- word_vectors = KeyedVectors.load_word2vec_format('vectors/GoogleNews-vectors-negative300.bin', binary=True)
29
-
30
  def load_model():
31
  model_name = "DevBM/t5-large-squad"
32
  model = T5ForConditionalGeneration.from_pretrained(model_name)
@@ -63,29 +60,23 @@ def extract_keywords(text):
63
 
64
  return list(combined_keywords)
65
 
66
- # Function to map keywords to sentences with customizable context window size
67
- def map_keywords_to_sentences(text, keywords, context_window_size):
68
- sentences = sent_tokenize(text)
69
- keyword_sentence_mapping = {}
70
- for keyword in keywords:
71
- for i, sentence in enumerate(sentences):
72
- if keyword in sentence:
73
- # Combine current sentence with surrounding sentences for context
74
- start = max(0, i - context_window_size)
75
- end = min(len(sentences), i + context_window_size + 1)
76
- context = ' '.join(sentences[start:end])
77
- if keyword not in keyword_sentence_mapping:
78
- keyword_sentence_mapping[keyword] = context
79
- else:
80
- keyword_sentence_mapping[keyword] += ' ' + context
81
- return keyword_sentence_mapping
82
 
83
  def get_similar_words(word, n=3):
84
- try:
85
- similar_words = word_vectors.most_similar(word, topn=n)
86
- return [word for word, _ in similar_words]
87
- except KeyError:
88
- return []
 
 
 
 
 
 
 
 
89
 
90
  def get_synonyms(word, n=3):
91
  synonyms = []
@@ -100,7 +91,7 @@ def get_synonyms(word, n=3):
100
  def generate_options(answer, context, n=3):
101
  options = [answer]
102
 
103
- # Try to get similar words based on word embeddings
104
  similar_words = get_similar_words(answer, n)
105
  options.extend(similar_words)
106
 
@@ -128,6 +119,24 @@ def generate_options(answer, context, n=3):
128
 
129
  return options
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # Function to perform entity linking using Wikipedia API
132
  @lru_cache(maxsize=128)
133
  def entity_linking(keyword):
 
24
  user_agent = 'QGen/1.0 ([email protected])'
25
  wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
26
 
 
 
 
27
  def load_model():
28
  model_name = "DevBM/t5-large-squad"
29
  model = T5ForConditionalGeneration.from_pretrained(model_name)
 
60
 
61
  return list(combined_keywords)
62
 
63
+ # Load spaCy model (medium-sized model with word vectors)
64
+ nlp = spacy.load("en_core_web_md")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def get_similar_words(word, n=3):
67
+ # Get the vector for the word
68
+ word_vector = nlp(word).vector
69
+
70
+ # Find similar words
71
+ similar_words = []
72
+ for w in nlp.vocab:
73
+ if w.has_vector and w.is_lower and w.is_alpha and w.text != word:
74
+ similarity = nlp(w.text).similarity(nlp(word))
75
+ similar_words.append((w.text, similarity))
76
+
77
+ # Sort by similarity and return top n
78
+ similar_words.sort(key=lambda x: x[1], reverse=True)
79
+ return [word for word, _ in similar_words[:n]]
80
 
81
  def get_synonyms(word, n=3):
82
  synonyms = []
 
91
  def generate_options(answer, context, n=3):
92
  options = [answer]
93
 
94
+ # Try to get similar words based on word vectors
95
  similar_words = get_similar_words(answer, n)
96
  options.extend(similar_words)
97
 
 
119
 
120
  return options
121
 
122
+ # Function to map keywords to sentences with customizable context window size
123
+ def map_keywords_to_sentences(text, keywords, context_window_size):
124
+ sentences = sent_tokenize(text)
125
+ keyword_sentence_mapping = {}
126
+ for keyword in keywords:
127
+ for i, sentence in enumerate(sentences):
128
+ if keyword in sentence:
129
+ # Combine current sentence with surrounding sentences for context
130
+ start = max(0, i - context_window_size)
131
+ end = min(len(sentences), i + context_window_size + 1)
132
+ context = ' '.join(sentences[start:end])
133
+ if keyword not in keyword_sentence_mapping:
134
+ keyword_sentence_mapping[keyword] = context
135
+ else:
136
+ keyword_sentence_mapping[keyword] += ' ' + context
137
+ return keyword_sentence_mapping
138
+
139
+
140
  # Function to perform entity linking using Wikipedia API
141
  @lru_cache(maxsize=128)
142
  def entity_linking(keyword):