awacke1 commited on
Commit
1b1e489
Β·
1 Parent(s): fec4005

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -0
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ import wikipediaapi
4
+ import wikipedia
5
+ from wikipedia.exceptions import DisambiguationError
6
+ from transformers import TFAutoModel, AutoTokenizer
7
+ import numpy as np
8
+ import pandas as pd
9
+ import faiss
10
+ import datetime
11
+ import time
12
+
13
+
14
+ try:
15
+ nlp = spacy.load("en_core_web_sm")
16
+ except:
17
+ spacy.cli.download("en_core_web_sm")
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ wh_words = ['what', 'who', 'how', 'when', 'which']
21
+
22
+ def get_concepts(text):
23
+ text = text.lower()
24
+ doc = nlp(text)
25
+ concepts = []
26
+ for chunk in doc.noun_chunks:
27
+ if chunk.text not in wh_words:
28
+ concepts.append(chunk.text)
29
+ return concepts
30
+
31
+ def get_passages(text, k=100):
32
+ doc = nlp(text)
33
+ passages = []
34
+ passage_len = 0
35
+ passage = ""
36
+ sents = list(doc.sents)
37
+ for i in range(len(sents)):
38
+ sen = sents[i]
39
+ passage_len += len(sen)
40
+ if passage_len >= k:
41
+ passages.append(passage)
42
+ passage = sen.text
43
+ passage_len = len(sen)
44
+ continue
45
+ elif i == (len(sents) - 1):
46
+ passage += " " + sen.text
47
+ passages.append(passage)
48
+ passage = ""
49
+ passage_len = 0
50
+ continue
51
+ passage += " " + sen.text
52
+ return passages
53
+
54
+ def get_dicts_for_dpr(concepts, n_results=20, k=100):
55
+ dicts = []
56
+ for concept in concepts:
57
+ wikis = wikipedia.search(concept, results=n_results)
58
+ st.write(f"{concept} No of Wikis: {len(wikis)}")
59
+ for wiki in wikis:
60
+ try:
61
+ html_page = wikipedia.page(title=wiki, auto_suggest=False)
62
+ except DisambiguationError:
63
+ continue
64
+ htmlResults = html_page.content
65
+ passages = get_passages(htmlResults, k=k)
66
+ for passage in passages:
67
+ i_dicts = {}
68
+ i_dicts['text'] = passage
69
+ i_dicts['title'] = wiki
70
+ dicts.append(i_dicts)
71
+ return dicts
72
+
73
+ passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
74
+ query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
75
+ p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
76
+ q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
77
+
78
+ def get_title_text_combined(passage_dicts):
79
+ res = []
80
+ for p in passage_dicts:
81
+ res.append(tuple((p['title'], p['text'])))
82
+ return res
83
+
84
+ def extracted_passage_embeddings(processed_passages, max_length=156):
85
+ passage_inputs = p_tokenizer.batch_encode_plus(
86
+ processed_passages,
87
+ add_special_tokens=True,
88
+ truncation=True,
89
+ padding="max_length",
90
+ max_length=max_length,
91
+ return_token_type_ids=True
92
+ )
93
+ passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']), np.array(passage_inputs['attention_mask']),
94
+ np.array(passage_inputs['token_type_ids'])],
95
+ batch_size=64,
96
+ verbose=1)
97
+ return passage_embeddings
98
+
99
+ def extracted_query_embeddings(queries, max_length=64):
100
+ query_inputs = q_tokenizer.batch_encode_plus(
101
+ queries,
102
+ add_special_tokens=True,
103
+ truncation=True,
104
+ padding="max_length",
105
+ max_length=max_length,
106
+ return_token_type_ids=True
107
+ )
108
+
109
+ query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
110
+ np.array(query_inputs['attention_mask']),
111
+ np.array(query_inputs['token_type_ids'])],
112
+ batch_size=1,
113
+ verbose=1)
114
+ return query_embeddings
115
+
116
+ def get_pagetext(page):
117
+ s = str(page).replace("/t","")
118
+ return s
119
+
120
+ def get_wiki_summary(search):
121
+ wiki_wiki = wikipediaapi.Wikipedia('en')
122
+ page = wiki_wiki.page(search)
123
+
124
+
125
+ def get_wiki_summaryDF(search):
126
+ wiki_wiki = wikipediaapi.Wikipedia('en')
127
+ page = wiki_wiki.page(search)
128
+
129
+ isExist = page.exists()
130
+ if not isExist:
131
+ return isExist, "Not found", "Not found", "Not found", "Not found"
132
+
133
+ pageurl = page.fullurl
134
+ pagetitle = page.title
135
+ pagesummary = page.summary[0:60]
136
+ pagetext = get_pagetext(page.text)
137
+
138
+ backlinks = page.backlinks
139
+ linklist = ""
140
+ for link in backlinks.items():
141
+ pui = link[0]
142
+ linklist += pui + " , "
143
+ a=1
144
+
145
+ categories = page.categories
146
+ categorylist = ""
147
+ for category in categories.items():
148
+ pui = category[0]
149
+ categorylist += pui + " , "
150
+ a=1
151
+
152
+ links = page.links
153
+ linklist2 = ""
154
+ for link in links.items():
155
+ pui = link[0]
156
+ linklist2 += pui + " , "
157
+ a=1
158
+
159
+ sections = page.sections
160
+
161
+ ex_dic = {
162
+ 'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
163
+ 'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
164
+ }
165
+
166
+ df = pd.DataFrame(ex_dic)
167
+
168
+ return df
169
+
170
+
171
+ def save_message(name, message):
172
+ now = datetime.datetime.now()
173
+ timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
174
+ with open("chat.txt", "a") as f:
175
+ f.write(f"{timestamp} - {name}: {message}\n")
176
+
177
+ def press_release():
178
+ st.markdown("""πŸŽ‰πŸŽŠ Breaking News! πŸ“’πŸ“£
179
+ Introducing StreamlitWikipediaChat - the ultimate way to chat with Wikipedia and the whole world at the same time! πŸŒŽπŸ“šπŸ‘‹
180
+ Are you tired of reading boring articles on Wikipedia? Do you want to have some fun while learning new things? Then StreamlitWikipediaChat is just the thing for you! πŸ˜ƒπŸ’»
181
+ With StreamlitWikipediaChat, you can ask Wikipedia anything you want and get instant responses! Whether you want to know the capital of Madagascar or how to make a delicious chocolate cake, Wikipedia has got you covered. 🍰🌍
182
+ But that's not all! You can also chat with other people from around the world who are using StreamlitWikipediaChat at the same time. It's like a virtual classroom where you can learn from and teach others. πŸŒπŸ‘¨β€πŸ«πŸ‘©β€πŸ«
183
+ And the best part? StreamlitWikipediaChat is super easy to use! All you have to do is type in your question and hit send. That's it! πŸ€―πŸ™Œ
184
+ So, what are you waiting for? Join the fun and start chatting with Wikipedia and the world today! πŸ˜ŽπŸŽ‰
185
+ StreamlitWikipediaChat - where learning meets fun! πŸ€“πŸŽˆ""")
186
+
187
+
188
+ def main():
189
+ st.title("Streamlit Chat")
190
+
191
+ name = st.text_input("Enter your name")
192
+ message = st.text_input("Enter a topic to share from Wikipedia")
193
+ if st.button("Submit"):
194
+
195
+ # wiki
196
+ df = get_wiki_summaryDF(message)
197
+
198
+ save_message(name, message)
199
+ save_message(name, df)
200
+
201
+ st.text("Message sent!")
202
+
203
+
204
+ st.text("Chat history:")
205
+ with open("chat.txt", "a+") as f:
206
+ f.seek(0)
207
+ chat_history = f.read()
208
+ #st.text(chat_history)
209
+ st.markdown(chat_history)
210
+
211
+ countdown = st.empty()
212
+ t = 60
213
+ while t:
214
+ mins, secs = divmod(t, 60)
215
+ countdown.text(f"Time remaining: {mins:02d}:{secs:02d}")
216
+ time.sleep(1)
217
+ t -= 1
218
+ if t == 0:
219
+ countdown.text("Time's up!")
220
+ with open("chat.txt", "a+") as f:
221
+ f.seek(0)
222
+ chat_history = f.read()
223
+ #st.text(chat_history)
224
+ st.markdown(chat_history)
225
+
226
+ press_release()
227
+
228
+ t = 60
229
+
230
+ if __name__ == "__main__":
231
+ main()
232
+