mpinzon awacke1 commited on
Commit
1c86491
β€’
0 Parent(s):

Duplicate from awacke1/StreamlitWikipediaChat

Browse files

Co-authored-by: Aaron C Wacker <[email protected]>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +239 -0
  4. requirements.txt +10 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: πŸŒŽπŸ“šπŸ‘‹Streamlit-Wikipedia-Chat
3
+ emoji: πŸŒπŸ‘¨β€πŸ«πŸ‘©β€πŸ«
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: awacke1/StreamlitWikipediaChat
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ import wikipediaapi
4
+ import wikipedia
5
+ from wikipedia.exceptions import DisambiguationError
6
+ from transformers import TFAutoModel, AutoTokenizer
7
+ import numpy as np
8
+ import pandas as pd
9
+ import faiss
10
+ import datetime
11
+ import time
12
+
13
+
14
+ try:
15
+ nlp = spacy.load("en_core_web_sm")
16
+ except:
17
+ spacy.cli.download("en_core_web_sm")
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ wh_words = ['what', 'who', 'how', 'when', 'which']
21
+
22
+ def get_concepts(text):
23
+ text = text.lower()
24
+ doc = nlp(text)
25
+ concepts = []
26
+ for chunk in doc.noun_chunks:
27
+ if chunk.text not in wh_words:
28
+ concepts.append(chunk.text)
29
+ return concepts
30
+
31
+ def get_passages(text, k=100):
32
+ doc = nlp(text)
33
+ passages = []
34
+ passage_len = 0
35
+ passage = ""
36
+ sents = list(doc.sents)
37
+ for i in range(len(sents)):
38
+ sen = sents[i]
39
+ passage_len += len(sen)
40
+ if passage_len >= k:
41
+ passages.append(passage)
42
+ passage = sen.text
43
+ passage_len = len(sen)
44
+ continue
45
+ elif i == (len(sents) - 1):
46
+ passage += " " + sen.text
47
+ passages.append(passage)
48
+ passage = ""
49
+ passage_len = 0
50
+ continue
51
+ passage += " " + sen.text
52
+ return passages
53
+
54
+ def get_dicts_for_dpr(concepts, n_results=20, k=100):
55
+ dicts = []
56
+ for concept in concepts:
57
+ wikis = wikipedia.search(concept, results=n_results)
58
+ st.write(f"{concept} No of Wikis: {len(wikis)}")
59
+ for wiki in wikis:
60
+ try:
61
+ html_page = wikipedia.page(title=wiki, auto_suggest=False)
62
+ except DisambiguationError:
63
+ continue
64
+ htmlResults = html_page.content
65
+ passages = get_passages(htmlResults, k=k)
66
+ for passage in passages:
67
+ i_dicts = {}
68
+ i_dicts['text'] = passage
69
+ i_dicts['title'] = wiki
70
+ dicts.append(i_dicts)
71
+ return dicts
72
+
73
+ passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
74
+ query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
75
+ p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
76
+ q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
77
+
78
+ def get_title_text_combined(passage_dicts):
79
+ res = []
80
+ for p in passage_dicts:
81
+ res.append(tuple((p['title'], p['text'])))
82
+ return res
83
+
84
+ def extracted_passage_embeddings(processed_passages, max_length=156):
85
+ passage_inputs = p_tokenizer.batch_encode_plus(
86
+ processed_passages,
87
+ add_special_tokens=True,
88
+ truncation=True,
89
+ padding="max_length",
90
+ max_length=max_length,
91
+ return_token_type_ids=True
92
+ )
93
+ passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']), np.array(passage_inputs['attention_mask']),
94
+ np.array(passage_inputs['token_type_ids'])],
95
+ batch_size=64,
96
+ verbose=1)
97
+ return passage_embeddings
98
+
99
+ def extracted_query_embeddings(queries, max_length=64):
100
+ query_inputs = q_tokenizer.batch_encode_plus(
101
+ queries,
102
+ add_special_tokens=True,
103
+ truncation=True,
104
+ padding="max_length",
105
+ max_length=max_length,
106
+ return_token_type_ids=True
107
+ )
108
+
109
+ query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
110
+ np.array(query_inputs['attention_mask']),
111
+ np.array(query_inputs['token_type_ids'])],
112
+ batch_size=1,
113
+ verbose=1)
114
+ return query_embeddings
115
+
116
+ def get_pagetext(page):
117
+ s = str(page).replace("/t","")
118
+ return s
119
+
120
+ def get_wiki_summary(search):
121
+ wiki_wiki = wikipediaapi.Wikipedia('en')
122
+ page = wiki_wiki.page(search)
123
+
124
+
125
+ def get_wiki_summaryDF(search):
126
+ wiki_wiki = wikipediaapi.Wikipedia('en')
127
+ page = wiki_wiki.page(search)
128
+
129
+ isExist = page.exists()
130
+ if not isExist:
131
+ return isExist, "Not found", "Not found", "Not found", "Not found"
132
+
133
+ pageurl = page.fullurl
134
+ pagetitle = page.title
135
+ pagesummary = page.summary[0:60]
136
+ pagetext = get_pagetext(page.text)
137
+
138
+ backlinks = page.backlinks
139
+ linklist = ""
140
+ for link in backlinks.items():
141
+ pui = link[0]
142
+ linklist += pui + " , "
143
+ a=1
144
+
145
+ categories = page.categories
146
+ categorylist = ""
147
+ for category in categories.items():
148
+ pui = category[0]
149
+ categorylist += pui + " , "
150
+ a=1
151
+
152
+ links = page.links
153
+ linklist2 = ""
154
+ for link in links.items():
155
+ pui = link[0]
156
+ linklist2 += pui + " , "
157
+ a=1
158
+
159
+ sections = page.sections
160
+
161
+ ex_dic = {
162
+ 'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
163
+ 'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
164
+ }
165
+
166
+ df = pd.DataFrame(ex_dic)
167
+
168
+ return df
169
+
170
+
171
+ def save_message(name, message):
172
+ now = datetime.datetime.now()
173
+ timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
174
+ with open("chat.txt", "a") as f:
175
+ f.write(f"{timestamp} - {name}: {message}\n")
176
+
177
+ def press_release():
178
+ st.markdown("""πŸŽ‰πŸŽŠ Breaking News! πŸ“’πŸ“£
179
+
180
+ Introducing StreamlitWikipediaChat - the ultimate way to chat with Wikipedia and the whole world at the same time! πŸŒŽπŸ“šπŸ‘‹
181
+
182
+ Are you tired of reading boring articles on Wikipedia? Do you want to have some fun while learning new things? Then StreamlitWikipediaChat is just the thing for you! πŸ˜ƒπŸ’»
183
+
184
+ With StreamlitWikipediaChat, you can ask Wikipedia anything you want and get instant responses! Whether you want to know the capital of Madagascar or how to make a delicious chocolate cake, Wikipedia has got you covered. 🍰🌍
185
+
186
+ But that's not all! You can also chat with other people from around the world who are using StreamlitWikipediaChat at the same time. It's like a virtual classroom where you can learn from and teach others. πŸŒπŸ‘¨β€πŸ«πŸ‘©β€πŸ«
187
+
188
+ And the best part? StreamlitWikipediaChat is super easy to use! All you have to do is type in your question and hit send. That's it! πŸ€―πŸ™Œ
189
+
190
+ So, what are you waiting for? Join the fun and start chatting with Wikipedia and the world today! πŸ˜ŽπŸŽ‰
191
+
192
+ StreamlitWikipediaChat - where learning meets fun! πŸ€“πŸŽˆ""")
193
+
194
+
195
+ def main():
196
+ st.title("Streamlit Chat")
197
+
198
+ name = st.text_input("Enter your name")
199
+ message = st.text_input("Enter a topic to share from Wikipedia")
200
+ if st.button("Submit"):
201
+
202
+ # wiki
203
+ df = get_wiki_summaryDF(message)
204
+
205
+ save_message(name, message)
206
+ save_message(name, df)
207
+
208
+ st.text("Message sent!")
209
+
210
+
211
+ st.text("Chat history:")
212
+ with open("chat.txt", "a+") as f:
213
+ f.seek(0)
214
+ chat_history = f.read()
215
+ #st.text(chat_history)
216
+ st.markdown(chat_history)
217
+
218
+ countdown = st.empty()
219
+ t = 60
220
+ while t:
221
+ mins, secs = divmod(t, 60)
222
+ countdown.text(f"Time remaining: {mins:02d}:{secs:02d}")
223
+ time.sleep(1)
224
+ t -= 1
225
+ if t == 0:
226
+ countdown.text("Time's up!")
227
+ with open("chat.txt", "a+") as f:
228
+ f.seek(0)
229
+ chat_history = f.read()
230
+ #st.text(chat_history)
231
+ st.markdown(chat_history)
232
+
233
+ press_release()
234
+
235
+ t = 60
236
+
237
+ if __name__ == "__main__":
238
+ main()
239
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ wikipedia
2
+ spacy
3
+ faiss-cpu
4
+ pandas
5
+ transformers
6
+ tensorflow
7
+ wikipedia-api
8
+ beautifulsoup4
9
+ streamlit
10
+ requests