trysem datasciencedojo commited on
Commit
db31b27
·
0 Parent(s):

Duplicate from datasciencedojo/Wikipedia-Article-Scrape

Browse files

Co-authored-by: Data Science Dojo <[email protected]>

Files changed (5) hide show
  1. .gitattributes +31 -0
  2. HelveticaWorld-Regular.ttf +0 -0
  3. README.md +13 -0
  4. app.py +89 -0
  5. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
HelveticaWorld-Regular.ttf ADDED
Binary file (657 kB). View file
 
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Wikipedia Article Scrape
3
+ emoji: 🦀
4
+ colorFrom: red
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.4.1
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: datasciencedojo/Wikipedia-Article-Scrape
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import wikipedia
3
+ import numpy as np
4
+ import pandas as pd
5
+ from os import path
6
+ from PIL import Image
7
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
8
+ import matplotlib.pyplot as plt
9
+
10
+ def wikipediaScrap(article_name, wikipedia_language = "en - English"):
11
+ wikipedia_language = wikipedia_language.split(" - ")[0]
12
+
13
+ if wikipedia_language:
14
+ wikipedia.set_lang(wikipedia_language)
15
+
16
+ # rem_sp = article_name.replace(" ", "")
17
+ et_page = wikipedia.page(article_name)
18
+ title = et_page.title
19
+ content = et_page.content
20
+ page_url = et_page.url
21
+ linked_pages = et_page.links
22
+
23
+ text = content
24
+
25
+ # Create and generate a word cloud image:
26
+ wordcloud = WordCloud(font_path="HelveticaWorld-Regular.ttf").generate(text)
27
+
28
+ # Display the generated image:
29
+ plt.imshow(wordcloud, interpolation='bilinear')
30
+ plt.axis("off")
31
+
32
+ return title, content, page_url, "\n". join(linked_pages), plt
33
+
34
+ css = """
35
+ footer {display:none !important}
36
+ .output-markdown{display:none !important}
37
+ footer {visibility: hidden}
38
+ #dsd_button {background: purple, color: white}
39
+
40
+ textarea[data-testid="textbox"] { height: 178px !important}
41
+
42
+ #mytext {height: 43px !important;}
43
+
44
+ .max-h-[30rem] {max-height: 18rem !important;}
45
+
46
+ .hover\:bg-orange-50:hover {
47
+ --tw-bg-opacity: 1 !important;
48
+ background-color: rgb(229,225,255) !important;
49
+ }
50
+ """
51
+
52
+ ini_dict = wikipedia.languages()
53
+
54
+ # split dictionary into keys and values
55
+ keys = []
56
+ values = []
57
+ language=[]
58
+
59
+ items = ini_dict.items()
60
+ for item in items:
61
+ keys.append(item[0]), values.append(item[1])
62
+ language.append(item[0]+" - "+item[1])
63
+
64
+ with gr.Blocks(title="Wikipedia Article Scrape | Data Science Dojo", css = css) as demo:
65
+ with gr.Row():
66
+ inp = gr.Textbox(placeholder="Enter the name of wikipedia article", label="Wikipedia article name")
67
+ lan = gr.Dropdown(label=" Select Language", choices=language, value=language[105], interactive=True)
68
+
69
+ btn = gr.Button("Start Scraping", elem_id="dsd_button")
70
+ with gr.Row():
71
+ with gr.Column():
72
+ gr.Markdown("""## About""")
73
+ title = gr.Textbox(label="Article title")
74
+ url = gr.Textbox(label="Article URL")
75
+ with gr.Column():
76
+ gr.Markdown("""## Wordcloud""")
77
+ wordcloud = gr.Plot()
78
+ gr.Markdown("""### Content""")
79
+ with gr.Row():
80
+ content = gr.Textbox(label="Content")
81
+ gr.Markdown("""### Linked Articles""")
82
+ with gr.Row():
83
+ linked = gr.Textbox(label="Linked Articles")
84
+ with gr.Row():
85
+ gr.Examples(
86
+ examples = [["Eiffel Tower", "en - English"], ["Eiffel tower", 'ur - اردو']], fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud], cache_examples=True)
87
+ btn.click(fn=wikipediaScrap, inputs=[inp, lan], outputs=[title, content, url, linked, wordcloud])
88
+
89
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ wikipedia
3
+ wordcloud