shamim237 commited on
Commit
f11b302
·
1 Parent(s): 00d9a12

initial_commit

Browse files
Files changed (8) hide show
  1. README.md +36 -12
  2. app.py +17 -0
  3. multiapp.py +19 -0
  4. packages.txt +1 -0
  5. paraphraser.py +28 -0
  6. requirements.txt +9 -0
  7. scrap.py +24 -0
  8. summary.py +24 -0
README.md CHANGED
@@ -1,12 +1,36 @@
1
- ---
2
- title: Python Dev Task Skyranko
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.17.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python-dev-task-summarization
2
+
3
+ The task has been done in two methods-
4
+ - **using traditional Python libraries (like NLTK,Sumy)**
5
+ - **using pre-trained transformers model**
6
+
7
+ # Method-1
8
+ ## using traditional Python libraries
9
+ #### Web Scraping Tools:
10
+ - Selenium
11
+ #### Paraphrasing Tools:
12
+ - used [nlpaug](https://github.com/makcedward/nlpaug) library
13
+ #### Summarization Tools:
14
+ - used [sumy](https://miso-belica.github.io/sumy/) library
15
+ #### System Requirements:
16
+ - you will find it in the _requirements.txt_ file
17
+
18
+ ## How to test or run this?
19
+ - just open this link and follow the instructions: _**https://shamim237-python-dev-task-app-3n18pu.streamlit.app/**_
20
+
21
+ # Method-2
22
+ ## Using pre-trained transformers model
23
+ #### Web Scraping Tools:
24
+ - ScrapperAPI
25
+ - BeautifulSoup
26
+ #### Paraphrasing Tools:
27
+ - used **"ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"** pre-trained model from HuggingFace
28
+ #### Summarization Tools:
29
+ - used **/"google/pegasus-cnn_dailymail"/** pre-trained model from HuggingFace
30
+ #### System Requirements:
31
+ - you will find it in the _Python_Dev_Task.ipynb_ notebook or in the below link.
32
+
33
+ ## How to test or run this?
34
+ - Just open the **"Python_Dev_Task.ipynb"** file in Colab _or_ open this link: **_https://colab.research.google.com/drive/1wwaj0TobsnzQL5jMVsYrF5z6rc1944tE?usp=sharing_**
35
+ - Run all the cell
36
+ - The summarization output will show up in the last cell of the notebook.
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from multiapp import MultiApp
3
+ from apps import paraphraseApp, summarizerApp, scraperrApp
4
+
5
+ app = MultiApp()
6
+
7
+ st.title("Python Dev Task @SkyRanko")
8
+ st.write("==================_Completed by_ **Shamim Mahbub**==================")
9
+ st.markdown("This app provides three services - :red[Scraping], :orange[Paraphrasing] and :blue[Summarizing]")
10
+ st.caption("Note: _After scraping data from Amazon, the data has been paraphrased using a model and then Summarization has been performed on the paraphrased data._")
11
+
12
+ # Add all your application here
13
+ app.add_app("Scraper", scraperrApp.app)
14
+ app.add_app("Paraphraser", paraphraseApp.app)
15
+ app.add_app("Summarizer", summarizerApp.app)
16
+ # The main app
17
+ app.run()
multiapp.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ class MultiApp:
4
+ def __init__(self):
5
+ self.apps = []
6
+
7
+ def add_app(self, title, func):
8
+ self.apps.append({
9
+ "title": title,
10
+ "function": func
11
+ })
12
+
13
+ def run(self):
14
+ app = st.selectbox(
15
+ 'Choose one',
16
+ self.apps,
17
+ format_func=lambda app: app['title'])
18
+
19
+ app['function']()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ firefox-esr
paraphraser.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ import nlpaug.augmenter.word as naw
4
+
5
+ import os
6
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
7
+
8
+ @st.cache(allow_output_mutation=True, ttl=48*3600)
9
+ def load_model():
10
+ aug = naw.ContextualWordEmbsAug(
11
+ model_path='bert-base-uncased', action="insert")
12
+ return aug
13
+
14
+ aug = load_model()
15
+
16
+ def parphrase(passage):
17
+ sen = []
18
+ for i in passage:
19
+ res = len(re.findall(r'\w+', i))
20
+ if res == 2:
21
+ pass
22
+ else:
23
+ res = i.replace('"', "'").replace("\n", "")
24
+ sen.append(res)
25
+
26
+ pas = " ".join(sen)
27
+ para_text = aug.augment(pas)
28
+ return para_text
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ nlpaug==1.1.11
2
+ nltk
3
+ selenium==4.8.0
4
+ sentencepiece==0.1.97
5
+ streamlit==1.17.0
6
+ sumy==0.11.0
7
+ torch==1.13.1
8
+ transformers==4.25.1
9
+ webdriver-manager
scrap.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from selenium import webdriver
3
+ from selenium.webdriver import Chrome
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.firefox.options import Options
6
+ from selenium.webdriver.firefox.service import Service
7
+ from webdriver_manager.firefox import GeckoDriverManager
8
+ from selenium.webdriver.common.by import By
9
+
10
+ def extract(link):
11
+ url = link
12
+ firefoxOptions = Options()
13
+ firefoxOptions.add_argument("--headless")
14
+ service = Service(GeckoDriverManager().install())
15
+ driver = webdriver.Firefox(
16
+ options=firefoxOptions,
17
+ service=service,
18
+ )
19
+ driver.get(url)
20
+ data = driver.find_element(By.ID,"aplus_feature_div")
21
+ data = data.text
22
+ data = data.split("\n")
23
+ time.sleep(2)
24
+ return data
summary.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import streamlit as st
3
+ from sumy.nlp.tokenizers import Tokenizer
4
+ from sumy.parsers.plaintext import PlaintextParser
5
+ from sumy.summarizers.lex_rank import LexRankSummarizer
6
+
7
+ @st.cache(allow_output_mutation=True, ttl=48*3600)
8
+ def dwnld_lib():
9
+ nltk.download('punkt')
10
+
11
+ dwnld_lib()
12
+
13
+ def text_summary(text):
14
+ para = " ".join(text)
15
+ # Create a plaintext parser and tokenizer
16
+ parser = PlaintextParser.from_string(para, Tokenizer("english"))
17
+ # Create a LexRank summarizer
18
+ summarizer = LexRankSummarizer()
19
+ # Summarize the text and print the results
20
+ summ = []
21
+ for sentence in summarizer(parser.document, 4):
22
+ summy = str(sentence).capitalize()
23
+ summ.append(summy)
24
+ return summ