Spaces:
Runtime error
Runtime error
initial_commit
Browse files- README.md +36 -12
- app.py +17 -0
- multiapp.py +19 -0
- packages.txt +1 -0
- paraphraser.py +28 -0
- requirements.txt +9 -0
- scrap.py +24 -0
- summary.py +24 -0
README.md
CHANGED
@@ -1,12 +1,36 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python-dev-task-summarization
|
2 |
+
|
3 |
+
The task has been done in two methods-
|
4 |
+
- **using traditional Python libraries (like NLTK,Sumy)**
|
5 |
+
- **using pre-trained transformers model**
|
6 |
+
|
7 |
+
# Method-1
|
8 |
+
## using traditional Python libraries
|
9 |
+
#### Web Scraping Tools:
|
10 |
+
- Selenium
|
11 |
+
#### Paraphrasing Tools:
|
12 |
+
- used [nlpaug](https://github.com/makcedward/nlpaug) library
|
13 |
+
#### Summarization Tools:
|
14 |
+
- used [sumy](https://miso-belica.github.io/sumy/) library
|
15 |
+
#### System Requirements:
|
16 |
+
- you will find it in the _requirements.txt_ file
|
17 |
+
|
18 |
+
## How to test or run this?
|
19 |
+
- just open this link and follow the instructions: _**https://shamim237-python-dev-task-app-3n18pu.streamlit.app/**_
|
20 |
+
|
21 |
+
# Method-2
|
22 |
+
## Using pre-trained transformers model
|
23 |
+
#### Web Scraping Tools:
|
24 |
+
- ScrapperAPI
|
25 |
+
- BeautifulSoup
|
26 |
+
#### Paraphrasing Tools:
|
27 |
+
- used **"ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"** pre-trained model from HuggingFace
|
28 |
+
#### Summarization Tools:
|
29 |
+
- used **/"google/pegasus-cnn_dailymail"/** pre-trained model from HuggingFace
|
30 |
+
#### System Requirements:
|
31 |
+
- you will find it in the _Python_Dev_Task.ipynb_ notebook or in the below link.
|
32 |
+
|
33 |
+
## How to test or run this?
|
34 |
+
- Just open the **"Python_Dev_Task.ipynb"** file in Colab _or_ open this link: **_https://colab.research.google.com/drive/1wwaj0TobsnzQL5jMVsYrF5z6rc1944tE?usp=sharing_**
|
35 |
+
- Run all the cell
|
36 |
+
- The summarization output will show up in the last cell of the notebook.
|
app.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from multiapp import MultiApp
|
3 |
+
from apps import paraphraseApp, summarizerApp, scraperrApp
|
4 |
+
|
5 |
+
app = MultiApp()
|
6 |
+
|
7 |
+
st.title("Python Dev Task @SkyRanko")
|
8 |
+
st.write("==================_Completed by_ **Shamim Mahbub**==================")
|
9 |
+
st.markdown("This app provides three services - :red[Scraping], :orange[Paraphrasing] and :blue[Summarizing]")
|
10 |
+
st.caption("Note: _After scraping data from Amazon, the data has been paraphrased using a model and then Summarization has been performed on the paraphrased data._")
|
11 |
+
|
12 |
+
# Add all your application here
|
13 |
+
app.add_app("Scraper", scraperrApp.app)
|
14 |
+
app.add_app("Paraphraser", paraphraseApp.app)
|
15 |
+
app.add_app("Summarizer", summarizerApp.app)
|
16 |
+
# The main app
|
17 |
+
app.run()
|
multiapp.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
class MultiApp:
|
4 |
+
def __init__(self):
|
5 |
+
self.apps = []
|
6 |
+
|
7 |
+
def add_app(self, title, func):
|
8 |
+
self.apps.append({
|
9 |
+
"title": title,
|
10 |
+
"function": func
|
11 |
+
})
|
12 |
+
|
13 |
+
def run(self):
|
14 |
+
app = st.selectbox(
|
15 |
+
'Choose one',
|
16 |
+
self.apps,
|
17 |
+
format_func=lambda app: app['title'])
|
18 |
+
|
19 |
+
app['function']()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
firefox-esr
|
paraphraser.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import streamlit as st
|
3 |
+
import nlpaug.augmenter.word as naw
|
4 |
+
|
5 |
+
import os
|
6 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
7 |
+
|
8 |
+
@st.cache(allow_output_mutation=True, ttl=48*3600)
|
9 |
+
def load_model():
|
10 |
+
aug = naw.ContextualWordEmbsAug(
|
11 |
+
model_path='bert-base-uncased', action="insert")
|
12 |
+
return aug
|
13 |
+
|
14 |
+
aug = load_model()
|
15 |
+
|
16 |
+
def parphrase(passage):
|
17 |
+
sen = []
|
18 |
+
for i in passage:
|
19 |
+
res = len(re.findall(r'\w+', i))
|
20 |
+
if res == 2:
|
21 |
+
pass
|
22 |
+
else:
|
23 |
+
res = i.replace('"', "'").replace("\n", "")
|
24 |
+
sen.append(res)
|
25 |
+
|
26 |
+
pas = " ".join(sen)
|
27 |
+
para_text = aug.augment(pas)
|
28 |
+
return para_text
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nlpaug==1.1.11
|
2 |
+
nltk
|
3 |
+
selenium==4.8.0
|
4 |
+
sentencepiece==0.1.97
|
5 |
+
streamlit==1.17.0
|
6 |
+
sumy==0.11.0
|
7 |
+
torch==1.13.1
|
8 |
+
transformers==4.25.1
|
9 |
+
webdriver-manager
|
scrap.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from selenium import webdriver
|
3 |
+
from selenium.webdriver import Chrome
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.firefox.options import Options
|
6 |
+
from selenium.webdriver.firefox.service import Service
|
7 |
+
from webdriver_manager.firefox import GeckoDriverManager
|
8 |
+
from selenium.webdriver.common.by import By
|
9 |
+
|
10 |
+
def extract(link):
|
11 |
+
url = link
|
12 |
+
firefoxOptions = Options()
|
13 |
+
firefoxOptions.add_argument("--headless")
|
14 |
+
service = Service(GeckoDriverManager().install())
|
15 |
+
driver = webdriver.Firefox(
|
16 |
+
options=firefoxOptions,
|
17 |
+
service=service,
|
18 |
+
)
|
19 |
+
driver.get(url)
|
20 |
+
data = driver.find_element(By.ID,"aplus_feature_div")
|
21 |
+
data = data.text
|
22 |
+
data = data.split("\n")
|
23 |
+
time.sleep(2)
|
24 |
+
return data
|
summary.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import streamlit as st
|
3 |
+
from sumy.nlp.tokenizers import Tokenizer
|
4 |
+
from sumy.parsers.plaintext import PlaintextParser
|
5 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
6 |
+
|
7 |
+
@st.cache(allow_output_mutation=True, ttl=48*3600)
|
8 |
+
def dwnld_lib():
|
9 |
+
nltk.download('punkt')
|
10 |
+
|
11 |
+
dwnld_lib()
|
12 |
+
|
13 |
+
def text_summary(text):
|
14 |
+
para = " ".join(text)
|
15 |
+
# Create a plaintext parser and tokenizer
|
16 |
+
parser = PlaintextParser.from_string(para, Tokenizer("english"))
|
17 |
+
# Create a LexRank summarizer
|
18 |
+
summarizer = LexRankSummarizer()
|
19 |
+
# Summarize the text and print the results
|
20 |
+
summ = []
|
21 |
+
for sentence in summarizer(parser.document, 4):
|
22 |
+
summy = str(sentence).capitalize()
|
23 |
+
summ.append(summy)
|
24 |
+
return summ
|