KevlarVK commited on
Commit
a4f4f24
·
1 Parent(s): f1e08af

simple code to summarize using bart-large-cnn

Browse files
Files changed (3) hide show
  1. Utils.py +35 -0
  2. app.py +11 -2
  3. summarize.py +50 -0
Utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import string
4
+
5
+ def fetch_article_text(url: str):
6
+
7
+ r = requests.get(url)
8
+ soup = BeautifulSoup(r.text, "html.parser")
9
+ results = soup.find_all(["h1", "p"])
10
+ text = [result.text for result in results]
11
+ ARTICLE = " ".join(text)
12
+ ARTICLE = ARTICLE.replace(".", ".<eos>")
13
+ ARTICLE = ARTICLE.replace("!", "!<eos>")
14
+ ARTICLE = ARTICLE.replace("?", "?<eos>")
15
+ sentences = ARTICLE.split("<eos>")
16
+ current_chunk = 0
17
+ chunks = []
18
+ for sentence in sentences:
19
+ if len(chunks) == current_chunk + 1:
20
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
21
+ chunks[current_chunk].extend(sentence.split(" "))
22
+ else:
23
+ current_chunk += 1
24
+ chunks.append(sentence.split(" "))
25
+ else:
26
+ print(current_chunk)
27
+ chunks.append(sentence.split(" "))
28
+
29
+ for chunk_id in range(len(chunks)):
30
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
31
+
32
+ return ARTICLE, chunks
33
+
34
+ def count_tokens(text: str):
35
+ return len(text.split(" "))
app.py CHANGED
@@ -1,4 +1,13 @@
1
  import streamlit as st
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from summarize import bart_summarize
3
 
4
+ # Create a text field
5
+ text = st.text_input("Enter text here")
6
+
7
+ # Create a button
8
+ button = st.button("Click here")
9
+
10
+ # get text from text field and print it
11
+ if button:
12
+ summary = bart_summarize(text)
13
+ st.write(summary)
summarize.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from transformers import BartTokenizer, TFBartForConditionalGeneration, pipeline
3
+ from Utils import fetch_article_text, count_tokens
4
+ import re
5
+ from nltk.tokenize import sent_tokenize
6
+
7
+ tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
8
+ model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
9
+
10
+ def bart_summarize(text: str):
11
+
12
+ max_length = model.config.max_position_embeddings
13
+
14
+ sentences = sent_tokenize(text)
15
+ sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and len(sentence.split(" ")) > 4]
16
+
17
+ input_chunks = []
18
+ temp_sentences = ""
19
+ tokens = 0
20
+
21
+ for sentence in sentences:
22
+ if tokens + count_tokens(sentence) < max_length:
23
+ temp_sentences += sentence
24
+ tokens += count_tokens(sentence)
25
+ else:
26
+ input_chunks.append(temp_sentences)
27
+ tokens = count_tokens(sentence)
28
+ temp_sentences = sentence
29
+
30
+ if len(temp_sentences) > 0:
31
+ input_chunks.append(temp_sentences)
32
+
33
+ # summarize each input chunk separately
34
+ summaries = []
35
+ for chunk in input_chunks:
36
+ # encode the input chunk
37
+
38
+ encoded_input = tokenizer.encode(chunk, max_length=max_length, truncation=True, padding='longest', return_tensors='tf')
39
+
40
+ # generate summary for the input chunk
41
+ summary_ids = model.generate(encoded_input, max_length=300, num_beams=4, early_stopping=True)
42
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
43
+
44
+ # add the summary to the list of summaries
45
+ summaries.append(summary)
46
+
47
+ # # combine the summaries to get the final summary for the entire input
48
+ final_summary = " ".join(summaries)
49
+
50
+ return final_summary