|
""" |
|
#App: NLP App with Streamlit |
|
Description |
|
This is a Natural Language Processing(NLP) base Application that is useful for |
|
Document/Text Summarization from Bangla images and English Images/PDF files. |
|
""" |
|
|
|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") |
|
st.title("Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!") |
|
import torch |
|
import docx2txt |
|
from PIL import Image |
|
from PyPDF2 import PdfFileReader |
|
from pdf2image import convert_from_bytes |
|
import pdfplumber |
|
|
|
import pdf2image |
|
import requests |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
import line_cor |
|
import altair as alt |
|
from transformers import AutoTokenizer, AutoModelWithLMHead |
|
|
|
from PIL import Image |
|
|
|
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" |
|
headers0 = {"Authorization": "Bearer "+str(os.environ["t5multilingual"])} |
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache |
|
def read_pdf(file): |
|
|
|
|
|
pdfReader = PdfFileReader(file) |
|
count = pdfReader.numPages |
|
all_page_text = " " |
|
for i in range(count): |
|
page = pdfReader.getPage(i) |
|
all_page_text += page.extractText()+" " |
|
return all_page_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('t5-base') |
|
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) |
|
@st.cache(suppress_st_warning=True) |
|
def engsum(text): |
|
|
|
inputs = tokenizer.encode("summarize: " + text,return_tensors='pt', |
|
max_length= 512, |
|
truncation=True) |
|
summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2) |
|
summary = tokenizer.decode(summary_ids[0]) |
|
st.success(summary[5:-4]) |
|
def bansum(text): |
|
def query(payload): |
|
response = requests.post(API_URL0, headers=headers0, json=payload) |
|
return response.json() |
|
out = query({"inputs": text, "min_length":300}) |
|
if isinstance(out, list) and out[0].get("summary_text"): |
|
text_output = out[0]["summary_text"] |
|
st.success(text_output) |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
camera_photo=None |
|
import streamlit as st |
|
if "photo" not in st.session_state: |
|
st.session_state["photo"]="not done" |
|
def change_photo_state(): |
|
st.session_state["photo"]="done" |
|
with st.container(): |
|
c1, c2, c3 = st.columns([1.5,1.5,1.5]) |
|
message = c1.text_input("Type your text here!") |
|
Capture=True |
|
if c3.button("Start Camera"): |
|
camera_photo = c3.camera_input("Capture a photo to summarize: ", on_change=change_photo_state) |
|
if c3.button("Stop Camera"): |
|
Capture =False |
|
|
|
uploaded_photo = c2.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) |
|
if st.session_state["photo"]=="done" or message: |
|
if uploaded_photo and uploaded_photo.type=='application/pdf': |
|
tet = read_pdf(uploaded_photo) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) |
|
text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):] |
|
|
|
if st.button("English Pdf Summarize"): |
|
st.subheader("Selected text for summarize: ") |
|
st.success(text) |
|
st.subheader("Summarized Text: ") |
|
engsum(text) |
|
|
|
elif uploaded_photo and uploaded_photo.type !='application/pdf': |
|
text=None |
|
img = Image.open(uploaded_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
st.subheader("Select the summarization type:") |
|
c4, c5 = st.columns([1,7]) |
|
if c4.button("BENGALI"): |
|
text = pytesseract.image_to_string(img, lang="ben") |
|
st.success(text) |
|
st.subheader("সারাংশ/সারমর্ম") |
|
bansum(text) |
|
if c5.button("ENGLISH"): |
|
text=pytesseract.image_to_string(img) |
|
st.success(text) |
|
st.subheader("Summarized Text") |
|
engsum(text) |
|
elif camera_photo: |
|
text=None |
|
img = Image.open(camera_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
|
|
st.subheader("Select the summarization type:") |
|
|
|
c6, c7 = st.columns([1,7]) |
|
if c6.button("Bangla"): |
|
text = pytesseract.image_to_string(img, lang="ben") |
|
st.success(text) |
|
st.subheader("সারাংশ/সারমর্ম") |
|
bansum(text) |
|
if c7.button("English"): |
|
text=pytesseract.image_to_string(img) |
|
st.success(text) |
|
st.subheader("Summarized Text") |
|
engsum(text) |
|
else: |
|
text=None |
|
text = message |
|
c8, c9 = st.columns([1,7]) |
|
if c8.button("Bangla"): |
|
bansum(text) |
|
if c9.button("English"): |
|
engsum(text) |
|
|
|
if __name__ == "__main__": |
|
main() |