File size: 8,739 Bytes
9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
ba45265
e6d3f33
9c37e72
09d4214
 
29e33a8
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
b446f5c
a07988a
af0fae9
 
 
 
ba45265
29e33a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5dc1c
29e33a8
 
 
1a9fa41
c7a7627
 
 
 
 
3f06691
c7a7627
37287e0
c7a7627
0975d28
 
 
c7a7627
 
 
 
37287e0
0975d28
 
 
47d9c5a
0a75d54
 
 
ec4347b
9d1426d
ebcff05
 
59ea779
ebcff05
59ea779
f1ebc19
d960c1e
 
4d7ccde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87566f8
 
 
 
 
ea779a7
87566f8
 
 
 
 
 
 
 
 
 
 
ea779a7
 
 
 
 
 
 
 
 
 
 
af619b7
ea779a7
 
 
d960c1e
 
 
 
2dffa73
d0a45f9
9d1426d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Bangla and English Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text

def engsum(output):
    def query(payload):
        response = requests.post(API_URL1, headers=headers1, json=payload)
        return response.json()
        
    out = query({
        "inputs": output,
        "min_length":300
    })
    if isinstance(out, list) and out[0].get("generated_text"):
       text_output = out[0]["generated_text"]
       st.success(text_output)
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 
    
@st.cache
def save(l):
    return l
#@st.cache
def main():
    import streamlit as st
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    def change_photo_state():
        st.session_state["photo"]="done"
    with st.container():
        c1, c2, c3 = st.columns([2,2,1])
        message = c1.st.text_input("Type your text here!")
        camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
        uploaded_photo = save(c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state))
        if st.session_state["photo"]=="done" or message:
            if uploaded_photo and uploaded_photo.type=='application/pdf':
                tet = read_pdf(uploaded_photo)
                # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                #     temp_file.write(uploaded_photo.read())
                #     temp_file_path = temp_file.name
                    
                # loader = PyPDFLoader(temp_file_path)
                # if loader:
                #     text.extend(loader.load())
                #     os.remove(temp_file_path)
                # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
                # text_chunks = text_splitter.split_documents(text)
                values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
                text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
                #st.success(type(text_chunks))
                if st.button("English Pdf Summarize"):
                    st.subheader("Selected text for summarize: ")
                    st.success(text)
                    st.subheader("Summarized Text: ")
                    engsum(text)
            
            elif uploaded_photo and uploaded_photo.type !='application/pdf':
                text=None
                img = Image.open(uploaded_photo)
                img = img.save("img.png")
                img = cv2.imread("img.png")
                st.text("Select the summarization type:")
                c4, c5 = st.columns([1,1])
                if c4.button("BENGALI"):
                    text =  pytesseract.image_to_string(img, lang="ben")
                    st.subheader("সারাংশ/সারমর্ম")
                    bansum(text)
                if c5.button("ENGLISH"): 
                    text=pytesseract.image_to_string(img)
                    st.subheader("Summarized Text")
                    engsum(text)
                #st.success(text)
            elif camera_photo:
                text=None
                img = Image.open(camera_photo)
                img = img.save("img.png")
                img = cv2.imread("img.png")
                #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
                st.text("Select the summarization type:")
                c6, c7 = st.columns([1,1])
                if c6.button("Bangla"):
                    text =  pytesseract.image_to_string(img, lang="ben")
                    st.subheader("সারাংশ/সারমর্ম")
                    bansum(text)
                if c7.button("English"): 
                    text=pytesseract.image_to_string(img)
                    st.subheader("Summarized Text")
                    engsum(text)
            else:
                text=None
                text = message
                c8, c9 = st.columns([1,1])
                if c8.button("Bangla"):
                    bansum(text)
                if c9.button("English"): 
                    engsum(text) 
             
    with st.container():
        from streamlit_chat import message as st_message
        from transformers import BlenderbotTokenizer
        from transformers import BlenderbotForConditionalGeneration
        st.title("Chatbot!!!")
        
        @st.experimental_singleton
        def get_models():
            # it may be necessary for other frameworks to cache the model
            # seems pytorch keeps an internal state of the conversation
            model_name = "facebook/blenderbot-400M-distill"
            tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
            model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
            return tokenizer, model
        if "history" not in st.session_state:
            st.session_state.history = []
        st.title("Hello bot: ")
        def generate_answer():
            tokenizer, model = get_models()
            user_message = st.session_state.input_text
            inputs = tokenizer(st.session_state.input_text, return_tensors="pt")
            result = model.generate(**inputs)
            message_bot = tokenizer.decode(
                result[0], skip_special_tokens=True
            )  # .replace("<s>", "").replace("</s>", "")
            st.session_state.history.append({"message": user_message, "is_user": True})
            st.session_state.history.append({"message": message_bot, "is_user": False})
        st.text_input("Talk to the bot", key="input_text", on_change=generate_answer)
        from copyreg import clear_extension_cache
        for chat in st.session_state.history:
            st_message(**chat)
        if st.button("Refresh/New Chat"):
           st.session_state.history = []
            
            


if __name__ == "__main__":
    main()