File size: 8,598 Bytes
9c37e72
dba2773
 
9c37e72
 
 
 
 
 
 
 
 
dba2773
9c37e72
 
 
 
 
 
6e58c44
bd18577
 
 
 
 
 
 
c75cc74
 
 
 
 
 
 
9c37e72
36603f5
9c37e72
9531d63
9c37e72
 
 
 
 
 
 
 
419e04c
9c37e72
54ee49c
 
f6a6e42
9c37e72
 
 
 
 
 
9531d63
 
 
 
 
9c37e72
f6a6e42
9c37e72
 
 
 
 
 
 
 
 
 
 
34f51a1
9c37e72
c75cc74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c37e72
9531d63
9c37e72
 
c75cc74
9c37e72
 
 
6647ca5
9c37e72
 
c75cc74
9c37e72
 
9531d63
6647ca5
9531d63
 
 
c75cc74
6647ca5
9531d63
0cb4c76
 
 
c75cc74
0cb4c76
adb2b76
 
9c37e72
 
 
9222805
 
 
9c37e72
 
 
 
 
 
 
 
991d44a
9c37e72
 
 
 
 
c42698d
9c37e72
 
 
 
 
 
dba2773
c95ac40
9c37e72
c95ac40
9c37e72
 
 
c95ac40
 
1272866
fd37fd8
c95ac40
9c37e72
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows;

+ Tokenization & Lemmatization using Spacy

+ Named Entity Recognition(NER) using SpaCy

+ Sentiment Analysis using TextBlob

+ Document/Text Summarization using Gensim/T5 for both Bangla and english

This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP task with Streamlit, Spacy, Textblob and Gensim
"""
# Core Pkgs
import os
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
import streamlit as st
import websockets
import pyaudio
from configure import api_key
import json
import asyncio

import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel


# NLP Pkgs
from textblob import TextBlob 
import spacy
from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
# Title
st.title("Streamlit NLP APP")
@st.experimental_singleton
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData
@st.experimental_singleton
def load_models():
    tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
    model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    return tokenizer, model
# Function For Extracting Entities
@st.experimental_singleton
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData
def main():
	""" NLP Based App with Streamlit """
	st.markdown("""
    	#### Description
    	This is a Natural Language Processing(NLP) Based App useful for basic NLP task
         NER,Sentiment, Spell Corrections and Summarization
    	""")
    if "text" not in st.session_state:
        st.session_state["text"] = ""
        st.session_state["run"] = False
    def start_listening():
        st.session_state["run"] = True
    st.button("Say something", on_click=start_listening)
    text = st.text_input("What should I create?", value=st.session_state["text"])
    URL = "wss://api.assemblyai.com/v2/realtime/ws?sample_rate=16000"
    FRAMES_PER_BUFFER = 3200
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    p = pyaudio.PyAudio()
    # starts recording
    stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    frames_per_buffer=FRAMES_PER_BUFFER
    )
    async def send_receive():
          print(f'Connecting websocket to url ${URL}')
    async with websockets.connect(
		URL,
		extra_headers=(("Authorization", api_key),),
		ping_interval=5,
		ping_timeout=20
	) as _ws:

        r = await asyncio.sleep(0.1)
        print("Receiving Session begins ...")

        session_begins = await _ws.recv()

        async def send():
            while st.session_state['run']:      
                try:
                    data = stream.read(FRAMES_PER_BUFFER)
                    data = base64.b64encode(data).decode("utf-8")
                    json_data = json.dumps({"audio_data":str(data)})
                    r = await _ws.send(json_data)
                except websockets.exceptions.ConnectionClosedError as e:
                    print(e)
                    assert e.code == 4008
                    break
                except Exception as e:
                    print(e)
                    assert False, "Not a websocket 4008 error"

                r = await asyncio.sleep(0.01)


        async def receive():
            while st.session_state['run']:
                try:
                    result_str = await _ws.recv()
                    result = json.loads(result_str)['text']
                    
                    if json.loads(result_str)['message_type'] == 'FinalTranscript':
                        result = result.replace('.', '')
                        result = result.replace('!', '')
                        st.session_state['text'] = result
                        st.session_state['run'] = False
                        st.experimental_rerun()
                except websockets.exceptions.ConnectionClosedError as e:
                    print(e)
                    assert e.code == 4008
                    break
                except Exception as e:
                    print(e)
                    assert False, "Not a websocket 4008 error"
			
        send_result, receive_result = await asyncio.gather(send(), receive())
	# Entity Extraction
	if st.checkbox("Show Named Entities"):
		st.subheader("Analyze Your Text")
		if st.button("Extract"):
			entity_result = entity_analyzer(text)
			st.json(entity_result)

	# Sentiment Analysis
	if st.checkbox("Show Sentiment Analysis"):
		st.subheader("Analyse Your Text")
		if st.button("Analyze"):
			blob = TextBlob(text)
			result_sentiment = blob.sentiment
			st.success(result_sentiment)
    #Text Corrections
	if st.checkbox("Spell Corrections"):
		st.subheader("Correct Your Text")
		if st.button("Spell Corrections"):
			st.text("Using TextBlob ..")
			st.success(TextBlob(text).correct())
	if st.checkbox("Text Generation"):
		st.subheader("Generate Text")
		ok = st.button("Generate")
		tokenizer, model = load_models()
		if ok:
		    input_ids = tokenizer(text, return_tensors='pt').input_ids
		    st.text("Using Hugging Face Transformer, Contrastive Search ..")
		    output = model.generate(input_ids, max_length=128)
		    st.success(tokenizer.decode(output[0], skip_special_tokens=True))
	def change_photo_state():
		st.session_state["photo"]="done"
	st.subheader("Summary section, feed your image!")
	camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
	uploaded_photo = st.file_uploader("Upload Image, Containing English or Bangla texts",type=['jpg','png','jpeg'], on_change=change_photo_state)
	message = st.text_input("Or, drop your text here, only English text!")
	if "photo" not in st.session_state:
		st.session_state["photo"]="not done"

	if st.session_state["photo"]=="done" or message:
		if uploaded_photo:
			img = Image.open(uploaded_photo)
			img = img.save("img.png")
			img = cv2.imread("img.png")
			text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see in Bangla for Bangla Images only") else pytesseract.image_to_string(img)
			st.success(text)
		if camera_photo:
			img = Image.open(camera_photo)
			img = img.save("img.png")
			img = cv2.imread("img.png")
			text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see Bangla") else pytesseract.image_to_string(img)
			st.success(text)
		if uploaded_photo==None and camera_photo==None:
			#our_image=load_image("image.jpg")
			#img = cv2.imread("scholarly_text.jpg")
			text = message
		# Summarization
		if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
			#st.subheader("Summarize Your Text for English and Bangla Texts!")
			#message = st.text_area("Enter the Text","Type please ..")
			#st.text("Using Gensim Summarizer ..")
			#st.success(mess)
			summary_result = summarize(text)
			st.success(summary_result)
		elif st.checkbox("Mark here, Better Text Summarization for English only!"):
			#st.title("Summarize Your Text for English only!")
			tokenizer = AutoTokenizer.from_pretrained('t5-base')
			model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
			#st.text("Using Google T5 Transformer ..")
			inputs = tokenizer.encode("summarize: " + text,
						return_tensors='pt',
										max_length=512,
										truncation=True)
			summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
			summary = tokenizer.decode(summary_ids[0])
			st.success(summary)
	
	st.sidebar.subheader("About App")
	st.sidebar.subheader("By")
	st.sidebar.text("Soumen Sarker")

if __name__ == '__main__':
	main()