File size: 8,598 Bytes
9c37e72 dba2773 9c37e72 dba2773 9c37e72 6e58c44 bd18577 c75cc74 9c37e72 36603f5 9c37e72 9531d63 9c37e72 419e04c 9c37e72 54ee49c f6a6e42 9c37e72 9531d63 9c37e72 f6a6e42 9c37e72 34f51a1 9c37e72 c75cc74 9c37e72 9531d63 9c37e72 c75cc74 9c37e72 6647ca5 9c37e72 c75cc74 9c37e72 9531d63 6647ca5 9531d63 c75cc74 6647ca5 9531d63 0cb4c76 c75cc74 0cb4c76 adb2b76 9c37e72 9222805 9c37e72 991d44a 9c37e72 c42698d 9c37e72 dba2773 c95ac40 9c37e72 c95ac40 9c37e72 c95ac40 1272866 fd37fd8 c95ac40 9c37e72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows;
+ Tokenization & Lemmatization using Spacy
+ Named Entity Recognition(NER) using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using Gensim/T5 for both Bangla and english
This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP task with Streamlit, Spacy, Textblob and Gensim
"""
# Core Pkgs
import os
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')
#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
import streamlit as st
import websockets
import pyaudio
from configure import api_key
import json
import asyncio
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
# NLP Pkgs
from textblob import TextBlob
import spacy
from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
# Title
st.title("Streamlit NLP APP")
@st.experimental_singleton
def text_analyzer(my_text):
nlp = spacy.load('en_core_web_sm')
docx = nlp(my_text)
# tokens = [ token.text for token in docx]
allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
return allData
@st.experimental_singleton
def load_models():
tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
return tokenizer, model
# Function For Extracting Entities
@st.experimental_singleton
def entity_analyzer(my_text):
nlp = spacy.load('en_core_web_sm')
docx = nlp(my_text)
tokens = [ token.text for token in docx]
entities = [(entity.text,entity.label_)for entity in docx.ents]
allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
return allData
def main():
""" NLP Based App with Streamlit """
st.markdown("""
#### Description
This is a Natural Language Processing(NLP) Based App useful for basic NLP task
NER,Sentiment, Spell Corrections and Summarization
""")
if "text" not in st.session_state:
st.session_state["text"] = ""
st.session_state["run"] = False
def start_listening():
st.session_state["run"] = True
st.button("Say something", on_click=start_listening)
text = st.text_input("What should I create?", value=st.session_state["text"])
URL = "wss://api.assemblyai.com/v2/realtime/ws?sample_rate=16000"
FRAMES_PER_BUFFER = 3200
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
# starts recording
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=FRAMES_PER_BUFFER
)
async def send_receive():
print(f'Connecting websocket to url ${URL}')
async with websockets.connect(
URL,
extra_headers=(("Authorization", api_key),),
ping_interval=5,
ping_timeout=20
) as _ws:
r = await asyncio.sleep(0.1)
print("Receiving Session begins ...")
session_begins = await _ws.recv()
async def send():
while st.session_state['run']:
try:
data = stream.read(FRAMES_PER_BUFFER)
data = base64.b64encode(data).decode("utf-8")
json_data = json.dumps({"audio_data":str(data)})
r = await _ws.send(json_data)
except websockets.exceptions.ConnectionClosedError as e:
print(e)
assert e.code == 4008
break
except Exception as e:
print(e)
assert False, "Not a websocket 4008 error"
r = await asyncio.sleep(0.01)
async def receive():
while st.session_state['run']:
try:
result_str = await _ws.recv()
result = json.loads(result_str)['text']
if json.loads(result_str)['message_type'] == 'FinalTranscript':
result = result.replace('.', '')
result = result.replace('!', '')
st.session_state['text'] = result
st.session_state['run'] = False
st.experimental_rerun()
except websockets.exceptions.ConnectionClosedError as e:
print(e)
assert e.code == 4008
break
except Exception as e:
print(e)
assert False, "Not a websocket 4008 error"
send_result, receive_result = await asyncio.gather(send(), receive())
# Entity Extraction
if st.checkbox("Show Named Entities"):
st.subheader("Analyze Your Text")
if st.button("Extract"):
entity_result = entity_analyzer(text)
st.json(entity_result)
# Sentiment Analysis
if st.checkbox("Show Sentiment Analysis"):
st.subheader("Analyse Your Text")
if st.button("Analyze"):
blob = TextBlob(text)
result_sentiment = blob.sentiment
st.success(result_sentiment)
#Text Corrections
if st.checkbox("Spell Corrections"):
st.subheader("Correct Your Text")
if st.button("Spell Corrections"):
st.text("Using TextBlob ..")
st.success(TextBlob(text).correct())
if st.checkbox("Text Generation"):
st.subheader("Generate Text")
ok = st.button("Generate")
tokenizer, model = load_models()
if ok:
input_ids = tokenizer(text, return_tensors='pt').input_ids
st.text("Using Hugging Face Transformer, Contrastive Search ..")
output = model.generate(input_ids, max_length=128)
st.success(tokenizer.decode(output[0], skip_special_tokens=True))
def change_photo_state():
st.session_state["photo"]="done"
st.subheader("Summary section, feed your image!")
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
uploaded_photo = st.file_uploader("Upload Image, Containing English or Bangla texts",type=['jpg','png','jpeg'], on_change=change_photo_state)
message = st.text_input("Or, drop your text here, only English text!")
if "photo" not in st.session_state:
st.session_state["photo"]="not done"
if st.session_state["photo"]=="done" or message:
if uploaded_photo:
img = Image.open(uploaded_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see in Bangla for Bangla Images only") else pytesseract.image_to_string(img)
st.success(text)
if camera_photo:
img = Image.open(camera_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see Bangla") else pytesseract.image_to_string(img)
st.success(text)
if uploaded_photo==None and camera_photo==None:
#our_image=load_image("image.jpg")
#img = cv2.imread("scholarly_text.jpg")
text = message
# Summarization
if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
#st.subheader("Summarize Your Text for English and Bangla Texts!")
#message = st.text_area("Enter the Text","Type please ..")
#st.text("Using Gensim Summarizer ..")
#st.success(mess)
summary_result = summarize(text)
st.success(summary_result)
elif st.checkbox("Mark here, Better Text Summarization for English only!"):
#st.title("Summarize Your Text for English only!")
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
#st.text("Using Google T5 Transformer ..")
inputs = tokenizer.encode("summarize: " + text,
return_tensors='pt',
max_length=512,
truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
summary = tokenizer.decode(summary_ids[0])
st.success(summary)
st.sidebar.subheader("About App")
st.sidebar.subheader("By")
st.sidebar.text("Soumen Sarker")
if __name__ == '__main__':
main()
|