Spaces:
Runtime error
Runtime error
Ambareesh T N
commited on
Commit
·
c7baaec
1
Parent(s):
c2e5ccb
Add application file
Browse files- app.py.py +161 -0
- requirements.txt +7 -0
app.py.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Untitled3.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/18DTgeDomshKNQMgYQ6y6mJbBom9mRw5l
|
8 |
+
"""
|
9 |
+
|
10 |
+
# Commented out IPython magic to ensure Python compatibility.
|
11 |
+
# %%writefile app.py
|
12 |
+
# %%writefile 'app.py'
|
13 |
+
import nltk
|
14 |
+
import math
|
15 |
+
import torch
|
16 |
+
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
17 |
+
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
18 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
|
19 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
20 |
+
from nltk.corpus import stopwords
|
21 |
+
from collections import Counter
|
22 |
+
from flair.data import Sentence
|
23 |
+
from flair.models import SequenceTagger
|
24 |
+
nltk.download('stopwords')
|
25 |
+
nltk.download('punkt')
|
26 |
+
import streamlit as st
|
27 |
+
|
28 |
+
st.set_page_config(layout="wide")
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
def divide_sentence(sentence):
|
33 |
+
conjunctions = ["and", "but", "or", "however", "therefore", "furthermore", "nevertheless",'the','i']
|
34 |
+
tokens = nltk.word_tokenize(sentence)
|
35 |
+
subsentences = []
|
36 |
+
current_subsentence = []
|
37 |
+
for token in tokens:
|
38 |
+
if token.lower() in conjunctions:
|
39 |
+
if len(current_subsentence)>0:
|
40 |
+
subsentences.append(" ".join(current_subsentence))
|
41 |
+
current_subsentence = []
|
42 |
+
else:
|
43 |
+
current_subsentence.append(token)
|
44 |
+
# Add the final subsentence to the list
|
45 |
+
subsentences.append(" ".join(current_subsentence))
|
46 |
+
# print(subsentences)
|
47 |
+
# d={}
|
48 |
+
# for s in subsentences:
|
49 |
+
# d[s] = {'accuracy':None,}
|
50 |
+
return subsentences
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def topic_identify(subsentences):
|
55 |
+
def sigmoid(x):
|
56 |
+
return 1 / (1 + math.exp(-x))
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all")
|
58 |
+
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all", problem_type="multi_label_classification")
|
59 |
+
model.eval()
|
60 |
+
class_mapping = model.config.id2label
|
61 |
+
topics = []
|
62 |
+
for text in subsentences:
|
63 |
+
with torch.no_grad():
|
64 |
+
tokens = tokenizer(text, return_tensors='pt')
|
65 |
+
output = model(**tokens)
|
66 |
+
flags = [sigmoid(s) > 0.5 for s in output[0][0].detach().tolist()]
|
67 |
+
topic = [class_mapping[n] for n, i in enumerate(flags) if i]
|
68 |
+
topics.append(','.join(topic))
|
69 |
+
return topics
|
70 |
+
|
71 |
+
|
72 |
+
def sentiment_score(subsentences):
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
74 |
+
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
75 |
+
from transformers import pipeline
|
76 |
+
sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
|
77 |
+
senti = []
|
78 |
+
for sen in subsentences:
|
79 |
+
a=sentiment_task(sen)
|
80 |
+
# [{'label': 'positive', 'score': 0.9484752416610718}]
|
81 |
+
a=a[0]
|
82 |
+
senti.append(a['label']+' , '+str(a['score']))
|
83 |
+
return senti
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
def intent_identify(subsentences):
|
88 |
+
model_name = 'cartesinus/fedcsis-intent_baseline-xlm_r-en'
|
89 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
90 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
91 |
+
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
92 |
+
intents = []
|
93 |
+
for s in subsentences:
|
94 |
+
res = classifier(s)
|
95 |
+
a=res[0]
|
96 |
+
intents.append(a['label']+' , '+str(a['score']))
|
97 |
+
return intents
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
def entity_identify(subsentences):
|
102 |
+
# load the NER tagger
|
103 |
+
tagger = SequenceTagger.load('ner')
|
104 |
+
# create a sentence to analyze
|
105 |
+
entities = []
|
106 |
+
for sentence in subsentences:
|
107 |
+
sentence = Sentence(sentence)
|
108 |
+
# run NER on the sentence
|
109 |
+
tagger.predict(sentence)
|
110 |
+
# print the entities found in the sentence
|
111 |
+
ent = []
|
112 |
+
for entity in sentence.get_spans('ner'):
|
113 |
+
ent.append(entity.text)
|
114 |
+
entities.append(','.join(ent))
|
115 |
+
return entities
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
def keyword_identify(subsentences):
|
120 |
+
class KeywordExtractor:
|
121 |
+
def __init__(self):
|
122 |
+
self.stop_words = set(stopwords.words('english'))
|
123 |
+
def extract_keywords(self, text):
|
124 |
+
# tokenize sentences
|
125 |
+
sentences = sent_tokenize(text)
|
126 |
+
# tokenize words and remove stop words
|
127 |
+
words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.lower() not in self.stop_words and word.isalpha()]
|
128 |
+
# count word frequencies
|
129 |
+
word_freq = Counter(words)
|
130 |
+
# sort words by frequency
|
131 |
+
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
132 |
+
# return top 3 keywords
|
133 |
+
return [word[0] for word in sorted_words[:2]]
|
134 |
+
key = KeywordExtractor()
|
135 |
+
keywords=[]
|
136 |
+
for s in subsentences:
|
137 |
+
keyword = key.extract_keywords(s)
|
138 |
+
keywords.append(','.join(keyword))
|
139 |
+
return keywords
|
140 |
+
st.markdown("<h1 style='text-align: center; color: white; background : grey'>Process Fest</h1>", unsafe_allow_html=True)
|
141 |
+
import pandas as pd
|
142 |
+
import numpy as np
|
143 |
+
sent = st.text_input(label = 'Enter the Text:')
|
144 |
+
button = st.button('submit')
|
145 |
+
#sent = "The stay at AAA was good The food was not that bad but the service was very bad and I prefer BBB than AAA I’ll raise a complaint against AAA"
|
146 |
+
if button:
|
147 |
+
subsentences = divide_sentence(sent)
|
148 |
+
topic = topic_identify(subsentences)
|
149 |
+
sentiment = sentiment_score(subsentences)
|
150 |
+
intent = intent_identify(subsentences)
|
151 |
+
entity = entity_identify(subsentences)
|
152 |
+
keyword = keyword_identify(subsentences)
|
153 |
+
df = pd.DataFrame(
|
154 |
+
{
|
155 |
+
'subsentences': subsentences,
|
156 |
+
'sentiment and score': sentiment,
|
157 |
+
'intent': intent,
|
158 |
+
'entity' : entity,
|
159 |
+
'keyword' : keyword
|
160 |
+
})
|
161 |
+
st.dataframe(data=df, width=None, height=None,use_container_width=False)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nltk == 3.7
|
2 |
+
torch == 1.13.1
|
3 |
+
transformers == 4.25.1
|
4 |
+
flair == 0.12.1
|
5 |
+
streamlit
|
6 |
+
pandas
|
7 |
+
numpy
|