Spaces:

Bedirhan
/

bitirme-proje

App Files Files Community

bitirme-proje / sentimentAnalysis.py

Bedirhan's picture

Upload sentimentAnalysis.py

a1e75ef over 2 years ago

3.87 kB

	import nltk
	import pandas as pd
	from nltk.corpus import stopwords
	import re
	import numpy as np
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
	from scipy.special import softmax

	class sentimentAnalysis():
	def __init__(self, lang, text2analysePath):
	self.lang = lang
	self.text2analysePath = text2analysePath
	self.engLabels = ["negative", "neutral", "positive"]
	nltk.download("stopwords")

	def downloadModels(self):
	txtt = open(self.text2analysePath, 'r', encoding="utf-8")
	if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
	MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
	self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
	self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
	self.model.save_pretrained(MODEL)
	self.tokenizer.save_pretrained(MODEL)
	self.engPrepareText(txtt)

	elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
	self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
	self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
	self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
	self.trPrepareText(txtt)

	else:
	print("Dil bulunamadı!------The language has not been found!")

	def engPrepareText(self, txtt):
	a = []
	for i in txtt.readlines():
	i = i.lower()
	i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
	spl = i.split(' ')
	new_word = [word for word in spl if not word in set(stopwords.words("english"))]
	a.append(' '.join(new_word))
	dFen = pd.DataFrame(a, columns=["texts"])
	self.engAnalyse(dFen)

	def trPrepareText(self, txtt):
	a = []
	for i in txtt.readlines():
	i = i.lower()
	i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
	spl = i.split(' ')
	new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
	a.append(' '.join(new_word))
	dFtr = pd.DataFrame(a, columns=["metinler"])
	self.trAnalyse(dFtr)

	def engAnalyse(self, dFen):
	for i in range(len(dFen)):
	text = dFen["texts"][i]
	encoded_input = self.tokenizer(text, return_tensors='pt')
	output = self.model(**encoded_input)
	scores = output[0][0].detach().numpy()
	scores = softmax(scores)
	ranking = np.argsort(scores)
	ranking = ranking[::-1]
	print(f"text: {text}")
	for i in range(scores.shape[0]):
	l = self.engLabels[ranking[i]]
	s = scores[ranking[i]]
	print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")

	def trAnalyse(self, dFtr):
	for i in range(len(dFtr)):
	text = dFtr["metinler"][i]
	p = self.sa(text)[0]
	if p["label"] == "positive":
	print(f"text: {text}")
	print(f"1-) positive: {np.round(float(p['score']), 4)}")
	print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
	else:
	print(f"text: {text}")
	print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
	print(f"2-) negative: {np.round(float(p['score']), 4)}")



	lang = "ingilizce"
	path = "texts/denemeler/text.txt"

	sA = sentimentAnalysis(lang, path).downloadModels()