Spaces:

Bedirhan
/

bitirme-proje

File size: 3,872 Bytes

a1e75ef

import nltk
import pandas as pd
from nltk.corpus import stopwords
import re
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scipy.special import softmax

class sentimentAnalysis():
    def __init__(self, lang, text2analysePath):
        self.lang = lang
        self.text2analysePath = text2analysePath
        self.engLabels = ["negative", "neutral", "positive"]
        nltk.download("stopwords")

    def downloadModels(self):
        txtt = open(self.text2analysePath, 'r', encoding="utf-8")
        if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
            MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
            self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
            self.model.save_pretrained(MODEL)
            self.tokenizer.save_pretrained(MODEL)
            self.engPrepareText(txtt)

        elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
            self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
            self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
            self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
            self.trPrepareText(txtt)

        else:
            print("Dil bulunamadı!------The language has not been found!")

    def engPrepareText(self, txtt):
        a = []
        for i in txtt.readlines():
            i = i.lower()
            i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
            spl = i.split(' ')
            new_word = [word for word in spl if not word in set(stopwords.words("english"))]
            a.append(' '.join(new_word))
            dFen = pd.DataFrame(a, columns=["texts"])
        self.engAnalyse(dFen)

    def trPrepareText(self, txtt):
        a = []
        for i in txtt.readlines():
            i = i.lower()
            i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
            spl = i.split(' ')
            new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
            a.append(' '.join(new_word))
            dFtr = pd.DataFrame(a, columns=["metinler"])
        self.trAnalyse(dFtr)

    def engAnalyse(self, dFen):
        for i in range(len(dFen)):
            text = dFen["texts"][i]
            encoded_input = self.tokenizer(text, return_tensors='pt')
            output = self.model(**encoded_input)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
            print(f"text: {text}")
            for i in range(scores.shape[0]):
                l = self.engLabels[ranking[i]]
                s = scores[ranking[i]]
                print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")

    def trAnalyse(self, dFtr):
        for i in range(len(dFtr)):
            text = dFtr["metinler"][i]
            p = self.sa(text)[0]
            if p["label"] == "positive":
                print(f"text: {text}")
                print(f"1-) positive: {np.round(float(p['score']), 4)}")
                print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
            else:
                print(f"text: {text}")
                print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
                print(f"2-) negative: {np.round(float(p['score']), 4)}")



lang = "ingilizce"
path = "texts/denemeler/text.txt"

sA = sentimentAnalysis(lang, path).downloadModels()