story_point_estimator / gerar_modelos /gerar_modelos_neosp.py
giseldo's picture
ultima mensagem
1549063
raw
history blame
1.84 kB
from textblob import TextBlob
import textstat
from sklearn import svm
import pandas as pd
import nltk
from nltk.corpus import stopwords
from joblib import dump
# carregando os dados
df = pd.read_csv("dataset/ALOY_deep-se.csv")
# Tirando os 5 Primeiros
df = df.iloc[5:df.shape[0]]
# criando a coluna contexto = titulo + descricao
df["context"] = df["title"] + df["description"]
# pré-processamento
nltk.download('stopwords')
stop = stopwords.words('english')
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# Extração de features
df["gunning_fog"] = df['context'].apply(textstat.gunning_fog)
df["flesch_reading_ease"] = df['context'].apply(textstat.flesch_reading_ease)
df["flesch_kincaid_grade"] = df['context'].apply(textstat.flesch_kincaid_grade)
df["smog_index"] = df['context'].apply(textstat.smog_index)
df["coleman_liau_index"] = df['context'].apply(textstat.coleman_liau_index)
df["automated_readability_index"] = df['context'].apply(textstat.automated_readability_index)
df["dale_chall_readability_score"] = df['context'].apply(textstat.dale_chall_readability_score)
df["difficult_words"] = df['context'].apply(textstat.difficult_words)
df["linsear_write_formula"] = df['context'].apply(textstat.linsear_write_formula)
df["polarity"] = df["context"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["subjectivity"] = df["context"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
X = df[["gunning_fog", "flesch_reading_ease", "flesch_kincaid_grade", "smog_index", "coleman_liau_index",
"automated_readability_index", "dale_chall_readability_score", "difficult_words", "linsear_write_formula",
"polarity", "subjectivity"]]
y = df["storypoint"]
# modelo SVR
model = svm.SVR()
model.fit(X, y)
dump(model, "model/model_tawos_aloy_neosp.pkl")