story_point_estimator / gerar_modelos /gerar_modelos_tfidfsvm.py
giseldo's picture
ultima mensagem
1549063
raw
history blame
856 Bytes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
import pandas as pd
import nltk
from nltk.corpus import stopwords
from joblib import dump
# Carregando os dados
df = pd.read_csv("dataset/ALOY_deep-se.csv")
# Tirando os 5 Primeiros
df = df.iloc[5:df.shape[0]]
# Criando a coluna contexto = titulo + descricao
df["context"] = df["title"] + df["description"]
# Pré-processamento
nltk.download('stopwords')
stop = stopwords.words('english')
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# Extração de features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["context"])
y = df["storypoint"]
# Modelos
model = svm.SVR()
model.fit(X, y)
dump(vectorizer, "model/vectorizer_tfidf.pkl")
dump(model, "model/model_tawos_aloy_tfidfsvm.pkl")