Spaces:
Running
Running
from textblob import TextBlob | |
import textstat | |
from sklearn import svm | |
import pandas as pd | |
import nltk | |
from nltk.corpus import stopwords | |
from joblib import dump | |
# carregando os dados | |
df = pd.read_csv("dataset/ALOY_deep-se.csv") | |
# Tirando os 5 Primeiros | |
df = df.iloc[5:df.shape[0]] | |
# criando a coluna contexto = titulo + descricao | |
df["context"] = df["title"] + df["description"] | |
# pré-processamento | |
nltk.download('stopwords') | |
stop = stopwords.words('english') | |
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) | |
# Extração de features | |
df["gunning_fog"] = df['context'].apply(textstat.gunning_fog) | |
df["flesch_reading_ease"] = df['context'].apply(textstat.flesch_reading_ease) | |
df["flesch_kincaid_grade"] = df['context'].apply(textstat.flesch_kincaid_grade) | |
df["smog_index"] = df['context'].apply(textstat.smog_index) | |
df["coleman_liau_index"] = df['context'].apply(textstat.coleman_liau_index) | |
df["automated_readability_index"] = df['context'].apply(textstat.automated_readability_index) | |
df["dale_chall_readability_score"] = df['context'].apply(textstat.dale_chall_readability_score) | |
df["difficult_words"] = df['context'].apply(textstat.difficult_words) | |
df["linsear_write_formula"] = df['context'].apply(textstat.linsear_write_formula) | |
df["polarity"] = df["context"].apply(lambda x: TextBlob(x).sentiment.polarity) | |
df["subjectivity"] = df["context"].apply(lambda x: TextBlob(x).sentiment.subjectivity) | |
X = df[["gunning_fog", "flesch_reading_ease", "flesch_kincaid_grade", "smog_index", "coleman_liau_index", | |
"automated_readability_index", "dale_chall_readability_score", "difficult_words", "linsear_write_formula", | |
"polarity", "subjectivity"]] | |
y = df["storypoint"] | |
# modelo SVR | |
model = svm.SVR() | |
model.fit(X, y) | |
dump(model, "model/model_tawos_aloy_neosp.pkl") |