Spaces:
Sleeping
Sleeping
from flask import Flask, request, render_template | |
import joblib | |
import re | |
import string | |
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from tqdm import tqdm | |
from nltk.corpus import stopwords | |
import nltk | |
nltk.download('stopwords') | |
app = Flask(__name__) | |
# app.config['APPLICATION_ROOT'] = '/klasifikasi-berita' | |
# Load pre-trained model and vectorizer | |
def load_model(): | |
model = joblib.load("logistic_regression_model.pkl") # Example model file | |
vectorizer = joblib.load("content_vectorizer.pkl") # Example vectorizer file | |
return model, vectorizer | |
model, vectorizer = load_model() # Load model and vectorizer once on startup | |
# Function to clean the input string | |
def clean_string(text): | |
text = text.lower() # Make text lowercase | |
text = re.sub(r'\n', ' ', text) # Remove line breaks | |
translator = str.maketrans('', '', string.punctuation) # Remove punctuation | |
text = text.translate(translator) | |
text = re.sub(r'\d+', '', text) # Remove numbers | |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces | |
text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters | |
stop_words = set(stopwords.words('indonesian')) # Remove stopwords | |
text = ' '.join([word for word in text.split() if word not in stop_words]) | |
return text | |
# Function to stem the input string using Sastrawi | |
def sastrawi_stemmer(text): | |
factory = StemmerFactory() | |
stemmer = factory.create_stemmer() | |
stemmed_text = ' '.join(stemmer.stem(word) for word in tqdm(text.split()) if word in text) | |
return stemmed_text | |
# Function to classify news article | |
def classify_news(text, model, vectorizer): | |
# Clean and preprocess the text | |
cleaned_text = clean_string(text) | |
stemmed_text = sastrawi_stemmer(cleaned_text) | |
# Vectorize the text | |
text_vectorized = vectorizer.transform([stemmed_text]) | |
# Get prediction and probabilities | |
prediction = model.predict(text_vectorized) | |
prediction_proba = model.predict_proba(text_vectorized) | |
return prediction[0], prediction_proba[0] | |
# Flask route for the main page | |
def home(): | |
category_name = None | |
probabilities = None | |
user_input = "" | |
if request.method == "POST": | |
user_input = request.form["news_text"] | |
if user_input.strip() != "": | |
# Classify the text | |
category, probabilities = classify_news(user_input, model, vectorizer) | |
# Map category to string | |
if category == 0: | |
category_name = "Ekonomi" | |
elif category == 1: | |
category_name = "Politik" | |
return render_template("index.html", category_name=category_name, probabilities=probabilities, user_input=user_input) | |
# Run the Flask app | |
if __name__ == "__main__": | |
app.run(debug=True) | |