Spaces:

wchynto
/

klasifikasi-berita

Sleeping

App Files Files Community

klasifikasi-berita / app.py

wchynto

init

e2d5ce2 22 days ago

raw

history blame contribute delete

2.92 kB

	from flask import Flask, request, render_template
	import joblib
	import re
	import string
	from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
	from sklearn.feature_extraction.text import TfidfVectorizer
	from tqdm import tqdm
	from nltk.corpus import stopwords
	import nltk

	nltk.download('stopwords')

	app = Flask(__name__)
	# app.config['APPLICATION_ROOT'] = '/klasifikasi-berita'

	# Load pre-trained model and vectorizer
	def load_model():
	model = joblib.load("logistic_regression_model.pkl") # Example model file
	vectorizer = joblib.load("content_vectorizer.pkl") # Example vectorizer file
	return model, vectorizer

	model, vectorizer = load_model() # Load model and vectorizer once on startup

	# Function to clean the input string
	def clean_string(text):
	text = text.lower() # Make text lowercase
	text = re.sub(r'\n', ' ', text) # Remove line breaks
	translator = str.maketrans('', '', string.punctuation) # Remove punctuation
	text = text.translate(translator)
	text = re.sub(r'\d+', '', text) # Remove numbers
	text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
	text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters
	stop_words = set(stopwords.words('indonesian')) # Remove stopwords
	text = ' '.join([word for word in text.split() if word not in stop_words])
	return text

	# Function to stem the input string using Sastrawi
	def sastrawi_stemmer(text):
	factory = StemmerFactory()
	stemmer = factory.create_stemmer()
	stemmed_text = ' '.join(stemmer.stem(word) for word in tqdm(text.split()) if word in text)
	return stemmed_text

	# Function to classify news article
	def classify_news(text, model, vectorizer):
	# Clean and preprocess the text
	cleaned_text = clean_string(text)
	stemmed_text = sastrawi_stemmer(cleaned_text)

	# Vectorize the text
	text_vectorized = vectorizer.transform([stemmed_text])

	# Get prediction and probabilities
	prediction = model.predict(text_vectorized)
	prediction_proba = model.predict_proba(text_vectorized)

	return prediction[0], prediction_proba[0]

	# Flask route for the main page
	@app.route("/", methods=["GET", "POST"])
	def home():
	category_name = None
	probabilities = None
	user_input = ""

	if request.method == "POST":
	user_input = request.form["news_text"]
	if user_input.strip() != "":
	# Classify the text
	category, probabilities = classify_news(user_input, model, vectorizer)

	# Map category to string
	if category == 0:
	category_name = "Ekonomi"
	elif category == 1:
	category_name = "Politik"

	return render_template("index.html", category_name=category_name, probabilities=probabilities, user_input=user_input)

	# Run the Flask app
	if __name__ == "__main__":
	app.run(debug=True)