Spaces:
Sleeping
Sleeping
from flask import Flask, render_template, request | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import re | |
import pickle | |
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
from nltk.corpus import stopwords | |
import nltk | |
# Inisialisasi aplikasi Flask | |
app = Flask(__name__) | |
# Memuat model klasifikasi | |
lr_model = pickle.load(open('model/lr_modelNormal.pkl', 'rb')) | |
tfidf_model = pickle.load(open('model/tfidf_modelLatest.pkl', 'rb')) | |
# Mengunduh data NLTK yang diperlukan | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
# Enkode label untuk kategori | |
labels_encode = {1: "Research", 0: "News"} | |
# stop_words = set(stopwords.words('indonesian')) | |
stop_words = stopwords.words('indonesian') | |
# save stopwords | |
with open('stopwords.txt', 'w') as f: | |
for item in stop_words: | |
f.write("%s\n" % item) | |
# Fungsi untuk mengambil konten berita dari URL | |
def scrape_news(url): | |
isi = [] | |
judul = [] | |
response = requests.get(url) | |
if response.status_code == 200: | |
article_full = BeautifulSoup(response.content, "html.parser") | |
judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold").text.strip() | |
artikel_element = article_full.find("div", class_="detail-text") | |
artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")] | |
artikel_content = "\n".join(artikel_teks) | |
isi.append(artikel_content) | |
judul.append(judul_artikel) | |
return pd.DataFrame({"judul": judul, "isi": isi}) | |
# Fungsi pembersihan teks | |
def cleansing(text): | |
text = re.sub(r'[\s]+', ' ', text) | |
text = text.encode('ascii', 'ignore').decode('utf-8') | |
text = re.sub(r'[^\x00-\x7f]', r'', text) | |
text = re.sub(r'\d+', '', text) | |
text = text.lower() | |
text = re.sub(r'\b-\b', ' ', text) | |
text = re.sub(r'[^\w\s]+', ' ', text) | |
text = text.replace('\n', '') | |
return text | |
# Fungsi untuk menghapus stopword | |
def remove_stopwords(text): | |
words = text.split() | |
words = [word for word in words if word not in stop_words] | |
return ' '.join(words) | |
# Fungsi stemming | |
def stemming(text): | |
factory = StemmerFactory() | |
stemmer = factory.create_stemmer() | |
return stemmer.stem(text) | |
# Fungsi utama untuk preprocessing teks | |
def preprocess_text(text): | |
clean_text = cleansing(text) | |
stopword_text = remove_stopwords(clean_text) | |
return stemming(stopword_text) | |
# Fungsi untuk mengklasifikasikan teks | |
def classify_news(text): | |
processed_text = preprocess_text(text) | |
text_vectorized = tfidf_model.transform([processed_text]) | |
prediction = lr_model.predict(text_vectorized) | |
prediction_proba = lr_model.predict_proba(text_vectorized) | |
return prediction[0], prediction_proba[0] | |
# Fungsi untuk mengklasifikasikan teks dengan model yang berbeda | |
def classify_news_with_model(text, model): | |
processed_text = preprocess_text(text) | |
text_vectorized = tfidf_model.transform([processed_text]) | |
prediction = model.predict(text_vectorized) | |
prediction_proba = model.predict_proba(text_vectorized) | |
# Mengembalikan kategori, probabilitas berita, dan probabilitas penelitian | |
return prediction[0], prediction_proba[0] # prediction[0] untuk kategori, prediction_proba[0] untuk probabilitas | |
# Rute untuk halaman utama | |
def index(): | |
if request.method == 'POST': | |
link_news = request.form.get("link_news") | |
selected_model = request.form.get("model") | |
# Validasi input | |
if not link_news: | |
return render_template('index.html', error="Link tidak boleh kosong.") | |
if "cnbcindonesia" not in link_news: | |
return render_template('index.html', error="Link tidak valid. Pastikan link berita dari CNBC Indonesia.") | |
# Mengambil konten berita dari URL yang diberikan | |
news = scrape_news(link_news) | |
news['cleaned_text'] = news["isi"].apply(preprocess_text) | |
# Melakukan klasifikasi dengan model yang dipilih | |
if selected_model == "logistic_regression": | |
prediction, probabilities = classify_news(news['cleaned_text'][0]) | |
category_name = labels_encode[prediction] | |
prob_news_percent = round(probabilities[0] * 100, 3) | |
prob_research_percent = round(probabilities[1] * 100, 3) | |
elif selected_model == "lr_modelNcompo5": | |
# Memuat pipeline untuk 5 komponen | |
with open('model_pipeline_5.pkl', 'rb') as f: | |
pipeline_5 = pickle.load(f) | |
# Transformasi menggunakan model TF-IDF yang dimuat | |
X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]]) | |
prediction = pipeline_5.predict(X_new_tfidf) | |
probabilities = pipeline_5.predict_proba(X_new_tfidf) | |
category_name = labels_encode[prediction[0]] | |
prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita | |
prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe | |
elif selected_model == "lr_modelNcompo10": | |
# Memuat pipeline untuk 10 komponen | |
with open('model_pipeline_10.pkl', 'rb') as f: | |
pipeline_10 = pickle.load(f) | |
# Transformasi menggunakan model TF-IDF yang dimuat | |
X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]]) | |
prediction = pipeline_10.predict(X_new_tfidf) | |
probabilities = pipeline_10.predict_proba(X_new_tfidf) | |
category_name = labels_encode[prediction[0]] | |
prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita | |
prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe | |
# Membulatkan probabilitas dan mengubah ke persen | |
return render_template('index.html', result=category_name, prob_news=prob_news_percent, prob_research=prob_research_percent) | |
return render_template('index.html') | |
if __name__ == '__main__': | |
app.run(debug=True, port=5001) | |