huggingface-ppw / app.py
rrayhka's picture
first commit
c7ec819
from flask import Flask, render_template, request
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import nltk
# Inisialisasi aplikasi Flask
app = Flask(__name__)
# Memuat model klasifikasi
lr_model = pickle.load(open('model/lr_modelNormal.pkl', 'rb'))
tfidf_model = pickle.load(open('model/tfidf_modelLatest.pkl', 'rb'))
# Mengunduh data NLTK yang diperlukan
nltk.download('stopwords')
nltk.download('punkt')
# Enkode label untuk kategori
labels_encode = {1: "Research", 0: "News"}
# stop_words = set(stopwords.words('indonesian'))
stop_words = stopwords.words('indonesian')
# save stopwords
with open('stopwords.txt', 'w') as f:
for item in stop_words:
f.write("%s\n" % item)
# Fungsi untuk mengambil konten berita dari URL
def scrape_news(url):
isi = []
judul = []
response = requests.get(url)
if response.status_code == 200:
article_full = BeautifulSoup(response.content, "html.parser")
judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold").text.strip()
artikel_element = article_full.find("div", class_="detail-text")
artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")]
artikel_content = "\n".join(artikel_teks)
isi.append(artikel_content)
judul.append(judul_artikel)
return pd.DataFrame({"judul": judul, "isi": isi})
# Fungsi pembersihan teks
def cleansing(text):
text = re.sub(r'[\s]+', ' ', text)
text = text.encode('ascii', 'ignore').decode('utf-8')
text = re.sub(r'[^\x00-\x7f]', r'', text)
text = re.sub(r'\d+', '', text)
text = text.lower()
text = re.sub(r'\b-\b', ' ', text)
text = re.sub(r'[^\w\s]+', ' ', text)
text = text.replace('\n', '')
return text
# Fungsi untuk menghapus stopword
def remove_stopwords(text):
words = text.split()
words = [word for word in words if word not in stop_words]
return ' '.join(words)
# Fungsi stemming
def stemming(text):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return stemmer.stem(text)
# Fungsi utama untuk preprocessing teks
def preprocess_text(text):
clean_text = cleansing(text)
stopword_text = remove_stopwords(clean_text)
return stemming(stopword_text)
# Fungsi untuk mengklasifikasikan teks
def classify_news(text):
processed_text = preprocess_text(text)
text_vectorized = tfidf_model.transform([processed_text])
prediction = lr_model.predict(text_vectorized)
prediction_proba = lr_model.predict_proba(text_vectorized)
return prediction[0], prediction_proba[0]
# Fungsi untuk mengklasifikasikan teks dengan model yang berbeda
def classify_news_with_model(text, model):
processed_text = preprocess_text(text)
text_vectorized = tfidf_model.transform([processed_text])
prediction = model.predict(text_vectorized)
prediction_proba = model.predict_proba(text_vectorized)
# Mengembalikan kategori, probabilitas berita, dan probabilitas penelitian
return prediction[0], prediction_proba[0] # prediction[0] untuk kategori, prediction_proba[0] untuk probabilitas
# Rute untuk halaman utama
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
link_news = request.form.get("link_news")
selected_model = request.form.get("model")
# Validasi input
if not link_news:
return render_template('index.html', error="Link tidak boleh kosong.")
if "cnbcindonesia" not in link_news:
return render_template('index.html', error="Link tidak valid. Pastikan link berita dari CNBC Indonesia.")
# Mengambil konten berita dari URL yang diberikan
news = scrape_news(link_news)
news['cleaned_text'] = news["isi"].apply(preprocess_text)
# Melakukan klasifikasi dengan model yang dipilih
if selected_model == "logistic_regression":
prediction, probabilities = classify_news(news['cleaned_text'][0])
category_name = labels_encode[prediction]
prob_news_percent = round(probabilities[0] * 100, 3)
prob_research_percent = round(probabilities[1] * 100, 3)
elif selected_model == "lr_modelNcompo5":
# Memuat pipeline untuk 5 komponen
with open('model_pipeline_5.pkl', 'rb') as f:
pipeline_5 = pickle.load(f)
# Transformasi menggunakan model TF-IDF yang dimuat
X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]])
prediction = pipeline_5.predict(X_new_tfidf)
probabilities = pipeline_5.predict_proba(X_new_tfidf)
category_name = labels_encode[prediction[0]]
prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita
prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe
elif selected_model == "lr_modelNcompo10":
# Memuat pipeline untuk 10 komponen
with open('model_pipeline_10.pkl', 'rb') as f:
pipeline_10 = pickle.load(f)
# Transformasi menggunakan model TF-IDF yang dimuat
X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]])
prediction = pipeline_10.predict(X_new_tfidf)
probabilities = pipeline_10.predict_proba(X_new_tfidf)
category_name = labels_encode[prediction[0]]
prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita
prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe
# Membulatkan probabilitas dan mengubah ke persen
return render_template('index.html', result=category_name, prob_news=prob_news_percent, prob_research=prob_research_percent)
return render_template('index.html')
if __name__ == '__main__':
app.run(debug=True, port=5001)