Spaces:
Sleeping
Sleeping
File size: 6,073 Bytes
c7ec819 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
from flask import Flask, render_template, request
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import nltk
# Inisialisasi aplikasi Flask
app = Flask(__name__)
# Memuat model klasifikasi
lr_model = pickle.load(open('model/lr_modelNormal.pkl', 'rb'))
tfidf_model = pickle.load(open('model/tfidf_modelLatest.pkl', 'rb'))
# Mengunduh data NLTK yang diperlukan
nltk.download('stopwords')
nltk.download('punkt')
# Enkode label untuk kategori
labels_encode = {1: "Research", 0: "News"}
# stop_words = set(stopwords.words('indonesian'))
stop_words = stopwords.words('indonesian')
# save stopwords
with open('stopwords.txt', 'w') as f:
for item in stop_words:
f.write("%s\n" % item)
# Fungsi untuk mengambil konten berita dari URL
def scrape_news(url):
isi = []
judul = []
response = requests.get(url)
if response.status_code == 200:
article_full = BeautifulSoup(response.content, "html.parser")
judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold").text.strip()
artikel_element = article_full.find("div", class_="detail-text")
artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")]
artikel_content = "\n".join(artikel_teks)
isi.append(artikel_content)
judul.append(judul_artikel)
return pd.DataFrame({"judul": judul, "isi": isi})
# Fungsi pembersihan teks
def cleansing(text):
text = re.sub(r'[\s]+', ' ', text)
text = text.encode('ascii', 'ignore').decode('utf-8')
text = re.sub(r'[^\x00-\x7f]', r'', text)
text = re.sub(r'\d+', '', text)
text = text.lower()
text = re.sub(r'\b-\b', ' ', text)
text = re.sub(r'[^\w\s]+', ' ', text)
text = text.replace('\n', '')
return text
# Fungsi untuk menghapus stopword
def remove_stopwords(text):
words = text.split()
words = [word for word in words if word not in stop_words]
return ' '.join(words)
# Fungsi stemming
def stemming(text):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return stemmer.stem(text)
# Fungsi utama untuk preprocessing teks
def preprocess_text(text):
clean_text = cleansing(text)
stopword_text = remove_stopwords(clean_text)
return stemming(stopword_text)
# Fungsi untuk mengklasifikasikan teks
def classify_news(text):
processed_text = preprocess_text(text)
text_vectorized = tfidf_model.transform([processed_text])
prediction = lr_model.predict(text_vectorized)
prediction_proba = lr_model.predict_proba(text_vectorized)
return prediction[0], prediction_proba[0]
# Fungsi untuk mengklasifikasikan teks dengan model yang berbeda
def classify_news_with_model(text, model):
processed_text = preprocess_text(text)
text_vectorized = tfidf_model.transform([processed_text])
prediction = model.predict(text_vectorized)
prediction_proba = model.predict_proba(text_vectorized)
# Mengembalikan kategori, probabilitas berita, dan probabilitas penelitian
return prediction[0], prediction_proba[0] # prediction[0] untuk kategori, prediction_proba[0] untuk probabilitas
# Rute untuk halaman utama
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
link_news = request.form.get("link_news")
selected_model = request.form.get("model")
# Validasi input
if not link_news:
return render_template('index.html', error="Link tidak boleh kosong.")
if "cnbcindonesia" not in link_news:
return render_template('index.html', error="Link tidak valid. Pastikan link berita dari CNBC Indonesia.")
# Mengambil konten berita dari URL yang diberikan
news = scrape_news(link_news)
news['cleaned_text'] = news["isi"].apply(preprocess_text)
# Melakukan klasifikasi dengan model yang dipilih
if selected_model == "logistic_regression":
prediction, probabilities = classify_news(news['cleaned_text'][0])
category_name = labels_encode[prediction]
prob_news_percent = round(probabilities[0] * 100, 3)
prob_research_percent = round(probabilities[1] * 100, 3)
elif selected_model == "lr_modelNcompo5":
# Memuat pipeline untuk 5 komponen
with open('model_pipeline_5.pkl', 'rb') as f:
pipeline_5 = pickle.load(f)
# Transformasi menggunakan model TF-IDF yang dimuat
X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]])
prediction = pipeline_5.predict(X_new_tfidf)
probabilities = pipeline_5.predict_proba(X_new_tfidf)
category_name = labels_encode[prediction[0]]
prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita
prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe
elif selected_model == "lr_modelNcompo10":
# Memuat pipeline untuk 10 komponen
with open('model_pipeline_10.pkl', 'rb') as f:
pipeline_10 = pickle.load(f)
# Transformasi menggunakan model TF-IDF yang dimuat
X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]])
prediction = pipeline_10.predict(X_new_tfidf)
probabilities = pipeline_10.predict_proba(X_new_tfidf)
category_name = labels_encode[prediction[0]]
prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita
prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe
# Membulatkan probabilitas dan mengubah ke persen
return render_template('index.html', result=category_name, prob_news=prob_news_percent, prob_research=prob_research_percent)
return render_template('index.html')
if __name__ == '__main__':
app.run(debug=True, port=5001)
|