Spaces:

wchynto
/

ringkasan-berita

Sleeping

App Files Files Community

wchynto commited on Dec 5, 2024

Commit

9afb8cd

1 Parent(s): ad26689

init

Browse files

Files changed (8) hide show

Dockerfile +10 -0
__pycache__/preprocessing.cpython-39.pyc +0 -0
app.py +72 -0
preprocessing.py +50 -0
requirements.txt +10 -0
templates/index.html +17 -0
tfidf_preprocessing.pkl +0 -0
tfidf_vectorizer.pkl +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.9
+WORKDIR /python-docker
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+RUN mkdir -p /nltk_data && chmod -R 777 /nltk_data
+COPY . .
+CMD [ "python", "-m" , "flask", "run", "--host=0.0.0.0"]

__pycache__/preprocessing.cpython-39.pyc ADDED Viewed

Binary file (1.51 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from flask import Flask, request, render_template
+import pickle5 as pkl
+from preprocessing import clean_text, clean_stopword, preprocess_text
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import re
+import string
+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+# Initialize Flask app
+app = Flask(__name__)
+def process_input(text):
+    prep_result = preprocess_text(text)
+    kalimat = nltk.sent_tokenize(prep_result)
+    tfidf_vectorizer = pkl.load(open('./tfidf_vectorizer.pkl', 'rb'))
+    tfidf_preprocessing = pkl.load(open('./tfidf_preprocessing.pkl', 'rb'))
+    terms = tfidf_vectorizer.get_feature_names_out()
+    tfidf = pd.DataFrame(data=tfidf_preprocessing.toarray(), columns=terms)
+    cossim = cosine_similarity(tfidf, tfidf)
+    similarity_matrix = pd.DataFrame(cossim,
+                                     index=range(len(kalimat)),
+                                     columns=range(len(kalimat)))
+    G = nx.DiGraph()
+    for i in range(len(cossim)):
+        G.add_node(i)
+    for i in range(len(cossim)):
+        for j in range(len(cossim)):
+            similarity = cossim[i][j]
+            if similarity > 0.1 and i != j:
+                G.add_edge(i, j)
+    closeness = nx.closeness_centrality(G)
+    sorted_closeness = sorted(closeness.items(), key=lambda x: x[1], reverse=True)
+    print(sorted_closeness)
+    ringkasan_closeness = ""
+    for node, closeness_preprocessing in sorted_closeness[:3]:
+        top_sentence = kalimat[node]
+        ringkasan_closeness += top_sentence + " "
+        print(ringkasan_closeness)
+    return top_sentence
+@app.route('/', methods=['GET', 'POST'])
+def summarize():
+    result = None
+    if request.method == 'POST':
+        text = request.form['text']
+        if text:
+            result = process_input(text)
+    return render_template('index.html', result=result)
+if __name__ == '__main__':
+    app.run(debug=True)

preprocessing.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pickle as pkl
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import re
+import string
+import pickle as pkl
+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+def clean_text(text):
+	# make text lowercase
+	text = text.lower()
+	# remove line breaks
+	text = re.sub(r'\n', ' ', text)
+	# remove puctuation
+	translator = str.maketrans('', '', string.punctuation)
+	text = text.translate(translator)
+		# remove numbers
+	text = re.sub(r'\d+', '', text)
+	# remove extra spaces
+	text = re.sub(r'\s+', ' ', text)
+	# remove non-ascii characters
+	text = re.sub(r'[^\x00-\x7F]+', ' ', text)
+	return text
+def clean_stopword(tokens):
+	listStopword =  set(stopwords.words('indonesian'))
+	filtered_words = [word for word in tokens if word.lower() not in listStopword]
+	return filtered_words
+def preprocess_text(content):
+	cleaned_text = clean_text(content)
+	tokens = word_tokenize(cleaned_text)
+	cleaned_stopword = clean_stopword(tokens)
+	return ' '.join(cleaned_stopword)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+flask
+pickle5
+Sastrawi
+scikit-learn
+tqdm
+nltk
+pandas
+networkx
+matplotlib
+numpy

templates/index.html ADDED Viewed

	@@ -0,0 +1,17 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Ringkasan Otomatis</title>
+</head>
+<body>
+    <h1>Ringkasan Otomatis</h1>
+    <form method="POST">
+        <textarea name="text" rows="10" cols="80" placeholder="Masukkan teks yang ingin diringkas"></textarea><br>
+        <button type="submit">Ringkas</button>
+    </form>
+    {% if result %}
+        <h2>Hasil Ringkasan</h2>
+        <p>{{ result }}</p>
+    {% endif %}
+</body>
+</html>

tfidf_preprocessing.pkl ADDED Viewed

Binary file (1.59 kB). View file

tfidf_vectorizer.pkl ADDED Viewed

Binary file (2.83 kB). View file