Spaces:
Sleeping
Sleeping
init
Browse files- Dockerfile +10 -0
- __pycache__/preprocessing.cpython-39.pyc +0 -0
- app.py +72 -0
- preprocessing.py +50 -0
- requirements.txt +10 -0
- templates/index.html +17 -0
- tfidf_preprocessing.pkl +0 -0
- tfidf_vectorizer.pkl +0 -0
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /python-docker
|
4 |
+
|
5 |
+
COPY requirements.txt requirements.txt
|
6 |
+
RUN pip install -r requirements.txt
|
7 |
+
RUN mkdir -p /nltk_data && chmod -R 777 /nltk_data
|
8 |
+
COPY . .
|
9 |
+
|
10 |
+
CMD [ "python", "-m" , "flask", "run", "--host=0.0.0.0"]
|
__pycache__/preprocessing.cpython-39.pyc
ADDED
Binary file (1.51 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, render_template
|
2 |
+
import pickle5 as pkl
|
3 |
+
from preprocessing import clean_text, clean_stopword, preprocess_text
|
4 |
+
|
5 |
+
import nltk
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
import re
|
9 |
+
import string
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
import networkx as nx
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
+
|
17 |
+
nltk.download('stopwords')
|
18 |
+
nltk.download('punkt_tab')
|
19 |
+
|
20 |
+
# Initialize Flask app
|
21 |
+
app = Flask(__name__)
|
22 |
+
|
23 |
+
def process_input(text):
|
24 |
+
prep_result = preprocess_text(text)
|
25 |
+
kalimat = nltk.sent_tokenize(prep_result)
|
26 |
+
tfidf_vectorizer = pkl.load(open('./tfidf_vectorizer.pkl', 'rb'))
|
27 |
+
tfidf_preprocessing = pkl.load(open('./tfidf_preprocessing.pkl', 'rb'))
|
28 |
+
terms = tfidf_vectorizer.get_feature_names_out()
|
29 |
+
tfidf = pd.DataFrame(data=tfidf_preprocessing.toarray(), columns=terms)
|
30 |
+
|
31 |
+
cossim = cosine_similarity(tfidf, tfidf)
|
32 |
+
similarity_matrix = pd.DataFrame(cossim,
|
33 |
+
index=range(len(kalimat)),
|
34 |
+
columns=range(len(kalimat)))
|
35 |
+
|
36 |
+
G = nx.DiGraph()
|
37 |
+
for i in range(len(cossim)):
|
38 |
+
G.add_node(i)
|
39 |
+
|
40 |
+
for i in range(len(cossim)):
|
41 |
+
for j in range(len(cossim)):
|
42 |
+
similarity = cossim[i][j]
|
43 |
+
if similarity > 0.1 and i != j:
|
44 |
+
G.add_edge(i, j)
|
45 |
+
|
46 |
+
closeness = nx.closeness_centrality(G)
|
47 |
+
|
48 |
+
sorted_closeness = sorted(closeness.items(), key=lambda x: x[1], reverse=True)
|
49 |
+
|
50 |
+
print(sorted_closeness)
|
51 |
+
|
52 |
+
ringkasan_closeness = ""
|
53 |
+
for node, closeness_preprocessing in sorted_closeness[:3]:
|
54 |
+
top_sentence = kalimat[node]
|
55 |
+
ringkasan_closeness += top_sentence + " "
|
56 |
+
print(ringkasan_closeness)
|
57 |
+
|
58 |
+
return top_sentence
|
59 |
+
|
60 |
+
|
61 |
+
@app.route('/', methods=['GET', 'POST'])
|
62 |
+
def summarize():
|
63 |
+
result = None
|
64 |
+
if request.method == 'POST':
|
65 |
+
text = request.form['text']
|
66 |
+
if text:
|
67 |
+
result = process_input(text)
|
68 |
+
return render_template('index.html', result=result)
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == '__main__':
|
72 |
+
app.run(debug=True)
|
preprocessing.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle as pkl
|
2 |
+
|
3 |
+
import nltk
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
from nltk.tokenize import word_tokenize
|
6 |
+
import re
|
7 |
+
import string
|
8 |
+
|
9 |
+
import pickle as pkl
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
import networkx as nx
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
from tqdm import tqdm
|
15 |
+
|
16 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
17 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
18 |
+
|
19 |
+
def clean_text(text):
|
20 |
+
# make text lowercase
|
21 |
+
text = text.lower()
|
22 |
+
|
23 |
+
# remove line breaks
|
24 |
+
text = re.sub(r'\n', ' ', text)
|
25 |
+
|
26 |
+
# remove puctuation
|
27 |
+
translator = str.maketrans('', '', string.punctuation)
|
28 |
+
text = text.translate(translator)
|
29 |
+
|
30 |
+
# remove numbers
|
31 |
+
text = re.sub(r'\d+', '', text)
|
32 |
+
|
33 |
+
# remove extra spaces
|
34 |
+
text = re.sub(r'\s+', ' ', text)
|
35 |
+
|
36 |
+
# remove non-ascii characters
|
37 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
38 |
+
|
39 |
+
return text
|
40 |
+
|
41 |
+
def clean_stopword(tokens):
|
42 |
+
listStopword = set(stopwords.words('indonesian'))
|
43 |
+
filtered_words = [word for word in tokens if word.lower() not in listStopword]
|
44 |
+
return filtered_words
|
45 |
+
|
46 |
+
def preprocess_text(content):
|
47 |
+
cleaned_text = clean_text(content)
|
48 |
+
tokens = word_tokenize(cleaned_text)
|
49 |
+
cleaned_stopword = clean_stopword(tokens)
|
50 |
+
return ' '.join(cleaned_stopword)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask
|
2 |
+
pickle5
|
3 |
+
Sastrawi
|
4 |
+
scikit-learn
|
5 |
+
tqdm
|
6 |
+
nltk
|
7 |
+
pandas
|
8 |
+
networkx
|
9 |
+
matplotlib
|
10 |
+
numpy
|
templates/index.html
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>Ringkasan Otomatis</title>
|
5 |
+
</head>
|
6 |
+
<body>
|
7 |
+
<h1>Ringkasan Otomatis</h1>
|
8 |
+
<form method="POST">
|
9 |
+
<textarea name="text" rows="10" cols="80" placeholder="Masukkan teks yang ingin diringkas"></textarea><br>
|
10 |
+
<button type="submit">Ringkas</button>
|
11 |
+
</form>
|
12 |
+
{% if result %}
|
13 |
+
<h2>Hasil Ringkasan</h2>
|
14 |
+
<p>{{ result }}</p>
|
15 |
+
{% endif %}
|
16 |
+
</body>
|
17 |
+
</html>
|
tfidf_preprocessing.pkl
ADDED
Binary file (1.59 kB). View file
|
|
tfidf_vectorizer.pkl
ADDED
Binary file (2.83 kB). View file
|
|