wchynto commited on
Commit
9afb8cd
·
1 Parent(s): ad26689
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /python-docker
4
+
5
+ COPY requirements.txt requirements.txt
6
+ RUN pip install -r requirements.txt
7
+ RUN mkdir -p /nltk_data && chmod -R 777 /nltk_data
8
+ COPY . .
9
+
10
+ CMD [ "python", "-m" , "flask", "run", "--host=0.0.0.0"]
__pycache__/preprocessing.cpython-39.pyc ADDED
Binary file (1.51 kB). View file
 
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template
2
+ import pickle5 as pkl
3
+ from preprocessing import clean_text, clean_stopword, preprocess_text
4
+
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from nltk.tokenize import word_tokenize
8
+ import re
9
+ import string
10
+
11
+ import pandas as pd
12
+ import networkx as nx
13
+ import matplotlib.pyplot as plt
14
+ from sklearn.feature_extraction.text import TfidfVectorizer
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+
17
+ nltk.download('stopwords')
18
+ nltk.download('punkt_tab')
19
+
20
+ # Initialize Flask app
21
+ app = Flask(__name__)
22
+
23
+ def process_input(text):
24
+ prep_result = preprocess_text(text)
25
+ kalimat = nltk.sent_tokenize(prep_result)
26
+ tfidf_vectorizer = pkl.load(open('./tfidf_vectorizer.pkl', 'rb'))
27
+ tfidf_preprocessing = pkl.load(open('./tfidf_preprocessing.pkl', 'rb'))
28
+ terms = tfidf_vectorizer.get_feature_names_out()
29
+ tfidf = pd.DataFrame(data=tfidf_preprocessing.toarray(), columns=terms)
30
+
31
+ cossim = cosine_similarity(tfidf, tfidf)
32
+ similarity_matrix = pd.DataFrame(cossim,
33
+ index=range(len(kalimat)),
34
+ columns=range(len(kalimat)))
35
+
36
+ G = nx.DiGraph()
37
+ for i in range(len(cossim)):
38
+ G.add_node(i)
39
+
40
+ for i in range(len(cossim)):
41
+ for j in range(len(cossim)):
42
+ similarity = cossim[i][j]
43
+ if similarity > 0.1 and i != j:
44
+ G.add_edge(i, j)
45
+
46
+ closeness = nx.closeness_centrality(G)
47
+
48
+ sorted_closeness = sorted(closeness.items(), key=lambda x: x[1], reverse=True)
49
+
50
+ print(sorted_closeness)
51
+
52
+ ringkasan_closeness = ""
53
+ for node, closeness_preprocessing in sorted_closeness[:3]:
54
+ top_sentence = kalimat[node]
55
+ ringkasan_closeness += top_sentence + " "
56
+ print(ringkasan_closeness)
57
+
58
+ return top_sentence
59
+
60
+
61
+ @app.route('/', methods=['GET', 'POST'])
62
+ def summarize():
63
+ result = None
64
+ if request.method == 'POST':
65
+ text = request.form['text']
66
+ if text:
67
+ result = process_input(text)
68
+ return render_template('index.html', result=result)
69
+
70
+
71
+ if __name__ == '__main__':
72
+ app.run(debug=True)
preprocessing.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle as pkl
2
+
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.tokenize import word_tokenize
6
+ import re
7
+ import string
8
+
9
+ import pickle as pkl
10
+
11
+ import pandas as pd
12
+ import networkx as nx
13
+ import matplotlib.pyplot as plt
14
+ from tqdm import tqdm
15
+
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from sklearn.metrics.pairwise import cosine_similarity
18
+
19
+ def clean_text(text):
20
+ # make text lowercase
21
+ text = text.lower()
22
+
23
+ # remove line breaks
24
+ text = re.sub(r'\n', ' ', text)
25
+
26
+ # remove puctuation
27
+ translator = str.maketrans('', '', string.punctuation)
28
+ text = text.translate(translator)
29
+
30
+ # remove numbers
31
+ text = re.sub(r'\d+', '', text)
32
+
33
+ # remove extra spaces
34
+ text = re.sub(r'\s+', ' ', text)
35
+
36
+ # remove non-ascii characters
37
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
38
+
39
+ return text
40
+
41
+ def clean_stopword(tokens):
42
+ listStopword = set(stopwords.words('indonesian'))
43
+ filtered_words = [word for word in tokens if word.lower() not in listStopword]
44
+ return filtered_words
45
+
46
+ def preprocess_text(content):
47
+ cleaned_text = clean_text(content)
48
+ tokens = word_tokenize(cleaned_text)
49
+ cleaned_stopword = clean_stopword(tokens)
50
+ return ' '.join(cleaned_stopword)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ pickle5
3
+ Sastrawi
4
+ scikit-learn
5
+ tqdm
6
+ nltk
7
+ pandas
8
+ networkx
9
+ matplotlib
10
+ numpy
templates/index.html ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Ringkasan Otomatis</title>
5
+ </head>
6
+ <body>
7
+ <h1>Ringkasan Otomatis</h1>
8
+ <form method="POST">
9
+ <textarea name="text" rows="10" cols="80" placeholder="Masukkan teks yang ingin diringkas"></textarea><br>
10
+ <button type="submit">Ringkas</button>
11
+ </form>
12
+ {% if result %}
13
+ <h2>Hasil Ringkasan</h2>
14
+ <p>{{ result }}</p>
15
+ {% endif %}
16
+ </body>
17
+ </html>
tfidf_preprocessing.pkl ADDED
Binary file (1.59 kB). View file
 
tfidf_vectorizer.pkl ADDED
Binary file (2.83 kB). View file