File size: 1,174 Bytes
9afb8cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pickle as pkl

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

import pickle as pkl

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

def clean_text(text):
	# make text lowercase
	text = text.lower() 

	# remove line breaks
	text = re.sub(r'\n', ' ', text)

	# remove puctuation
	translator = str.maketrans('', '', string.punctuation)
	text = text.translate(translator)

		# remove numbers
	text = re.sub(r'\d+', '', text)

	# remove extra spaces 
	text = re.sub(r'\s+', ' ', text)

	# remove non-ascii characters
	text = re.sub(r'[^\x00-\x7F]+', ' ', text)

	return text

def clean_stopword(tokens):
	listStopword =  set(stopwords.words('indonesian'))
	filtered_words = [word for word in tokens if word.lower() not in listStopword]
	return filtered_words

def preprocess_text(content):
	cleaned_text = clean_text(content)
	tokens = word_tokenize(cleaned_text)
	cleaned_stopword = clean_stopword(tokens)
	return ' '.join(cleaned_stopword)