Upload fake_news_model.py
Browse files- fake_news_model.py +64 -0
fake_news_model.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import tensorflow as tf
|
3 |
+
from tensorflow.keras.models import Sequential
|
4 |
+
from tensorflow.keras.layers import Embedding, Dense
|
5 |
+
from tensorflow.keras.layers import LSTM, Activation, SpatialDropout1D
|
6 |
+
from tensorflow.keras.preprocessing.text import one_hot
|
7 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import pandas as pd
|
13 |
+
import numpy as np
|
14 |
+
import re
|
15 |
+
|
16 |
+
train_dir = "data/train.csv"
|
17 |
+
df = pd.read_csv(train_dir)
|
18 |
+
|
19 |
+
df = df.dropna()
|
20 |
+
df = df.reset_index()
|
21 |
+
X = df.drop(labels=['label', 'id'], axis=1)
|
22 |
+
y = df['label']
|
23 |
+
|
24 |
+
xdata = X.copy()
|
25 |
+
xdata.reset_index(inplace=True)
|
26 |
+
|
27 |
+
lemmatizer = WordNetLemmatizer()
|
28 |
+
stop_words = stopwords.words('english')
|
29 |
+
|
30 |
+
xtitle = []
|
31 |
+
for i in range(len(xdata)):
|
32 |
+
sent = re.sub('[^a-zA-Z]', ' ', xdata['title'][i])
|
33 |
+
sent = sent.lower().split()
|
34 |
+
sent = [lemmatizer.lemmatize(word) for word in sent if word not in set(stop_words)]
|
35 |
+
sent = " ".join(sent)
|
36 |
+
xtitle.append(sent)
|
37 |
+
|
38 |
+
vocab_size = 5000
|
39 |
+
embedding_feature_len = 30
|
40 |
+
max_sent_len = 20
|
41 |
+
batch_size = 32
|
42 |
+
epochs = 10
|
43 |
+
|
44 |
+
one_hot_representation = [one_hot(words, vocab_size) for words in xtitle]
|
45 |
+
padded_sequences = pad_sequences(one_hot_representation, truncating="post",
|
46 |
+
padding="post", maxlen=max_sent_len)
|
47 |
+
|
48 |
+
X = np.array(padded_sequences)
|
49 |
+
y = np.array(y)
|
50 |
+
|
51 |
+
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
52 |
+
|
53 |
+
model = Sequential()
|
54 |
+
model.add(Embedding(vocab_size, embedding_feature_len, input_length=max_sent_len))
|
55 |
+
model.add(SpatialDropout1D(rate=0.2))
|
56 |
+
model.add(LSTM(units=128))
|
57 |
+
model.add(Dense(units=1))
|
58 |
+
model.add(Activation("sigmoid"))
|
59 |
+
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
|
60 |
+
print(model.summary())
|
61 |
+
|
62 |
+
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
|
63 |
+
|
64 |
+
y_pred = model.predict_classes(x_test)
|