## Simple sentiment analysis with Keras and GloVe embeddings

In [1]:
import gensim
g = gensim.models.KeyedVectors.load_word2vec_format("glove.6B.200d.w2v.txt")

In [2]:
g.most_similar("hello")

[('goodbye', 0.6602526903152466),
 ('!', 0.6129173040390015),
 ('hey', 0.5987921357154846),
 ('muddah', 0.5824116468429565),
 ('dolly', 0.5414455533027649),
 ('yeah', 0.5124996900558472),
 ('`', 0.49999740719795227),
 ('wow', 0.49062684178352356),
 ('dear', 0.4800094962120056),
 ('daddy', 0.47699397802352905)]

In [14]:
import nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

In [26]:
import numpy as np

In [19]:
with open("reviews.txt") as f:
    reviews = [ [y.lower() for y in line.strip().split() if y not in stopwords and len(y) > 2 and y in g ] for line in f ]

In [27]:
with open("labels.txt") as f:
    labels = np.array([ 1 if line.strip() == 'positive' else 0 for line in f ])

In [54]:
max_words = 100
embedding_dim = 200

In [76]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=50000)
texts = [" ".join(review) for review in reviews]
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=max_words)
vocab = tokenizer.word_index

In [78]:
glove_emb = np.zeros((len(vocab) + 1, embedding_dim))
for word, i in vocab.items():
    if word in g:
        glove_emb[i] = g[word]

In [49]:
from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [79]:
model = Sequential()
model.add(Embedding(len(vocab) + 1, embedding_dim, weights=[glove_emb], input_length=max_words, trainable=False))

In [80]:
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))

In [85]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [82]:
X_train, y_train, X_val, y_val = data[:20000], labels[:20000], data[20000:], labels[20000:]

In [86]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=2, batch_size=128)



Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa8e2fdc828>

In [92]:
s = "cool movie !"

array([[0.6096567]], dtype=float32)

In [95]:
def test_on_texts(s):
    print(model.predict(pad_sequences(tokenizer.texts_to_sequences(s), maxlen=max_words)))

In [97]:
test_on_texts([
    'this was a great movie',
    'i really liked it',
    'terrible film, hated it',
    'the movie was so-so',
    'oh yeah, great movie, a new godfather',
    'i would watch it again and again and again'
    ])

[[0.7248762 ]
 [0.65714085]
 [0.27267292]
 [0.49111417]
 [0.74458504]
 [0.5209482 ]]
