## Simple sentiment analysis with Keras and FastText embeddings

In [14]:
import gensim
from gensim.models.wrappers import FastText

In [15]:
g = FastText.load_fasttext_format('wiki.en.bin')

In [83]:
g.most_similar(positive=["final_fantasy"], topn=200)

  """Entry point for launching an IPython kernel.


[('final_fantasy_x', 0.9591085910797119),
 ('final_fantasy_xi', 0.9552675485610962),
 ('final_first', 0.7867688536643982),
 ('final_', 0.7800595164299011),
 ('final_third', 0.7536799907684326),
 ('warhammer_fantasy_stubs', 0.7517883777618408),
 ('_royal_society_prize_for_junior_science_book', 0.7474319934844971),
 ('/gaymers_on_trial_the_ecas_hal_', 0.7471776008605957),
 ('national_book_award_for_fiction', 0.7467536330223083),
 ('final_score', 0.7466115951538086),
 ('coupe_de_france_final_', 0.7452429533004761),
 ('fictional_firearms', 0.7438316941261292),
 ('international_criminal_tribunal_for_rwanda_judges', 0.7430262565612793),
 ('man_booker_prize_for_fiction', 0.7427597641944885),
 ('finalfantasy', 0.7422560453414917),
 ('_wine_investment_beats_recession', 0.7418316602706909),
 ('variable_of_final_song', 0.7414098978042603),
 ('real_happy_tree_friends', 0.7412666082382202),
 ('/crue_sued_manager_fires_back_in_motley_feud', 0.7410304546356201),
 ('/harry_potter_and_the_halfblood_pri

In [17]:
import nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

In [18]:
import numpy as np

In [19]:
with open("reviews.txt") as f:
    reviews = [ [y.lower() for y in line.strip().split() if y not in stopwords and len(y) > 2 and y in g ] for line in f ]

  


In [20]:
with open("labels.txt") as f:
    labels = np.array([ 1 if line.strip() == 'positive' else 0 for line in f ])

In [25]:
max_words = 100
embedding_dim = 300

In [23]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=50000)
texts = [" ".join(review) for review in reviews]
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=max_words)
vocab = tokenizer.word_index

In [26]:
glove_emb = np.zeros((len(vocab) + 1, embedding_dim))
for word, i in vocab.items():
    if word in g:
        glove_emb[i] = g[word]

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [27]:
from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [28]:
model = Sequential()
model.add(Embedding(len(vocab) + 1, embedding_dim, weights=[glove_emb], input_length=max_words, trainable=False))

In [29]:
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))

In [30]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [31]:
X_train, y_train, X_val, y_val = data[:20000], labels[:20000], data[20000:], labels[20000:]

In [32]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=128)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb81a509eb8>

In [92]:
s = "cool movie !"

array([[0.6096567]], dtype=float32)

In [34]:
def test_on_texts(s):
    print(model.predict(pad_sequences(tokenizer.texts_to_sequences(s), maxlen=max_words)))

In [35]:
test_on_texts([
    'this was a great movie',
    'i really liked it',
    'terrible film, hated it',
    'the movie was so-so',
    'oh yeah, great movie, a new godfather',
    'i would watch it again and again and again'
    ])

[[0.84083194]
 [0.7083693 ]
 [0.18270014]
 [0.52240455]
 [0.8464341 ]
 [0.7111294 ]]
