## Text categorization with numpy helper functions

In [None]:
import string
import collections
import glob
import codecs
import numpy as np
import scipy as sp
from scipy.sparse import lil_matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
translator = str.maketrans("","", string.punctuation)

In [None]:
import nltk

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')

In [None]:
stemmer.stem("helicopters flying")

In [None]:
def my_tokenizer(s):
    return [ stemmer.stem(x) for x in nltk.tokenize.word_tokenize(s.lower().translate(translator)) if not x in stopwords ]

In [None]:
cv = CountVectorizer(input='filename', encoding='latin1', min_df=10)
X = cv.fit_transform([x for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [None]:
tfv = TfidfVectorizer(input='filename', encoding='latin1', min_df=10, max_df=200, ngram_range=(1, 2), tokenizer=my_tokenizer)
X = tfv.fit_transform([x for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [None]:
labels_array = sorted(list(set([directory.split('/')[-1] for directory in glob.glob('20news-bydate-train/*')])))
labels_dict = {l: i for i, l in enumerate(labels_array)}
y = np.array([labels_dict[directory.split('/')[-1]] for directory in glob.glob('20news-bydate-train/*') for x in glob.glob(directory + '/*')])

In [None]:
X_test = tfv.transform([x for directory in glob.glob('20news-bydate-test/*') for x in glob.glob(directory + '/*')])
y_test = np.array([labels_dict[x.split('/')[1]] for x in [directory for directory in glob.glob('20news-bydate-test/*') for x in glob.glob(directory + '/*')]])

In [None]:
X.shape, y.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
model = MultinomialNB()
model.fit(X, y)
vocab_array = tfv.get_feature_names()
model.coef_.shape
[(vocab_array[y[0]], y[1]) for y in sorted([(i,x) for i, x in enumerate(model.coef_[0])], key=lambda x: np.abs(x[1]))[:20] ]

In [None]:
def test_model(model, name=None):
    if not name is None:
        print(name)
    model.fit(X, y)
    print("Train set: %.5f\tTest set: %.5f" % (model.score(X, y), model.score(X_test, y_test)) )

test_model(MultinomialNB(), 'MultinomialNB')
test_model(BernoulliNB(), 'BernoulliNB')

In [None]:
from sklearn.svm import LinearSVC, SVC
test_model(LinearSVC(), 'LinearSVC')
# test_model(SVC(kernel='rbf'), 'SVC RBF')

In [None]:
from sklearn.linear_model import LogisticRegression
test_model(LogisticRegression(), 'LogRegr')