import pandas as pd
import numpy as np
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
train.head()
test.head()
sample_submission.head()
train.isnull().sum()
test.isnull().sum()
train['target'].value_counts()
from sklearn.model_selection import train_test_split
X = train.text.values
y = train.target.values
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
stratify = y,
random_state = 0,
test_size = 0.2, shuffle=True)
print(X_train.shape)
print(X_valid.shape)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 3), use_idf=1,
smooth_idf=1, sublinear_tf=1,
stop_words='english')
# Fitting Tfidfvectorizer to train and test data
tfv.fit(list(X_train) + list(X_valid))
X_train_tfv = tfv.transform(X_train)
X_valid_tfv = tfv.transform(X_valid)
# Build a simple logistic regression model
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_valid_tfv)
# Check F1 score
from sklearn.metrics import f1_score
f1_score(y_valid, predictions)
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 3), stop_words='english')
ctv.fit(list(X_train) + list(X_valid))
X_train_ctv = ctv.transform(X_train)
X_valid_ctv = ctv.transform(X_valid)
# Fit and predict with Logistic Regression
clf = LogisticRegression(C=1.0)
clf.fit(X_train_ctv, y_train)
predictions = clf.predict(X_valid_ctv)
# Check Score
f1_score(y_valid, predictions)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_valid_tfv)
f1_score(y_valid, predictions)
clf = MultinomialNB()
clf.fit(X_train_ctv, y_train)
predictions = clf.predict(X_valid_ctv)
f1_score(y_valid, predictions)
from xgboost import XGBClassifier
clf = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(X_train_tfv.tocsc(), y_train)
predictions = clf.predict(X_valid_tfv.tocsc())
f1_score(y_valid, predictions)
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tfv, y_train)
predictions = clf.predict(X_valid_tfv)
f1_score(y_valid, predictions)
sample_submission.head()
test.head()
X_test = test.text.values
X_test_tfv = tfv.transform(X_test)
sample_submission['target'] = clf.predict(X_test_tfv)
sample_submission.head()
sample_submission.to_csv("submission.csv", index=False)