Commit e7eccb44 authored by sashika sewwandi's avatar sashika sewwandi

Upload New File

parent b90f08ef
import pandas as pd
import numpy as np
import seaborn as sb
import sklearn as sk
import warnings
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings('ignore')
# data set read
dataset_01 = pd.read_csv("valid_avengers_dataset.csv")
dataset_02 = pd.read_csv('invalid_thor_dataset.csv')
# combine data sets
df = pd.DataFrame()
df = df.append(dataset_01, ignore_index=True)
df = df.append(dataset_02, ignore_index=True)
# clean up step
df['views'] = df.groupby('id')['id'].transform(lambda x: np.random.randint(5000, 10000000))
# Drop some columns which is not relevant to the analysis (they are not numeric)
cols_to_drop = ['id', 'conversation_id', 'created_at', 'date', 'time',
'timezone', 'user_id', 'username', 'name', 'place', 'tweet', 'mentions', 'urls', 'photos',
'hashtags', 'cashtags', 'link', 'quote_url', 'thumbnail', "near", "geo", "source", "user_rt_id",
"user_rt", "retweet_id", "reply_to", "retweet_date", "translate", "trans_src", "trans_dest",
"replies_count", "retweets_count"]
df = df.drop(cols_to_drop, axis=1)
# filling null values
df['retweet'] = df['retweet'].fillna(True)
# Drop all rows with missin data
df = df.dropna()
df.drop_duplicates(inplace=True)
# First, create dummy columns from the Embarked and Sex columns
architectureColumnDummy = pd.get_dummies(df['architecture'])
retweetColumnDummy = pd.get_dummies(df['retweet'])
sexColumnDummy = pd.get_dummies(df['sex'])
languageColumnDummy = pd.get_dummies(df['language'])
# adding dummy values to table
df = pd.concat((df, architectureColumnDummy, retweetColumnDummy, sexColumnDummy, languageColumnDummy), axis=1)
# Drop the redundant columns thus converted
df = df.drop(['architecture', 'retweet', 'sex', 'language'], axis=1)
# model traning step
# Seperate the dataframe into X and y data
X = df.values
y = df['validation'].values
# Delete the Survived column from X
X = np.delete(X, 1, axis=1)
# Split the dataset into 70% Training and 30% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Decision Tree Classifier algorrithm
dt_clf = tree.DecisionTreeClassifier(max_depth=5)
dt_clf.fit(X_train, y_train)
print("Decision Tree Classifier score: ", dt_clf.score(X_test, y_test))
y_pred = dt_clf.predict(X_test)
# Random Forest Classifier algorrithm
confusion_matrix(y_test, y_pred)
rf_clf = ensemble.RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)
print("Random Forest Classifier algorrithm score", rf_clf.score(X_test, y_test))
# Random Forest Classifier algorrithm after tune up
RF = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
RF.fit(X_train, y_train)
print("Random Forest Classifier algorrithm after tune up")
print("Random Forest Classifier algorrithm round up score", round(RF.score(X, y), 4))
print("Random Forest Classifier algorrithm score", RF.score(X_test, y_test))
# Gradient Boosting Classifier
gb_clf = ensemble.GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
print("Gradient Boosting Classifier algorrithm score", gb_clf.score(X_test, y_test))
# Gradient Boosting Classifier after tune up
gb_clf = ensemble.GradientBoostingClassifier(n_estimators=50)
gb_clf.fit(X_train, y_train)
print("Gradient Boosting Classifier algorrithm after tune up")
print("Gradient Boosting Classifier algorrithm score", gb_clf.score(X_test, y_test))
# instantiate the Logistic Regression model
logreg = LogisticRegression(solver='lbfgs', random_state=0)
# fit the model
logreg.fit(X_train, y_train)
print("Logistic Regression Classifier algorrithm round up score: ", round(logreg.score(X, y), 4))
print("Logistic Regression Classifier algorrithm score: ", logreg.score(X_test, y_test))
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
print("Logistic Regression Classifier algorrithm after tune up")
print("Logistic Regression Classifier algorrithm round up score: ", round(LR.score(X_test, y_test), 4))
print("Logistic Regression Classifier algorrithm score: ", LR.score(X_test, y_test))
SVM = svm.LinearSVC()
SVM.fit(X_train, y_train)
print("Suport vector machine classifier algorrithm round up score: ", round(SVM.score(X, y), 4))
print("Suport vector machine Classifier algorrithm score: ", SVM.score(X_test, y_test))
SVM = svm.SVC(decision_function_shape="ovo").fit(X_train, y_train)
print("Suport vector machine Classifier algorrithm after tune up")
print("Suport vector machine classifier algorrithm round up score: ", round(SVM.score(X_test, y_test), 4))
print("Suport vector machine classifier algorrithm score: ", SVM.score(X_test, y_test))
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
NN.fit(X_train, y_train)
print("Multi layer perceptron Classifier algorrithm round up score: ", round(NN.score(X, y), 4))
print("Multi layer perceptron classifier algorrithm score: ", NN.score(X_test, y_test))
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1).fit(X_train, y_train)
print("Multi layer perceptron Classifier algorrithm after tune up")
print("Multi layer perceptron Classifier algorrithm round up score: ", round(NN.score(X, y), 4))
print("Multi layer perceptron classifier algorrithm score: ", NN.score(X_test, y_test))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment