Commit 356a0da0 authored by Sajana_it20194130's avatar Sajana_it20194130

Upload New File

parent 86b0672c
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
# Loading Dataset
data = pd.read_csv('malicious_phish.csv')
# Preprocessing and Feature Engineering
def extract_features(url):
url_length = len(url)
special_chars = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', url))
return [url_length, special_chars]
# Extract domain and path features
def extract_domain_features(url):
# Extract domain from URL
domain = re.search(r'://(.*?)/', url)
if domain:
domain = domain.group(1)
else:
domain = ""
return domain
def extract_path_features(url):
# Extract path from URL
path = re.search(r'://.*?(/.*)$', url)
if path:
path = path.group(1)
# Additional path features willbe extracted here
else:
path = ""
return path
# Split the dataset into text data and engineered features
text_data = data['url']
engineered_data = np.array(data['url'].apply(extract_features).tolist())
# Standardize the engineered features
scaler = StandardScaler()
engineered_data = scaler.fit_transform(engineered_data)
# Split the data into training and testing sets
X_text_train, X_text_test, X_engineered_train, X_engineered_test, y_train, y_test = train_test_split(
text_data, engineered_data, data['type'], test_size=0.2, random_state=42
)
# Create separate pipelines for text data, domain features, and path features
text_pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=100, stop_words='english')),
('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])
engineered_pipeline = Pipeline([
('scaler', StandardScaler()),
('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])
# Define hyperparameters to search
param_grid = {
'rf__n_estimators': [100, 200, 300],
'rf__max_depth': [None, 10, 20],
'rf__min_samples_split': [2, 5, 10]
}
# Create grid search pipelines for text data and engineered features
grid_search_text = GridSearchCV(text_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search_engineered = GridSearchCV(engineered_pipeline, param_grid, cv=5, n_jobs=-1)
# Fit the grid search pipelines to the training data
grid_search_text.fit(X_text_train, y_train)
grid_search_engineered.fit(X_engineered_train, y_train)
# Get the best estimators and their parameters
best_text_pipeline = grid_search_text.best_estimator_
best_text_params = grid_search_text.best_params_
best_engineered_pipeline = grid_search_engineered.best_estimator_
best_engineered_params = grid_search_engineered.best_params_
# Use the best pipelines for predictions
y_pred_text = best_text_pipeline.predict(X_text_test)
y_pred_engineered = best_engineered_pipeline.predict(X_engineered_test)
# Evaluate the models
accuracy_text = accuracy_score(y_test, y_pred_text)
accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
print("Accuracy:", accuracy_text)
print("Accuracy:", accuracy_engineered)
# Save both pipelines to joblib files
joblib.dump(best_text_pipeline, 'best_text_classifier.joblib')
joblib.dump(best_engineered_pipeline, 'best_engineered_classifier.joblib')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment