Upload New File

356a0da0 · Sajana_it20194130 · 86b0672c · 356a0da0
Commit 356a0da0 authored Nov 03, 2023 by Sajana_it20194130
Hide whitespace changes
Inline Side-by-side

Showing with 108 additions and 0 deletions

untitled3.py untitled3.py +108 -0

No files found.
--- a/untitled3.py
+++ b/untitled3.py
+# Import necessary libraries
+import pandas as pd
+import numpy as np
+import re
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+import joblib
+
+# Loading Dataset
+data = pd.read_csv('malicious_phish.csv')
+
+# Preprocessing and Feature Engineering
+def extract_features(url):
+    url_length = len(url)
+    special_chars = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', url))
+    return [url_length, special_chars]
+
+# Extract domain and path features
+def extract_domain_features(url):
+    # Extract domain from URL
+    domain = re.search(r'://(.*?)/', url)
+    if domain:
+        domain = domain.group(1)
+       
+    else:
+        domain = ""
+
+    return domain
+
+def extract_path_features(url):
+    # Extract path from URL
+    path = re.search(r'://.*?(/.*)$', url)
+    if path:
+        path = path.group(1)
+        # Additional path features willbe extracted here
+    else:
+        path = ""
+
+    return path
+
+# Split the dataset into text data and engineered features
+text_data = data['url']
+engineered_data = np.array(data['url'].apply(extract_features).tolist())
+
+# Standardize the engineered features
+scaler = StandardScaler()
+engineered_data = scaler.fit_transform(engineered_data)
+
+# Split the data into training and testing sets
+X_text_train, X_text_test, X_engineered_train, X_engineered_test, y_train, y_test = train_test_split(
+    text_data, engineered_data, data['type'], test_size=0.2, random_state=42
+)
+
+# Create separate pipelines for text data, domain features, and path features
+text_pipeline = Pipeline([
+    ('tfidf', TfidfVectorizer(max_features=100, stop_words='english')),
+    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
+])
+
+engineered_pipeline = Pipeline([
+    ('scaler', StandardScaler()),
+    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
+])
+
+# Define hyperparameters to search
+param_grid = {
+    'rf__n_estimators': [100, 200, 300],
+    'rf__max_depth': [None, 10, 20],
+    'rf__min_samples_split': [2, 5, 10]
+}
+
+# Create grid search pipelines for text data and engineered features
+grid_search_text = GridSearchCV(text_pipeline, param_grid, cv=5, n_jobs=-1)
+grid_search_engineered = GridSearchCV(engineered_pipeline, param_grid, cv=5, n_jobs=-1)
+
+# Fit the grid search pipelines to the training data
+grid_search_text.fit(X_text_train, y_train)
+grid_search_engineered.fit(X_engineered_train, y_train)
+
+# Get the best estimators and their parameters
+best_text_pipeline = grid_search_text.best_estimator_
+best_text_params = grid_search_text.best_params_
+best_engineered_pipeline = grid_search_engineered.best_estimator_
+best_engineered_params = grid_search_engineered.best_params_
+
+
+# Use the best pipelines for predictions
+y_pred_text = best_text_pipeline.predict(X_text_test)
+y_pred_engineered = best_engineered_pipeline.predict(X_engineered_test)
+
+# Evaluate the models
+accuracy_text = accuracy_score(y_test, y_pred_text)
+accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
+
+
+print("Accuracy:", accuracy_text)
+print("Accuracy:", accuracy_engineered)
+
+
+# Save both pipelines to joblib files
+joblib.dump(best_text_pipeline, 'best_text_classifier.joblib')
+joblib.dump(best_engineered_pipeline, 'best_engineered_classifier.joblib')
+
+