Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AAGGY
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
23-153
AAGGY
Commits
356a0da0
Commit
356a0da0
authored
Nov 03, 2023
by
Sajana_it20194130
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
86b0672c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
108 additions
and
0 deletions
+108
-0
untitled3.py
untitled3.py
+108
-0
No files found.
untitled3.py
0 → 100644
View file @
356a0da0
# Import necessary libraries
import
pandas
as
pd
import
numpy
as
np
import
re
from
sklearn.model_selection
import
train_test_split
,
GridSearchCV
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.metrics
import
accuracy_score
,
classification_report
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.pipeline
import
Pipeline
import
joblib
# Loading Dataset
data
=
pd
.
read_csv
(
'malicious_phish.csv'
)
# Preprocessing and Feature Engineering
def
extract_features
(
url
):
url_length
=
len
(
url
)
special_chars
=
len
(
re
.
findall
(
r'[!@#$
%
^&*(),.?":{}|<>]'
,
url
))
return
[
url_length
,
special_chars
]
# Extract domain and path features
def
extract_domain_features
(
url
):
# Extract domain from URL
domain
=
re
.
search
(
r'://(.*?)/'
,
url
)
if
domain
:
domain
=
domain
.
group
(
1
)
else
:
domain
=
""
return
domain
def
extract_path_features
(
url
):
# Extract path from URL
path
=
re
.
search
(
r'://.*?(/.*)$'
,
url
)
if
path
:
path
=
path
.
group
(
1
)
# Additional path features willbe extracted here
else
:
path
=
""
return
path
# Split the dataset into text data and engineered features
text_data
=
data
[
'url'
]
engineered_data
=
np
.
array
(
data
[
'url'
]
.
apply
(
extract_features
)
.
tolist
())
# Standardize the engineered features
scaler
=
StandardScaler
()
engineered_data
=
scaler
.
fit_transform
(
engineered_data
)
# Split the data into training and testing sets
X_text_train
,
X_text_test
,
X_engineered_train
,
X_engineered_test
,
y_train
,
y_test
=
train_test_split
(
text_data
,
engineered_data
,
data
[
'type'
],
test_size
=
0.2
,
random_state
=
42
)
# Create separate pipelines for text data, domain features, and path features
text_pipeline
=
Pipeline
([
(
'tfidf'
,
TfidfVectorizer
(
max_features
=
100
,
stop_words
=
'english'
)),
(
'rf'
,
RandomForestClassifier
(
n_estimators
=
100
,
random_state
=
42
,
class_weight
=
'balanced'
))
])
engineered_pipeline
=
Pipeline
([
(
'scaler'
,
StandardScaler
()),
(
'rf'
,
RandomForestClassifier
(
n_estimators
=
100
,
random_state
=
42
,
class_weight
=
'balanced'
))
])
# Define hyperparameters to search
param_grid
=
{
'rf__n_estimators'
:
[
100
,
200
,
300
],
'rf__max_depth'
:
[
None
,
10
,
20
],
'rf__min_samples_split'
:
[
2
,
5
,
10
]
}
# Create grid search pipelines for text data and engineered features
grid_search_text
=
GridSearchCV
(
text_pipeline
,
param_grid
,
cv
=
5
,
n_jobs
=-
1
)
grid_search_engineered
=
GridSearchCV
(
engineered_pipeline
,
param_grid
,
cv
=
5
,
n_jobs
=-
1
)
# Fit the grid search pipelines to the training data
grid_search_text
.
fit
(
X_text_train
,
y_train
)
grid_search_engineered
.
fit
(
X_engineered_train
,
y_train
)
# Get the best estimators and their parameters
best_text_pipeline
=
grid_search_text
.
best_estimator_
best_text_params
=
grid_search_text
.
best_params_
best_engineered_pipeline
=
grid_search_engineered
.
best_estimator_
best_engineered_params
=
grid_search_engineered
.
best_params_
# Use the best pipelines for predictions
y_pred_text
=
best_text_pipeline
.
predict
(
X_text_test
)
y_pred_engineered
=
best_engineered_pipeline
.
predict
(
X_engineered_test
)
# Evaluate the models
accuracy_text
=
accuracy_score
(
y_test
,
y_pred_text
)
accuracy_engineered
=
accuracy_score
(
y_test
,
y_pred_engineered
)
print
(
"Accuracy:"
,
accuracy_text
)
print
(
"Accuracy:"
,
accuracy_engineered
)
# Save both pipelines to joblib files
joblib
.
dump
(
best_text_pipeline
,
'best_text_classifier.joblib'
)
joblib
.
dump
(
best_engineered_pipeline
,
'best_engineered_classifier.joblib'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment