diff options
author | Luca Deri <deri@ntop.org> | 2024-10-26 21:15:36 +0200 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2024-10-26 21:15:36 +0200 |
commit | f5d903caadb00b3e2f68c74cf9da7a19cf4545f7 (patch) | |
tree | aac33900efd9dc38b3cdb15e563055428b3765b0 /dga/scikit-learn_tests | |
parent | 0fb30c857d3f54546e8de61cd5234c2860474369 (diff) |
Moved new DGA code
Diffstat (limited to 'dga/scikit-learn_tests')
-rw-r--r-- | dga/scikit-learn_tests/test_script.py | 23 | ||||
-rw-r--r-- | dga/scikit-learn_tests/train_script.py | 50 |
2 files changed, 73 insertions, 0 deletions
diff --git a/dga/scikit-learn_tests/test_script.py b/dga/scikit-learn_tests/test_script.py new file mode 100644 index 000000000..4ded249f8 --- /dev/null +++ b/dga/scikit-learn_tests/test_script.py @@ -0,0 +1,23 @@ +import joblib +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import classification_report, accuracy_score +import time + +mlp = joblib.load('mlp_model.joblib') +X_test = joblib.load('X_test.joblib') +y_test = joblib.load('y_test.joblib') +label_encoder = joblib.load('label_encoder.joblib') + +# Perform prediction +start = time.time() +y_pred = mlp.predict(X_test) +print(f"Prediction time: {time.time()-start:.2f} seconds") + +# Evaluate the model +accuracy = accuracy_score(y_test, y_pred) +report = classification_report(y_test, y_pred, target_names=label_encoder.classes_) + +# Print the results +print(f"Accuracy: {accuracy:.4f}") +print("\nClassification Report:") +print(report) diff --git a/dga/scikit-learn_tests/train_script.py b/dga/scikit-learn_tests/train_script.py new file mode 100644 index 000000000..040c3966a --- /dev/null +++ b/dga/scikit-learn_tests/train_script.py @@ -0,0 +1,50 @@ +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import LabelEncoder +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import classification_report, accuracy_score +from sklearn.decomposition import TruncatedSVD +import pandas as pd +import time +import joblib + +df = pd.read_csv("../dga_domains_full.csv", header=None, names=["label", "family", "domain"]) +df = df[["label", "domain"]] + +# Label Encoding and Domain Vectorization Representation +label_encoder = LabelEncoder() +df["label_encoded"] = label_encoder.fit_transform(df["label"]) +vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 4)) # Use 2 to 4 character n-grams +X = vectorizer.fit_transform(df["domain"]) +joblib.dump(label_encoder, "label_encoder.joblib") + +# Dimensionality Reduction +svd = TruncatedSVD(n_components=100) # Set the number of components as needed +X_reduced = svd.fit_transform(X) + +# Suddividere il dataset in training e test +X_train, X_test, y_train, y_test = train_test_split(X_reduced, df["label_encoded"], test_size=0.1, shuffle=True, random_state=27) +joblib.dump(X_test, "X_test.joblib") +joblib.dump(y_test, "y_test.joblib") + +# Inizializzazione e addestramento +mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=27) + +start = time.time() +mlp.fit(X_train, y_train) +print(f"Tempo di addestramento: {time.time()-start:.2f} secondi") + +# Fare previsioni sul set di test +start = time.time() +y_pred = mlp.predict(X_test) +print(f"Tempo di previsione: {time.time()-start:.2f} secondi") + +# Valutare le prestazioni del modello +accuracy = accuracy_score(y_test, y_pred) +report = classification_report(y_test, y_pred, target_names=label_encoder.classes_) + +# Stampa i risultati +print(f"Accuratezza: {accuracy:.4f}") +print("\nClassification Report:") +print(report) +joblib.dump(mlp, 'mlp_model.joblib')
\ No newline at end of file |