aboutsummaryrefslogtreecommitdiff
path: root/dga/scikit-learn_tests
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2024-10-26 21:15:36 +0200
committerLuca Deri <deri@ntop.org>2024-10-26 21:15:36 +0200
commitf5d903caadb00b3e2f68c74cf9da7a19cf4545f7 (patch)
treeaac33900efd9dc38b3cdb15e563055428b3765b0 /dga/scikit-learn_tests
parent0fb30c857d3f54546e8de61cd5234c2860474369 (diff)
Moved new DGA code
Diffstat (limited to 'dga/scikit-learn_tests')
-rw-r--r--dga/scikit-learn_tests/test_script.py23
-rw-r--r--dga/scikit-learn_tests/train_script.py50
2 files changed, 73 insertions, 0 deletions
diff --git a/dga/scikit-learn_tests/test_script.py b/dga/scikit-learn_tests/test_script.py
new file mode 100644
index 000000000..4ded249f8
--- /dev/null
+++ b/dga/scikit-learn_tests/test_script.py
@@ -0,0 +1,23 @@
+import joblib
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import classification_report, accuracy_score
+import time
+
+mlp = joblib.load('mlp_model.joblib')
+X_test = joblib.load('X_test.joblib')
+y_test = joblib.load('y_test.joblib')
+label_encoder = joblib.load('label_encoder.joblib')
+
+# Perform prediction
+start = time.time()
+y_pred = mlp.predict(X_test)
+print(f"Prediction time: {time.time()-start:.2f} seconds")
+
+# Evaluate the model
+accuracy = accuracy_score(y_test, y_pred)
+report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
+
+# Print the results
+print(f"Accuracy: {accuracy:.4f}")
+print("\nClassification Report:")
+print(report)
diff --git a/dga/scikit-learn_tests/train_script.py b/dga/scikit-learn_tests/train_script.py
new file mode 100644
index 000000000..040c3966a
--- /dev/null
+++ b/dga/scikit-learn_tests/train_script.py
@@ -0,0 +1,50 @@
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import LabelEncoder
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import classification_report, accuracy_score
+from sklearn.decomposition import TruncatedSVD
+import pandas as pd
+import time
+import joblib
+
+df = pd.read_csv("../dga_domains_full.csv", header=None, names=["label", "family", "domain"])
+df = df[["label", "domain"]]
+
+# Label Encoding and Domain Vectorization Representation
+label_encoder = LabelEncoder()
+df["label_encoded"] = label_encoder.fit_transform(df["label"])
+vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 4)) # Use 2 to 4 character n-grams
+X = vectorizer.fit_transform(df["domain"])
+joblib.dump(label_encoder, "label_encoder.joblib")
+
+# Dimensionality Reduction
+svd = TruncatedSVD(n_components=100) # Set the number of components as needed
+X_reduced = svd.fit_transform(X)
+
+# Suddividere il dataset in training e test
+X_train, X_test, y_train, y_test = train_test_split(X_reduced, df["label_encoded"], test_size=0.1, shuffle=True, random_state=27)
+joblib.dump(X_test, "X_test.joblib")
+joblib.dump(y_test, "y_test.joblib")
+
+# Inizializzazione e addestramento
+mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=27)
+
+start = time.time()
+mlp.fit(X_train, y_train)
+print(f"Tempo di addestramento: {time.time()-start:.2f} secondi")
+
+# Fare previsioni sul set di test
+start = time.time()
+y_pred = mlp.predict(X_test)
+print(f"Tempo di previsione: {time.time()-start:.2f} secondi")
+
+# Valutare le prestazioni del modello
+accuracy = accuracy_score(y_test, y_pred)
+report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
+
+# Stampa i risultati
+print(f"Accuratezza: {accuracy:.4f}")
+print("\nClassification Report:")
+print(report)
+joblib.dump(mlp, 'mlp_model.joblib') \ No newline at end of file