From ecd3c734d00671a4fe5ac1713422dae55f2bad2f Mon Sep 17 00:00:00 2001 From: Luca Deri Date: Mon, 28 Oct 2024 12:55:18 +0100 Subject: Rename --- dga/tensoflow_tests/test_script.py | 22 ----------- dga/tensoflow_tests/train_script.py | 72 ------------------------------------ dga/tensorflow_tests/test_script.py | 22 +++++++++++ dga/tensorflow_tests/train_script.py | 72 ++++++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 94 deletions(-) delete mode 100644 dga/tensoflow_tests/test_script.py delete mode 100644 dga/tensoflow_tests/train_script.py create mode 100644 dga/tensorflow_tests/test_script.py create mode 100644 dga/tensorflow_tests/train_script.py (limited to 'dga') diff --git a/dga/tensoflow_tests/test_script.py b/dga/tensoflow_tests/test_script.py deleted file mode 100644 index 5c946c8cf..000000000 --- a/dga/tensoflow_tests/test_script.py +++ /dev/null @@ -1,22 +0,0 @@ -import tensorflow as tf -import joblib -import numpy as np -from sklearn.metrics import classification_report, accuracy_score - -# Load the model -model = tf.keras.models.load_model("dga_model.keras") -X_test, y_test = joblib.load("test_data.pkl") -label_encoder = joblib.load("label_encoder.pkl") -tokenizer = joblib.load("tokenizer.pkl") - -# Make predictions on the test set -y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten() - -# Calculate accuracy -accuracy = accuracy_score(y_test, y_pred) -print(f"Accuracy: {accuracy:.4f}") - -# Generate the classification report -report = classification_report(y_test, y_pred, target_names=label_encoder.classes_) -print("\nClassification Report:") -print(report) diff --git a/dga/tensoflow_tests/train_script.py b/dga/tensoflow_tests/train_script.py deleted file mode 100644 index e6962221b..000000000 --- a/dga/tensoflow_tests/train_script.py +++ /dev/null @@ -1,72 +0,0 @@ -import tensorflow as tf -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder -from sklearn.metrics import classification_report, accuracy_score -from tensorflow.keras.preprocessing.text import Tokenizer -from tensorflow.keras.preprocessing.sequence import pad_sequences -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Embedding, LSTM, Dense -import pandas as pd -import joblib - -# Check if GPU is available -gpus = tf.config.list_physical_devices('GPU') -if gpus: - print(f"Num GPUs Available: {len(gpus)}") - for gpu in gpus: - print(f"GPU: {gpu}") -else: - print("No GPUs available. Using CPU.") - -# Read the file -df = pd.read_csv("../dga_domains_full.csv", header=None, names=["label", "family", "domain"]) -df = df[["label", "domain"]] - -# Transform labels (legit/dga) into numbers -label_encoder = LabelEncoder() -df["label_encoded"] = label_encoder.fit_transform(df["label"]) - -# Pre-process domains -tokenizer = Tokenizer(char_level=True) # Character-level tokenization -tokenizer.fit_on_texts(df["domain"]) -sequences = tokenizer.texts_to_sequences(df["domain"]) -X = pad_sequences(sequences, maxlen=100) # Padding for maximum length -y = df["label_encoded"].values - -# Split the dataset into train, validation, and test sets -X_temp, X_test, y_temp, y_test = train_test_split(X, df["label_encoded"], test_size=0.1, random_state=27) -X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=27) # 10% of 90% is 9%, resulting in 81% train, 9% validation, 10% test - -# Model with embedding -model = Sequential() -model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=100)) # Embedding -model.add(LSTM(64)) # Recurrent layer -model.add(Dense(1, activation='sigmoid')) - -# Compile the model -model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) - -import time - -start = time.time() -with tf.device('/GPU:0' if gpus else '/CPU:0'): - model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val)) -print(f"Training time: {time.time() - start}") - -# Make predictions on the test set -y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten() - -# Calculate accuracy -accuracy = accuracy_score(y_test, y_pred) -print(f"Accuracy: {accuracy:.4f}") - -# Generate the classification report -report = classification_report(y_test, y_pred, target_names=label_encoder.classes_) -print("\nClassification Report:") -print(report) - -# Save model, test dataset and tensorflow utilities for future test -model.save("dga_model.keras") -joblib.dump((X_test, y_test), "test_data.pkl") -joblib.dump(label_encoder, "label_encoder.pkl") -joblib.dump(tokenizer, "tokenizer.pkl") diff --git a/dga/tensorflow_tests/test_script.py b/dga/tensorflow_tests/test_script.py new file mode 100644 index 000000000..5c946c8cf --- /dev/null +++ b/dga/tensorflow_tests/test_script.py @@ -0,0 +1,22 @@ +import tensorflow as tf +import joblib +import numpy as np +from sklearn.metrics import classification_report, accuracy_score + +# Load the model +model = tf.keras.models.load_model("dga_model.keras") +X_test, y_test = joblib.load("test_data.pkl") +label_encoder = joblib.load("label_encoder.pkl") +tokenizer = joblib.load("tokenizer.pkl") + +# Make predictions on the test set +y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten() + +# Calculate accuracy +accuracy = accuracy_score(y_test, y_pred) +print(f"Accuracy: {accuracy:.4f}") + +# Generate the classification report +report = classification_report(y_test, y_pred, target_names=label_encoder.classes_) +print("\nClassification Report:") +print(report) diff --git a/dga/tensorflow_tests/train_script.py b/dga/tensorflow_tests/train_script.py new file mode 100644 index 000000000..e6962221b --- /dev/null +++ b/dga/tensorflow_tests/train_script.py @@ -0,0 +1,72 @@ +import tensorflow as tf +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import classification_report, accuracy_score +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Embedding, LSTM, Dense +import pandas as pd +import joblib + +# Check if GPU is available +gpus = tf.config.list_physical_devices('GPU') +if gpus: + print(f"Num GPUs Available: {len(gpus)}") + for gpu in gpus: + print(f"GPU: {gpu}") +else: + print("No GPUs available. Using CPU.") + +# Read the file +df = pd.read_csv("../dga_domains_full.csv", header=None, names=["label", "family", "domain"]) +df = df[["label", "domain"]] + +# Transform labels (legit/dga) into numbers +label_encoder = LabelEncoder() +df["label_encoded"] = label_encoder.fit_transform(df["label"]) + +# Pre-process domains +tokenizer = Tokenizer(char_level=True) # Character-level tokenization +tokenizer.fit_on_texts(df["domain"]) +sequences = tokenizer.texts_to_sequences(df["domain"]) +X = pad_sequences(sequences, maxlen=100) # Padding for maximum length +y = df["label_encoded"].values + +# Split the dataset into train, validation, and test sets +X_temp, X_test, y_temp, y_test = train_test_split(X, df["label_encoded"], test_size=0.1, random_state=27) +X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=27) # 10% of 90% is 9%, resulting in 81% train, 9% validation, 10% test + +# Model with embedding +model = Sequential() +model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=100)) # Embedding +model.add(LSTM(64)) # Recurrent layer +model.add(Dense(1, activation='sigmoid')) + +# Compile the model +model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) + +import time + +start = time.time() +with tf.device('/GPU:0' if gpus else '/CPU:0'): + model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val)) +print(f"Training time: {time.time() - start}") + +# Make predictions on the test set +y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten() + +# Calculate accuracy +accuracy = accuracy_score(y_test, y_pred) +print(f"Accuracy: {accuracy:.4f}") + +# Generate the classification report +report = classification_report(y_test, y_pred, target_names=label_encoder.classes_) +print("\nClassification Report:") +print(report) + +# Save model, test dataset and tensorflow utilities for future test +model.save("dga_model.keras") +joblib.dump((X_test, y_test), "test_data.pkl") +joblib.dump(label_encoder, "label_encoder.pkl") +joblib.dump(tokenizer, "tokenizer.pkl") -- cgit v1.2.3