1 files changed, 72 insertions, 0 deletions
diff --git a/dga/tensorflow_tests/train_script.py b/dga/tensorflow_tests/train_script.py
new file mode 100644
index 000000000..e6962221b
--- /dev/null
+++ b/dga/tensorflow_tests/train_script.py
@@ -0,0 +1,72 @@
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import classification_report, accuracy_score
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, LSTM, Dense
+import pandas as pd
+import joblib
+
+# Check if GPU is available
+gpus = tf.config.list_physical_devices('GPU')
+if gpus:
+    print(f"Num GPUs Available: {len(gpus)}")
+    for gpu in gpus:
+        print(f"GPU: {gpu}")
+else:
+    print("No GPUs available. Using CPU.")
+
+# Read the file
+df = pd.read_csv("../dga_domains_full.csv", header=None, names=["label", "family", "domain"])
+df = df[["label", "domain"]]
+
+# Transform labels (legit/dga) into numbers
+label_encoder = LabelEncoder()
+df["label_encoded"] = label_encoder.fit_transform(df["label"])
+
+# Pre-process domains
+tokenizer = Tokenizer(char_level=True)  # Character-level tokenization
+tokenizer.fit_on_texts(df["domain"])
+sequences = tokenizer.texts_to_sequences(df["domain"])
+X = pad_sequences(sequences, maxlen=100)  # Padding for maximum length
+y = df["label_encoded"].values
+
+# Split the dataset into train, validation, and test sets
+X_temp, X_test, y_temp, y_test = train_test_split(X, df["label_encoded"], test_size=0.1, random_state=27)
+X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=27)  # 10% of 90% is 9%, resulting in 81% train, 9% validation, 10% test
+
+# Model with embedding
+model = Sequential()
+model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=100))  # Embedding
+model.add(LSTM(64))  # Recurrent layer
+model.add(Dense(1, activation='sigmoid'))
+
+# Compile the model
+model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+
+import time
+
+start = time.time()
+with tf.device('/GPU:0' if gpus else '/CPU:0'):
+    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))
+print(f"Training time: {time.time() - start}")
+
+# Make predictions on the test set
+y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten()
+
+# Calculate accuracy
+accuracy = accuracy_score(y_test, y_pred)
+print(f"Accuracy: {accuracy:.4f}")
+
+# Generate the classification report
+report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
+print("\nClassification Report:")
+print(report)
+
+# Save model, test dataset and tensorflow utilities for future test
+model.save("dga_model.keras")
+joblib.dump((X_test, y_test), "test_data.pkl")
+joblib.dump(label_encoder, "label_encoder.pkl")
+joblib.dump(tokenizer, "tokenizer.pkl")