malconv / src /utils.py
cycloevan's picture
Upload 17 files
b92918a verified
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
# utils.py에 λˆ„λ½λœ import
import tensorflow as tf # configure_gpu_memory ν•¨μˆ˜μ—μ„œ ν•„μš”
def read_binary_file(file_path, max_length=2_000_000):
"""
λ°”μ΄λ„ˆλ¦¬ νŒŒμΌμ„ 읽어 μ •μˆ˜ λ°°μ—΄λ‘œ λ³€ν™˜
λ…Όλ¬Έ 사양: 2MBκΉŒμ§€ 처리
"""
try:
with open(file_path, 'rb') as f:
raw_bytes = f.read()
# λ°”μ΄νŠΈλ₯Ό 0-255 μ •μˆ˜λ‘œ λ³€ν™˜
byte_array = np.frombuffer(raw_bytes, dtype=np.uint8)
if len(byte_array) > max_length:
# κΈ΄ 파일: μ•ž 2MB만 μ‚¬μš© (λ…Όλ¬Έ 방식)
return byte_array[:max_length]
else:
# 짧은 파일: 0으둜 νŒ¨λ”©
padded = np.zeros(max_length, dtype=np.uint8)
padded[:len(byte_array)] = byte_array
return padded
except Exception as e:
print(f"파일 읽기 였λ₯˜ {file_path}: {e}")
return np.zeros(max_length, dtype=np.uint8)
def load_dataset_from_directory(malware_dir, benign_dir, max_length=2_000_000, max_samples_per_class=None):
"""
λ””λ ‰ν† λ¦¬μ—μ„œ 직접 λ°”μ΄λ„ˆλ¦¬ νŒŒμΌλ“€μ„ λ‘œλ“œ
Args:
malware_dir: μ•…μ„±μ½”λ“œ νŒŒμΌλ“€μ΄ μžˆλŠ” 디렉토리
benign_dir: 정상 νŒŒμΌλ“€μ΄ μžˆλŠ” 디렉토리
max_length: μ΅œλŒ€ λ°”μ΄νŠΈ 길이
max_samples_per_class: ν΄λž˜μŠ€λ‹Ή μ΅œλŒ€ μƒ˜ν”Œ 수
"""
X, y = [], []
# μ•…μ„±μ½”λ“œ 파일 λ‘œλ“œ
if os.path.exists(malware_dir):
malware_files = [f for f in os.listdir(malware_dir) if os.path.isfile(os.path.join(malware_dir, f))]
if max_samples_per_class:
malware_files = malware_files[:max_samples_per_class]
print(f"μ•…μ„±μ½”λ“œ 파일 λ‘œλ”© 쀑... ({len(malware_files)}개)")
for i, filename in enumerate(malware_files):
file_path = os.path.join(malware_dir, filename)
byte_array = read_binary_file(file_path, max_length)
X.append(byte_array)
y.append(0) # μ•…μ„±μ½”λ“œ = 0
if (i + 1) % 100 == 0:
print(f" {i + 1}/{len(malware_files)} 처리 μ™„λ£Œ")
# 정상 파일 λ‘œλ“œ
if os.path.exists(benign_dir):
benign_files = [f for f in os.listdir(benign_dir) if os.path.isfile(os.path.join(benign_dir, f))]
if max_samples_per_class:
benign_files = benign_files[:max_samples_per_class]
print(f"정상 파일 λ‘œλ”© 쀑... ({len(benign_files)}개)")
for i, filename in enumerate(benign_files):
file_path = os.path.join(benign_dir, filename)
byte_array = read_binary_file(file_path, max_length)
X.append(byte_array)
y.append(1) # 정상 = 1
if (i + 1) % 100 == 0:
print(f" {i + 1}/{len(benign_files)} 처리 μ™„λ£Œ")
X = np.array(X)
y = np.array(y)
print(f"\n데이터셋 λ‘œλ”© μ™„λ£Œ:")
print(f" 총 μƒ˜ν”Œ: {len(X)}")
print(f" μ•…μ„±μ½”λ“œ: {np.sum(y == 0)}")
print(f" μ •μƒνŒŒμΌ: {np.sum(y == 1)}")
return X, y
def load_dataset_from_csv(csv_path, max_length=2_000_000):
"""CSV νŒŒμΌμ—μ„œ 데이터셋 λ‘œλ“œ"""
df = pd.read_csv(csv_path)
X, y = [], []
print("CSVμ—μ„œ 파일 λ‘œλ”© 쀑...")
for idx, row in df.iterrows():
file_path = row['filepath']
label = row['label']
if os.path.exists(file_path):
byte_array = read_binary_file(file_path, max_length)
X.append(byte_array)
y.append(label)
else:
print(f"νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {file_path}")
if (idx + 1) % 1000 == 0:
print(f" {idx + 1} 파일 처리 μ™„λ£Œ")
return np.array(X), np.array(y)
def configure_gpu_memory():
"""GPU λ©”λͺ¨λ¦¬ μ„€μ •"""
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
print(f"GPU μ„€μ • μ™„λ£Œ: {len(gpus)}개 GPU μ‚¬μš©")
return True
except RuntimeError as e:
print(f"GPU μ„€μ • 였λ₯˜: {e}")
return False
def plot_training_history(history):
"""ν›ˆλ ¨ νžˆμŠ€ν† λ¦¬ μ‹œκ°ν™”"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Loss
axes[0, 0].plot(history.history['loss'], label='Training Loss')
if 'val_loss' in history.history:
axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')
axes[0, 0].set_title('Model Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)
# Accuracy
axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')
if 'val_accuracy' in history.history:
axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[0, 1].set_title('Model Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)
# AUC
if 'auc' in history.history:
axes[1, 0].plot(history.history['auc'], label='Training AUC')
if 'val_auc' in history.history:
axes[1, 0].plot(history.history['val_auc'], label='Validation AUC')
axes[1, 0].set_title('Model AUC')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('AUC')
axes[1, 0].legend()
axes[1, 0].grid(True)
# Learning Rate
if 'lr' in history.history:
axes[1, 1].plot(history.history['lr'], label='Learning Rate', color='red')
axes[1, 1].set_title('Learning Rate Schedule')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Learning Rate')
axes[1, 1].set_yscale('log')
axes[1, 1].legend()
axes[1, 1].grid(True)
plt.tight_layout()
plt.show()
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
"""ν˜Όλ™ ν–‰λ ¬ μ‹œκ°ν™”"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Malware', 'Benign'],
yticklabels=['Malware', 'Benign'])
plt.title(title)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
def evaluate_model(model, X_test, y_test, batch_size=16):
"""λͺ¨λΈ μ„±λŠ₯ 평가"""
print("λͺ¨λΈ 평가 쀑...")
# 예츑
y_pred_prob = model.predict(X_test, batch_size=batch_size, verbose=1)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()
# λ©”νŠΈλ¦­ 계산
accuracy = np.mean(y_pred == y_test)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"\n=== 평가 κ²°κ³Ό ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"AUC Score: {auc_score:.4f}")
print(f"\nλΆ„λ₯˜ 리포트:")
print(classification_report(y_test, y_pred, target_names=['Malware', 'Benign']))
# ν˜Όλ™ ν–‰λ ¬ μ‹œκ°ν™”
plot_confusion_matrix(y_test, y_pred, "MalConv Performance")
return {
'accuracy': accuracy,
'balanced_accuracy': balanced_acc,
'auc': auc_score,
'predictions': y_pred_prob
}
def get_file_paths_and_labels(malware_dir, benign_dir, max_samples_per_class=None):
"""
λ””λ ‰ν† λ¦¬μ—μ„œ 파일 κ²½λ‘œμ™€ λ ˆμ΄λΈ” λͺ©λ‘μ„ κ°€μ Έμ˜΅λ‹ˆλ‹€. (λ©”λͺ¨λ¦¬μ— 파일 λ‘œλ“œ μ•ˆν•¨)
"""
filepaths = []
labels = []
# μ•…μ„±μ½”λ“œ 파일 경둜
if os.path.exists(malware_dir):
malware_files = [os.path.join(malware_dir, f) for f in os.listdir(malware_dir) if os.path.isfile(os.path.join(malware_dir, f))]
if max_samples_per_class:
malware_files = malware_files[:max_samples_per_class]
filepaths.extend(malware_files)
labels.extend([0] * len(malware_files)) # μ•…μ„±μ½”λ“œ = 0
print(f"μ•…μ„±μ½”λ“œ 파일 경둜 λ‘œλ”©: {len(malware_files)}개")
# 정상 파일 경둜
if os.path.exists(benign_dir):
benign_files = [os.path.join(benign_dir, f) for f in os.listdir(benign_dir) if os.path.isfile(os.path.join(benign_dir, f))]
if max_samples_per_class:
benign_files = benign_files[:max_samples_per_class]
filepaths.extend(benign_files)
labels.extend([1] * len(benign_files)) # 정상 = 1
print(f"정상 파일 경둜 λ‘œλ”©: {len(benign_files)}개")
print(f"\n총 파일 경둜: {len(filepaths)}")
print(f" μ•…μ„±μ½”λ“œ: {labels.count(0)}")
print(f" μ •μƒνŒŒμΌ: {labels.count(1)}")
# 데이터 μˆœμ„œ μ„žκΈ°
indices = np.arange(len(filepaths))
np.random.shuffle(indices)
filepaths = np.array(filepaths)[indices].tolist()
labels = np.array(labels)[indices]
return filepaths, labels
def data_generator(filepaths, labels, batch_size, max_length=2_000_000, shuffle=True):
"""
데이터λ₯Ό 배치 λ‹¨μœ„λ‘œ μƒμ„±ν•˜λŠ” μ œλ„ˆλ ˆμ΄ν„°
"""
num_samples = len(filepaths)
if num_samples == 0:
return
while True:
indices = np.arange(num_samples)
if shuffle:
np.random.shuffle(indices)
for i in range(0, num_samples, batch_size):
batch_indices = indices[i:i+batch_size]
X_batch = []
y_batch_list = []
for j in batch_indices:
try:
X_batch.append(read_binary_file(filepaths[j], max_length))
y_batch_list.append(labels[j])
except Exception as e:
print(f"Warning: Skipping file {filepaths[j]} due to error: {e}")
continue
if not X_batch:
continue
yield np.array(X_batch), np.array(y_batch_list)