malconv / src /utils.py

Upload 17 files

b92918a verified 6 months ago

10.3 kB

	import os
	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score
	import matplotlib.pyplot as plt
	import seaborn as sns
	# utils.py에 누락된 import
	import tensorflow as tf # configure_gpu_memory 함수에서 필요


	def read_binary_file(file_path, max_length=2_000_000):
	"""
	바이너리 파일을 읽어 정수 배열로 변환
	논문 사양: 2MB까지 처리
	"""
	try:
	with open(file_path, 'rb') as f:
	raw_bytes = f.read()

	# 바이트를 0-255 정수로 변환
	byte_array = np.frombuffer(raw_bytes, dtype=np.uint8)

	if len(byte_array) > max_length:
	# 긴 파일: 앞 2MB만 사용 (논문 방식)
	return byte_array[:max_length]
	else:
	# 짧은 파일: 0으로 패딩
	padded = np.zeros(max_length, dtype=np.uint8)
	padded[:len(byte_array)] = byte_array
	return padded

	except Exception as e:
	print(f"파일 읽기 오류 {file_path}: {e}")
	return np.zeros(max_length, dtype=np.uint8)

	def load_dataset_from_directory(malware_dir, benign_dir, max_length=2_000_000, max_samples_per_class=None):
	"""
	디렉토리에서 직접 바이너리 파일들을 로드

	Args:
	malware_dir: 악성코드 파일들이 있는 디렉토리
	benign_dir: 정상 파일들이 있는 디렉토리
	max_length: 최대 바이트 길이
	max_samples_per_class: 클래스당 최대 샘플 수
	"""
	X, y = [], []

	# 악성코드 파일 로드
	if os.path.exists(malware_dir):
	malware_files = [f for f in os.listdir(malware_dir) if os.path.isfile(os.path.join(malware_dir, f))]
	if max_samples_per_class:
	malware_files = malware_files[:max_samples_per_class]

	print(f"악성코드 파일 로딩 중... ({len(malware_files)}개)")
	for i, filename in enumerate(malware_files):
	file_path = os.path.join(malware_dir, filename)
	byte_array = read_binary_file(file_path, max_length)
	X.append(byte_array)
	y.append(0) # 악성코드 = 0

	if (i + 1) % 100 == 0:
	print(f" {i + 1}/{len(malware_files)} 처리 완료")

	# 정상 파일 로드
	if os.path.exists(benign_dir):
	benign_files = [f for f in os.listdir(benign_dir) if os.path.isfile(os.path.join(benign_dir, f))]
	if max_samples_per_class:
	benign_files = benign_files[:max_samples_per_class]

	print(f"정상 파일 로딩 중... ({len(benign_files)}개)")
	for i, filename in enumerate(benign_files):
	file_path = os.path.join(benign_dir, filename)
	byte_array = read_binary_file(file_path, max_length)
	X.append(byte_array)
	y.append(1) # 정상 = 1

	if (i + 1) % 100 == 0:
	print(f" {i + 1}/{len(benign_files)} 처리 완료")

	X = np.array(X)
	y = np.array(y)

	print(f"\n데이터셋 로딩 완료:")
	print(f" 총 샘플: {len(X)}")
	print(f" 악성코드: {np.sum(y == 0)}")
	print(f" 정상파일: {np.sum(y == 1)}")

	return X, y

	def load_dataset_from_csv(csv_path, max_length=2_000_000):
	"""CSV 파일에서 데이터셋 로드"""
	df = pd.read_csv(csv_path)

	X, y = [], []

	print("CSV에서 파일 로딩 중...")
	for idx, row in df.iterrows():
	file_path = row['filepath']
	label = row['label']

	if os.path.exists(file_path):
	byte_array = read_binary_file(file_path, max_length)
	X.append(byte_array)
	y.append(label)
	else:
	print(f"파일을 찾을 수 없습니다: {file_path}")

	if (idx + 1) % 1000 == 0:
	print(f" {idx + 1} 파일 처리 완료")

	return np.array(X), np.array(y)

	def configure_gpu_memory():
	"""GPU 메모리 설정"""
	gpus = tf.config.experimental.list_physical_devices('GPU')
	if gpus:
	try:
	for gpu in gpus:
	tf.config.experimental.set_memory_growth(gpu, True)
	print(f"GPU 설정 완료: {len(gpus)}개 GPU 사용")
	return True
	except RuntimeError as e:
	print(f"GPU 설정 오류: {e}")
	return False

	def plot_training_history(history):
	"""훈련 히스토리 시각화"""
	fig, axes = plt.subplots(2, 2, figsize=(12, 10))

	# Loss
	axes[0, 0].plot(history.history['loss'], label='Training Loss')
	if 'val_loss' in history.history:
	axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')
	axes[0, 0].set_title('Model Loss')
	axes[0, 0].set_xlabel('Epoch')
	axes[0, 0].set_ylabel('Loss')
	axes[0, 0].legend()
	axes[0, 0].grid(True)

	# Accuracy
	axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')
	if 'val_accuracy' in history.history:
	axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')
	axes[0, 1].set_title('Model Accuracy')
	axes[0, 1].set_xlabel('Epoch')
	axes[0, 1].set_ylabel('Accuracy')
	axes[0, 1].legend()
	axes[0, 1].grid(True)

	# AUC
	if 'auc' in history.history:
	axes[1, 0].plot(history.history['auc'], label='Training AUC')
	if 'val_auc' in history.history:
	axes[1, 0].plot(history.history['val_auc'], label='Validation AUC')
	axes[1, 0].set_title('Model AUC')
	axes[1, 0].set_xlabel('Epoch')
	axes[1, 0].set_ylabel('AUC')
	axes[1, 0].legend()
	axes[1, 0].grid(True)

	# Learning Rate
	if 'lr' in history.history:
	axes[1, 1].plot(history.history['lr'], label='Learning Rate', color='red')
	axes[1, 1].set_title('Learning Rate Schedule')
	axes[1, 1].set_xlabel('Epoch')
	axes[1, 1].set_ylabel('Learning Rate')
	axes[1, 1].set_yscale('log')
	axes[1, 1].legend()
	axes[1, 1].grid(True)

	plt.tight_layout()
	plt.show()

	def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
	"""혼동 행렬 시각화"""
	cm = confusion_matrix(y_true, y_pred)

	plt.figure(figsize=(8, 6))
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
	xticklabels=['Malware', 'Benign'],
	yticklabels=['Malware', 'Benign'])
	plt.title(title)
	plt.ylabel('True Label')
	plt.xlabel('Predicted Label')
	plt.show()

	def evaluate_model(model, X_test, y_test, batch_size=16):
	"""모델 성능 평가"""
	print("모델 평가 중...")

	# 예측
	y_pred_prob = model.predict(X_test, batch_size=batch_size, verbose=1)
	y_pred = (y_pred_prob > 0.5).astype(int).flatten()

	# 메트릭 계산
	accuracy = np.mean(y_pred == y_test)
	balanced_acc = balanced_accuracy_score(y_test, y_pred)
	auc_score = roc_auc_score(y_test, y_pred_prob)

	print(f"\n=== 평가 결과 ===")
	print(f"Accuracy: {accuracy:.4f}")
	print(f"Balanced Accuracy: {balanced_acc:.4f}")
	print(f"AUC Score: {auc_score:.4f}")

	print(f"\n분류 리포트:")
	print(classification_report(y_test, y_pred, target_names=['Malware', 'Benign']))

	# 혼동 행렬 시각화
	plot_confusion_matrix(y_test, y_pred, "MalConv Performance")

	return {
	'accuracy': accuracy,
	'balanced_accuracy': balanced_acc,
	'auc': auc_score,
	'predictions': y_pred_prob
	}

	def get_file_paths_and_labels(malware_dir, benign_dir, max_samples_per_class=None):
	"""
	디렉토리에서 파일 경로와 레이블 목록을 가져옵니다. (메모리에 파일 로드 안함)
	"""
	filepaths = []
	labels = []

	# 악성코드 파일 경로
	if os.path.exists(malware_dir):
	malware_files = [os.path.join(malware_dir, f) for f in os.listdir(malware_dir) if os.path.isfile(os.path.join(malware_dir, f))]
	if max_samples_per_class:
	malware_files = malware_files[:max_samples_per_class]
	filepaths.extend(malware_files)
	labels.extend([0] * len(malware_files)) # 악성코드 = 0
	print(f"악성코드 파일 경로 로딩: {len(malware_files)}개")

	# 정상 파일 경로
	if os.path.exists(benign_dir):
	benign_files = [os.path.join(benign_dir, f) for f in os.listdir(benign_dir) if os.path.isfile(os.path.join(benign_dir, f))]
	if max_samples_per_class:
	benign_files = benign_files[:max_samples_per_class]
	filepaths.extend(benign_files)
	labels.extend([1] * len(benign_files)) # 정상 = 1
	print(f"정상 파일 경로 로딩: {len(benign_files)}개")

	print(f"\n총 파일 경로: {len(filepaths)}")
	print(f" 악성코드: {labels.count(0)}")
	print(f" 정상파일: {labels.count(1)}")

	# 데이터 순서 섞기
	indices = np.arange(len(filepaths))
	np.random.shuffle(indices)
	filepaths = np.array(filepaths)[indices].tolist()
	labels = np.array(labels)[indices]

	return filepaths, labels


	def data_generator(filepaths, labels, batch_size, max_length=2_000_000, shuffle=True):
	"""
	데이터를 배치 단위로 생성하는 제너레이터
	"""
	num_samples = len(filepaths)
	if num_samples == 0:
	return

	while True:
	indices = np.arange(num_samples)
	if shuffle:
	np.random.shuffle(indices)

	for i in range(0, num_samples, batch_size):
	batch_indices = indices[i:i+batch_size]

	X_batch = []
	y_batch_list = []

	for j in batch_indices:
	try:
	X_batch.append(read_binary_file(filepaths[j], max_length))
	y_batch_list.append(labels[j])
	except Exception as e:
	print(f"Warning: Skipping file {filepaths[j]} due to error: {e}")
	continue

	if not X_batch:
	continue

	yield np.array(X_batch), np.array(y_batch_list)