DGA Domain Detection
langu_xyz

0x01 Domain Generating Algorithm

Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.

Example

0x02 Random Forest

random forest = bagging + decision trees

0x03 code

  • Random Forest

  • MultinomialNB

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import os, sys
import traceback
import json
import optparse
import pickle
import collections
import sklearn
import sklearn.feature_extraction
import sklearn.ensemble
import sklearn.metrics
import pandas as pd
import numpy as np
import tldextract
import math
import operator
from sklearn.model_selection import train_test_split
from matplotlib import pylab
from pylab import *

收集数据

1
2
3
alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank','uri'], header=None, encoding='utf-8')
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
rank    100000 non-null int64
uri     100000 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB

rank uri
0 1 facebook.com
1 2 google.com
2 3 youtube.com
3 4 yahoo.com
4 5 baidu.com
1
2
3
dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8')
dga_dataframe.info()
dga_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2669 entries, 0 to 2668
Data columns (total 1 columns):
raw_domain    2669 non-null object
dtypes: object(1)
memory usage: 20.9+ KB

raw_domain
0 04055051be412eea5a61b7da8438be3d.info
1 1cb8a5f36f.info
2 30acd347397c34fc273e996b22951002.org
3 336c986a284e2b3bc0f69f949cb437cb.info
4 336c986a284e2b3bc0f69f949cb437cb.org
1
2
3
word_dataframe = pd.read_csv('data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')
word_dataframe.info()
word_dataframe.head(10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479623 entries, 0 to 479622
Data columns (total 1 columns):
word    479619 non-null object
dtypes: object(1)
memory usage: 3.7+ MB

word
0 1080
1 10-point
2 10th
3 11-point
4 12-point
5 16-point
6 18-point
7 1st
8 2
9 20-point

准备数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def domain_extract(uri):
ext = tldextract.extract(uri)
if (not ext.suffix):
return None
else:
return ext.domain

alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 91377 entries, 0 to 99999
Data columns (total 1 columns):
domain    91377 non-null object
dtypes: object(1)
memory usage: 1.4+ MB

domain
0 facebook
1 google
2 youtube
3 yahoo
4 baidu
1
2
3
alexa_dataframe['class'] = 'legit'
#对正常数据打标legit
alexa_dataframe.head()

domain class
0 facebook legit
1 google legit
2 youtube legit
3 yahoo legit
4 baidu legit
1
2
3
4
5
6
# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
#打乱循序,重新索引
#Randomly permute a sequence, or return a permuted range
alexa_total = alexa_dataframe.shape[0]
print('Total Alexa domains %d' % alexa_total)
Total Alexa domains 91377
1
2
3
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
#This method applies a function that accepts and returns a scalar to every element of a DataFrame.
del dga_dataframe['raw_domain']
1
2
3
4
dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_total = dga_dataframe.shape[0]
print('Total DGA domains %d' % dga_total)
Total DGA domains 2664
1
2
dga_dataframe['class'] = 'dga'
dga_dataframe.head()

domain class
0 04055051be412eea5a61b7da8438be3d dga
1 1cb8a5f36f dga
2 30acd347397c34fc273e996b22951002 dga
3 336c986a284e2b3bc0f69f949cb437cb dga
5 40a43e61e56a5c218cf6c22aca27f7ee dga
1
2
3
4
5
6
def entropy(s):
'''
熵计算
'''
p, lns = collections.Counter(s), float(len(s))
return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
1
2
3
4
5
6
7
8
all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)
#将数据根据不同的轴作简单的融合
#如果两个表的index都没有实际含义,使用ignore_index=True
all_domains['length'] = [len(x) for x in all_domains['domain']]
all_domains = all_domains[all_domains['length'] > 6]
#排除短domain的干扰
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
all_domains.head(10)

domain class length entropy
0 facebook legit 8 2.750000
2 youtube legit 7 2.521641
5 wikipedia legit 9 2.641604
10 blogspot legit 8 2.750000
11 twitter legit 7 2.128085
12 linkedin legit 8 2.500000
19 wordpress legit 9 2.725481
23 microsoft legit 9 2.947703
27 xvideos legit 7 2.807355
28 googleusercontent legit 17 3.175123

分析数据

1
2
3
4
5
#箱线图
all_domains.boxplot('length','class')
pylab.ylabel('Domain Length')
all_domains.boxplot('entropy','class')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_13_1

output_13_2

1
2
3
4
5
6
7
8
9
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['length'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['length'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain Length')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_14_1

1
all_domains.tail(10)

domain class length entropy
94031 xcfwwghb dga 8 2.750000
94032 xcgqdfyrkgihlrmfmfib dga 20 3.684184
94033 xclqwzcfcx dga 10 2.646439
94034 xcpfxzuf dga 8 2.500000
94035 xcvxhxze dga 8 2.405639
94036 xdbrbsbm dga 8 2.405639
94037 xdfjryydcfwvkvui dga 16 3.500000
94038 xdjlvcgw dga 8 3.000000
94039 xdrmjeu dga 7 2.807355
94040 xflrjyyjswoatsoq dga 16 3.500000
1
2
3
4
5
6
legit = all_domains[(all_domains['class']=='legit')]
max_grams = np.maximum(legit['alexa_grams'],legit['word_grams'])
ax = max_grams.hist(bins=80)
ax.figure.suptitle('Histogram of the Max NGram Score for Domains')
pylab.xlabel('Number of Domains')
pylab.ylabel('Maximum NGram Score')
Text(0,0.5,'Maximum NGram Score')

output_16_1

1
2
3
4
5
word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]
word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
word_dataframe = word_dataframe.dropna()
word_dataframe = word_dataframe.drop_duplicates()
word_dataframe.head(10)

word
37 a
48 aa
51 aaa
53 aaaa
54 aaaaaa
55 aaal
56 aaas
57 aaberg
58 aachen
59 aae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0)
#词袋模型统计词频
#ngram_range:词组切分的长度范围
#如果一个词的频率小于min_df或者大于max_df,将不会被作为关键词
counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
#生成词频向量
#fit_transform 计算各个词语出现的次数
alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
#数据归一化
print(alexa_counts[:10])
ngrams_list = alexa_vc.get_feature_names()
#从包含文本和图片的数据集中提取特征,转换成机器学习中可用的数值型特征
print(ngrams_list[:10])

_sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=operator.itemgetter(1), reverse=True)
#zip()将两个序列合并,返回zip对象,可强制转换为列表或字典
# sorted()对序列进行排序,返回一个排序后的新列表,原数据不改变
print('Alexa NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
print(ngram, count)
[1.         1.         1.17609126 1.64345268 1.11394335 1.14612804
 1.         1.17609126 1.07918125 1.54406804]
['-20', '-a-', '-ac', '-ad', '-ads', '-af', '-ag', '-ai', '-air', '-al']
Alexa NGrams: 23613
ing 3.443888546777372
lin 3.4271614029259654
ine 3.399673721481038
tor 3.26528962586083
ter 3.2631624649622166
ion 3.2467447097238415
ent 3.228913405994688
por 3.2013971243204513
the 3.2005769267548483
ree 3.16345955176999
1
2
3
4
5
6
#提取词的数值型特征
dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
counts_matrix = dict_vc.fit_transform(word_dataframe['word'])
dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = dict_vc.get_feature_names()
print(ngrams_list[:10])
['aaa', 'aab', 'aac', 'aad', 'aaf', 'aag', 'aah', 'aai', 'aak', 'aal']
1
2
3
4
_sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True)
print('Word NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
print(ngram, count)
Word NGrams: 123061
ing 4.387300822448285
ess 4.204879333760662
ati 4.1933472563864616
ion 4.165036479994566
ter 4.162415036106447
nes 4.112504458767161
tio 4.076822423342773
ate 4.0723602039634885
ent 4.069631102620343
tion 4.0496056125949735
1
2
3
4
5
6
7
def ngram_count(domain):
'''
domain中包含的ngrams数
'''
alexa_match = alexa_counts * alexa_vc.transform([domain]).T
dict_match = dict_counts * dict_vc.transform([domain]).T
print('%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match))
1
2
3
4
ngram_count('google')
ngram_count('facebook')
ngram_count('1cb8a5f36f')
ngram_count('pterodactylfarts')
google Alexa match:17 Dict match: 14
facebook Alexa match:31 Dict match: 27
1cb8a5f36f Alexa match:0 Dict match: 0
pterodactylfarts Alexa match:35 Dict match: 76
1
2
3
4
#Compute NGram matches for all the domains and add to our dataframe
all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T
all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T
all_domains.head(10)

domain class length entropy alexa_grams word_grams
0 facebook legit 8 2.750000 31.302278 27.872426
2 youtube legit 7 2.521641 25.855170 18.287142
5 wikipedia legit 9 2.641604 24.571024 29.175635
10 blogspot legit 8 2.750000 24.435141 19.274501
11 twitter legit 7 2.128085 23.244500 31.130820
12 linkedin legit 8 2.500000 24.774916 32.904408
19 wordpress legit 9 2.725481 38.369509 33.806635
23 microsoft legit 9 2.947703 32.133033 39.530125
27 xvideos legit 7 2.807355 28.906360 18.846834
28 googleusercontent legit 17 3.175123 67.315750 86.104683
1
2
3
#Use the vectorized operations of the dataframe to investigate differences
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
all_domains.sort_values(['diff'], ascending=True).head(10)

domain class length entropy alexa_grams word_grams diff
79366 bipolardisorderdepressionanxiety legit 32 3.616729 117.312465 190.833856 -73.521391
72512 channel4embarrassingillnesses legit 29 3.440070 95.786979 169.119440 -73.332460
10961 stirringtroubleinternationally legit 30 3.481728 134.049367 207.204729 -73.155362
85031 americansforresponsiblesolutions legit 32 3.667838 148.143049 218.363956 -70.220908
20459 pragmatismopolitico legit 19 3.326360 61.244630 121.536223 -60.291593
13702 egaliteetreconciliation legit 23 3.186393 91.938518 152.125325 -60.186808
4706 interoperabilitybridges legit 23 3.588354 95.037285 153.626312 -58.589028
85161 foreclosurephilippines legit 22 3.447402 74.506548 132.514638 -58.008090
45636 annamalicesissyselfhypnosis legit 27 3.429908 68.680068 126.667692 -57.987623
70351 corazonindomablecapitulos legit 25 3.813661 75.535473 133.160690 -57.625217
1
all_domains.sort_values(['diff'], ascending=False).head(10)

domain class length entropy alexa_grams word_grams diff
54228 gay-sex-pics-porn-pictures-gay-sex-porn-gay-se... legit 56 3.661056 159.642301 85.124184 74.518116
85091 article-directory-free-submission-free-content legit 46 3.786816 235.233896 188.230453 47.003443
16893 stream-free-movies-online legit 25 3.509275 120.250616 74.496915 45.753701
63380 watch-free-movie-online legit 23 3.708132 103.029245 58.943451 44.085794
44253 best-online-shopping-site legit 25 3.452879 123.377240 79.596640 43.780601
22524 social-bookmarking-sites-list legit 29 3.702472 145.755266 102.261826 43.493440
66335 free-online-directory legit 21 3.403989 123.379738 80.735030 42.644708
46553 free-links-articles-directory legit 29 3.702472 153.239055 110.955361 42.283694
59873 online-web-directory legit 20 3.584184 116.310717 74.082948 42.227769
58016 web-directory-online legit 20 3.584184 114.402671 74.082948 40.319723
1
2
3
4
5
#gram count低的词
weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2)
weird = all_domains[weird_cond]
print(weird.shape[0])
weird.head(10)
91

domain class length entropy alexa_grams word_grams diff
1246 twcczhu legit 7 2.521641 1.748188 0.0 1.748188
2009 ggmm777 legit 7 1.556657 1.518514 0.0 1.518514
2760 qq66699 legit 7 1.556657 1.342423 0.0 1.342423
17347 crx7601 legit 7 2.807355 0.000000 0.0 0.000000
18682 hzsxzhyy legit 8 2.250000 0.000000 0.0 0.000000
19418 02022222222 legit 11 0.684038 1.041393 0.0 1.041393
19887 3181302 legit 7 2.235926 0.000000 0.0 0.000000
21172 hljdns4 legit 7 2.807355 1.755875 0.0 1.755875
26441 05tz2e9 legit 7 2.807355 0.000000 0.0 0.000000
26557 fzysqmy legit 7 2.521641 1.176091 0.0 1.176091
1
2
3
#对于这些正常但是gram count低的domain标记为weird
all_domains.loc[weird_cond, 'class'] = 'weird'
all_domains['class'].value_counts()
legit    67221
dga       2664
weird       91
Name: class, dtype: int64
1
all_domains[all_domains['class'] == 'weird'].head()

domain class length entropy alexa_grams word_grams diff
1246 twcczhu weird 7 2.521641 1.748188 0.0 1.748188
2009 ggmm777 weird 7 1.556657 1.518514 0.0 1.518514
2760 qq66699 weird 7 1.556657 1.342423 0.0 1.342423
17347 crx7601 weird 7 2.807355 0.000000 0.0 0.000000
18682 hzsxzhyy weird 8 2.250000 0.000000 0.0 0.000000
1
2
3
4
5
6
7
8
9
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['word_grams'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['word_grams'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain word_grams')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_29_1

训练算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
not_weird = all_domains[all_domains['class'] != 'weird']
X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams'])
#将frame转换为Numpy-array表示
y = np.array(not_weird['class'].tolist())
#将array转换为list
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20)
#A random forest classifier
#The number of trees in the forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#随机划分训练集和测试集
#样本占比0.2
clf.fit(X_train, y_train)
#用训练数据拟合分类器模型
y_pred = clf.predict(X_test)
#用训练好的分类器去预测测试数据
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  
1
2
3
4
5
6
7
def show_cm(cm, labels):
#计算百分比
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
print('Confusion Matrix Stats')
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print("%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()))
1
2
3
4
labels = ['legit', 'dga']
cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels)
#混淆矩阵被用于在分类问题上对准确率的一种评估形式
show_cm(cm, labels)
Confusion Matrix Stats
legit/legit: 99.57% (13369/13427)
legit/dga: 0.43% (58/13427)
dga/legit: 15.45% (85/550)
dga/dga: 84.55% (465/550)
1
2
3
importances = zip(['length', 'entropy', 'alexa_grams', 'word_grams'], clf.feature_importances_)
#了解每个特征的重要性
list(importances)
[('length', 0.16033779891739047),
 ('entropy', 0.12175502861193326),
 ('alexa_grams', 0.5087685303664589),
 ('word_grams', 0.20913864210421748)]
1
clf.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

测试算法

1
2
3
4
5
def test_it(domain):
_alexa_match = alexa_counts * alexa_vc.transform([domain]).T
_dict_match = dict_counts * dict_vc.transform([domain]).T
_X = [[len(domain), entropy(domain), _alexa_match, _dict_match]]
print('%s : %s' % (domain, clf.predict(_X)[0]))
1
2
3
4
5
6
7
8
9
10
test_it('google')
test_it('google8sdflkajssjgjksdh')
test_it('faceboosadfadfafdk')
test_it('1cb8a5f36f')
test_it('pterodactyladfasdfasdffarts')
test_it('ptes9dro-dwacty2lfa5rrts')
test_it('beyonce')
test_it('bey666on4ce')
test_it('supersexy')
test_it('yourmomissohotinthesummertime')
google : legit
google8sdflkajssjgjksdh : dga
faceboosadfadfafdk : legit
1cb8a5f36f : dga
pterodactyladfasdfasdffarts : legit
ptes9dro-dwacty2lfa5rrts : dga
beyonce : legit
bey666on4ce : dga
supersexy : legit
yourmomissohotinthesummertime : legit

使用算法

1
2
3
4
5
def save_model_to_disk(name, model, model_dir='models'):
serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
model_path = os.path.join(model_dir, name+'.model')
print('Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model)/1024.0/1024.0))
open(model_path,'wb').write(serialized_model)
1
2
3
4
5
save_model_to_disk('dga_model_random_forest', clf)
save_model_to_disk('dga_model_alexa_vectorizor', alexa_vc)
save_model_to_disk('dga_model_alexa_counts', alexa_counts)
save_model_to_disk('dga_model_dict_vectorizor', dict_vc)
save_model_to_disk('dga_model_dict_counts', dict_counts)
Storing Serialized Model to Disk (dga_model_random_forest:1.80Meg)
Storing Serialized Model to Disk (dga_model_alexa_vectorizor:2.93Meg)
Storing Serialized Model to Disk (dga_model_alexa_counts:0.18Meg)
Storing Serialized Model to Disk (dga_model_dict_vectorizor:5.39Meg)
Storing Serialized Model to Disk (dga_model_dict_counts:0.94Meg)
1
2
3
4
5
6
7
8
9
def load_model_from_disk(name, model_dir='models'):
model_path = os.path.join(model_dir, name+'.model')
try:
model = pickle.loads(open(model_path,'rb').read())
print('success')
except:
print('Could not load model: %s from directory %s!' % (name, model_path))
return None
return model
1
2
3
4
5
6
7
clf = load_model_from_disk('dga_model_random_forest')
alexa_vc = load_model_from_disk('dga_model_alexa_vectorizor')
alexa_counts = load_model_from_disk('dga_model_alexa_counts')
dict_vc = load_model_from_disk('dga_model_dict_vectorizor')
dict_counts = load_model_from_disk('dga_model_dict_counts')
model = {'clf':clf, 'alexa_vc':alexa_vc, 'alexa_counts':alexa_counts,
'dict_vc':dict_vc, 'dict_counts':dict_counts}
success
success
success
success
success
1
2
3
4
5
6
7
8
9
def evaluate_url(model, url):
domain = domain_extract(url)
alexa_match = model['alexa_counts'] * model['alexa_vc'].transform([url]).T
dict_match = model['dict_counts'] * model['dict_vc'].transform([url]).T

X = [[len(domain), entropy(domain), alexa_match, dict_match]]
y_pred = model['clf'].predict(X)[0]

print('%s : %s' % (domain, y_pred))
1
evaluate_url(model, 'adfhalksfhjashfk.com')
adfhalksfhjashfk : dga

1
2
mtnb = MultinomialNB()
mtnb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
1
2
3
4
nb_y_pred=mtnb.predict(X_test)
print(classification_report(y_test, nb_y_pred))
cm = sklearn.metrics.confusion_matrix(y_test, nb_y_pred)
show_cm(cm, labels)
             precision    recall  f1-score   support

        dga       0.71      0.87      0.78       550
      legit       0.99      0.99      0.99     13427

avg / total       0.98      0.98      0.98     13977

Confusion Matrix Stats
legit/legit: 86.73% (477/550)
legit/dga: 13.27% (73/550)
dga/legit: 1.44% (194/13427)
dga/dga: 98.56% (13233/13427)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import os
import random
import tldextract
import sklearn
import pandas as pd
import numpy as np

from keras.models import Sequential, load_model
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from datetime import datetime
from zipfile import ZipFile
1
2
3
alexa_dataframe = pd.read_csv('data/top-1m.csv', names=['rank','uri'], header=None, encoding='utf-8')
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
rank    1000000 non-null int64
uri     1000000 non-null object
dtypes: int64(1), object(1)
memory usage: 15.3+ MB

rank uri
0 1 google.com
1 2 youtube.com
2 3 facebook.com
3 4 baidu.com
4 5 wikipedia.org
1
2
3
4
5
6
7
8
def load_data_set(filename):
fw = open('data/dga_domain.txt', 'w+')
with open(filename, "r") as f:
for line in f.readlines():
lineArr = line.strip().split('\t')
fw.write(lineArr[1] + '\n')
fw.close()
load_data_set('data/dga.txt')
1
2
3
dga_dataframe = pd.read_csv('data/dga_domain.txt', names=['raw_domain'], header=None, encoding='utf-8')
dga_dataframe.info()
dga_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158695 entries, 0 to 1158694
Data columns (total 1 columns):
raw_domain    1158695 non-null object
dtypes: object(1)
memory usage: 8.8+ MB

raw_domain
0 ogxbnjopz.biz
1 zyejwiist.net
2 buuqogz.com
3 vpjmomduqll.org
4 uakwifutnpn.biz
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def domain_extract(uri):
ext = tldextract.extract(uri)
if (not ext.suffix):
return None
else:
return ext.domain

alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
alexa_dataframe['length'] = [len(x) for x in alexa_dataframe['domain']]
alexa_dataframe = alexa_dataframe[alexa_dataframe['length'] > 6]
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 718018 entries, 1 to 999999
Data columns (total 2 columns):
domain    718018 non-null object
length    718018 non-null int64
dtypes: int64(1), object(1)
memory usage: 16.4+ MB

domain length
1 youtube 7
2 facebook 8
4 wikipedia 9
11 instagram 9
13 twitter 7
1
2
3
alexa_dataframe['class'] = 'legit'
#对正常数据打标legit
alexa_dataframe.head()

domain length class
1 youtube 7 legit
2 facebook 8 legit
4 wikipedia 9 legit
11 instagram 9 legit
13 twitter 7 legit
1
2
3
4
5
6
# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
#打乱循序,重新索引
#Randomly permute a sequence, or return a permuted range
alexa_total = alexa_dataframe.shape[0]
print('Total Alexa domains %d' % alexa_total)
Total Alexa domains 718018
1
2
3
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
#This method applies a function that accepts and returns a scalar to every element of a DataFrame.
del dga_dataframe['raw_domain']
1
2
3
4
5
6
dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_dataframe['length'] = [len(x) for x in dga_dataframe['domain']]
dga_dataframe = dga_dataframe[dga_dataframe['length'] > 6]
dga_total = dga_dataframe.shape[0]
print('Total DGA domains %d' % dga_total)
Total DGA domains 1082010
1
2
dga_dataframe['class'] = 'dga'
dga_dataframe.head()

domain length class
0 ogxbnjopz 9 dga
1 zyejwiist 9 dga
2 buuqogz 7 dga
3 vpjmomduqll 11 dga
4 uakwifutnpn 11 dga
1
2
3
all_domains = pd.concat([alexa_dataframe[:5000], dga_dataframe[:5000]], ignore_index=True)
#
all_domains.head(10)

domain length class
0 youtube 7 legit
1 facebook 8 legit
2 wikipedia 9 legit
3 instagram 9 legit
4 twitter 7 legit
5 blogspot 8 legit
6 netflix 7 legit
7 pornhub 7 legit
8 xvideos 7 legit
9 livejasmin 10 legit
1
all_domains.tail(10)

domain length class
9990 mxepwpxki 9 dga
9991 xnvqgaddhivrqowtbs 18 dga
9992 btgjyoydcwoeigdldngr 20 dga
9993 mnnridfyhxkyk 13 dga
9994 jmcctiodbdemfejo 16 dga
9995 mepoiwtmeffy 12 dga
9996 iwpikrmppfqeere 15 dga
9997 gcibdmrs 8 dga
9998 tusdspujigdyntbxusuah 21 dga
9999 wvsiuqhblxfijnoefjnao 21 dga
1
2
X = all_domains['domain']
labels = all_domains['class']
1
2
3
ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2))
count_vec = ngram_vectorizer.fit_transform(X)
max_features = count_vec.shape[1]
1
y = [0 if x == 'legit' else 1 for x in labels]
1
final_data = []
多层感知机(MLP)
1
2
3
4
5
6
7
def build_model(max_features):
model = Sequential()
model.add(Dense(1, input_dim=max_features, activation='sigmoid'))
#添加一个全连接层,激活函数使用sigmoid,输出维度max_features
model.compile(loss='binary_crossentropy',optimizer='adam')
#编译模型,损失函数采用对数损失函数,优化器选用adam
return model
1
2
3
4
max_epoch = 50
nfolds = 10
#10轮训练
batch_size = 128
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
for fold in range(nfolds):
print("fold %u/%u" % (fold+1, nfolds))
X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y, labels, test_size=0.2)

print('Build model...')
model = build_model(max_features)

print("Train...")
X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
best_iter = -1
best_auc = 0.0
out_data = {}

for ep in range(max_epoch):
model.fit(X_train.todense(), y_train, batch_size=batch_size, nb_epoch=1)
t_probs = model.predict_proba(X_holdout.todense())
t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)
#计算AUC值

print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))
if t_auc > best_auc:
best_auc = t_auc
best_iter = ep

probs = model.predict_proba(X_test.todense())
out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}
print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
else:
if (ep-best_iter) > 5:
break

final_data.append(out_data)
model.save('model.h5')
fold 1/10
Build model...
Train...


/usr/lib/python3/dist-packages/ipykernel_launcher.py:15: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.
  from ipykernel import kernelapp as app


Epoch 1/1
7600/7600 [==============================] - 1s 86us/step - loss: 0.6297
Epoch 0: auc = 0.950239 (best=0.000000)
[[915  86]
 [108 891]]
Epoch 1/1
7600/7600 [==============================] - 0s 26us/step - loss: 0.5243
Epoch 1: auc = 0.980196 (best=0.950239)
[[952  49]
 [ 83 916]]
Epoch 1/1
7600/7600 [==============================] - 0s 31us/step - loss: 0.4502
Epoch 2: auc = 0.984872 (best=0.980196)
[[965  36]
 [ 78 921]]
Epoch 1/1
7600/7600 

Epoch 32: auc = 0.994192 (best=0.994192)
1
model = load_model('model.h5')
1
print(final_data)
[{'y': [0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1], 'labels': 2403    legit
2789    legit
450     legit
4521    legit
2841    legit
8645      dga
6999      dga
7831      dga
6291      dga
3746    legit
6226      dga
4111    legit
8487      dga
678     legit
90      legit
6151      dga
8300      dga
4004    legit
2489    legit
4836    legit
8291      dga
8198      dga
8911      dga
7585      dga
260     legit
5905      dga
5646      dga
970     legit
8718      dga
275     legit
        ...  
8589      dga
6620      dga
7470      dga
5230      dga
4827    legit
5677      dga
3417    legit
8539      dga
7147      dga
3699    legit
4751    legit
3043    legit
5475      dga
3736    legit
3887    legit
6349      dga
4996    legit
7379      dga
3530    legit
1942    legit
7914      dga
9752      dga
6717      dga
5363      dga
7622      dga
961     legit
1641    legit
4607    legit
8649      dga
6087      dga
Name: class, Length: 2000, dtype: object, 'probs': array([[0.14488636],
       [0.00496732],
       [0.00896166],
       ...,
       [0.00593334],
       [0.95598286],
       [0.9867235 ]], dtype=float32), 'epochs': 43, 'confusion_matrix': array([[972,  29],
       [ 62, 937]])}
1
2
z_test = np.array([[0, 0, 0, 0, 0,  0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]])
model.predict(z_test)
array([[1.]], dtype=float32)
1
print(sklearn.metrics.classification_report(final_data[0]['y'], final_data[0]['probs'] > .5))
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       970
           1       0.97      0.95      0.96      1030

   micro avg       0.96      0.96      0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000
LSTM
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def build_model_lstm(max_features, maxlen):
"""Build LSTM model"""
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
#添加一个嵌入层,嵌入层是将正整数(下标)转换为具有固定大小的向量
model.add(LSTM(128))
#添加长短期记忆网络LSTM,从样本中学习特征,这个是核心层
model.add(Dropout(0.5))
#添加Dropout层防止过拟合
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop')
#编译模型,损失函数采用对数损失函数,优化器选用rmsprop

return model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
X = all_domains['domain']
labels = all_domains['class']

valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}
max_features = len(valid_chars) + 1
#计算特征字符长度
maxlen = np.max([len(x) for x in X])
#记录最长的域名长度
X = [[valid_chars[y] for y in x] for x in X]
#转换为下标数组
X = sequence.pad_sequences(X, maxlen=maxlen)
#进行长度填充
y = [0 if x == 'legit' else 1 for x in labels]
final_data = []
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
for fold in range(nfolds):
print("fold %u/%u" % (fold+1, nfolds))
X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels,
test_size=0.2)

print('Build model...')
model = build_model_lstm(max_features, maxlen)

print("Train...")
X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
best_iter = -1
best_auc = 0.0
out_data = {}

for ep in range(max_epoch):
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1)

t_probs = model.predict_proba(X_holdout)
t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

if t_auc > best_auc:
best_auc = t_auc
best_iter = ep

probs = model.predict_proba(X_test)

out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
else:
if (ep-best_iter) > 2:
break

final_data.append(out_data)
fold 1/10
Build model...
Train...

Epoch 1/1
7600/7600 [==============================] - 24s 3ms/step - loss: 0.3562
Epoch 0: auc = 0.979725 (best=0.000000)
[[893 113]
 [ 42 952]]
Epoch 1/1
7600/7600 [==============================] - 23s 3ms/step - loss: 0.1643
Epoch 7: auc = 0.980221 (best=0.981659)
Epoch 1/1
7600/7600 [==============================] - 21s 3ms/step - loss: 0.1603
Epoch 8: auc = 0.979843 (best=0.981659)
1
print(sklearn.metrics.classification_report(final_data[0]['y'], final_data[0]['probs'] > .5))
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1006
           1       0.96      0.95      0.95       994

   micro avg       0.96      0.96      0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000
  • Post title:DGA Domain Detection
  • Post author:langu_xyz
  • Create time:2019-08-24 21:00:00
  • Post link:https://blog.langu.xyz/DGA Domain Detection/
  • Copyright Notice:All articles in this blog are licensed under BY-NC-SA unless stating additionally.