DGA Domain Detection

0x01 Domain Generating Algorithm

Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.

Example

0x02 Random Forest

random forest = bagging + decision trees

0x03 code

Random Forest
MultinomialNB

import os, sys
import traceback
import json
import optparse
import pickle
import collections
import sklearn
import sklearn.feature_extraction
import sklearn.ensemble
import sklearn.metrics
import pandas as pd
import numpy as np
import tldextract
import math
import operator
from sklearn.model_selection import train_test_split
from matplotlib import pylab
from pylab import *

收集数据

1
2
3

alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank','uri'], header=None, encoding='utf-8')
alexa_dataframe.info()
alexa_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
rank    100000 non-null int64
uri     100000 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB

	rank	uri
0	1	facebook.com
1	2	google.com
2	3	youtube.com
3	4	yahoo.com
4	5	baidu.com

1
2
3

dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8')
dga_dataframe.info()
dga_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2669 entries, 0 to 2668
Data columns (total 1 columns):
raw_domain    2669 non-null object
dtypes: object(1)
memory usage: 20.9+ KB

	raw_domain
0	04055051be412eea5a61b7da8438be3d.info
1	1cb8a5f36f.info
2	30acd347397c34fc273e996b22951002.org
3	336c986a284e2b3bc0f69f949cb437cb.info
4	336c986a284e2b3bc0f69f949cb437cb.org

1
2
3

word_dataframe = pd.read_csv('data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')
word_dataframe.info()
word_dataframe.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479623 entries, 0 to 479622
Data columns (total 1 columns):
word    479619 non-null object
dtypes: object(1)
memory usage: 3.7+ MB

	word
0	1080
1	10-point
2	10th
3	11-point
4	12-point
5	16-point
6	18-point
7	1st
8	2
9	20-point

准备数据

def domain_extract(uri):
    ext = tldextract.extract(uri)
    if (not ext.suffix):
        return None
    else:
        return ext.domain
    
alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
alexa_dataframe.info()
alexa_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91377 entries, 0 to 99999
Data columns (total 1 columns):
domain    91377 non-null object
dtypes: object(1)
memory usage: 1.4+ MB

	domain
0	facebook
1	google
2	youtube
3	yahoo
4	baidu

1
2
3

alexa_dataframe['class'] = 'legit'
#对正常数据打标legit
alexa_dataframe.head()

	domain	class
0	facebook	legit
1	google	legit
2	youtube	legit
3	yahoo	legit
4	baidu	legit

# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
#打乱循序，重新索引
#Randomly permute a sequence, or return a permuted range
alexa_total = alexa_dataframe.shape[0]
print('Total Alexa domains %d' % alexa_total)

Total Alexa domains 91377

1
2
3

dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
#This method applies a function that accepts and returns a scalar to every element of a DataFrame.
del dga_dataframe['raw_domain']

dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_total = dga_dataframe.shape[0]
print('Total DGA domains %d' % dga_total)

Total DGA domains 2664

1 2	dga_dataframe['class'] = 'dga' dga_dataframe.head()

	domain	class
0	04055051be412eea5a61b7da8438be3d	dga
1	1cb8a5f36f	dga
2	30acd347397c34fc273e996b22951002	dga
3	336c986a284e2b3bc0f69f949cb437cb	dga
5	40a43e61e56a5c218cf6c22aca27f7ee	dga

def entropy(s):
    '''
    熵计算
    '''
    p, lns = collections.Counter(s), float(len(s))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())

all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)
#将数据根据不同的轴作简单的融合
#如果两个表的index都没有实际含义，使用ignore_index=True
all_domains['length'] = [len(x) for x in all_domains['domain']]
all_domains = all_domains[all_domains['length'] > 6]
#排除短domain的干扰
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
all_domains.head(10)

	domain	class	length	entropy
0	facebook	legit	8	2.750000
2	youtube	legit	7	2.521641
5	wikipedia	legit	9	2.641604
10	blogspot	legit	8	2.750000
11	twitter	legit	7	2.128085
12	linkedin	legit	8	2.500000
19	wordpress	legit	9	2.725481
23	microsoft	legit	9	2.947703
27	xvideos	legit	7	2.807355
28	googleusercontent	legit	17	3.175123

分析数据

#箱线图
all_domains.boxplot('length','class')
pylab.ylabel('Domain Length')
all_domains.boxplot('entropy','class')
pylab.ylabel('Domain Entropy')

Text(0,0.5,'Domain Entropy')

cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['length'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['length'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain Length')
pylab.ylabel('Domain Entropy')

Text(0,0.5,'Domain Entropy')

1	all_domains.tail(10)

	domain	class	length	entropy
94031	xcfwwghb	dga	8	2.750000
94032	xcgqdfyrkgihlrmfmfib	dga	20	3.684184
94033	xclqwzcfcx	dga	10	2.646439
94034	xcpfxzuf	dga	8	2.500000
94035	xcvxhxze	dga	8	2.405639
94036	xdbrbsbm	dga	8	2.405639
94037	xdfjryydcfwvkvui	dga	16	3.500000
94038	xdjlvcgw	dga	8	3.000000
94039	xdrmjeu	dga	7	2.807355
94040	xflrjyyjswoatsoq	dga	16	3.500000

legit = all_domains[(all_domains['class']=='legit')]
max_grams = np.maximum(legit['alexa_grams'],legit['word_grams'])
ax = max_grams.hist(bins=80)
ax.figure.suptitle('Histogram of the Max NGram Score for Domains')
pylab.xlabel('Number of Domains')
pylab.ylabel('Maximum NGram Score')

Text(0,0.5,'Maximum NGram Score')

word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]
word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
word_dataframe = word_dataframe.dropna()
word_dataframe = word_dataframe.drop_duplicates()
word_dataframe.head(10)

	word
37	a
48	aa
51	aaa
53	aaaa
54	aaaaaa
55	aaal
56	aaas
57	aaberg
58	aachen
59	aae

alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0)
#词袋模型统计词频
#ngram_range：词组切分的长度范围
#如果一个词的频率小于min_df或者大于max_df，将不会被作为关键词
counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
#生成词频向量
#fit_transform 计算各个词语出现的次数
alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
#数据归一化
print(alexa_counts[:10])
ngrams_list = alexa_vc.get_feature_names()
#从包含文本和图片的数据集中提取特征，转换成机器学习中可用的数值型特征
print(ngrams_list[:10])

_sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=operator.itemgetter(1), reverse=True)
#zip()将两个序列合并，返回zip对象，可强制转换为列表或字典
# sorted()对序列进行排序，返回一个排序后的新列表，原数据不改变
print('Alexa NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
    print(ngram, count)

[1.         1.         1.17609126 1.64345268 1.11394335 1.14612804
 1.         1.17609126 1.07918125 1.54406804]
['-20', '-a-', '-ac', '-ad', '-ads', '-af', '-ag', '-ai', '-air', '-al']
Alexa NGrams: 23613
ing 3.443888546777372
lin 3.4271614029259654
ine 3.399673721481038
tor 3.26528962586083
ter 3.2631624649622166
ion 3.2467447097238415
ent 3.228913405994688
por 3.2013971243204513
the 3.2005769267548483
ree 3.16345955176999

#提取词的数值型特征
dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
counts_matrix = dict_vc.fit_transform(word_dataframe['word'])
dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = dict_vc.get_feature_names()
print(ngrams_list[:10])

['aaa', 'aab', 'aac', 'aad', 'aaf', 'aag', 'aah', 'aai', 'aak', 'aal']

_sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True)
print('Word NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
    print(ngram, count)

Word NGrams: 123061
ing 4.387300822448285
ess 4.204879333760662
ati 4.1933472563864616
ion 4.165036479994566
ter 4.162415036106447
nes 4.112504458767161
tio 4.076822423342773
ate 4.0723602039634885
ent 4.069631102620343
tion 4.0496056125949735

def ngram_count(domain):
    '''
    domain中包含的ngrams数
    '''
    alexa_match = alexa_counts * alexa_vc.transform([domain]).T  
    dict_match = dict_counts * dict_vc.transform([domain]).T
    print('%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match))

ngram_count('google')
ngram_count('facebook')
ngram_count('1cb8a5f36f')
ngram_count('pterodactylfarts')

google Alexa match:17 Dict match: 14
facebook Alexa match:31 Dict match: 27
1cb8a5f36f Alexa match:0 Dict match: 0
pterodactylfarts Alexa match:35 Dict match: 76

#Compute NGram matches for all the domains and add to our dataframe
all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T
all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T
all_domains.head(10)

	domain	class	length	entropy	alexa_grams	word_grams
0	facebook	legit	8	2.750000	31.302278	27.872426
2	youtube	legit	7	2.521641	25.855170	18.287142
5	wikipedia	legit	9	2.641604	24.571024	29.175635
10	blogspot	legit	8	2.750000	24.435141	19.274501
11	twitter	legit	7	2.128085	23.244500	31.130820
12	linkedin	legit	8	2.500000	24.774916	32.904408
19	wordpress	legit	9	2.725481	38.369509	33.806635
23	microsoft	legit	9	2.947703	32.133033	39.530125
27	xvideos	legit	7	2.807355	28.906360	18.846834
28	googleusercontent	legit	17	3.175123	67.315750	86.104683

1
2
3

#Use the vectorized operations of the dataframe to investigate differences
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
all_domains.sort_values(['diff'], ascending=True).head(10)

	domain	class	length	entropy	alexa_grams	word_grams	diff
79366	bipolardisorderdepressionanxiety	legit	32	3.616729	117.312465	190.833856	-73.521391
72512	channel4embarrassingillnesses	legit	29	3.440070	95.786979	169.119440	-73.332460
10961	stirringtroubleinternationally	legit	30	3.481728	134.049367	207.204729	-73.155362
85031	americansforresponsiblesolutions	legit	32	3.667838	148.143049	218.363956	-70.220908
20459	pragmatismopolitico	legit	19	3.326360	61.244630	121.536223	-60.291593
13702	egaliteetreconciliation	legit	23	3.186393	91.938518	152.125325	-60.186808
4706	interoperabilitybridges	legit	23	3.588354	95.037285	153.626312	-58.589028
85161	foreclosurephilippines	legit	22	3.447402	74.506548	132.514638	-58.008090
45636	annamalicesissyselfhypnosis	legit	27	3.429908	68.680068	126.667692	-57.987623
70351	corazonindomablecapitulos	legit	25	3.813661	75.535473	133.160690	-57.625217

1	all_domains.sort_values(['diff'], ascending=False).head(10)

	domain	class	length	entropy	alexa_grams	word_grams	diff
54228	gay-sex-pics-porn-pictures-gay-sex-porn-gay-se...	legit	56	3.661056	159.642301	85.124184	74.518116
85091	article-directory-free-submission-free-content	legit	46	3.786816	235.233896	188.230453	47.003443
16893	stream-free-movies-online	legit	25	3.509275	120.250616	74.496915	45.753701
63380	watch-free-movie-online	legit	23	3.708132	103.029245	58.943451	44.085794
44253	best-online-shopping-site	legit	25	3.452879	123.377240	79.596640	43.780601
22524	social-bookmarking-sites-list	legit	29	3.702472	145.755266	102.261826	43.493440
66335	free-online-directory	legit	21	3.403989	123.379738	80.735030	42.644708
46553	free-links-articles-directory	legit	29	3.702472	153.239055	110.955361	42.283694
59873	online-web-directory	legit	20	3.584184	116.310717	74.082948	42.227769
58016	web-directory-online	legit	20	3.584184	114.402671	74.082948	40.319723

#gram count低的词
weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2)
weird = all_domains[weird_cond]
print(weird.shape[0])
weird.head(10)

	domain	class	length	entropy	alexa_grams	diff
1246	twcczhu	legit	7	2.521641	1.748188	1.748188
2009	ggmm777	legit	7	1.556657	1.518514	1.518514
2760	qq66699	legit	7	1.556657	1.342423	1.342423
17347	crx7601	legit	7	2.807355	0.000000	0.000000
18682	hzsxzhyy	legit	8	2.250000	0.000000	0.000000
19418	02022222222	legit	11	0.684038	1.041393	1.041393
19887	3181302	legit	7	2.235926	0.000000	0.000000
21172	hljdns4	legit	7	2.807355	1.755875	1.755875
26441	05tz2e9	legit	7	2.807355	0.000000	0.000000
26557	fzysqmy	legit	7	2.521641	1.176091	1.176091

1
2
3

#对于这些正常但是gram count低的domain标记为weird
all_domains.loc[weird_cond, 'class'] = 'weird'
all_domains['class'].value_counts()

legit    67221
dga       2664
weird       91
Name: class, dtype: int64

1	all_domains[all_domains['class'] == 'weird'].head()

	domain	class	length	entropy	alexa_grams	diff
1246	twcczhu	weird	7	2.521641	1.748188	1.748188
2009	ggmm777	weird	7	1.556657	1.518514	1.518514
2760	qq66699	weird	7	1.556657	1.342423	1.342423
17347	crx7601	weird	7	2.807355	0.000000	0.000000
18682	hzsxzhyy	weird	8	2.250000	0.000000	0.000000

cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['word_grams'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['word_grams'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain word_grams')
pylab.ylabel('Domain Entropy')

Text(0,0.5,'Domain Entropy')

训练算法

not_weird = all_domains[all_domains['class'] != 'weird']
X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams'])
#将frame转换为Numpy-array表示
y = np.array(not_weird['class'].tolist())
#将array转换为list
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20)
#A random forest classifier
#The number of trees in the forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#随机划分训练集和测试集
#样本占比0.2
clf.fit(X_train, y_train)
#用训练数据拟合分类器模型
y_pred = clf.predict(X_test)
#用训练好的分类器去预测测试数据

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.

def show_cm(cm, labels):
    #计算百分比
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)  
    print('Confusion Matrix Stats')
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print("%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()))

labels = ['legit', 'dga']
cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels)
#混淆矩阵被用于在分类问题上对准确率的一种评估形式
show_cm(cm, labels)

Confusion Matrix Stats
legit/legit: 99.57% (13369/13427)
legit/dga: 0.43% (58/13427)
dga/legit: 15.45% (85/550)
dga/dga: 84.55% (465/550)

1
2
3

importances = zip(['length', 'entropy', 'alexa_grams', 'word_grams'], clf.feature_importances_)
#了解每个特征的重要性
list(importances)

[('length', 0.16033779891739047),
 ('entropy', 0.12175502861193326),
 ('alexa_grams', 0.5087685303664589),
 ('word_grams', 0.20913864210421748)]

1	clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

测试算法

def test_it(domain):
    _alexa_match = alexa_counts * alexa_vc.transform([domain]).T  
    _dict_match = dict_counts * dict_vc.transform([domain]).T
    _X = [[len(domain), entropy(domain), _alexa_match, _dict_match]]
    print('%s : %s' % (domain, clf.predict(_X)[0]))

test_it('google')
test_it('google8sdflkajssjgjksdh')
test_it('faceboosadfadfafdk')
test_it('1cb8a5f36f')
test_it('pterodactyladfasdfasdffarts')
test_it('ptes9dro-dwacty2lfa5rrts')
test_it('beyonce')
test_it('bey666on4ce')
test_it('supersexy')
test_it('yourmomissohotinthesummertime')

google : legit
google8sdflkajssjgjksdh : dga
faceboosadfadfafdk : legit
1cb8a5f36f : dga
pterodactyladfasdfasdffarts : legit
ptes9dro-dwacty2lfa5rrts : dga
beyonce : legit
bey666on4ce : dga
supersexy : legit
yourmomissohotinthesummertime : legit

使用算法

def save_model_to_disk(name, model, model_dir='models'):
    serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
    model_path = os.path.join(model_dir, name+'.model')
    print('Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model)/1024.0/1024.0))
    open(model_path,'wb').write(serialized_model)

save_model_to_disk('dga_model_random_forest', clf)
save_model_to_disk('dga_model_alexa_vectorizor', alexa_vc)
save_model_to_disk('dga_model_alexa_counts', alexa_counts)
save_model_to_disk('dga_model_dict_vectorizor', dict_vc)
save_model_to_disk('dga_model_dict_counts', dict_counts)

Storing Serialized Model to Disk (dga_model_random_forest:1.80Meg)
Storing Serialized Model to Disk (dga_model_alexa_vectorizor:2.93Meg)
Storing Serialized Model to Disk (dga_model_alexa_counts:0.18Meg)
Storing Serialized Model to Disk (dga_model_dict_vectorizor:5.39Meg)
Storing Serialized Model to Disk (dga_model_dict_counts:0.94Meg)

def load_model_from_disk(name, model_dir='models'):
    model_path = os.path.join(model_dir, name+'.model')
    try:
        model = pickle.loads(open(model_path,'rb').read())
        print('success')
    except:
        print('Could not load model: %s from directory %s!' % (name, model_path))
        return None
    return model

clf = load_model_from_disk('dga_model_random_forest')
alexa_vc = load_model_from_disk('dga_model_alexa_vectorizor')
alexa_counts = load_model_from_disk('dga_model_alexa_counts')
dict_vc = load_model_from_disk('dga_model_dict_vectorizor')
dict_counts = load_model_from_disk('dga_model_dict_counts')
model = {'clf':clf, 'alexa_vc':alexa_vc, 'alexa_counts':alexa_counts,
                 'dict_vc':dict_vc, 'dict_counts':dict_counts}

success
success
success
success
success

def evaluate_url(model, url):
    domain = domain_extract(url)
    alexa_match = model['alexa_counts'] * model['alexa_vc'].transform([url]).T
    dict_match = model['dict_counts'] * model['dict_vc'].transform([url]).T
    
    X = [[len(domain), entropy(domain), alexa_match, dict_match]]
    y_pred = model['clf'].predict(X)[0]
    
    print('%s : %s' % (domain, y_pred))

1	evaluate_url(model, 'adfhalksfhjashfk.com')

adfhalksfhjashfk : dga

1 2	mtnb = MultinomialNB() mtnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

nb_y_pred=mtnb.predict(X_test)
print(classification_report(y_test, nb_y_pred))
cm = sklearn.metrics.confusion_matrix(y_test, nb_y_pred)
show_cm(cm, labels)

             precision    recall  f1-score   support

        dga       0.71      0.87      0.78       550
      legit       0.99      0.99      0.99     13427

avg / total       0.98      0.98      0.98     13977

Confusion Matrix Stats
legit/legit: 86.73% (477/550)
legit/dga: 13.27% (73/550)
dga/legit: 1.44% (194/13427)
dga/dga: 98.56% (13233/13427)

import os
import random
import tldextract
import sklearn
import pandas as pd
import numpy as np

from keras.models import Sequential, load_model
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from datetime import datetime
from zipfile import ZipFile

1
2
3

alexa_dataframe = pd.read_csv('data/top-1m.csv', names=['rank','uri'], header=None, encoding='utf-8')
alexa_dataframe.info()
alexa_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
rank    1000000 non-null int64
uri     1000000 non-null object
dtypes: int64(1), object(1)
memory usage: 15.3+ MB

	rank	uri
0	1	google.com
1	2	youtube.com
2	3	facebook.com
3	4	baidu.com
4	5	wikipedia.org

def load_data_set(filename):
    fw = open('data/dga_domain.txt', 'w+')
    with open(filename, "r") as f:
        for line in f.readlines():
            lineArr = line.strip().split('\t')
            fw.write(lineArr[1] + '\n')
    fw.close()
load_data_set('data/dga.txt')

1
2
3

dga_dataframe = pd.read_csv('data/dga_domain.txt', names=['raw_domain'], header=None, encoding='utf-8')
dga_dataframe.info()
dga_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158695 entries, 0 to 1158694
Data columns (total 1 columns):
raw_domain    1158695 non-null object
dtypes: object(1)
memory usage: 8.8+ MB

	raw_domain
0	ogxbnjopz.biz
1	zyejwiist.net
2	buuqogz.com
3	vpjmomduqll.org
4	uakwifutnpn.biz

def domain_extract(uri):
    ext = tldextract.extract(uri)
    if (not ext.suffix):
        return None
    else:
        return ext.domain
    
alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
alexa_dataframe['length'] = [len(x) for x in alexa_dataframe['domain']]
alexa_dataframe = alexa_dataframe[alexa_dataframe['length'] > 6]
alexa_dataframe.info()
alexa_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 718018 entries, 1 to 999999
Data columns (total 2 columns):
domain    718018 non-null object
length    718018 non-null int64
dtypes: int64(1), object(1)
memory usage: 16.4+ MB

	domain	length
1	youtube	7
2	facebook	8
4	wikipedia	9
11	instagram	9
13	twitter	7

1
2
3

alexa_dataframe['class'] = 'legit'
#对正常数据打标legit
alexa_dataframe.head()

	domain	length	class
1	youtube	7	legit
2	facebook	8	legit
4	wikipedia	9	legit
11	instagram	9	legit
13	twitter	7	legit

# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
#打乱循序，重新索引
#Randomly permute a sequence, or return a permuted range
alexa_total = alexa_dataframe.shape[0]
print('Total Alexa domains %d' % alexa_total)

Total Alexa domains 718018

1
2
3

dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
#This method applies a function that accepts and returns a scalar to every element of a DataFrame.
del dga_dataframe['raw_domain']

dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_dataframe['length'] = [len(x) for x in dga_dataframe['domain']]
dga_dataframe = dga_dataframe[dga_dataframe['length'] > 6]
dga_total = dga_dataframe.shape[0]
print('Total DGA domains %d' % dga_total)

Total DGA domains 1082010

1 2	dga_dataframe['class'] = 'dga' dga_dataframe.head()

	domain	length	class
0	ogxbnjopz	9	dga
1	zyejwiist	9	dga
2	buuqogz	7	dga
3	vpjmomduqll	11	dga
4	uakwifutnpn	11	dga

1
2
3

all_domains = pd.concat([alexa_dataframe[:5000], dga_dataframe[:5000]], ignore_index=True)
#
all_domains.head(10)

	domain	length	class
0	youtube	7	legit
1	facebook	8	legit
2	wikipedia	9	legit
3	instagram	9	legit
4	twitter	7	legit
5	blogspot	8	legit
6	netflix	7	legit
7	pornhub	7	legit
8	xvideos	7	legit
9	livejasmin	10	legit

1	all_domains.tail(10)

	domain	length	class
9990	mxepwpxki	9	dga
9991	xnvqgaddhivrqowtbs	18	dga
9992	btgjyoydcwoeigdldngr	20	dga
9993	mnnridfyhxkyk	13	dga
9994	jmcctiodbdemfejo	16	dga
9995	mepoiwtmeffy	12	dga
9996	iwpikrmppfqeere	15	dga
9997	gcibdmrs	8	dga
9998	tusdspujigdyntbxusuah	21	dga
9999	wvsiuqhblxfijnoefjnao	21	dga

1 2	X = all_domains['domain'] labels = all_domains['class']

1
2
3

ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2))
count_vec = ngram_vectorizer.fit_transform(X)
max_features = count_vec.shape[1]

1	y = [0 if x == 'legit' else 1 for x in labels]

1	final_data = []

多层感知机（MLP）

def build_model(max_features):
    model = Sequential()
    model.add(Dense(1, input_dim=max_features, activation='sigmoid'))
    #添加一个全连接层，激活函数使用sigmoid，输出维度max_features
    model.compile(loss='binary_crossentropy',optimizer='adam')
    #编译模型，损失函数采用对数损失函数，优化器选用adam
    return model

max_epoch = 50
nfolds = 10
#10轮训练
batch_size = 128

for fold in range(nfolds):
    print("fold %u/%u" % (fold+1, nfolds))
    X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y, labels, test_size=0.2)

    print('Build model...')
    model = build_model(max_features)

    print("Train...")
    X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
    best_iter = -1
    best_auc = 0.0
    out_data = {}

    for ep in range(max_epoch):
        model.fit(X_train.todense(), y_train, batch_size=batch_size, nb_epoch=1)
        t_probs = model.predict_proba(X_holdout.todense())
        t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)
        #计算AUC值

        print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))
        if t_auc > best_auc:
            best_auc = t_auc
            best_iter = ep

            probs = model.predict_proba(X_test.todense())
            out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}
            print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
        else:
            if (ep-best_iter) > 5:
                break

    final_data.append(out_data)
    model.save('model.h5')

fold 1/10
Build model...
Train...


/usr/lib/python3/dist-packages/ipykernel_launcher.py:15: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.
  from ipykernel import kernelapp as app


Epoch 1/1
7600/7600 [==============================] - 1s 86us/step - loss: 0.6297
Epoch 0: auc = 0.950239 (best=0.000000)
[[915  86]
 [108 891]]
Epoch 1/1
7600/7600 [==============================] - 0s 26us/step - loss: 0.5243
Epoch 1: auc = 0.980196 (best=0.950239)
[[952  49]
 [ 83 916]]
Epoch 1/1
7600/7600 [==============================] - 0s 31us/step - loss: 0.4502
Epoch 2: auc = 0.984872 (best=0.980196)
[[965  36]
 [ 78 921]]
Epoch 1/1
7600/7600 

Epoch 32: auc = 0.994192 (best=0.994192)

1	model = load_model('model.h5')

1	print(final_data)

[{'y': [0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1], 'labels': 2403    legit
2789    legit
450     legit
4521    legit
2841    legit
8645      dga
6999      dga
7831      dga
6291      dga
3746    legit
6226      dga
4111    legit
8487      dga
678     legit
90      legit
6151      dga
8300      dga
4004    legit
2489    legit
4836    legit
8291      dga
8198      dga
8911      dga
7585      dga
260     legit
5905      dga
5646      dga
970     legit
8718      dga
275     legit
        ...  
8589      dga
6620      dga
7470      dga
5230      dga
4827    legit
5677      dga
3417    legit
8539      dga
7147      dga
3699    legit
4751    legit
3043    legit
5475      dga
3736    legit
3887    legit
6349      dga
4996    legit
7379      dga
3530    legit
1942    legit
7914      dga
9752      dga
6717      dga
5363      dga
7622      dga
961     legit
1641    legit
4607    legit
8649      dga
6087      dga
Name: class, Length: 2000, dtype: object, 'probs': array([[0.14488636],
       [0.00496732],
       [0.00896166],
       ...,
       [0.00593334],
       [0.95598286],
       [0.9867235 ]], dtype=float32), 'epochs': 43, 'confusion_matrix': array([[972,  29],
       [ 62, 937]])}

1
2

z_test = np.array([[0, 0, 0, 0, 0,  0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]])
model.predict(z_test)

array([[1.]], dtype=float32)

1	print(sklearn.metrics.classification_report(final_data[0]['y'], final_data[0]['probs'] > .5))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       970
           1       0.97      0.95      0.96      1030

   micro avg       0.96      0.96      0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

LSTM

def build_model_lstm(max_features, maxlen):
    """Build LSTM model"""
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    #添加一个嵌入层，嵌入层是将正整数（下标）转换为具有固定大小的向量
    model.add(LSTM(128))
    #添加长短期记忆网络LSTM，从样本中学习特征，这个是核心层
    model.add(Dropout(0.5))
    #添加Dropout层防止过拟合
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    #编译模型，损失函数采用对数损失函数，优化器选用rmsprop

    return model

X = all_domains['domain']
labels = all_domains['class']

valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}
max_features = len(valid_chars) + 1
#计算特征字符长度
maxlen = np.max([len(x) for x in X])
#记录最长的域名长度
X = [[valid_chars[y] for y in x] for x in X]
#转换为下标数组
X = sequence.pad_sequences(X, maxlen=maxlen)
#进行长度填充
y = [0 if x == 'legit' else 1 for x in labels]
final_data = []

for fold in range(nfolds):
    print("fold %u/%u" % (fold+1, nfolds))
    X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, 
                                                                           test_size=0.2)

    print('Build model...')
    model = build_model_lstm(max_features, maxlen)

    print("Train...")
    X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
    best_iter = -1
    best_auc = 0.0
    out_data = {}

    for ep in range(max_epoch):
        model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1)

        t_probs = model.predict_proba(X_holdout)
        t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

        print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

        if t_auc > best_auc:
            best_auc = t_auc
            best_iter = ep

            probs = model.predict_proba(X_test)

            out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep, 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

            print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
        else:
            if (ep-best_iter) > 2:
                break

    final_data.append(out_data)

fold 1/10
Build model...
Train...

Epoch 1/1
7600/7600 [==============================] - 24s 3ms/step - loss: 0.3562
Epoch 0: auc = 0.979725 (best=0.000000)
[[893 113]
 [ 42 952]]
Epoch 1/1
7600/7600 [==============================] - 23s 3ms/step - loss: 0.1643
Epoch 7: auc = 0.980221 (best=0.981659)
Epoch 1/1
7600/7600 [==============================] - 21s 3ms/step - loss: 0.1603
Epoch 8: auc = 0.979843 (best=0.981659)

1	print(sklearn.metrics.classification_report(final_data[0]['y'], final_data[0]['probs'] > .5))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1006
           1       0.96      0.95      0.95       994

   micro avg       0.96      0.96      0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000