0x01 Domain Generating Algorithm Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.
Example
0x02 Random Forest random forest = bagging + decision trees
0x03 code
Random Forest
MultinomialNB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 import os, sysimport tracebackimport jsonimport optparseimport pickleimport collectionsimport sklearnimport sklearn.feature_extractionimport sklearn.ensembleimport sklearn.metricsimport pandas as pdimport numpy as npimport tldextractimport mathimport operatorfrom sklearn.model_selection import train_test_splitfrom matplotlib import pylabfrom pylab import *
收集数据
1 2 3 alexa_dataframe = pd.read_csv('data/alexa_100k.csv' , names=['rank' ,'uri' ], header=None , encoding='utf-8' ) alexa_dataframe.info() alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
rank 100000 non-null int64
uri 100000 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB
rank
uri
0
1
facebook.com
1
2
google.com
2
3
youtube.com
3
4
yahoo.com
4
5
baidu.com
1 2 3 dga_dataframe = pd.read_csv('data/dga_domains.txt' , names=['raw_domain' ], header=None , encoding='utf-8' ) dga_dataframe.info() dga_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2669 entries, 0 to 2668
Data columns (total 1 columns):
raw_domain 2669 non-null object
dtypes: object(1)
memory usage: 20.9+ KB
raw_domain
0
04055051be412eea5a61b7da8438be3d.info
1
1cb8a5f36f.info
2
30acd347397c34fc273e996b22951002.org
3
336c986a284e2b3bc0f69f949cb437cb.info
4
336c986a284e2b3bc0f69f949cb437cb.org
1 2 3 word_dataframe = pd.read_csv('data/words.txt' , names=['word' ], header=None , dtype={'word' : np.str }, encoding='utf-8' ) word_dataframe.info() word_dataframe.head(10 )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479623 entries, 0 to 479622
Data columns (total 1 columns):
word 479619 non-null object
dtypes: object(1)
memory usage: 3.7+ MB
word
0
1080
1
10-point
2
10th
3
11-point
4
12-point
5
16-point
6
18-point
7
1st
8
2
9
20-point
准备数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def domain_extract (uri ): ext = tldextract.extract(uri) if (not ext.suffix): return None else : return ext.domain alexa_dataframe['domain' ] = [ domain_extract(uri) for uri in alexa_dataframe['uri' ]] del alexa_dataframe['rank' ]del alexa_dataframe['uri' ]alexa_dataframe = alexa_dataframe.dropna() alexa_dataframe = alexa_dataframe.drop_duplicates() alexa_dataframe.info() alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 91377 entries, 0 to 99999
Data columns (total 1 columns):
domain 91377 non-null object
dtypes: object(1)
memory usage: 1.4+ MB
domain
0
facebook
1
google
2
youtube
3
yahoo
4
baidu
1 2 3 alexa_dataframe['class' ] = 'legit' alexa_dataframe.head()
domain
class
0
facebook
legit
1
google
legit
2
youtube
legit
3
yahoo
legit
4
baidu
legit
1 2 3 4 5 6 alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index)) alexa_total = alexa_dataframe.shape[0 ] print ('Total Alexa domains %d' % alexa_total)
Total Alexa domains 91377
1 2 3 dga_dataframe['domain' ] = dga_dataframe.applymap(lambda x: x.split('.' )[0 ].strip().lower()) del dga_dataframe['raw_domain' ]
1 2 3 4 dga_dataframe = dga_dataframe.dropna() dga_dataframe = dga_dataframe.drop_duplicates() dga_total = dga_dataframe.shape[0 ] print ('Total DGA domains %d' % dga_total)
Total DGA domains 2664
1 2 dga_dataframe['class' ] = 'dga' dga_dataframe.head()
domain
class
0
04055051be412eea5a61b7da8438be3d
dga
1
1cb8a5f36f
dga
2
30acd347397c34fc273e996b22951002
dga
3
336c986a284e2b3bc0f69f949cb437cb
dga
5
40a43e61e56a5c218cf6c22aca27f7ee
dga
1 2 3 4 5 6 def entropy (s ): ''' 熵计算 ''' p, lns = collections.Counter(s), float (len (s)) return -sum ( count/lns * math.log(count/lns, 2 ) for count in p.values())
1 2 3 4 5 6 7 8 all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True ) all_domains['length' ] = [len (x) for x in all_domains['domain' ]] all_domains = all_domains[all_domains['length' ] > 6 ] all_domains['entropy' ] = [entropy(x) for x in all_domains['domain' ]] all_domains.head(10 )
domain
class
length
entropy
0
facebook
legit
8
2.750000
2
youtube
legit
7
2.521641
5
wikipedia
legit
9
2.641604
10
blogspot
legit
8
2.750000
11
twitter
legit
7
2.128085
12
linkedin
legit
8
2.500000
19
wordpress
legit
9
2.725481
23
microsoft
legit
9
2.947703
27
xvideos
legit
7
2.807355
28
googleusercontent
legit
17
3.175123
分析数据
1 2 3 4 5 all_domains.boxplot('length' ,'class' ) pylab.ylabel('Domain Length' ) all_domains.boxplot('entropy' ,'class' ) pylab.ylabel('Domain Entropy' )
Text(0,0.5,'Domain Entropy')
1 2 3 4 5 6 7 8 9 cond = all_domains['class' ] == 'dga' dga = all_domains[cond] alexa = all_domains[~cond] plt.scatter(alexa['length' ], alexa['entropy' ], s=140 , c='#aaaaff' , label='Alexa' , alpha=.2 ) plt.scatter(dga['length' ], dga['entropy' ], s=40 , c='r' , label='DGA' , alpha=.3 ) plt.legend() pylab.xlabel('Domain Length' ) pylab.ylabel('Domain Entropy' )
Text(0,0.5,'Domain Entropy')
domain
class
length
entropy
94031
xcfwwghb
dga
8
2.750000
94032
xcgqdfyrkgihlrmfmfib
dga
20
3.684184
94033
xclqwzcfcx
dga
10
2.646439
94034
xcpfxzuf
dga
8
2.500000
94035
xcvxhxze
dga
8
2.405639
94036
xdbrbsbm
dga
8
2.405639
94037
xdfjryydcfwvkvui
dga
16
3.500000
94038
xdjlvcgw
dga
8
3.000000
94039
xdrmjeu
dga
7
2.807355
94040
xflrjyyjswoatsoq
dga
16
3.500000
1 2 3 4 5 6 legit = all_domains[(all_domains['class' ]=='legit' )] max_grams = np.maximum(legit['alexa_grams' ],legit['word_grams' ]) ax = max_grams.hist(bins=80 ) ax.figure.suptitle('Histogram of the Max NGram Score for Domains' ) pylab.xlabel('Number of Domains' ) pylab.ylabel('Maximum NGram Score' )
Text(0,0.5,'Maximum NGram Score')
1 2 3 4 5 word_dataframe = word_dataframe[word_dataframe['word' ].map (lambda x: str (x).isalpha())] word_dataframe = word_dataframe.applymap(lambda x: str (x).strip().lower()) word_dataframe = word_dataframe.dropna() word_dataframe = word_dataframe.drop_duplicates() word_dataframe.head(10 )
word
37
a
48
aa
51
aaa
53
aaaa
54
aaaaaa
55
aaal
56
aaas
57
aaberg
58
aachen
59
aae
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char' , ngram_range=(3 ,5 ), min_df=1e-4 , max_df=1.0 ) counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain' ]) alexa_counts = np.log10(counts_matrix.sum (axis=0 ).getA1()) print (alexa_counts[:10 ])ngrams_list = alexa_vc.get_feature_names() print (ngrams_list[:10 ])_sorted_ngrams = sorted (zip (ngrams_list, alexa_counts), key=operator.itemgetter(1 ), reverse=True ) print ('Alexa NGrams: %d' % len (_sorted_ngrams))for ngram, count in _sorted_ngrams[:10 ]: print (ngram, count)
[1. 1. 1.17609126 1.64345268 1.11394335 1.14612804
1. 1.17609126 1.07918125 1.54406804]
['-20', '-a-', '-ac', '-ad', '-ads', '-af', '-ag', '-ai', '-air', '-al']
Alexa NGrams: 23613
ing 3.443888546777372
lin 3.4271614029259654
ine 3.399673721481038
tor 3.26528962586083
ter 3.2631624649622166
ion 3.2467447097238415
ent 3.228913405994688
por 3.2013971243204513
the 3.2005769267548483
ree 3.16345955176999
1 2 3 4 5 6 dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char' , ngram_range=(3 ,5 ), min_df=1e-5 , max_df=1.0 ) counts_matrix = dict_vc.fit_transform(word_dataframe['word' ]) dict_counts = np.log10(counts_matrix.sum (axis=0 ).getA1()) ngrams_list = dict_vc.get_feature_names() print (ngrams_list[:10 ])
['aaa', 'aab', 'aac', 'aad', 'aaf', 'aag', 'aah', 'aai', 'aak', 'aal']
1 2 3 4 _sorted_ngrams = sorted (zip (ngrams_list, dict_counts), key=operator.itemgetter(1 ), reverse=True ) print ('Word NGrams: %d' % len (_sorted_ngrams))for ngram, count in _sorted_ngrams[:10 ]: print (ngram, count)
Word NGrams: 123061
ing 4.387300822448285
ess 4.204879333760662
ati 4.1933472563864616
ion 4.165036479994566
ter 4.162415036106447
nes 4.112504458767161
tio 4.076822423342773
ate 4.0723602039634885
ent 4.069631102620343
tion 4.0496056125949735
1 2 3 4 5 6 7 def ngram_count (domain ): ''' domain中包含的ngrams数 ''' alexa_match = alexa_counts * alexa_vc.transform([domain]).T dict_match = dict_counts * dict_vc.transform([domain]).T print ('%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match))
1 2 3 4 ngram_count('google' ) ngram_count('facebook' ) ngram_count('1cb8a5f36f' ) ngram_count('pterodactylfarts' )
google Alexa match:17 Dict match: 14
facebook Alexa match:31 Dict match: 27
1cb8a5f36f Alexa match:0 Dict match: 0
pterodactylfarts Alexa match:35 Dict match: 76
1 2 3 4 all_domains['alexa_grams' ]= alexa_counts * alexa_vc.transform(all_domains['domain' ]).T all_domains['word_grams' ]= dict_counts * dict_vc.transform(all_domains['domain' ]).T all_domains.head(10 )
domain
class
length
entropy
alexa_grams
word_grams
0
facebook
legit
8
2.750000
31.302278
27.872426
2
youtube
legit
7
2.521641
25.855170
18.287142
5
wikipedia
legit
9
2.641604
24.571024
29.175635
10
blogspot
legit
8
2.750000
24.435141
19.274501
11
twitter
legit
7
2.128085
23.244500
31.130820
12
linkedin
legit
8
2.500000
24.774916
32.904408
19
wordpress
legit
9
2.725481
38.369509
33.806635
23
microsoft
legit
9
2.947703
32.133033
39.530125
27
xvideos
legit
7
2.807355
28.906360
18.846834
28
googleusercontent
legit
17
3.175123
67.315750
86.104683
1 2 3 all_domains['diff' ] = all_domains['alexa_grams' ] - all_domains['word_grams' ] all_domains.sort_values(['diff' ], ascending=True ).head(10 )
domain
class
length
entropy
alexa_grams
word_grams
diff
79366
bipolardisorderdepressionanxiety
legit
32
3.616729
117.312465
190.833856
-73.521391
72512
channel4embarrassingillnesses
legit
29
3.440070
95.786979
169.119440
-73.332460
10961
stirringtroubleinternationally
legit
30
3.481728
134.049367
207.204729
-73.155362
85031
americansforresponsiblesolutions
legit
32
3.667838
148.143049
218.363956
-70.220908
20459
pragmatismopolitico
legit
19
3.326360
61.244630
121.536223
-60.291593
13702
egaliteetreconciliation
legit
23
3.186393
91.938518
152.125325
-60.186808
4706
interoperabilitybridges
legit
23
3.588354
95.037285
153.626312
-58.589028
85161
foreclosurephilippines
legit
22
3.447402
74.506548
132.514638
-58.008090
45636
annamalicesissyselfhypnosis
legit
27
3.429908
68.680068
126.667692
-57.987623
70351
corazonindomablecapitulos
legit
25
3.813661
75.535473
133.160690
-57.625217
1 all_domains.sort_values(['diff' ], ascending=False ).head(10 )
domain
class
length
entropy
alexa_grams
word_grams
diff
54228
gay-sex-pics-porn-pictures-gay-sex-porn-gay-se...
legit
56
3.661056
159.642301
85.124184
74.518116
85091
article-directory-free-submission-free-content
legit
46
3.786816
235.233896
188.230453
47.003443
16893
stream-free-movies-online
legit
25
3.509275
120.250616
74.496915
45.753701
63380
watch-free-movie-online
legit
23
3.708132
103.029245
58.943451
44.085794
44253
best-online-shopping-site
legit
25
3.452879
123.377240
79.596640
43.780601
22524
social-bookmarking-sites-list
legit
29
3.702472
145.755266
102.261826
43.493440
66335
free-online-directory
legit
21
3.403989
123.379738
80.735030
42.644708
46553
free-links-articles-directory
legit
29
3.702472
153.239055
110.955361
42.283694
59873
online-web-directory
legit
20
3.584184
116.310717
74.082948
42.227769
58016
web-directory-online
legit
20
3.584184
114.402671
74.082948
40.319723
1 2 3 4 5 weird_cond = (all_domains['class' ]=='legit' ) & (all_domains['word_grams' ]<3 ) & (all_domains['alexa_grams' ]<2 ) weird = all_domains[weird_cond] print (weird.shape[0 ])weird.head(10 )
91
domain
class
length
entropy
alexa_grams
word_grams
diff
1246
twcczhu
legit
7
2.521641
1.748188
0.0
1.748188
2009
ggmm777
legit
7
1.556657
1.518514
0.0
1.518514
2760
qq66699
legit
7
1.556657
1.342423
0.0
1.342423
17347
crx7601
legit
7
2.807355
0.000000
0.0
0.000000
18682
hzsxzhyy
legit
8
2.250000
0.000000
0.0
0.000000
19418
02022222222
legit
11
0.684038
1.041393
0.0
1.041393
19887
3181302
legit
7
2.235926
0.000000
0.0
0.000000
21172
hljdns4
legit
7
2.807355
1.755875
0.0
1.755875
26441
05tz2e9
legit
7
2.807355
0.000000
0.0
0.000000
26557
fzysqmy
legit
7
2.521641
1.176091
0.0
1.176091
1 2 3 all_domains.loc[weird_cond, 'class' ] = 'weird' all_domains['class' ].value_counts()
legit 67221
dga 2664
weird 91
Name: class, dtype: int64
1 all_domains[all_domains['class' ] == 'weird' ].head()
domain
class
length
entropy
alexa_grams
word_grams
diff
1246
twcczhu
weird
7
2.521641
1.748188
0.0
1.748188
2009
ggmm777
weird
7
1.556657
1.518514
0.0
1.518514
2760
qq66699
weird
7
1.556657
1.342423
0.0
1.342423
17347
crx7601
weird
7
2.807355
0.000000
0.0
0.000000
18682
hzsxzhyy
weird
8
2.250000
0.000000
0.0
0.000000
1 2 3 4 5 6 7 8 9 cond = all_domains['class' ] == 'dga' dga = all_domains[cond] alexa = all_domains[~cond] plt.scatter(alexa['word_grams' ], alexa['entropy' ], s=140 , c='#aaaaff' , label='Alexa' , alpha=.2 ) plt.scatter(dga['word_grams' ], dga['entropy' ], s=40 , c='r' , label='DGA' , alpha=.3 ) plt.legend() pylab.xlabel('Domain word_grams' ) pylab.ylabel('Domain Entropy' )
Text(0,0.5,'Domain Entropy')
训练算法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 not_weird = all_domains[all_domains['class' ] != 'weird' ] X = not_weird.as_matrix(['length' , 'entropy' , 'alexa_grams' , 'word_grams' ]) y = np.array(not_weird['class' ].tolist()) clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20 ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
1 2 3 4 5 6 7 def show_cm (cm, labels ): percent = (cm*100.0 )/np.array(np.matrix(cm.sum (axis=1 )).T) print ('Confusion Matrix Stats' ) for i, label_i in enumerate (labels): for j, label_j in enumerate (labels): print ("%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum ()))
1 2 3 4 labels = ['legit' , 'dga' ] cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels) show_cm(cm, labels)
Confusion Matrix Stats
legit/legit: 99.57% (13369/13427)
legit/dga: 0.43% (58/13427)
dga/legit: 15.45% (85/550)
dga/dga: 84.55% (465/550)
1 2 3 importances = zip (['length' , 'entropy' , 'alexa_grams' , 'word_grams' ], clf.feature_importances_) list (importances)
[('length', 0.16033779891739047),
('entropy', 0.12175502861193326),
('alexa_grams', 0.5087685303664589),
('word_grams', 0.20913864210421748)]
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
测试算法
1 2 3 4 5 def test_it (domain ): _alexa_match = alexa_counts * alexa_vc.transform([domain]).T _dict_match = dict_counts * dict_vc.transform([domain]).T _X = [[len (domain), entropy(domain), _alexa_match, _dict_match]] print ('%s : %s' % (domain, clf.predict(_X)[0 ]))
1 2 3 4 5 6 7 8 9 10 test_it('google' ) test_it('google8sdflkajssjgjksdh' ) test_it('faceboosadfadfafdk' ) test_it('1cb8a5f36f' ) test_it('pterodactyladfasdfasdffarts' ) test_it('ptes9dro-dwacty2lfa5rrts' ) test_it('beyonce' ) test_it('bey666on4ce' ) test_it('supersexy' ) test_it('yourmomissohotinthesummertime' )
google : legit
google8sdflkajssjgjksdh : dga
faceboosadfadfafdk : legit
1cb8a5f36f : dga
pterodactyladfasdfasdffarts : legit
ptes9dro-dwacty2lfa5rrts : dga
beyonce : legit
bey666on4ce : dga
supersexy : legit
yourmomissohotinthesummertime : legit
使用算法
1 2 3 4 5 def save_model_to_disk (name, model, model_dir='models' ): serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) model_path = os.path.join(model_dir, name+'.model' ) print ('Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len (serialized_model)/1024.0 /1024.0 )) open (model_path,'wb' ).write(serialized_model)
1 2 3 4 5 save_model_to_disk('dga_model_random_forest' , clf) save_model_to_disk('dga_model_alexa_vectorizor' , alexa_vc) save_model_to_disk('dga_model_alexa_counts' , alexa_counts) save_model_to_disk('dga_model_dict_vectorizor' , dict_vc) save_model_to_disk('dga_model_dict_counts' , dict_counts)
Storing Serialized Model to Disk (dga_model_random_forest:1.80Meg)
Storing Serialized Model to Disk (dga_model_alexa_vectorizor:2.93Meg)
Storing Serialized Model to Disk (dga_model_alexa_counts:0.18Meg)
Storing Serialized Model to Disk (dga_model_dict_vectorizor:5.39Meg)
Storing Serialized Model to Disk (dga_model_dict_counts:0.94Meg)
1 2 3 4 5 6 7 8 9 def load_model_from_disk (name, model_dir='models' ): model_path = os.path.join(model_dir, name+'.model' ) try : model = pickle.loads(open (model_path,'rb' ).read()) print ('success' ) except : print ('Could not load model: %s from directory %s!' % (name, model_path)) return None return model
1 2 3 4 5 6 7 clf = load_model_from_disk('dga_model_random_forest' ) alexa_vc = load_model_from_disk('dga_model_alexa_vectorizor' ) alexa_counts = load_model_from_disk('dga_model_alexa_counts' ) dict_vc = load_model_from_disk('dga_model_dict_vectorizor' ) dict_counts = load_model_from_disk('dga_model_dict_counts' ) model = {'clf' :clf, 'alexa_vc' :alexa_vc, 'alexa_counts' :alexa_counts, 'dict_vc' :dict_vc, 'dict_counts' :dict_counts}
success
success
success
success
success
1 2 3 4 5 6 7 8 9 def evaluate_url (model, url ): domain = domain_extract(url) alexa_match = model['alexa_counts' ] * model['alexa_vc' ].transform([url]).T dict_match = model['dict_counts' ] * model['dict_vc' ].transform([url]).T X = [[len (domain), entropy(domain), alexa_match, dict_match]] y_pred = model['clf' ].predict(X)[0 ] print ('%s : %s' % (domain, y_pred))
1 evaluate_url(model, 'adfhalksfhjashfk.com' )
adfhalksfhjashfk : dga
1 2 mtnb = MultinomialNB() mtnb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
1 2 3 4 nb_y_pred=mtnb.predict(X_test) print (classification_report(y_test, nb_y_pred))cm = sklearn.metrics.confusion_matrix(y_test, nb_y_pred) show_cm(cm, labels)
precision recall f1-score support
dga 0.71 0.87 0.78 550
legit 0.99 0.99 0.99 13427
avg / total 0.98 0.98 0.98 13977
Confusion Matrix Stats
legit/legit: 86.73% (477/550)
legit/dga: 13.27% (73/550)
dga/legit: 1.44% (194/13427)
dga/dga: 98.56% (13233/13427)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 import osimport randomimport tldextractimport sklearnimport pandas as pdimport numpy as npfrom keras.models import Sequential, load_modelfrom keras.preprocessing import sequencefrom keras.layers.core import Dense, Dropout, Activationfrom keras.layers.embeddings import Embeddingfrom keras.layers.recurrent import LSTMfrom sklearn import feature_extractionfrom sklearn.model_selection import train_test_splitfrom datetime import datetimefrom zipfile import ZipFile
1 2 3 alexa_dataframe = pd.read_csv('data/top-1m.csv' , names=['rank' ,'uri' ], header=None , encoding='utf-8' ) alexa_dataframe.info() alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
rank 1000000 non-null int64
uri 1000000 non-null object
dtypes: int64(1), object(1)
memory usage: 15.3+ MB
rank
uri
0
1
google.com
1
2
youtube.com
2
3
facebook.com
3
4
baidu.com
4
5
wikipedia.org
1 2 3 4 5 6 7 8 def load_data_set (filename ): fw = open ('data/dga_domain.txt' , 'w+' ) with open (filename, "r" ) as f: for line in f.readlines(): lineArr = line.strip().split('\t' ) fw.write(lineArr[1 ] + '\n' ) fw.close() load_data_set('data/dga.txt' )
1 2 3 dga_dataframe = pd.read_csv('data/dga_domain.txt' , names=['raw_domain' ], header=None , encoding='utf-8' ) dga_dataframe.info() dga_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158695 entries, 0 to 1158694
Data columns (total 1 columns):
raw_domain 1158695 non-null object
dtypes: object(1)
memory usage: 8.8+ MB
raw_domain
0
ogxbnjopz.biz
1
zyejwiist.net
2
buuqogz.com
3
vpjmomduqll.org
4
uakwifutnpn.biz
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def domain_extract (uri ): ext = tldextract.extract(uri) if (not ext.suffix): return None else : return ext.domain alexa_dataframe['domain' ] = [ domain_extract(uri) for uri in alexa_dataframe['uri' ]] del alexa_dataframe['rank' ]del alexa_dataframe['uri' ]alexa_dataframe = alexa_dataframe.dropna() alexa_dataframe = alexa_dataframe.drop_duplicates() alexa_dataframe['length' ] = [len (x) for x in alexa_dataframe['domain' ]] alexa_dataframe = alexa_dataframe[alexa_dataframe['length' ] > 6 ] alexa_dataframe.info() alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 718018 entries, 1 to 999999
Data columns (total 2 columns):
domain 718018 non-null object
length 718018 non-null int64
dtypes: int64(1), object(1)
memory usage: 16.4+ MB
domain
length
1
youtube
7
2
facebook
8
4
wikipedia
9
11
instagram
9
13
twitter
7
1 2 3 alexa_dataframe['class' ] = 'legit' alexa_dataframe.head()
domain
length
class
1
youtube
7
legit
2
facebook
8
legit
4
wikipedia
9
legit
11
instagram
9
legit
13
twitter
7
legit
1 2 3 4 5 6 alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index)) alexa_total = alexa_dataframe.shape[0 ] print ('Total Alexa domains %d' % alexa_total)
Total Alexa domains 718018
1 2 3 dga_dataframe['domain' ] = dga_dataframe.applymap(lambda x: x.split('.' )[0 ].strip().lower()) del dga_dataframe['raw_domain' ]
1 2 3 4 5 6 dga_dataframe = dga_dataframe.dropna() dga_dataframe = dga_dataframe.drop_duplicates() dga_dataframe['length' ] = [len (x) for x in dga_dataframe['domain' ]] dga_dataframe = dga_dataframe[dga_dataframe['length' ] > 6 ] dga_total = dga_dataframe.shape[0 ] print ('Total DGA domains %d' % dga_total)
Total DGA domains 1082010
1 2 dga_dataframe['class' ] = 'dga' dga_dataframe.head()
domain
length
class
0
ogxbnjopz
9
dga
1
zyejwiist
9
dga
2
buuqogz
7
dga
3
vpjmomduqll
11
dga
4
uakwifutnpn
11
dga
1 2 3 all_domains = pd.concat([alexa_dataframe[:5000 ], dga_dataframe[:5000 ]], ignore_index=True ) all_domains.head(10 )
domain
length
class
0
youtube
7
legit
1
facebook
8
legit
2
wikipedia
9
legit
3
instagram
9
legit
4
twitter
7
legit
5
blogspot
8
legit
6
netflix
7
legit
7
pornhub
7
legit
8
xvideos
7
legit
9
livejasmin
10
legit
domain
length
class
9990
mxepwpxki
9
dga
9991
xnvqgaddhivrqowtbs
18
dga
9992
btgjyoydcwoeigdldngr
20
dga
9993
mnnridfyhxkyk
13
dga
9994
jmcctiodbdemfejo
16
dga
9995
mepoiwtmeffy
12
dga
9996
iwpikrmppfqeere
15
dga
9997
gcibdmrs
8
dga
9998
tusdspujigdyntbxusuah
21
dga
9999
wvsiuqhblxfijnoefjnao
21
dga
1 2 X = all_domains['domain' ] labels = all_domains['class' ]
1 2 3 ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char' , ngram_range=(2 , 2 )) count_vec = ngram_vectorizer.fit_transform(X) max_features = count_vec.shape[1 ]
1 y = [0 if x == 'legit' else 1 for x in labels]
多层感知机(MLP) 1 2 3 4 5 6 7 def build_model (max_features ): model = Sequential() model.add(Dense(1 , input_dim=max_features, activation='sigmoid' )) model.compile (loss='binary_crossentropy' ,optimizer='adam' ) return model
1 2 3 4 max_epoch = 50 nfolds = 10 batch_size = 128
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 for fold in range (nfolds): print ("fold %u/%u" % (fold+1 , nfolds)) X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y, labels, test_size=0.2 ) print ('Build model...' ) model = build_model(max_features) print ("Train..." ) X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05 ) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range (max_epoch): model.fit(X_train.todense(), y_train, batch_size=batch_size, nb_epoch=1 ) t_probs = model.predict_proba(X_holdout.todense()) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print ('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test.todense()) out_data = {'y' :y_test, 'labels' : label_test, 'probs' :probs, 'epochs' : ep, 'confusion_matrix' : sklearn.metrics.confusion_matrix(y_test, probs > .5 )} print (sklearn.metrics.confusion_matrix(y_test, probs > .5 )) else : if (ep-best_iter) > 5 : break final_data.append(out_data) model.save('model.h5' )
fold 1/10
Build model...
Train...
/usr/lib/python3/dist-packages/ipykernel_launcher.py:15: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.
from ipykernel import kernelapp as app
Epoch 1/1
7600/7600 [==============================] - 1s 86us/step - loss: 0.6297
Epoch 0: auc = 0.950239 (best=0.000000)
[[915 86]
[108 891]]
Epoch 1/1
7600/7600 [==============================] - 0s 26us/step - loss: 0.5243
Epoch 1: auc = 0.980196 (best=0.950239)
[[952 49]
[ 83 916]]
Epoch 1/1
7600/7600 [==============================] - 0s 31us/step - loss: 0.4502
Epoch 2: auc = 0.984872 (best=0.980196)
[[965 36]
[ 78 921]]
Epoch 1/1
7600/7600
Epoch 32: auc = 0.994192 (best=0.994192)
1 model = load_model('model.h5' )
[{'y': [0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1], 'labels': 2403 legit
2789 legit
450 legit
4521 legit
2841 legit
8645 dga
6999 dga
7831 dga
6291 dga
3746 legit
6226 dga
4111 legit
8487 dga
678 legit
90 legit
6151 dga
8300 dga
4004 legit
2489 legit
4836 legit
8291 dga
8198 dga
8911 dga
7585 dga
260 legit
5905 dga
5646 dga
970 legit
8718 dga
275 legit
...
8589 dga
6620 dga
7470 dga
5230 dga
4827 legit
5677 dga
3417 legit
8539 dga
7147 dga
3699 legit
4751 legit
3043 legit
5475 dga
3736 legit
3887 legit
6349 dga
4996 legit
7379 dga
3530 legit
1942 legit
7914 dga
9752 dga
6717 dga
5363 dga
7622 dga
961 legit
1641 legit
4607 legit
8649 dga
6087 dga
Name: class, Length: 2000, dtype: object, 'probs': array([[0.14488636],
[0.00496732],
[0.00896166],
...,
[0.00593334],
[0.95598286],
[0.9867235 ]], dtype=float32), 'epochs': 43, 'confusion_matrix': array([[972, 29],
[ 62, 937]])}
1 2 z_test = np.array([[0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 ]]) model.predict(z_test)
array([[1.]], dtype=float32)
1 print (sklearn.metrics.classification_report(final_data[0 ]['y' ], final_data[0 ]['probs' ] > .5 ))
precision recall f1-score support
0 0.95 0.97 0.96 970
1 0.97 0.95 0.96 1030
micro avg 0.96 0.96 0.96 2000
macro avg 0.96 0.96 0.96 2000
weighted avg 0.96 0.96 0.96 2000
LSTM 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def build_model_lstm (max_features, maxlen ): """Build LSTM model""" model = Sequential() model.add(Embedding(max_features, 128 , input_length=maxlen)) model.add(LSTM(128 )) model.add(Dropout(0.5 )) model.add(Dense(1 )) model.add(Activation('sigmoid' )) model.compile (loss='binary_crossentropy' , optimizer='rmsprop' ) return model
1 2 3 4 5 6 7 8 9 10 11 12 13 14 X = all_domains['domain' ] labels = all_domains['class' ] valid_chars = {x:idx+1 for idx, x in enumerate (set ('' .join(X)))} max_features = len (valid_chars) + 1 maxlen = np.max ([len (x) for x in X]) X = [[valid_chars[y] for y in x] for x in X] X = sequence.pad_sequences(X, maxlen=maxlen) y = [0 if x == 'legit' else 1 for x in labels] final_data = []
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 for fold in range (nfolds): print ("fold %u/%u" % (fold+1 , nfolds)) X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, test_size=0.2 ) print ('Build model...' ) model = build_model_lstm(max_features, maxlen) print ("Train..." ) X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05 ) best_iter = -1 best_auc = 0.0 out_data = {} for ep in range (max_epoch): model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1 ) t_probs = model.predict_proba(X_holdout) t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs) print ('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)) if t_auc > best_auc: best_auc = t_auc best_iter = ep probs = model.predict_proba(X_test) out_data = {'y' :y_test, 'labels' : label_test, 'probs' :probs, 'epochs' : ep, 'confusion_matrix' : sklearn.metrics.confusion_matrix(y_test, probs > .5 )} print (sklearn.metrics.confusion_matrix(y_test, probs > .5 )) else : if (ep-best_iter) > 2 : break final_data.append(out_data)
fold 1/10
Build model...
Train...
Epoch 1/1
7600/7600 [==============================] - 24s 3ms/step - loss: 0.3562
Epoch 0: auc = 0.979725 (best=0.000000)
[[893 113]
[ 42 952]]
Epoch 1/1
7600/7600 [==============================] - 23s 3ms/step - loss: 0.1643
Epoch 7: auc = 0.980221 (best=0.981659)
Epoch 1/1
7600/7600 [==============================] - 21s 3ms/step - loss: 0.1603
Epoch 8: auc = 0.979843 (best=0.981659)
1 print (sklearn.metrics.classification_report(final_data[0 ]['y' ], final_data[0 ]['probs' ] > .5 ))
precision recall f1-score support
0 0.95 0.96 0.96 1006
1 0.96 0.95 0.95 994
micro avg 0.96 0.96 0.96 2000
macro avg 0.96 0.96 0.96 2000
weighted avg 0.96 0.96 0.96 2000