Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.
# Shuffle the data (important for training/testing) alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index)) #打乱循序,重新索引 #Randomly permute a sequence, or return a permuted range alexa_total = alexa_dataframe.shape[0] print('Total Alexa domains %d' % alexa_total)
Total Alexa domains 91377
1 2 3
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower()) #This method applies a function that accepts and returns a scalar to every element of a DataFrame. del dga_dataframe['raw_domain']
defentropy(s): ''' 熵计算 ''' p, lns = collections.Counter(s), float(len(s)) return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
1 2 3 4 5 6 7 8
all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True) #将数据根据不同的轴作简单的融合 #如果两个表的index都没有实际含义,使用ignore_index=True all_domains['length'] = [len(x) for x in all_domains['domain']] all_domains = all_domains[all_domains['length'] > 6] #排除短domain的干扰 all_domains['entropy'] = [entropy(x) for x in all_domains['domain']] all_domains.head(10)
legit = all_domains[(all_domains['class']=='legit')] max_grams = np.maximum(legit['alexa_grams'],legit['word_grams']) ax = max_grams.hist(bins=80) ax.figure.suptitle('Histogram of the Max NGram Score for Domains') pylab.xlabel('Number of Domains') pylab.ylabel('Maximum NGram Score')
[1. 1. 1.17609126 1.64345268 1.11394335 1.14612804
1. 1.17609126 1.07918125 1.54406804]
['-20', '-a-', '-ac', '-ad', '-ads', '-af', '-ag', '-ai', '-air', '-al']
Alexa NGrams: 23613
ing 3.443888546777372
lin 3.4271614029259654
ine 3.399673721481038
tor 3.26528962586083
ter 3.2631624649622166
ion 3.2467447097238415
ent 3.228913405994688
por 3.2013971243204513
the 3.2005769267548483
ree 3.16345955176999
_sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True) print('Word NGrams: %d' % len(_sorted_ngrams)) for ngram, count in _sorted_ngrams[:10]: print(ngram, count)
Word NGrams: 123061
ing 4.387300822448285
ess 4.204879333760662
ati 4.1933472563864616
ion 4.165036479994566
ter 4.162415036106447
nes 4.112504458767161
tio 4.076822423342773
ate 4.0723602039634885
ent 4.069631102620343
tion 4.0496056125949735
#Compute NGram matches for all the domains and add to our dataframe all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T all_domains.head(10)
domain
class
length
entropy
alexa_grams
word_grams
0
facebook
legit
8
2.750000
31.302278
27.872426
2
youtube
legit
7
2.521641
25.855170
18.287142
5
wikipedia
legit
9
2.641604
24.571024
29.175635
10
blogspot
legit
8
2.750000
24.435141
19.274501
11
twitter
legit
7
2.128085
23.244500
31.130820
12
linkedin
legit
8
2.500000
24.774916
32.904408
19
wordpress
legit
9
2.725481
38.369509
33.806635
23
microsoft
legit
9
2.947703
32.133033
39.530125
27
xvideos
legit
7
2.807355
28.906360
18.846834
28
googleusercontent
legit
17
3.175123
67.315750
86.104683
1 2 3
#Use the vectorized operations of the dataframe to investigate differences all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams'] all_domains.sort_values(['diff'], ascending=True).head(10)
not_weird = all_domains[all_domains['class'] != 'weird'] X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams']) #将frame转换为Numpy-array表示 y = np.array(not_weird['class'].tolist()) #将array转换为list clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) #A random forest classifier #The number of trees in the forest X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #随机划分训练集和测试集 #样本占比0.2 clf.fit(X_train, y_train) #用训练数据拟合分类器模型 y_pred = clf.predict(X_test) #用训练好的分类器去预测测试数据
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
1 2 3 4 5 6 7
defshow_cm(cm, labels): #计算百分比 percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) print('Confusion Matrix Stats') for i, label_i inenumerate(labels): for j, label_j inenumerate(labels): print("%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()))
Storing Serialized Model to Disk (dga_model_random_forest:1.80Meg)
Storing Serialized Model to Disk (dga_model_alexa_vectorizor:2.93Meg)
Storing Serialized Model to Disk (dga_model_alexa_counts:0.18Meg)
Storing Serialized Model to Disk (dga_model_dict_vectorizor:5.39Meg)
Storing Serialized Model to Disk (dga_model_dict_counts:0.94Meg)
1 2 3 4 5 6 7 8 9
defload_model_from_disk(name, model_dir='models'): model_path = os.path.join(model_dir, name+'.model') try: model = pickle.loads(open(model_path,'rb').read()) print('success') except: print('Could not load model: %s from directory %s!' % (name, model_path)) returnNone return model
import os import random import tldextract import sklearn import pandas as pd import numpy as np
from keras.models import Sequential, load_model from keras.preprocessing import sequence from keras.layers.core import Dense, Dropout, Activation from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM from sklearn import feature_extraction from sklearn.model_selection import train_test_split from datetime import datetime from zipfile import ZipFile
defdomain_extract(uri): ext = tldextract.extract(uri) if (not ext.suffix): returnNone else: return ext.domain alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']] del alexa_dataframe['rank'] del alexa_dataframe['uri'] alexa_dataframe = alexa_dataframe.dropna() alexa_dataframe = alexa_dataframe.drop_duplicates() alexa_dataframe['length'] = [len(x) for x in alexa_dataframe['domain']] alexa_dataframe = alexa_dataframe[alexa_dataframe['length'] > 6] alexa_dataframe.info() alexa_dataframe.head()
# Shuffle the data (important for training/testing) alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index)) #打乱循序,重新索引 #Randomly permute a sequence, or return a permuted range alexa_total = alexa_dataframe.shape[0] print('Total Alexa domains %d' % alexa_total)
Total Alexa domains 718018
1 2 3
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower()) #This method applies a function that accepts and returns a scalar to every element of a DataFrame. del dga_dataframe['raw_domain']
1 2 3 4 5 6
dga_dataframe = dga_dataframe.dropna() dga_dataframe = dga_dataframe.drop_duplicates() dga_dataframe['length'] = [len(x) for x in dga_dataframe['domain']] dga_dataframe = dga_dataframe[dga_dataframe['length'] > 6] dga_total = dga_dataframe.shape[0] print('Total DGA domains %d' % dga_total)
X = all_domains['domain'] labels = all_domains['class']
valid_chars = {x:idx+1for idx, x inenumerate(set(''.join(X)))} max_features = len(valid_chars) + 1 #计算特征字符长度 maxlen = np.max([len(x) for x in X]) #记录最长的域名长度 X = [[valid_chars[y] for y in x] for x in X] #转换为下标数组 X = sequence.pad_sequences(X, maxlen=maxlen) #进行长度填充 y = [0if x == 'legit'else1for x in labels] final_data = []
def img2vector(filename): returnVect = zeros((1,1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[0, 32*i+j] = int(lineStr[j]) return returnVect
def handwritingClassTest(): hwLabels = [] trainingFileList = listdir('digits/trainingDigits') m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): #从文件名上解析当前文件中的正确值,存入label fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('digits/trainingDigits/%s' % fileNameStr) testFileList = listdir('digits/testDigits') trueCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) if (classifierResult == classNumStr): trueCount += 1.0 print("\nthe total true rate is: %f" % (trueCount/float(mTest)))
def chooseBestFeatureToSplit(dataSet): ''' :param dataSet: 数据集 :return: ''' numFeatures = len(dataSet[0]) - 1 #特征列的长度,-1为label baseEntropy = calcShannonEnt(dataSet) #计算数据集的香农熵 bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] #创建一个list包含所有数据的第i个feature uniqueVals = set(featList) #转变为set格式 newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) #遍历featList中的所有feature,对每个feture划分一次数据集 prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) #计算当前feature的香农熵 infoGain = baseEntropy - newEntropy #计算熵差,信息增益 if (infoGain > bestInfoGain): #计算最大信息增益 bestInfoGain = infoGain bestFeature = i return bestFeature #返回最好的feature
递归构建决策树
1.得到数据集 2.最好feature划分 3.递归划分
当处理了所有feature后,类标签仍然不唯一时,采用多数表决方式决定子节点分类
1 2 3 4 5 6 7 8
def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0]
利用递归构建tree
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] #数据集的所有类标签 if classList.count(classList[0]) == len(classList): return classList[0] #当类标签完全相同返回该类标签 if len(dataSet[0]) == 1: #当所有属性都处理完,label仍然不唯一时,采用表决方式 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] #当前数据集选取的最好特征变量 myTree = {bestFeatLabel: {}} del(labels[bestFeat]) #删除用过的feature featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) #利用递归构建tree return myTree
def bagOfWords2VecMN(vocabList, inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec
def colicTest(): frTrain = open('horseColicTraining.txt') frTest = open('horseColicTest.txt') trainingSet = [] trainingLabels = [] for line in frTrain.readlines(): currLine = line.strip().split('\t') lineArr = [] for i in range(21): lineArr.append(float(currLine[i])) trainingSet.append(lineArr) trainingLabels.append(float(currLine[21])) trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000) #计算回归系数向量,迭代1000次 errorCount = 0 numTestVec = 0.0 for line in frTest.readlines(): #导入测试集计算分类错误率 numTestVec += 1.0 currLine = line.strip().split('\t') lineArr = [] for i in range(21): lineArr.append(float(currLine[i])) if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): errorCount += 1 errorRate = (float(errorCount)/numTestVec) print("the error rate of this test is: %f" % errorRate) return errorRate
def multiTest(): numTests = 10 errorSum=0.0 for k in range(numTests): # 计算10次求平均值 errorSum += colicTest() print("after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests)))
class optStruct: def __init__(self, dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters self.X = dataMatIn self.labelMat = classLabels self.C = C self.tol = toler self.m = shape(dataMatIn)[0] self.alphas = mat(zeros((self.m, 1))) self.b = 0 self.eCache = mat(zeros((self.m, 2))) #first column is valid flag self.K = mat(zeros((self.m, self.m))) for i in range(self.m): self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup) def calcEk(oS, k): ''' 计算E值并返回 :param oS: :param k: :return: ''' fXk = float(multiply(oS.alphas, oS.labelMat).T*oS.K[:, k] + oS.b) Ek = fXk - float(oS.labelMat[k]) return Ek def selectJ(i, oS, Ei): ''' 选择第二个alpha :param i: :param oS: :param Ei: :return: ''' maxK = -1 maxDeltaE = 0 Ej = 0 oS.eCache[i] = [1, Ei] validEcacheList = nonzero(oS.eCache[:, 0].A)[0] if (len(validEcacheList)) > 1: for k in validEcacheList: if k == i: continue Ek = calcEk(oS, k) deltaE = abs(Ei - Ek) if (deltaE > maxDeltaE): maxK = k maxDeltaE = deltaE Ej = Ek return maxK, Ej else: j = selectJrand(i, oS.m) Ej = calcEk(oS, j) return j, Ej
def updateEk(oS, k): ''' 计算误差值并存入缓存 :param oS: :param k: :return: ''' Ek = calcEk(oS, k) oS.eCache[k] = [1, Ek] def innerL(i, oS): ''' 优化过程 :param i: :param oS: :return: ''' Ei = calcEk(oS, i) if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)): j, Ej = selectJ(i, oS, Ei) alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy() if (oS.labelMat[i] != oS.labelMat[j]): L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print("L==H") return 0 eta = 2.0 * oS.K[i, j] - oS.K[i, i] - oS.K[j, j] if eta >= 0: print("eta>=0") return 0 oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta oS.alphas[j] = clipAlpha(oS.alphas[j], H, L) updateEk(oS, j) if (abs(oS.alphas[j] - alphaJold) < 0.00001): print("j not moving enough") return 0 oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j]) updateEk(oS, i) b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i, i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i, j] b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i, j] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j, j] if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1 elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2 else: oS.b = (b1 + b2)/2.0 return 1 else: return 0
def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)): ''' 完整的SMO外循环 :param dataMatIn: :param classLabels: :param C: :param toler: :param maxIter: :param kTup: :return: ''' oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup) iter = 0 entireSet = True alphaPairsChanged = 0 while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)): alphaPairsChanged = 0 if entireSet: for i in range(oS.m): alphaPairsChanged += innerL(i,oS) print("fullSet, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)) iter += 1 else: nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] for i in nonBoundIs: alphaPairsChanged += innerL(i, oS) print("non-bound, iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged)) iter += 1 if entireSet: entireSet = False elif (alphaPairsChanged == 0): entireSet = True print("iteration number: %d" % iter) return oS.b, oS.alphas
def calcWs(alphas, dataArr, classLabels): ''' 利用计算出的alpha进行分类 :param alphas: :param dataArr: :param classLabels: :return: ''' X = mat(dataArr) labelMat = mat(classLabels).transpose() m, n = shape(X) w = zeros((n, 1)) for i in range(m): w += multiply(alphas[i]*labelMat[i], X[i, :].T) return w
1 2 3 4 5 6 7
for i in range(100): if alphas[i] > 0.0: print(dateArr[i], labelArr[i]) ws = calcWs(alphas, dateArr, labelArr) print(ws) datmat = mat(dateArr) print(datmat[0]*mat(ws) + b)
def kernelTrans(X, A, kTup): ''' 核函数 :param X: :param A: :param kTup:包含核函数信息的元组 :return: ''' m, n = shape(X) K = mat(zeros((m, 1))) if kTup[0]=='lin': K = X * A.T elif kTup[0]=='rbf': for j in range(m): deltaRow = X[j, :] - A K[j] = deltaRow*deltaRow.T K = exp(K/(-1*kTup[1]**2)) else: raise NameError('Houston We Have a Problem -- That Kernel is not recognized') return K
words=[] datas=[] with open("data/xssed.csv","r",encoding="utf-8") as f: reader=csv.DictReader(f,fieldnames=["payload"]) for row in reader: payload=row["payload"] word=GeneSeg(payload) datas.append(word) words+=word
#构建数据集 def build_dataset(datas,words): count=[["UNK",-1]] counter=Counter(words) count.extend(counter.most_common(vocabulary_size-1)) #print(count) vocabulary=[c[0] for c in count] #print(vocabulary) data_set=[] for data in datas: d_set=[] for word in data: if word in vocabulary: d_set.append(word) else: d_set.append("UNK") count[0][1]+=1 data_set.append(d_set) print(data_set)