[机器学习笔记]DGA Domain Detection 1

Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.

0x01 Domain Generating Algorithm

Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.

Example

0x02 Random Forest

random forest = bagging + decision trees

0x03 code

  • Random Forest

  • MultinomialNB

import os, sys
import traceback
import json
import optparse
import pickle
import collections
import sklearn
import sklearn.feature_extraction
import sklearn.ensemble
import sklearn.metrics
import pandas as pd
import numpy as np
import tldextract
import math
import operator
from sklearn.model_selection import train_test_split
from matplotlib import pylab
from pylab import *

收集数据

alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank','uri'], header=None, encoding='utf-8')
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
rank    100000 non-null int64
uri     100000 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB
rank uri
0 1 facebook.com
1 2 google.com
2 3 youtube.com
3 4 yahoo.com
4 5 baidu.com
dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8')
dga_dataframe.info()
dga_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2669 entries, 0 to 2668
Data columns (total 1 columns):
raw_domain    2669 non-null object
dtypes: object(1)
memory usage: 20.9+ KB
raw_domain
0 04055051be412eea5a61b7da8438be3d.info
1 1cb8a5f36f.info
2 30acd347397c34fc273e996b22951002.org
3 336c986a284e2b3bc0f69f949cb437cb.info
4 336c986a284e2b3bc0f69f949cb437cb.org
word_dataframe = pd.read_csv('data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')
word_dataframe.info()
word_dataframe.head(10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479623 entries, 0 to 479622
Data columns (total 1 columns):
word    479619 non-null object
dtypes: object(1)
memory usage: 3.7+ MB
word
0 1080
1 10-point
2 10th
3 11-point
4 12-point
5 16-point
6 18-point
7 1st
8 2
9 20-point

准备数据

def domain_extract(uri):
    ext = tldextract.extract(uri)
    if (not ext.suffix):
        return None
    else:
        return ext.domain
    
alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 91377 entries, 0 to 99999
Data columns (total 1 columns):
domain    91377 non-null object
dtypes: object(1)
memory usage: 1.4+ MB
domain
0 facebook
1 google
2 youtube
3 yahoo
4 baidu
alexa_dataframe['class'] = 'legit'
#对正常数据打标legit
alexa_dataframe.head()
domain class
0 facebook legit
1 google legit
2 youtube legit
3 yahoo legit
4 baidu legit
# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
#打乱循序,重新索引
#Randomly permute a sequence, or return a permuted range
alexa_total = alexa_dataframe.shape[0]
print('Total Alexa domains %d' % alexa_total)
Total Alexa domains 91377
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
#This method applies a function that accepts and returns a scalar to every element of a DataFrame.
del dga_dataframe['raw_domain']
dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_total = dga_dataframe.shape[0]
print('Total DGA domains %d' % dga_total)
Total DGA domains 2664
dga_dataframe['class'] = 'dga'
dga_dataframe.head()
domain class
0 04055051be412eea5a61b7da8438be3d dga
1 1cb8a5f36f dga
2 30acd347397c34fc273e996b22951002 dga
3 336c986a284e2b3bc0f69f949cb437cb dga
5 40a43e61e56a5c218cf6c22aca27f7ee dga
def entropy(s):
    '''
    熵计算
    '''
    p, lns = collections.Counter(s), float(len(s))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)
#将数据根据不同的轴作简单的融合
#如果两个表的index都没有实际含义,使用ignore_index=True
all_domains['length'] = [len(x) for x in all_domains['domain']]
all_domains = all_domains[all_domains['length'] > 6]
#排除短domain的干扰
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
all_domains.head(10)
domain class length entropy
0 facebook legit 8 2.750000
2 youtube legit 7 2.521641
5 wikipedia legit 9 2.641604
10 blogspot legit 8 2.750000
11 twitter legit 7 2.128085
12 linkedin legit 8 2.500000
19 wordpress legit 9 2.725481
23 microsoft legit 9 2.947703
27 xvideos legit 7 2.807355
28 googleusercontent legit 17 3.175123

分析数据

#箱线图
all_domains.boxplot('length','class')
pylab.ylabel('Domain Length')
all_domains.boxplot('entropy','class')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_13_1

output_13_2

cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['length'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['length'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain Length')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_14_1

all_domains.tail(10)
domain class length entropy
94031 xcfwwghb dga 8 2.750000
94032 xcgqdfyrkgihlrmfmfib dga 20 3.684184
94033 xclqwzcfcx dga 10 2.646439
94034 xcpfxzuf dga 8 2.500000
94035 xcvxhxze dga 8 2.405639
94036 xdbrbsbm dga 8 2.405639
94037 xdfjryydcfwvkvui dga 16 3.500000
94038 xdjlvcgw dga 8 3.000000
94039 xdrmjeu dga 7 2.807355
94040 xflrjyyjswoatsoq dga 16 3.500000
legit = all_domains[(all_domains['class']=='legit')]
max_grams = np.maximum(legit['alexa_grams'],legit['word_grams'])
ax = max_grams.hist(bins=80)
ax.figure.suptitle('Histogram of the Max NGram Score for Domains')
pylab.xlabel('Number of Domains')
pylab.ylabel('Maximum NGram Score')
Text(0,0.5,'Maximum NGram Score')

output_16_1

word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]
word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
word_dataframe = word_dataframe.dropna()
word_dataframe = word_dataframe.drop_duplicates()
word_dataframe.head(10)
word
37 a
48 aa
51 aaa
53 aaaa
54 aaaaaa
55 aaal
56 aaas
57 aaberg
58 aachen
59 aae
alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0)
#词袋模型统计词频
#ngram_range:词组切分的长度范围
#如果一个词的频率小于min_df或者大于max_df,将不会被作为关键词
counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
#生成词频向量
#fit_transform 计算各个词语出现的次数
alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
#数据归一化
print(alexa_counts[:10])
ngrams_list = alexa_vc.get_feature_names()
#从包含文本和图片的数据集中提取特征,转换成机器学习中可用的数值型特征
print(ngrams_list[:10])

_sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=operator.itemgetter(1), reverse=True)
#zip()将两个序列合并,返回zip对象,可强制转换为列表或字典
# sorted()对序列进行排序,返回一个排序后的新列表,原数据不改变
print('Alexa NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
    print(ngram, count)
[1.         1.         1.17609126 1.64345268 1.11394335 1.14612804
 1.         1.17609126 1.07918125 1.54406804]
['-20', '-a-', '-ac', '-ad', '-ads', '-af', '-ag', '-ai', '-air', '-al']
Alexa NGrams: 23613
ing 3.443888546777372
lin 3.4271614029259654
ine 3.399673721481038
tor 3.26528962586083
ter 3.2631624649622166
ion 3.2467447097238415
ent 3.228913405994688
por 3.2013971243204513
the 3.2005769267548483
ree 3.16345955176999
#提取词的数值型特征
dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
counts_matrix = dict_vc.fit_transform(word_dataframe['word'])
dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = dict_vc.get_feature_names()
print(ngrams_list[:10])
['aaa', 'aab', 'aac', 'aad', 'aaf', 'aag', 'aah', 'aai', 'aak', 'aal']
_sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True)
print('Word NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
    print(ngram, count)
Word NGrams: 123061
ing 4.387300822448285
ess 4.204879333760662
ati 4.1933472563864616
ion 4.165036479994566
ter 4.162415036106447
nes 4.112504458767161
tio 4.076822423342773
ate 4.0723602039634885
ent 4.069631102620343
tion 4.0496056125949735
def ngram_count(domain):
    '''
    domain中包含的ngrams数
    '''
    alexa_match = alexa_counts * alexa_vc.transform([domain]).T  
    dict_match = dict_counts * dict_vc.transform([domain]).T
    print('%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match))
ngram_count('google')
ngram_count('facebook')
ngram_count('1cb8a5f36f')
ngram_count('pterodactylfarts')
google Alexa match:17 Dict match: 14
facebook Alexa match:31 Dict match: 27
1cb8a5f36f Alexa match:0 Dict match: 0
pterodactylfarts Alexa match:35 Dict match: 76
#Compute NGram matches for all the domains and add to our dataframe
all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T
all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T
all_domains.head(10)
domain class length entropy alexa_grams word_grams
0 facebook legit 8 2.750000 31.302278 27.872426
2 youtube legit 7 2.521641 25.855170 18.287142
5 wikipedia legit 9 2.641604 24.571024 29.175635
10 blogspot legit 8 2.750000 24.435141 19.274501
11 twitter legit 7 2.128085 23.244500 31.130820
12 linkedin legit 8 2.500000 24.774916 32.904408
19 wordpress legit 9 2.725481 38.369509 33.806635
23 microsoft legit 9 2.947703 32.133033 39.530125
27 xvideos legit 7 2.807355 28.906360 18.846834
28 googleusercontent legit 17 3.175123 67.315750 86.104683
#Use the vectorized operations of the dataframe to investigate differences
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
all_domains.sort_values(['diff'], ascending=True).head(10)
domain class length entropy alexa_grams word_grams diff
79366 bipolardisorderdepressionanxiety legit 32 3.616729 117.312465 190.833856 -73.521391
72512 channel4embarrassingillnesses legit 29 3.440070 95.786979 169.119440 -73.332460
10961 stirringtroubleinternationally legit 30 3.481728 134.049367 207.204729 -73.155362
85031 americansforresponsiblesolutions legit 32 3.667838 148.143049 218.363956 -70.220908
20459 pragmatismopolitico legit 19 3.326360 61.244630 121.536223 -60.291593
13702 egaliteetreconciliation legit 23 3.186393 91.938518 152.125325 -60.186808
4706 interoperabilitybridges legit 23 3.588354 95.037285 153.626312 -58.589028
85161 foreclosurephilippines legit 22 3.447402 74.506548 132.514638 -58.008090
45636 annamalicesissyselfhypnosis legit 27 3.429908 68.680068 126.667692 -57.987623
70351 corazonindomablecapitulos legit 25 3.813661 75.535473 133.160690 -57.625217
all_domains.sort_values(['diff'], ascending=False).head(10)
domain class length entropy alexa_grams word_grams diff
54228 gay-sex-pics-porn-pictures-gay-sex-porn-gay-se... legit 56 3.661056 159.642301 85.124184 74.518116
85091 article-directory-free-submission-free-content legit 46 3.786816 235.233896 188.230453 47.003443
16893 stream-free-movies-online legit 25 3.509275 120.250616 74.496915 45.753701
63380 watch-free-movie-online legit 23 3.708132 103.029245 58.943451 44.085794
44253 best-online-shopping-site legit 25 3.452879 123.377240 79.596640 43.780601
22524 social-bookmarking-sites-list legit 29 3.702472 145.755266 102.261826 43.493440
66335 free-online-directory legit 21 3.403989 123.379738 80.735030 42.644708
46553 free-links-articles-directory legit 29 3.702472 153.239055 110.955361 42.283694
59873 online-web-directory legit 20 3.584184 116.310717 74.082948 42.227769
58016 web-directory-online legit 20 3.584184 114.402671 74.082948 40.319723
#gram count低的词
weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2)
weird = all_domains[weird_cond]
print(weird.shape[0])
weird.head(10)
91
domain class length entropy alexa_grams word_grams diff
1246 twcczhu legit 7 2.521641 1.748188 0.0 1.748188
2009 ggmm777 legit 7 1.556657 1.518514 0.0 1.518514
2760 qq66699 legit 7 1.556657 1.342423 0.0 1.342423
17347 crx7601 legit 7 2.807355 0.000000 0.0 0.000000
18682 hzsxzhyy legit 8 2.250000 0.000000 0.0 0.000000
19418 02022222222 legit 11 0.684038 1.041393 0.0 1.041393
19887 3181302 legit 7 2.235926 0.000000 0.0 0.000000
21172 hljdns4 legit 7 2.807355 1.755875 0.0 1.755875
26441 05tz2e9 legit 7 2.807355 0.000000 0.0 0.000000
26557 fzysqmy legit 7 2.521641 1.176091 0.0 1.176091
#对于这些正常但是gram count低的domain标记为weird
all_domains.loc[weird_cond, 'class'] = 'weird'
all_domains['class'].value_counts()
legit    67221
dga       2664
weird       91
Name: class, dtype: int64
all_domains[all_domains['class'] == 'weird'].head()
domain class length entropy alexa_grams word_grams diff
1246 twcczhu weird 7 2.521641 1.748188 0.0 1.748188
2009 ggmm777 weird 7 1.556657 1.518514 0.0 1.518514
2760 qq66699 weird 7 1.556657 1.342423 0.0 1.342423
17347 crx7601 weird 7 2.807355 0.000000 0.0 0.000000
18682 hzsxzhyy weird 8 2.250000 0.000000 0.0 0.000000
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['word_grams'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['word_grams'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain word_grams')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_29_1

训练算法

not_weird = all_domains[all_domains['class'] != 'weird']
X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams'])
#将frame转换为Numpy-array表示
y = np.array(not_weird['class'].tolist())
#将array转换为list
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20)
#A random forest classifier
#The number of trees in the forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#随机划分训练集和测试集
#样本占比0.2
clf.fit(X_train, y_train)
#用训练数据拟合分类器模型
y_pred = clf.predict(X_test)
#用训练好的分类器去预测测试数据
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
def show_cm(cm, labels):
    #计算百分比
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)  
    print('Confusion Matrix Stats')
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print("%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()))
labels = ['legit', 'dga']
cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels)
#混淆矩阵被用于在分类问题上对准确率的一种评估形式
show_cm(cm, labels)
Confusion Matrix Stats
legit/legit: 99.57% (13369/13427)
legit/dga: 0.43% (58/13427)
dga/legit: 15.45% (85/550)
dga/dga: 84.55% (465/550)
importances = zip(['length', 'entropy', 'alexa_grams', 'word_grams'], clf.feature_importances_)
#了解每个特征的重要性
list(importances)
[('length', 0.16033779891739047),
 ('entropy', 0.12175502861193326),
 ('alexa_grams', 0.5087685303664589),
 ('word_grams', 0.20913864210421748)]
clf.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

测试算法

def test_it(domain):
    _alexa_match = alexa_counts * alexa_vc.transform([domain]).T  
    _dict_match = dict_counts * dict_vc.transform([domain]).T
    _X = [[len(domain), entropy(domain), _alexa_match, _dict_match]]
    print('%s : %s' % (domain, clf.predict(_X)[0]))
test_it('google')
test_it('google8sdflkajssjgjksdh')
test_it('faceboosadfadfafdk')
test_it('1cb8a5f36f')
test_it('pterodactyladfasdfasdffarts')
test_it('ptes9dro-dwacty2lfa5rrts')
test_it('beyonce')
test_it('bey666on4ce')
test_it('supersexy')
test_it('yourmomissohotinthesummertime')
google : legit
google8sdflkajssjgjksdh : dga
faceboosadfadfafdk : legit
1cb8a5f36f : dga
pterodactyladfasdfasdffarts : legit
ptes9dro-dwacty2lfa5rrts : dga
beyonce : legit
bey666on4ce : dga
supersexy : legit
yourmomissohotinthesummertime : legit

使用算法

def save_model_to_disk(name, model, model_dir='models'):
    serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
    model_path = os.path.join(model_dir, name+'.model')
    print('Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model)/1024.0/1024.0))
    open(model_path,'wb').write(serialized_model)
save_model_to_disk('dga_model_random_forest', clf)
save_model_to_disk('dga_model_alexa_vectorizor', alexa_vc)
save_model_to_disk('dga_model_alexa_counts', alexa_counts)
save_model_to_disk('dga_model_dict_vectorizor', dict_vc)
save_model_to_disk('dga_model_dict_counts', dict_counts)
Storing Serialized Model to Disk (dga_model_random_forest:1.80Meg)
Storing Serialized Model to Disk (dga_model_alexa_vectorizor:2.93Meg)
Storing Serialized Model to Disk (dga_model_alexa_counts:0.18Meg)
Storing Serialized Model to Disk (dga_model_dict_vectorizor:5.39Meg)
Storing Serialized Model to Disk (dga_model_dict_counts:0.94Meg)
def load_model_from_disk(name, model_dir='models'):
    model_path = os.path.join(model_dir, name+'.model')
    try:
        model = pickle.loads(open(model_path,'rb').read())
        print('success')
    except:
        print('Could not load model: %s from directory %s!' % (name, model_path))
        return None
    return model
clf = load_model_from_disk('dga_model_random_forest')
alexa_vc = load_model_from_disk('dga_model_alexa_vectorizor')
alexa_counts = load_model_from_disk('dga_model_alexa_counts')
dict_vc = load_model_from_disk('dga_model_dict_vectorizor')
dict_counts = load_model_from_disk('dga_model_dict_counts')
model = {'clf':clf, 'alexa_vc':alexa_vc, 'alexa_counts':alexa_counts,
                 'dict_vc':dict_vc, 'dict_counts':dict_counts}
success
success
success
success
success
def evaluate_url(model, url):
    domain = domain_extract(url)
    alexa_match = model['alexa_counts'] * model['alexa_vc'].transform([url]).T
    dict_match = model['dict_counts'] * model['dict_vc'].transform([url]).T
    
    X = [[len(domain), entropy(domain), alexa_match, dict_match]]
    y_pred = model['clf'].predict(X)[0]
    
    print('%s : %s' % (domain, y_pred))
evaluate_url(model, 'adfhalksfhjashfk.com')
adfhalksfhjashfk : dga

mtnb = MultinomialNB()
mtnb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
nb_y_pred=mtnb.predict(X_test)
print(classification_report(y_test, nb_y_pred))
cm = sklearn.metrics.confusion_matrix(y_test, nb_y_pred)
show_cm(cm, labels)
             precision    recall  f1-score   support

        dga       0.71      0.87      0.78       550
      legit       0.99      0.99      0.99     13427

avg / total       0.98      0.98      0.98     13977

Confusion Matrix Stats
legit/legit: 86.73% (477/550)
legit/dga: 13.27% (73/550)
dga/legit: 1.44% (194/13427)
dga/dga: 98.56% (13233/13427)