0x01 Domain Generating Algorithm

Domain generation algorithms (DGA) are algorithms seen in various families of malware that are used to periodically generate a large number of domain names that can be used as rendezvous points with their command and control servers.

Example

0x02 Random Forest

random forest = bagging + decision trees

0x03 code

  • Random Forest

  • MultinomialNB

import os, sys
import traceback
import json
import optparse
import pickle
import collections
import sklearn
import sklearn.feature_extraction
import sklearn.ensemble
import sklearn.metrics
import pandas as pd
import numpy as np
import tldextract
import math
import operator
from sklearn.model_selection import train_test_split
from matplotlib import pylab
from pylab import *

收集数据

alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank','uri'], header=None, encoding='utf-8')
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
rank    100000 non-null int64
uri     100000 non-null object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB
rank uri
0 1 facebook.com
1 2 google.com
2 3 youtube.com
3 4 yahoo.com
4 5 baidu.com
dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8')
dga_dataframe.info()
dga_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2669 entries, 0 to 2668
Data columns (total 1 columns):
raw_domain    2669 non-null object
dtypes: object(1)
memory usage: 20.9+ KB
raw_domain
0 04055051be412eea5a61b7da8438be3d.info
1 1cb8a5f36f.info
2 30acd347397c34fc273e996b22951002.org
3 336c986a284e2b3bc0f69f949cb437cb.info
4 336c986a284e2b3bc0f69f949cb437cb.org
word_dataframe = pd.read_csv('data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')
word_dataframe.info()
word_dataframe.head(10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479623 entries, 0 to 479622
Data columns (total 1 columns):
word    479619 non-null object
dtypes: object(1)
memory usage: 3.7+ MB
word
0 1080
1 10-point
2 10th
3 11-point
4 12-point
5 16-point
6 18-point
7 1st
8 2
9 20-point

准备数据

def domain_extract(uri):
    ext = tldextract.extract(uri)
    if (not ext.suffix):
        return None
    else:
        return ext.domain
    
alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
alexa_dataframe.info()
alexa_dataframe.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 91377 entries, 0 to 99999
Data columns (total 1 columns):
domain    91377 non-null object
dtypes: object(1)
memory usage: 1.4+ MB
domain
0 facebook
1 google
2 youtube
3 yahoo
4 baidu
alexa_dataframe['class'] = 'legit'
#对正常数据打标legit
alexa_dataframe.head()
domain class
0 facebook legit
1 google legit
2 youtube legit
3 yahoo legit
4 baidu legit
# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
#打乱循序,重新索引
#Randomly permute a sequence, or return a permuted range
alexa_total = alexa_dataframe.shape[0]
print('Total Alexa domains %d' % alexa_total)
Total Alexa domains 91377
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
#This method applies a function that accepts and returns a scalar to every element of a DataFrame.
del dga_dataframe['raw_domain']
dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_total = dga_dataframe.shape[0]
print('Total DGA domains %d' % dga_total)
Total DGA domains 2664
dga_dataframe['class'] = 'dga'
dga_dataframe.head()
domain class
0 04055051be412eea5a61b7da8438be3d dga
1 1cb8a5f36f dga
2 30acd347397c34fc273e996b22951002 dga
3 336c986a284e2b3bc0f69f949cb437cb dga
5 40a43e61e56a5c218cf6c22aca27f7ee dga
def entropy(s):
    '''
    熵计算
    '''
    p, lns = collections.Counter(s), float(len(s))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)
#将数据根据不同的轴作简单的融合
#如果两个表的index都没有实际含义,使用ignore_index=True
all_domains['length'] = [len(x) for x in all_domains['domain']]
all_domains = all_domains[all_domains['length'] > 6]
#排除短domain的干扰
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
all_domains.head(10)
domain class length entropy
0 facebook legit 8 2.750000
2 youtube legit 7 2.521641
5 wikipedia legit 9 2.641604
10 blogspot legit 8 2.750000
11 twitter legit 7 2.128085
12 linkedin legit 8 2.500000
19 wordpress legit 9 2.725481
23 microsoft legit 9 2.947703
27 xvideos legit 7 2.807355
28 googleusercontent legit 17 3.175123

分析数据

#箱线图
all_domains.boxplot('length','class')
pylab.ylabel('Domain Length')
all_domains.boxplot('entropy','class')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_13_1

output_13_2

cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['length'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['length'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain Length')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_14_1

all_domains.tail(10)
domain class length entropy
94031 xcfwwghb dga 8 2.750000
94032 xcgqdfyrkgihlrmfmfib dga 20 3.684184
94033 xclqwzcfcx dga 10 2.646439
94034 xcpfxzuf dga 8 2.500000
94035 xcvxhxze dga 8 2.405639
94036 xdbrbsbm dga 8 2.405639
94037 xdfjryydcfwvkvui dga 16 3.500000
94038 xdjlvcgw dga 8 3.000000
94039 xdrmjeu dga 7 2.807355
94040 xflrjyyjswoatsoq dga 16 3.500000
legit = all_domains[(all_domains['class']=='legit')]
max_grams = np.maximum(legit['alexa_grams'],legit['word_grams'])
ax = max_grams.hist(bins=80)
ax.figure.suptitle('Histogram of the Max NGram Score for Domains')
pylab.xlabel('Number of Domains')
pylab.ylabel('Maximum NGram Score')
Text(0,0.5,'Maximum NGram Score')

output_16_1

word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]
word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
word_dataframe = word_dataframe.dropna()
word_dataframe = word_dataframe.drop_duplicates()
word_dataframe.head(10)
word
37 a
48 aa
51 aaa
53 aaaa
54 aaaaaa
55 aaal
56 aaas
57 aaberg
58 aachen
59 aae
alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0)
#词袋模型统计词频
#ngram_range:词组切分的长度范围
#如果一个词的频率小于min_df或者大于max_df,将不会被作为关键词
counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
#生成词频向量
#fit_transform 计算各个词语出现的次数
alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
#数据归一化
print(alexa_counts[:10])
ngrams_list = alexa_vc.get_feature_names()
#从包含文本和图片的数据集中提取特征,转换成机器学习中可用的数值型特征
print(ngrams_list[:10])

_sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=operator.itemgetter(1), reverse=True)
#zip()将两个序列合并,返回zip对象,可强制转换为列表或字典
# sorted()对序列进行排序,返回一个排序后的新列表,原数据不改变
print('Alexa NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
    print(ngram, count)
[1.         1.         1.17609126 1.64345268 1.11394335 1.14612804
 1.         1.17609126 1.07918125 1.54406804]
['-20', '-a-', '-ac', '-ad', '-ads', '-af', '-ag', '-ai', '-air', '-al']
Alexa NGrams: 23613
ing 3.443888546777372
lin 3.4271614029259654
ine 3.399673721481038
tor 3.26528962586083
ter 3.2631624649622166
ion 3.2467447097238415
ent 3.228913405994688
por 3.2013971243204513
the 3.2005769267548483
ree 3.16345955176999
#提取词的数值型特征
dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
counts_matrix = dict_vc.fit_transform(word_dataframe['word'])
dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = dict_vc.get_feature_names()
print(ngrams_list[:10])
['aaa', 'aab', 'aac', 'aad', 'aaf', 'aag', 'aah', 'aai', 'aak', 'aal']
_sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True)
print('Word NGrams: %d' % len(_sorted_ngrams))
for ngram, count in _sorted_ngrams[:10]:
    print(ngram, count)
Word NGrams: 123061
ing 4.387300822448285
ess 4.204879333760662
ati 4.1933472563864616
ion 4.165036479994566
ter 4.162415036106447
nes 4.112504458767161
tio 4.076822423342773
ate 4.0723602039634885
ent 4.069631102620343
tion 4.0496056125949735
def ngram_count(domain):
    '''
    domain中包含的ngrams数
    '''
    alexa_match = alexa_counts * alexa_vc.transform([domain]).T  
    dict_match = dict_counts * dict_vc.transform([domain]).T
    print('%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match))
ngram_count('google')
ngram_count('facebook')
ngram_count('1cb8a5f36f')
ngram_count('pterodactylfarts')
google Alexa match:17 Dict match: 14
facebook Alexa match:31 Dict match: 27
1cb8a5f36f Alexa match:0 Dict match: 0
pterodactylfarts Alexa match:35 Dict match: 76
#Compute NGram matches for all the domains and add to our dataframe
all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T
all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T
all_domains.head(10)
domain class length entropy alexa_grams word_grams
0 facebook legit 8 2.750000 31.302278 27.872426
2 youtube legit 7 2.521641 25.855170 18.287142
5 wikipedia legit 9 2.641604 24.571024 29.175635
10 blogspot legit 8 2.750000 24.435141 19.274501
11 twitter legit 7 2.128085 23.244500 31.130820
12 linkedin legit 8 2.500000 24.774916 32.904408
19 wordpress legit 9 2.725481 38.369509 33.806635
23 microsoft legit 9 2.947703 32.133033 39.530125
27 xvideos legit 7 2.807355 28.906360 18.846834
28 googleusercontent legit 17 3.175123 67.315750 86.104683
#Use the vectorized operations of the dataframe to investigate differences
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
all_domains.sort_values(['diff'], ascending=True).head(10)
domain class length entropy alexa_grams word_grams diff
79366 bipolardisorderdepressionanxiety legit 32 3.616729 117.312465 190.833856 -73.521391
72512 channel4embarrassingillnesses legit 29 3.440070 95.786979 169.119440 -73.332460
10961 stirringtroubleinternationally legit 30 3.481728 134.049367 207.204729 -73.155362
85031 americansforresponsiblesolutions legit 32 3.667838 148.143049 218.363956 -70.220908
20459 pragmatismopolitico legit 19 3.326360 61.244630 121.536223 -60.291593
13702 egaliteetreconciliation legit 23 3.186393 91.938518 152.125325 -60.186808
4706 interoperabilitybridges legit 23 3.588354 95.037285 153.626312 -58.589028
85161 foreclosurephilippines legit 22 3.447402 74.506548 132.514638 -58.008090
45636 annamalicesissyselfhypnosis legit 27 3.429908 68.680068 126.667692 -57.987623
70351 corazonindomablecapitulos legit 25 3.813661 75.535473 133.160690 -57.625217
all_domains.sort_values(['diff'], ascending=False).head(10)
domain class length entropy alexa_grams word_grams diff
54228 gay-sex-pics-porn-pictures-gay-sex-porn-gay-se... legit 56 3.661056 159.642301 85.124184 74.518116
85091 article-directory-free-submission-free-content legit 46 3.786816 235.233896 188.230453 47.003443
16893 stream-free-movies-online legit 25 3.509275 120.250616 74.496915 45.753701
63380 watch-free-movie-online legit 23 3.708132 103.029245 58.943451 44.085794
44253 best-online-shopping-site legit 25 3.452879 123.377240 79.596640 43.780601
22524 social-bookmarking-sites-list legit 29 3.702472 145.755266 102.261826 43.493440
66335 free-online-directory legit 21 3.403989 123.379738 80.735030 42.644708
46553 free-links-articles-directory legit 29 3.702472 153.239055 110.955361 42.283694
59873 online-web-directory legit 20 3.584184 116.310717 74.082948 42.227769
58016 web-directory-online legit 20 3.584184 114.402671 74.082948 40.319723
#gram count低的词
weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2)
weird = all_domains[weird_cond]
print(weird.shape[0])
weird.head(10)
91
domain class length entropy alexa_grams word_grams diff
1246 twcczhu legit 7 2.521641 1.748188 0.0 1.748188
2009 ggmm777 legit 7 1.556657 1.518514 0.0 1.518514
2760 qq66699 legit 7 1.556657 1.342423 0.0 1.342423
17347 crx7601 legit 7 2.807355 0.000000 0.0 0.000000
18682 hzsxzhyy legit 8 2.250000 0.000000 0.0 0.000000
19418 02022222222 legit 11 0.684038 1.041393 0.0 1.041393
19887 3181302 legit 7 2.235926 0.000000 0.0 0.000000
21172 hljdns4 legit 7 2.807355 1.755875 0.0 1.755875
26441 05tz2e9 legit 7 2.807355 0.000000 0.0 0.000000
26557 fzysqmy legit 7 2.521641 1.176091 0.0 1.176091
#对于这些正常但是gram count低的domain标记为weird
all_domains.loc[weird_cond, 'class'] = 'weird'
all_domains['class'].value_counts()
legit    67221
dga       2664
weird       91
Name: class, dtype: int64
all_domains[all_domains['class'] == 'weird'].head()
domain class length entropy alexa_grams word_grams diff
1246 twcczhu weird 7 2.521641 1.748188 0.0 1.748188
2009 ggmm777 weird 7 1.556657 1.518514 0.0 1.518514
2760 qq66699 weird 7 1.556657 1.342423 0.0 1.342423
17347 crx7601 weird 7 2.807355 0.000000 0.0 0.000000
18682 hzsxzhyy weird 8 2.250000 0.000000 0.0 0.000000
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['word_grams'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['word_grams'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
#放置图例
pylab.xlabel('Domain word_grams')
pylab.ylabel('Domain Entropy')
Text(0,0.5,'Domain Entropy')

output_29_1

训练算法

not_weird = all_domains[all_domains['class'] != 'weird']
X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams'])
#将frame转换为Numpy-array表示
y = np.array(not_weird['class'].tolist())
#将array转换为list
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20)
#A random forest classifier
#The number of trees in the forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#随机划分训练集和测试集
#样本占比0.2
clf.fit(X_train, y_train)
#用训练数据拟合分类器模型
y_pred = clf.predict(X_test)
#用训练好的分类器去预测测试数据
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
def show_cm(cm, labels):
    #计算百分比
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)  
    print('Confusion Matrix Stats')
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print("%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()))
labels = ['legit', 'dga']
cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels)
#混淆矩阵被用于在分类问题上对准确率的一种评估形式
show_cm(cm, labels)
Confusion Matrix Stats
legit/legit: 99.57% (13369/13427)
legit/dga: 0.43% (58/13427)
dga/legit: 15.45% (85/550)
dga/dga: 84.55% (465/550)
importances = zip(['length', 'entropy', 'alexa_grams', 'word_grams'], clf.feature_importances_)
#了解每个特征的重要性
list(importances)
[('length', 0.16033779891739047),
 ('entropy', 0.12175502861193326),
 ('alexa_grams', 0.5087685303664589),
 ('word_grams', 0.20913864210421748)]
clf.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

测试算法

def test_it(domain):
    _alexa_match = alexa_counts * alexa_vc.transform([domain]).T  
    _dict_match = dict_counts * dict_vc.transform([domain]).T
    _X = [[len(domain), entropy(domain), _alexa_match, _dict_match]]
    print('%s : %s' % (domain, clf.predict(_X)[0]))
test_it('google')
test_it('google8sdflkajssjgjksdh')
test_it('faceboosadfadfafdk')
test_it('1cb8a5f36f')
test_it('pterodactyladfasdfasdffarts')
test_it('ptes9dro-dwacty2lfa5rrts')
test_it('beyonce')
test_it('bey666on4ce')
test_it('supersexy')
test_it('yourmomissohotinthesummertime')
google : legit
google8sdflkajssjgjksdh : dga
faceboosadfadfafdk : legit
1cb8a5f36f : dga
pterodactyladfasdfasdffarts : legit
ptes9dro-dwacty2lfa5rrts : dga
beyonce : legit
bey666on4ce : dga
supersexy : legit
yourmomissohotinthesummertime : legit

使用算法

def save_model_to_disk(name, model, model_dir='models'):
    serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
    model_path = os.path.join(model_dir, name+'.model')
    print('Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model)/1024.0/1024.0))
    open(model_path,'wb').write(serialized_model)
save_model_to_disk('dga_model_random_forest', clf)
save_model_to_disk('dga_model_alexa_vectorizor', alexa_vc)
save_model_to_disk('dga_model_alexa_counts', alexa_counts)
save_model_to_disk('dga_model_dict_vectorizor', dict_vc)
save_model_to_disk('dga_model_dict_counts', dict_counts)
Storing Serialized Model to Disk (dga_model_random_forest:1.80Meg)
Storing Serialized Model to Disk (dga_model_alexa_vectorizor:2.93Meg)
Storing Serialized Model to Disk (dga_model_alexa_counts:0.18Meg)
Storing Serialized Model to Disk (dga_model_dict_vectorizor:5.39Meg)
Storing Serialized Model to Disk (dga_model_dict_counts:0.94Meg)
def load_model_from_disk(name, model_dir='models'):
    model_path = os.path.join(model_dir, name+'.model')
    try:
        model = pickle.loads(open(model_path,'rb').read())
        print('success')
    except:
        print('Could not load model: %s from directory %s!' % (name, model_path))
        return None
    return model
clf = load_model_from_disk('dga_model_random_forest')
alexa_vc = load_model_from_disk('dga_model_alexa_vectorizor')
alexa_counts = load_model_from_disk('dga_model_alexa_counts')
dict_vc = load_model_from_disk('dga_model_dict_vectorizor')
dict_counts = load_model_from_disk('dga_model_dict_counts')
model = {'clf':clf, 'alexa_vc':alexa_vc, 'alexa_counts':alexa_counts,
                 'dict_vc':dict_vc, 'dict_counts':dict_counts}
success
success
success
success
success
def evaluate_url(model, url):
    domain = domain_extract(url)
    alexa_match = model['alexa_counts'] * model['alexa_vc'].transform([url]).T
    dict_match = model['dict_counts'] * model['dict_vc'].transform([url]).T
    
    X = [[len(domain), entropy(domain), alexa_match, dict_match]]
    y_pred = model['clf'].predict(X)[0]
    
    print('%s : %s' % (domain, y_pred))
evaluate_url(model, 'adfhalksfhjashfk.com')
adfhalksfhjashfk : dga

mtnb = MultinomialNB()
mtnb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
nb_y_pred=mtnb.predict(X_test)
print(classification_report(y_test, nb_y_pred))
cm = sklearn.metrics.confusion_matrix(y_test, nb_y_pred)
show_cm(cm, labels)
             precision    recall  f1-score   support

        dga       0.71      0.87      0.78       550
      legit       0.99      0.99      0.99     13427

avg / total       0.98      0.98      0.98     13977

Confusion Matrix Stats
legit/legit: 86.73% (477/550)
legit/dga: 13.27% (73/550)
dga/legit: 1.44% (194/13427)
dga/dga: 98.56% (13233/13427)