從GoogleClusterData統計每一個用戶的使用率、平均每次出價

    以前將google cluster data導入了Azure上的MySQL數據庫,下一步就是對這些數據進行分析,數據庫

挖掘用戶的使用規律了。app

首先,爲了加快執行速度,對user,time等加入索引。fetch

而後就能夠使用如下代碼進行統計了。google

import os
import MySQLdb
import time
import thread

def use4ADay(day, users):
    conn=MySQLdb.connect(host="localhost",user="root",passwd="123456",db="googleclusterdata",charset="utf8")
    cursor = conn.cursor()
    
    msAday = 24*60*60*1000000
    
    for user in users:
        user = user[0]
        print user
        use4ADay.user = user
        
        print 'day %s' %day
        startTime = (day - 1) * msAday
        endTime = day * msAday
        dayCPUUse = 0
        dayMEMUse = 0
        dayDiskUse = 0
        order = "select job_id from job_events where time >= %s and time < %s and user = '%s'" %(startTime, endTime, user)
        print order
        cursor.execute(order)
        job_ids = cursor.fetchall()
        for job_id in job_ids:
            job_id = job_id[0]
            print 'day %s' %day
            order = "select task_index, event_type, cpu_request, memory_request, disk_space_request, time from task_events \
    where time >= %s and time < %s and job_id = %d order by task_index"\
                    %(startTime, endTime, job_id)
            print order
            cursor.execute(order)
            tasks = cursor.fetchall()
            print 'tasks get'
            i = 0
            while i < len(tasks) - 1:
                task = tasks[i]
                if task[1] == 1:
                    task_index = task[0]
                    nextEvent = tasks[i+1]
                    if (nextEvent[1] == 4 or nextEvent[1] == 5) and nextEvent[0] == task_index:
                        taskLife = (nextEvent[5] - tasks[i][5]) / (10.0**6)
                        dayCPUUse += taskLife * task[2]
                        dayMEMUse += taskLife * task[3]
                        dayDiskUse += taskLife * task[4]
                        #print 'task: ', task_index, dayCPUUse, dayMEMUse, dayDiskUse
                i = i+1
            #print 'job: ', job_id, dayCPUUse, dayMEMUse, dayDiskUse
        fOut = open('C:\\userUsageEachDay\\day%d.txt' %day, 'a')
        fOut.write('%s\t%f\t%f\t%f\n' %(user,  dayCPUUse, dayMEMUse, dayDiskUse))
        fOut.close()
    print 'day %d finish' %day
    conn.close()

    
conn=MySQLdb.connect(host="localhost",user="root",passwd="123456",db="googleclusterdata",charset="utf8")
cursor = conn.cursor()
#get all user_name
order = "select distinct user from job_events"
print order
cursor.execute(order)
users = cursor.fetchall()
conn.close()

for day in range(1, 30):
    try:
        use4ADay(day, users)
    except:
        print 'day', day, 'failed!!'
        fOut = open('C:\\failed.txt', 'a')
        fOut.write('%s\t%d\t\n' %(use4ADay.user, day))
        fOut.close()
    #print 'starting thread for day %d' %day
    #thread.start_new_thread(use4ADay, (day, users, ) )#use4ADay(2, users)

下一步,是統計每一個用戶整個月的消費頻率,以及每次消費的平均消費量spa

fDay1 = open('C:\\Usage\\day1.txt')
users = []
for l in fDay1.readlines():
    l = l.split('\t')
    user = l[0]
    users.append(user)
fDay1.close()

#fOut = open('C:\\UseTraceOfAllUsers.txt', 'w')
for user in users:
    useDays = 0
    allPrice = 0
    for day in range(1,30):
        f = open('C:\\Usage\\day%d.txt' %day)
        isFind = False
        for l in f.readlines():
            if l.count(user) > 0:
                l = l.strip()
                l = l.split('\t')
                cpu = float(l[1])
                mem = float(l[2])
                disk = float(l[3])
                money = 1.92*cpu + 15.6*mem + 1.2*disk
                assert(money>=0)
                isFind = True
                break
        if isFind and money != 0:
            useDays += 1
            allPrice += money
        f.close()
    if useDays != 0:
        pass
        #fOut.write('%s\t%s\n' %(str(useDays/29.0), str(allPrice/useDays)))
fOut.close()

最後就能夠使用matlab進行畫圖啦。code

x = load('C:\UseTraceOfAllUsers.txt')
plot(x(:,1), x(:,2), 'o');

結果以下:blog

對平均使用量取個對數的話索引

x = load('C:\UseTraceOfAllUsers.txt')
plot(x(:,1), log(x(:,2)), 'o');

相關文章
相關標籤/搜索