一次服務器掛掉解決路程

最近一個月服務器天天都掛掉,html

top -c 一下 發現 cpu 390%java

矇蔽了,python

top -Hp <pid> 發現有四個線程一直在跑tomcat

jstack 了一下 發現死鎖了,服務器

排查了好久 暫時沒找到緣由 只能決定寫個python腳本監控一下了socket

當服務調用不通,jstack一下,而後從新拉起服務函數

這裏用到psutil模塊,要安裝一下url

# coding:utf-8
import urllib
import re
import os
import time
import socket
import logging
import psutil
import re
import traceback




# 日誌格式化方式
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
# 日期格式化方式
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
# 日誌輸出文件
logging.basicConfig(filename='my.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

# 檢測超時時間設置 單位(秒)
socket.setdefaulttimeout(30)

hospitalId='10300'

accountId='19071'

ipStr='127.0.0.1'

portStr='8081'

urlStr='http://'+ipStr+':'+portStr+'/xxxxxxxxccountId+'/'+hospitalId

para=urllib.urlencode({'xxx':xxx,'accountId':accountId})

serverName="/usr/local/servers/tomcat-mobile"

javaDumpPath='/home'


# lastStartDate=time.time()
# time.sleep(2)
# min
startingTime=1
# sec
testSleeptime=5
# print lastStartDate - time.time()
def get_html():
    # logging.info('start Test Web')
    try:
        page = urllib.urlopen(urlStr,para)
        #if page.code != '200' :
        #    return 'error'
        #else :   
        html = page.read()
    except Exception, e:
        logging.error(e.message)
        logging.error(traceback.format_exc())
        return 'error'
    return html

def test_tomcat():
    lastStartDate=1
    html=""
    while(1):
        logging.info(' ')
        print '-'
        if((lastStartDate + (startingTime * 60)) < time.time()):
            html = get_html()
            #logging.info(html)
            if(html == 'error'):
                logging.error('reboot ')
                start_dump()
                lastStartDate = time.time()
            else:
                logging.info('Web test success')
        time.sleep(5)

def get_threadId():
    pids = psutil.pids()
    p1 = r""+serverName
    pattern1 = re.compile(p1)
    for pid in pids:
        p = psutil.Process(pid)
        res=pattern1.findall(','.join(str(i) for i in p.cmdline()))
        if(res):
            return pid
    return -1


def start_dump():
    pid = get_threadId()
    if pid > -1 :
        logging.info("pid:"+str(pid)+"Error")
        os.system('mkdir -p '+javaDumpPath+'/'+str(pid))
        logging.info("start stack")
        os.system('jstack '+str(pid)+' > '+javaDumpPath+'/'+str(pid)+'/jstack.txt')
        logging.info("start dump heap")
        os.system('jmap -heap '+str(pid)+' > '+javaDumpPath+'/'+str(pid)+'/jmapheap.txt')
        logging.info("start dump Mem")
        os.system(' jmap -dump:format=b,file='+javaDumpPath+'/'+str(pid)+'/jmapdump.txt '+str(pid))
        logging.info("Kill Thread Id :"+str(pid))
        os.system('kill -9 '+str(pid))
    logging.info("restart tomcat")
    os.system(serverName+'/bin/startup.sh &')
# 啓動檢測函數

while(1):
    try:
        test_tomcat()
    except Exception, e:
        logging.error(e.message)
        logging.error(traceback.format_exc())

最後的最後,把出問題的代碼片斷給幹掉了,貌似如今服務沒再掛過。先記錄一波,spa

個人守護進程爲毛會被操做系統殺?操作系統

相關文章
相關標籤/搜索