最近一個月服務器天天都掛掉,html
top -c 一下 發現 cpu 390%java
矇蔽了,python
top -Hp <pid> 發現有四個線程一直在跑tomcat
jstack 了一下 發現死鎖了,服務器
排查了好久 暫時沒找到緣由 只能決定寫個python腳本監控一下了socket
當服務調用不通,jstack一下,而後從新拉起服務函數
這裏用到psutil模塊,要安裝一下url
# coding:utf-8 import urllib import re import os import time import socket import logging import psutil import re import traceback # 日誌格式化方式 LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" # 日期格式化方式 DATE_FORMAT = "%Y-%m-%d %H:%M:%S" # 日誌輸出文件 logging.basicConfig(filename='my.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT) # 檢測超時時間設置 單位(秒) socket.setdefaulttimeout(30) hospitalId='10300' accountId='19071' ipStr='127.0.0.1' portStr='8081' urlStr='http://'+ipStr+':'+portStr+'/xxxxxxxxccountId+'/'+hospitalId para=urllib.urlencode({'xxx':xxx,'accountId':accountId}) serverName="/usr/local/servers/tomcat-mobile" javaDumpPath='/home' # lastStartDate=time.time() # time.sleep(2) # min startingTime=1 # sec testSleeptime=5 # print lastStartDate - time.time() def get_html(): # logging.info('start Test Web') try: page = urllib.urlopen(urlStr,para) #if page.code != '200' : # return 'error' #else : html = page.read() except Exception, e: logging.error(e.message) logging.error(traceback.format_exc()) return 'error' return html def test_tomcat(): lastStartDate=1 html="" while(1): logging.info(' ') print '-' if((lastStartDate + (startingTime * 60)) < time.time()): html = get_html() #logging.info(html) if(html == 'error'): logging.error('reboot ') start_dump() lastStartDate = time.time() else: logging.info('Web test success') time.sleep(5) def get_threadId(): pids = psutil.pids() p1 = r""+serverName pattern1 = re.compile(p1) for pid in pids: p = psutil.Process(pid) res=pattern1.findall(','.join(str(i) for i in p.cmdline())) if(res): return pid return -1 def start_dump(): pid = get_threadId() if pid > -1 : logging.info("pid:"+str(pid)+"Error") os.system('mkdir -p '+javaDumpPath+'/'+str(pid)) logging.info("start stack") os.system('jstack '+str(pid)+' > '+javaDumpPath+'/'+str(pid)+'/jstack.txt') logging.info("start dump heap") os.system('jmap -heap '+str(pid)+' > '+javaDumpPath+'/'+str(pid)+'/jmapheap.txt') logging.info("start dump Mem") os.system(' jmap -dump:format=b,file='+javaDumpPath+'/'+str(pid)+'/jmapdump.txt '+str(pid)) logging.info("Kill Thread Id :"+str(pid)) os.system('kill -9 '+str(pid)) logging.info("restart tomcat") os.system(serverName+'/bin/startup.sh &') # 啓動檢測函數 while(1): try: test_tomcat() except Exception, e: logging.error(e.message) logging.error(traceback.format_exc())
最後的最後,把出問題的代碼片斷給幹掉了,貌似如今服務沒再掛過。先記錄一波,spa
個人守護進程爲毛會被操做系統殺?操作系統