nrpe を使用して python スクリプトを実行し、rabbitmq を監視しようとしています。スクリプト内には、「sudo rabbiqmqctl list_queues」というコマンドがあり、各キューのメッセージ数が表示されます。ただし、これによりnagiosがhtisメッセージを表示します:
CRITICAL - Command '['sudo', 'rabbitmqctl', 'list_queues']' returned non-zero exit status 1
これは権限の問題かもしれないと思ったので、次の方法で進めました
/etc/グループ:
ec2-user:x:500:
rabbitmq:x:498:nrpe,nagios,ec2-user
nagios:x:497:
nrpe:x:496:
rpc:x:32:
/etc/sudoers:
%rabbitmq ALL=NOPASSWD: /usr/sbin/rabbitmqctl
ナギオス構成:
command[check_rabbitmq_queuecount_prod]=/usr/bin/python27 /etc/nagios/check_rabbitmq_prod -a queues_count -C 3000 -W 1500
check_rabbitmq_prod:
#!/usr/bin/env python
from optparse import OptionParser
import shlex
import subprocess
import sys
class RabbitCmdWrapper(object):
"""So basically this just runs rabbitmqctl commands and returns parsed output.
Typically this means you need root privs for this to work.
Made this it's own class so it could be used in other monitoring tools
if desired."""
@classmethod
def list_queues(cls):
args = shlex.split('sudo rabbitmqctl list_queues')
cmd_result = subprocess.check_output(args).strip()
results = cls._parse_list_results(cmd_result)
return results
@classmethod
def _parse_list_results(cls, result_string):
results = result_string.strip().split('\n')
#remove text fluff
results.remove(results[-1])
results.remove(results[0])
return_data = []
for row in results:
return_data.append(row.split('\t'))
return return_data
def check_queues_count(critical=1000, warning=1000):
"""
A blanket check to make sure all queues are within count parameters.
TODO: Possibly break this out so test can be done on individual queues.
"""
try:
critical_q = []
warning_q = []
ok_q = []
results = RabbitCmdWrapper.list_queues()
for queue in results:
if queue[0] == 'SFS_Production_Queue':
count = int(queue[1])
if count >= critical:
critical_q.append("%s: %s" % (queue[0], count))
elif count >= warning:
warning_q.append("%s: %s" % (queue[0], count))
else:
ok_q.append("%s: %s" % (queue[0], count))
if critical_q:
print "CRITICAL - %s" % ", ".join(critical_q)
sys.exit(2)
elif warning_q:
print "WARNING - %s" % ", ".join(warning_q)
sys.exit(1)
else:
print "OK - %s" % ", ".join(ok_q)
sys.exit(0)
except Exception, err:
print "CRITICAL - %s" % err
sys.exit(2)
USAGE = """Usage: ./check_rabbitmq -a [action] -C [critical] -W [warning]
Actions:
- queues_count
checks the count in each of the queues in rabbitmq's list_queues"""
if __name__ == "__main__":
parser = OptionParser(USAGE)
parser.add_option("-a", "--action", dest="action",
help="Action to Check")
parser.add_option("-C", "--critical", dest="critical",
type="int", help="Critical Threshold")
parser.add_option("-W", "--warning", dest="warning",
type="int", help="Warning Threshold")
(options, args) = parser.parse_args()
if options.action == "queues_count":
check_queues_count(options.critical, options.warning)
else:
print "Invalid action: %s" % options.action
print USAGE
この時点で、何がスクリプトの実行を妨げているのかわかりません。コマンドライン経由で正常に実行されます。どんな助けでも大歓迎です。