python 脚本在发出 API 请求时经常中断
python script breaks often while making API request
我有一个 python 脚本,它 运行 作为 Ganeti 挂钩,在向 Ganeti 添加、删除、关闭和启动实例后 运行。将新实例添加到 Ganeti 时,挂钩应使用 API 调用将此实例添加到 check_mk。删除 Ganeti 中的实例会触发删除 check_mk 中的实例。关闭实例会在 check_mk 中设置停机时间,如果它是由挂钩设置的,则启动实例会删除 check_mk 中的停机时间。我们在多个位置(数据中心)拥有 Ganeti 集群。
我们使用 check-mk-raw 进行分布式监控,每个数据中心都有一个主机和多个从机 运行ning。因此,添加、删除等只能通过 API 调用 master 来完成。
Ganeti 钩子 stdout 和 stderr 被重定向到文件,这是在 Ganeti 中硬编码的。仅当脚本失败时,错误才会写入标准输出(cosole),但如果 运行s 成功,输出将被重定向到文件,而且大多数情况下并不多。所以 print()
没有帮助。因此我正在使用日志库。
主要问题是脚本经常中断,有时甚至没有记录。不知道是我编码能力有限还是网络延迟问题。我今天添加了所有异常以了解发生了什么,但这没有帮助。
对于这方面的任何帮助,我将不胜感激。下面是完整的脚本。
非常感谢。
编辑: 我删除了大部分异常,因为它们并不真正相关,并修复了脚本中的一些拼写错误。
#!/usr/bin/env python
"""Manage host in monitoring."""
import os
import re
import sys
import json
import socket
import logging
import requests
APIURL = 'https://checkmk.host/site/check_mk/webapi.py'
WEBURL = 'https://checkmk.host/site/check_mk/view.py'
def hook_mon_token():
"""Get secrets for monitoring from file.
This file is written by ganeti puppet module.
"""
with open('/root/.hook_mon_token', 'r') as _file:
ldap_secret = _file.readline()
mon_token = _file.readline()
return ldap_secret, mon_token
def get_datacenter():
"""Get datacenter we are run at."""
datacenter = requests.get('http://localhost:8500/v1/catalog/datacenters').json()
if '-' in datacenter[0]:
datacenter = datacenter[0].split('-')[1]
return datacenter.lower()
else:
return datacenter[0].lower()
def get_tenant(datacenter):
"""Return tenant name."""
tenant = str.lower(''.join(re.findall(r'tenant:([\w-]+)', os.environ['GANETI_INSTANCE_NIC0_NETWORK_TAGS'])))
if tenant == '':
tenant = datacenter
return tenant
def checkmk_api_call(action):
"""Call the Web API."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
server_ip = os.environ['GANETI_INSTANCE_NIC0_IP']
params = {'action': action, '_username': 'automation', '_secret': mon_token.strip()}
if action == 'add_host':
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
folder = datacenter + "/" + tenant + "/hosts"
request = {
'hostname': hostname,
'folder': folder,
'attributes': {
'ipaddress': server_ip,
'site': datacenter,
'tag_' + datacenter: datacenter,
'tag_' + datacenter + '-vm': datacenter + '-vm',
'tag_' + tenant + '-vm': tenant + '-vm',
'tag_agent': 'cmk-agent',
'tag_snmp': 'no-snmp'
},
'create_folders': '0'
}
else:
hostname = datacenter.upper() + '.' + instance_name
folder = datacenter + "/hosts"
request = {
'hostname': hostname,
'folder': folder,
'attributes': {
'ipaddress': server_ip,
'site': datacenter,
'tag_' + datacenter: datacenter,
'tag_' + datacenter + '-vm': datacenter + '-vm',
'tag_agent': 'cmk-agent',
'tag_snmp': 'no-snmp'
},
'create_folders': '0'
}
elif action == 'delete_host':
api_answer, request, hostname = get_host('delete_host')
if api_answer != server_ip:
return api_answer, hostname
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror, hostname
return False, hostname
except requests.exceptions.RequestException as error:
return error, hostname
def get_host(action):
"""Get the Host."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
params = {'action': 'get_host', '_username': 'automation', '_secret': mon_token.strip()}
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
request = {
'hostname': hostname
}
else:
hostname = datacenter.upper() + '.' + instance_name
request = {
'hostname': hostname
}
if action == "delete_host":
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror, request, hostname
response_post = resp_post.json()
host_ip = response_post['result']['attributes']['ipaddress']
return host_ip, request, hostname
except requests.exceptions.RequestException as error:
return error, request, hostname
else:
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror
return False
except requests.exceptions.RequestException as error:
return error
def is_down():
"""Check, if down and downtime comment."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
else:
hostname = datacenter.upper() + '.' + instance_name
params = {
'_username': 'automation',
'_secret': mon_token.strip(),
'output_format': 'JSON',
'host_regex': hostname,
'view_name': 'downtimes'
}
apierror = get_host('get_host')
if apierror:
return None, apierror, hostname
try:
resp_get = requests.get(WEBURL, params=params, auth=auth).text
resp_json = json.loads(resp_get)
if len(resp_json) == 1:
host_is_down = False
down_comment = ''
else:
host_is_down = True
down_comment = resp_json[1][resp_json[0].index('downtime_comment')]
return host_is_down, down_comment, hostname
except requests.exceptions.RequestException as error:
return None, error, hostname
def checkmk_web_call(action):
"""Call web page view."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
apierror = get_host('get_host')
if apierror:
return apierror
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
else:
hostname = datacenter.upper() + '.' + instance_name
params = {
'_do_confirm': 'yes',
'_do_actions': 'yes',
'_transid': '-1',
'_username': 'automation',
'_secret': mon_token.strip(),
'output_format': 'JSON'
}
if action == 'stop':
params.update({
'view_name': 'host',
'host': hostname,
'_on_hosts': 'on',
'_downrange__next_year': 'This+year',
'_down_comment': 'down by ganeti shutdown'
})
elif action == 'start':
params.update({
'view_name': 'downtimes',
'host_regex': hostname,
'_remove_downtimes': 'Remove'
})
try:
resp = requests.post(WEBURL, params=params, auth=auth)
return False
except requests.exceptions.RequestException as error:
return error
def gnt_action(action):
logger = logging.getLogger(__name__)
if action == 'instance-add':
apierror, hostname = checkmk_api_call('add_host')
if apierror:
logger.error(apierror, 'Could not add "%s" to check_mk! Please add it manually!' % hostname)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Added "%s" successfully to check_mk. Please activete changes in WATO' % hostname)
elif action == 'instance-remove':
apierror, hostname = checkmk_api_call('delete_host')
if apierror:
logger.error(apierror, 'Could not remove "%s" from check_mk! Please remove it manually!' % hostname)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Removed "%s" successfully from check_mk. Please activate changes in WATO' % hostname)
elif action == 'instance-start':
host_is_down, down_comment, hostname = is_down()
if host_is_down is None:
logger.info(down_comment)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
elif host_is_down and down_comment == 'down by ganeti shutdown':
apierror = checkmk_web_call('start')
if apierror:
logger.error(apierror)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Removed down time successfully for "%s" in check_mk' % hostname)
else:
logger.info('Nothing to do')
elif action == 'instance-stop':
host_is_down, down_comment, hostname = is_down()
if host_is_down is None:
logger.info(down_comment)
sys.exit(1)
elif host_is_down is False:
apierror = checkmk_web_call('stop')
if apierror:
logger.error(apierror)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Set down time successfully for "%s" in check_mk' % hostname)
else:
logger.info('Nothing to do. "%s" is already down' % hostname)
def main():
logger = logging.getLogger(__name__)
log_file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(process)s - %(message)s')
log_file_handler = logging.FileHandler('/tmp/monitoring_hook.log')
log_file_handler.setFormatter(log_file_format)
log_file_handler.setLevel(logging.DEBUG)
logger.addHandler(log_file_handler)
logger.setLevel(logging.INFO)
"""Add Hook for Ganeti to add new instance to monitoring."""
if socket.getfqdn() == os.environ['GANETI_MASTER']:
action = os.environ['GANETI_HOOKS_PATH']
if os.environ['GANETI_POST_INSTANCE_TAGS']:
if 'monitoring:no' in os.environ['GANETI_POST_INSTANCE_TAGS']:
logger.info('VM will not be added to check_mk')
sys.exit(0)
else:
gnt_action(action)
else:
gnt_action(action)
if __name__ == "__main__":
try:
pid = os.fork()
if pid > 0:
# Exit parent process
sys.exit(0)
except OSError, e:
print('fork failed: %d (%s)' % (e.errno, e.strerror))
sys.exit(1)
main()
经过更多调试后,我发现脚本仅在某些数据中心失败,而在其他数据中心总是成功,很明显这是网络问题。
API请求发送到监控服务器的WAN IP,所以我只是将其替换为/etc/hosts
中的LAN IP,直到找到根本原因。
对于无关紧要的内容,我们深表歉意post,因为脚本做了它应该做的事情。
我有一个 python 脚本,它 运行 作为 Ganeti 挂钩,在向 Ganeti 添加、删除、关闭和启动实例后 运行。将新实例添加到 Ganeti 时,挂钩应使用 API 调用将此实例添加到 check_mk。删除 Ganeti 中的实例会触发删除 check_mk 中的实例。关闭实例会在 check_mk 中设置停机时间,如果它是由挂钩设置的,则启动实例会删除 check_mk 中的停机时间。我们在多个位置(数据中心)拥有 Ganeti 集群。
我们使用 check-mk-raw 进行分布式监控,每个数据中心都有一个主机和多个从机 运行ning。因此,添加、删除等只能通过 API 调用 master 来完成。
Ganeti 钩子 stdout 和 stderr 被重定向到文件,这是在 Ganeti 中硬编码的。仅当脚本失败时,错误才会写入标准输出(cosole),但如果 运行s 成功,输出将被重定向到文件,而且大多数情况下并不多。所以 print()
没有帮助。因此我正在使用日志库。
主要问题是脚本经常中断,有时甚至没有记录。不知道是我编码能力有限还是网络延迟问题。我今天添加了所有异常以了解发生了什么,但这没有帮助。
对于这方面的任何帮助,我将不胜感激。下面是完整的脚本。
非常感谢。
编辑: 我删除了大部分异常,因为它们并不真正相关,并修复了脚本中的一些拼写错误。
#!/usr/bin/env python
"""Manage host in monitoring."""
import os
import re
import sys
import json
import socket
import logging
import requests
APIURL = 'https://checkmk.host/site/check_mk/webapi.py'
WEBURL = 'https://checkmk.host/site/check_mk/view.py'
def hook_mon_token():
"""Get secrets for monitoring from file.
This file is written by ganeti puppet module.
"""
with open('/root/.hook_mon_token', 'r') as _file:
ldap_secret = _file.readline()
mon_token = _file.readline()
return ldap_secret, mon_token
def get_datacenter():
"""Get datacenter we are run at."""
datacenter = requests.get('http://localhost:8500/v1/catalog/datacenters').json()
if '-' in datacenter[0]:
datacenter = datacenter[0].split('-')[1]
return datacenter.lower()
else:
return datacenter[0].lower()
def get_tenant(datacenter):
"""Return tenant name."""
tenant = str.lower(''.join(re.findall(r'tenant:([\w-]+)', os.environ['GANETI_INSTANCE_NIC0_NETWORK_TAGS'])))
if tenant == '':
tenant = datacenter
return tenant
def checkmk_api_call(action):
"""Call the Web API."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
server_ip = os.environ['GANETI_INSTANCE_NIC0_IP']
params = {'action': action, '_username': 'automation', '_secret': mon_token.strip()}
if action == 'add_host':
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
folder = datacenter + "/" + tenant + "/hosts"
request = {
'hostname': hostname,
'folder': folder,
'attributes': {
'ipaddress': server_ip,
'site': datacenter,
'tag_' + datacenter: datacenter,
'tag_' + datacenter + '-vm': datacenter + '-vm',
'tag_' + tenant + '-vm': tenant + '-vm',
'tag_agent': 'cmk-agent',
'tag_snmp': 'no-snmp'
},
'create_folders': '0'
}
else:
hostname = datacenter.upper() + '.' + instance_name
folder = datacenter + "/hosts"
request = {
'hostname': hostname,
'folder': folder,
'attributes': {
'ipaddress': server_ip,
'site': datacenter,
'tag_' + datacenter: datacenter,
'tag_' + datacenter + '-vm': datacenter + '-vm',
'tag_agent': 'cmk-agent',
'tag_snmp': 'no-snmp'
},
'create_folders': '0'
}
elif action == 'delete_host':
api_answer, request, hostname = get_host('delete_host')
if api_answer != server_ip:
return api_answer, hostname
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror, hostname
return False, hostname
except requests.exceptions.RequestException as error:
return error, hostname
def get_host(action):
"""Get the Host."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
params = {'action': 'get_host', '_username': 'automation', '_secret': mon_token.strip()}
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
request = {
'hostname': hostname
}
else:
hostname = datacenter.upper() + '.' + instance_name
request = {
'hostname': hostname
}
if action == "delete_host":
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror, request, hostname
response_post = resp_post.json()
host_ip = response_post['result']['attributes']['ipaddress']
return host_ip, request, hostname
except requests.exceptions.RequestException as error:
return error, request, hostname
else:
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror
return False
except requests.exceptions.RequestException as error:
return error
def is_down():
"""Check, if down and downtime comment."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
else:
hostname = datacenter.upper() + '.' + instance_name
params = {
'_username': 'automation',
'_secret': mon_token.strip(),
'output_format': 'JSON',
'host_regex': hostname,
'view_name': 'downtimes'
}
apierror = get_host('get_host')
if apierror:
return None, apierror, hostname
try:
resp_get = requests.get(WEBURL, params=params, auth=auth).text
resp_json = json.loads(resp_get)
if len(resp_json) == 1:
host_is_down = False
down_comment = ''
else:
host_is_down = True
down_comment = resp_json[1][resp_json[0].index('downtime_comment')]
return host_is_down, down_comment, hostname
except requests.exceptions.RequestException as error:
return None, error, hostname
def checkmk_web_call(action):
"""Call web page view."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
apierror = get_host('get_host')
if apierror:
return apierror
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
else:
hostname = datacenter.upper() + '.' + instance_name
params = {
'_do_confirm': 'yes',
'_do_actions': 'yes',
'_transid': '-1',
'_username': 'automation',
'_secret': mon_token.strip(),
'output_format': 'JSON'
}
if action == 'stop':
params.update({
'view_name': 'host',
'host': hostname,
'_on_hosts': 'on',
'_downrange__next_year': 'This+year',
'_down_comment': 'down by ganeti shutdown'
})
elif action == 'start':
params.update({
'view_name': 'downtimes',
'host_regex': hostname,
'_remove_downtimes': 'Remove'
})
try:
resp = requests.post(WEBURL, params=params, auth=auth)
return False
except requests.exceptions.RequestException as error:
return error
def gnt_action(action):
logger = logging.getLogger(__name__)
if action == 'instance-add':
apierror, hostname = checkmk_api_call('add_host')
if apierror:
logger.error(apierror, 'Could not add "%s" to check_mk! Please add it manually!' % hostname)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Added "%s" successfully to check_mk. Please activete changes in WATO' % hostname)
elif action == 'instance-remove':
apierror, hostname = checkmk_api_call('delete_host')
if apierror:
logger.error(apierror, 'Could not remove "%s" from check_mk! Please remove it manually!' % hostname)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Removed "%s" successfully from check_mk. Please activate changes in WATO' % hostname)
elif action == 'instance-start':
host_is_down, down_comment, hostname = is_down()
if host_is_down is None:
logger.info(down_comment)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
elif host_is_down and down_comment == 'down by ganeti shutdown':
apierror = checkmk_web_call('start')
if apierror:
logger.error(apierror)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Removed down time successfully for "%s" in check_mk' % hostname)
else:
logger.info('Nothing to do')
elif action == 'instance-stop':
host_is_down, down_comment, hostname = is_down()
if host_is_down is None:
logger.info(down_comment)
sys.exit(1)
elif host_is_down is False:
apierror = checkmk_web_call('stop')
if apierror:
logger.error(apierror)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Set down time successfully for "%s" in check_mk' % hostname)
else:
logger.info('Nothing to do. "%s" is already down' % hostname)
def main():
logger = logging.getLogger(__name__)
log_file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(process)s - %(message)s')
log_file_handler = logging.FileHandler('/tmp/monitoring_hook.log')
log_file_handler.setFormatter(log_file_format)
log_file_handler.setLevel(logging.DEBUG)
logger.addHandler(log_file_handler)
logger.setLevel(logging.INFO)
"""Add Hook for Ganeti to add new instance to monitoring."""
if socket.getfqdn() == os.environ['GANETI_MASTER']:
action = os.environ['GANETI_HOOKS_PATH']
if os.environ['GANETI_POST_INSTANCE_TAGS']:
if 'monitoring:no' in os.environ['GANETI_POST_INSTANCE_TAGS']:
logger.info('VM will not be added to check_mk')
sys.exit(0)
else:
gnt_action(action)
else:
gnt_action(action)
if __name__ == "__main__":
try:
pid = os.fork()
if pid > 0:
# Exit parent process
sys.exit(0)
except OSError, e:
print('fork failed: %d (%s)' % (e.errno, e.strerror))
sys.exit(1)
main()
经过更多调试后,我发现脚本仅在某些数据中心失败,而在其他数据中心总是成功,很明显这是网络问题。
API请求发送到监控服务器的WAN IP,所以我只是将其替换为/etc/hosts
中的LAN IP,直到找到根本原因。
对于无关紧要的内容,我们深表歉意post,因为脚本做了它应该做的事情。