PROJECT_MOVED -> https://lab.nexedi.com/nexedi/slapos
[slapos.git] / stack / monitor / monitor.py.in
1 #!{{ python_executable }}
2
3 import json
4 import os
5 import subprocess
6 import sys
7 import sqlite3
8 import time
9 import threading
10 from optparse import OptionParser, make_option
11
12
13 FAILURE = "FAILURE"
14 SUCCESS = "SUCCESS"
15
16 db_path = "{{ monitor_parameter['db-path'] }}"
17 instance_path = "{{ directory['home'] }}"
18 monitor_dir = "{{ directory['monitor-custom-scripts'] }}"
19 pid_dir = "{{ directory['run'] }}"
20 promise_dir = "{{ directory['promise'] }}"
21
22 monitoring_file_json = "{{ monitoring_file_json }}"
23
24 option_list = [
25 make_option("-a", "--all", action="store_true", dest="all",
26 help="test everything : promises, services, customs"),
27 make_option("-n", "--no-write", action="store_true", dest="only_stdout",
28 help="just show the json output on stdout"),
29 make_option("-m", "--monitors", action="store_true", dest="monitor",
30 help="add the custom monitoring file to the files to monitor"),
31 make_option("-p", "--promises", action="store_true", dest="promise",
32 help="add the promises\'file to the files to monitor"),
33 make_option("-s", "--services", action="store_true", dest="service",
34 help="add the file containing services\'pid to the files to monitor")
35 ]
36
37 class Popen(subprocess.Popen):
38
39 def set_timeout(self, timeout):
40 self.set_timeout = None # assert we're not called twice
41 event = threading.Event()
42 event.__killed = False # we just need a mutable
43 def t():
44 # do not call wait() or poll() because they're not thread-safe
45 if not event.wait(timeout) and self.returncode is None:
46 # race condition if waitpid completes just before the signal sent ?
47 self.terminate()
48 event.__killed = True
49 if event.wait(5):
50 return
51 if self.returncode is None:
52 self.kill() # same race as for terminate ?
53 t = threading.Thread(target=t)
54 t.daemon = True
55 t.start()
56 def killed():
57 event.set()
58 t.join()
59 return event.__killed
60 return killed
61
62 def init_db(db):
63 db.executescript("""
64 CREATE TABLE IF NOT EXISTS status (
65 timestamp INTEGER UNIQUE,
66 status VARCHAR(255));
67 CREATE TABLE IF NOT EXISTS individual_status (
68 timestamp INTEGER,
69 status VARCHAR(255),
70 element VARCHAR(255),
71 output TEXT);
72 """)
73
74 def getListOfScripts(directory):
75 """
76 Get the list of script inside of a directory (not recursive)
77 """
78 scripts = []
79 if os.path.exists(directory) and os.path.isdir(directory):
80 for file_name in os.listdir(directory):
81 file = os.path.join(directory, file_name)
82 if os.access(file, os.X_OK) and not os.path.isdir(file):
83 scripts.append(file)
84 else:
85 exit("There is a problem in your directories" \
86 "of monitoring. Please check them")
87 return scripts
88
89
90 def runServices(directory):
91 services = getListOfScripts(directory)
92 result = {}
93 for service in services:
94 service_path = os.path.join(pid_dir, service)
95 service_name = os.path.basename(service_path)
96 try:
97 pid = int(open(service_path).read())
98 ### because apache (or others) can write sockets
99 ### We also ignore not readable pid files
100 except (IOError, ValueError):
101 continue
102 try:
103 os.kill(pid, 0)
104 result[service_name] = ''
105 except OSError:
106 result[service_name] = "This service is not running anymore"
107 return result
108
109
110 def runScripts(directory):
111 # XXX script_timeout could be passed as parameters
112 script_timeout = 60 # in seconds
113 result = {}
114 with open(os.devnull, 'r+') as f:
115 for script in getListOfScripts(directory):
116 command = os.path.join(promise_dir, script),
117 script = os.path.basename(script)
118 result[script] = ''
119
120 p = Popen(command, cwd=instance_path,
121 env=None if sys.platform == 'cygwin' else {},
122 stdin=f, stdout=f, stderr=subprocess.PIPE)
123 killed = p.set_timeout(script_timeout)
124 stderr = p.communicate()[1]
125 if killed():
126 result[script] = "Time Out"
127 elif p.returncode:
128 result[script] = stderr.strip()
129 return result
130
131
132 def writeFiles(monitors):
133 timestamp = int(time.time())
134 db = sqlite3.connect(db_path)
135 init_db(db)
136 status = SUCCESS
137 for key, value in monitors.iteritems():
138 if value:
139 element_status = status = FAILURE
140 else:
141 element_status = SUCCESS
142 db.execute("insert into individual_status(timestamp, element, output, status) values (?, ?, ?, ?)", (timestamp, key, value, element_status))
143 db.execute("insert into status(timestamp, status) values (?, ?)", (timestamp, status))
144 db.commit()
145 db.close()
146 monitors['datetime'] = time.ctime(timestamp)
147 json.dump(monitors, open(monitoring_file_json, "w+"))
148
149 def main():
150 parser = OptionParser(option_list=option_list)
151 monitors = {}
152 (options, args) = parser.parse_args()
153
154 if not (options.monitor or options.promise
155 or options.service or options.all):
156 exit("Please provide at list one arg in : -a, -m, -p, -s")
157
158 if options.monitor or options.all:
159 monitors.update(runScripts(monitor_dir))
160 if options.promise or options.all:
161 monitors.update(runScripts(promise_dir))
162 if options.service or options.all:
163 monitors.update(runServices(pid_dir))
164
165 if options.only_stdout:
166 print json.dumps(monitors)
167 else:
168 writeFiles(monitors)
169
170
171 if __name__ == "__main__":
172 main()