-
Notifications
You must be signed in to change notification settings - Fork 62
/
file_store.py
681 lines (554 loc) · 18.2 KB
/
file_store.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
# -*- coding: utf-8 -*-
# Copyright (C) 2008-2022, Luis Pedro Coelho <luis@luispedro.org>
# vim: set ts=4 sts=4 sw=4 expandtab smartindent:
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
'''
file_store : file-system based data store & locks.
'''
import os
import sys
from os import path
from os.path import dirname, exists
import errno
import logging
import tempfile
import shutil
from subprocess import Popen
from time import time
from .base import base_store, base_lock
from jug.backends.encode import encode_to, decode_from
MAX_FILESIZE_IN_PACK = 512
def fsync_dir(fname):
import errno
parent = dirname(fname)
try:
fd = os.open(parent, os.O_RDONLY)
except:
# It seems that, on Windows and related platforms (cygwin...), you
# cannot open a directory to get a file descriptor, so the call above
# raises an error.
import sys
if not sys.platform.startswith('linux'):
return
else: # On Linux, we still want to check what's wrong
raise
try:
os.fsync(fd)
except OSError as err:
if err.errno != errno.EINVAL:
raise
finally:
os.close(fd)
class file_store(base_store):
def __init__(self, dname, compress_numpy=False):
'''
file_store(dname)
Recursively create directories.
'''
if dname.endswith('/'): dname = dname[:-1]
self.jugdir = dname
self.compress_numpy = compress_numpy
self.packed = {}
if path.exists(self._packfile()):
with open(self._packfile(), 'rb') as pfile:
self.packed = decode_from(pfile)
def _packfile(self):
return path.join(self.jugdir, 'packs', 'jugpack')
def __repr__(self):
return 'file_store({})'.format(self.jugdir)
__str__ = __repr__
def create(self):
'''
Recursively create directories.
'''
os.makedirs(self.jugdir, exist_ok=True)
os.makedirs(self.tempdir(), exist_ok=True)
def _maybe_create(self):
'''
Calls self.create() the first time it is called; then becomes a no-op.
'''
self.create()
self._maybe_create = (lambda : None)
def tempdir(self):
return path.join(self.jugdir, 'tempfiles')
def _getfname(self, name):
if type(name) != str:
name = name.decode('utf-8')
return path.join(self.jugdir, name[:2], name[2:])
def dump(self, value, name):
'''
store.dump(value, name)
Performs roughly the same as
pickle.dump(value, open(name,'w'))
but does it in a way that is guaranteed to be atomic even over NFS and
using compression on the disk for faster access.
'''
if name in self.packed:
del self.packed[name]
self.resave_pack()
self._maybe_create()
name = self._getfname(name)
os.makedirs(dirname(name), exist_ok=True)
fd, fname = tempfile.mkstemp('.jugtmp', 'jugtemp', self.tempdir())
output = os.fdopen(fd, 'wb')
try:
import numpy as np
if not self.compress_numpy and type(value) == np.ndarray:
np.lib.format.write_array(output, value)
output.flush()
os.fsync(output.fileno())
output.close()
fsync_dir(fname)
os.rename(fname, name)
return
except ImportError:
pass
except OSError:
pass
except ValueError:
pass
encode_to(value, output)
output.flush()
os.fsync(output.fileno())
output.close()
# Rename is atomic even over NFS.
fsync_dir(fname)
os.rename(fname, name)
def _iter_filekeys(self):
'''
for f in store._iter_filekeys():
...
Internal. Use `self.keys()`
'''
if not exists(self.jugdir):
return
for d in sorted(os.listdir(self.jugdir)):
if len(d) == 2:
for f in sorted(os.listdir(path.join(self.jugdir, d))):
yield (d+f).encode('ascii')
def list(self):
'''
keys = store.list()
Returns a list of all the keys in the store
'''
return list(self.packed.keys()) + list(self._iter_filekeys())
def update_pack(self):
to_remove = []
for k in self._iter_filekeys():
f = self._getfname(k)
s = os.stat(f)
if s.st_size <= MAX_FILESIZE_IN_PACK:
with open(f, 'rb') as ifile:
self.packed[k] = decode_from(ifile)
to_remove.append(k)
self.resave_pack()
for k in to_remove:
os.unlink(self._getfname(k))
return len(to_remove)
def resave_pack(self):
os.makedirs(path.join(self.jugdir, 'packs'), exist_ok=True)
lock = self.getlock('pack-save')
for i in range(10):
if lock.get():
break
from time import sleep
logging.warning("Waiting for lock file 'pack-save'")
sleep(2**i)
else:
raise Exception('Could not obtain lock to save packed data')
self._maybe_create()
fd, fname = tempfile.mkstemp('.jugtmp', 'jugtemp', self.tempdir())
output = os.fdopen(fd, 'wb')
encode_to(self.packed, output)
output.flush()
os.fsync(output.fileno())
output.close()
# Rename is atomic even over NFS.
fsync_dir(fname)
os.rename(fname, path.join(self.jugdir, 'packs', 'jugpack'))
lock.release()
def listlocks(self):
'''
keys = store.listlocks()
Returns a list of all the locks in the store
This is an unsafe function as the results may be outdated by the time
the function returns.
'''
if not exists(path.join(self.jugdir, 'locks')):
return []
keys = []
for k in os.listdir(path.join(self.jugdir, 'locks')):
keys.append(k[:-len('.lock')].encode('ascii'))
return keys
def can_load(self, name):
'''
can = store.can_load(name)
'''
fname = self._getfname(name)
return name in self.packed or exists(fname)
def load(self, name):
'''
obj = store.load(name)
Loads the objects. Equivalent to pickle.load(), but a bit smarter at
times.
Parameters
----------
name : str
Key to use
Returns
-------
obj : any
The object that was saved under ``name``
'''
if name in self.packed:
return self.packed[name]
fname = self._getfname(name)
with open(fname, 'rb') as ifile:
try:
import numpy as np
return np.lib.format.read_array(ifile)
except ValueError:
ifile.seek(0)
except ImportError:
pass
return decode_from(ifile)
def remove_many(self, names):
'''
store.remove_many(names)
Removes the objects with the given names from the store.
'''
removed = set()
for name in names:
if name in self.packed:
del self.packed[name]
removed.add(name)
try:
fname = self._getfname(name)
os.unlink(fname)
removed.add(name)
except OSError:
pass
self.resave_pack()
return removed
def remove(self, name):
'''
was_removed = store.remove(name)
Remove the entry associated with name.
Returns whether any entry was actually removed.
'''
return bool(self.remove_many([name]))
def cleanup(self, active, keeplocks=False):
'''
nr_removed = store.cleanup(active, keeplocks)
Implement 'cleanup' command
Parameters
----------
active : sequence
files *not to remove*
keeplocks : boolean
whether to preserve or remove locks
Returns
-------
nr_removed : integer
number of removed files
'''
active = frozenset(t.hash() for t in active)
active_fnames = frozenset(self._getfname(h) for h in active)
removed = 0
for dirpath,_,fs in os.walk(self.jugdir):
if keeplocks and path.basename(dirpath) == "locks":
continue
if path.basename(dirpath) == "packs":
continue
for f in fs:
f = path.join(dirpath, f)
if f not in active_fnames:
os.unlink(f)
removed += 1
pack_dirty = False
for k in frozenset(self.packed.keys()) - active:
del self.packed[k]
pack_dirty = True
removed += 1
if pack_dirty:
self.resave_pack()
return removed
def remove_locks(self):
'''
removed = store.remove_locks()
Remove all locks
Returns
-------
removed : int
Number of locks removed
'''
lockdir = path.join(self.jugdir, 'locks')
if not exists(lockdir): return 0
removed = 0
for f in os.listdir(lockdir):
os.unlink(path.join(lockdir, f))
removed += 1
return removed
def getlock(self, name):
'''
lock = store.getlock(name)
Retrieve a lock object associated with ``name``.
Parameters
----------
name : str
Key
Returns
-------
lock : Lock object
This is a file_lock object
'''
return file_based_lock(self.jugdir, name)
def close(self):
'''
store.close()
Has no effect on file based stores.
'''
pass
def metadata(self, t):
'''
meta = store.metadata(t)
Retrieves information on the state of the computation
Parameters
----------
t : Task
A Task object
Returns
-------
meta : dict
Dictionary describing the state of the computation
'''
from os import stat, path
from time import ctime
fname = self._getfname(t.hash())
if path.exists(fname):
st = stat(fname)
return {
'computed': True,
'completed': ctime(st.st_mtime),
}
return {
'computed': False
}
@staticmethod
def remove_store(jugdir):
'''
file_store.remove_store(jugdir)
Removes from disk all the files associated with this jugdir.
'''
if path.exists(jugdir):
shutil.rmtree(jugdir)
class file_based_lock(base_lock):
'''
file_based_lock: File-system based locks
Functions:
----------
- get(): acquire the lock
- release(): release the lock
- is_locked(): check lock state
'''
# We use epoch + 1 second to mark a task as failed
# More at https://github.com/luispedro/jug/issues/55
_FAILED_TIMESTAMP = (1, 1) # atime, mtime
def __init__(self, jugdir, name):
if type(name) != str:
name = str(name, 'utf-8')
self.fullname = path.join(jugdir, 'locks', '{0}.lock'.format(name))
def get(self):
'''
lock.get()
Create a lock for name in an NFS compatible way.
Parameters
----------
None
Returns
-------
locked : bool
Whether the lock was created
'''
if exists(self.fullname): return False
os.makedirs(path.dirname(self.fullname), exist_ok=True)
try:
import socket
from datetime import datetime
fd = os.open(self.fullname, os.O_RDWR|os.O_CREAT|os.O_EXCL)
F = os.fdopen(fd, 'w')
F.write('PID {0} on HOSTNAME {1}\n'.format(os.getpid(), socket.gethostname()))
F.write('Lock created on {0}\n'.format(datetime.now().strftime('%Y-%m-%d (%Hh%M.%S)')))
F.close()
return True
except OSError:
return False
def release(self):
'''
lock.release()
Removes lock
'''
try:
os.unlink(self.fullname)
except OSError:
pass
def is_locked(self):
'''
locked = lock.is_locked()
Returns whether a lock exists for name. Note that the answer can
be invalid by the time this function returns. Only by trying to
acquire the lock can you avoid race-conditions. See the get() function.
'''
return path.exists(self.fullname)
def fail(self):
'''
lock.fail()
Mark a task as failed.
Has no effect if the task isn't locked
'''
try:
os.utime(self.fullname, self._FAILED_TIMESTAMP)
except OSError:
return False
else:
return True
def is_failed(self):
'''
failed = lock.is_failed()
Returns whether this task is marked as failed.
This code is not race-condition free. It may happen that by the time
this function returns, the failed lock has been released.
'''
if self.is_locked():
try:
t = os.stat(self.fullname)
except OSError:
pass
else:
if t.st_mtime == self._FAILED_TIMESTAMP[0]:
return True
return False
class file_keepalive_store(file_store):
def __repr__(self):
return 'file_keepalive_store({})'.format(self.jugdir)
__str__ = __repr__
def getlock(self, name):
'''
lock = store.getlock(name)
Retrieve a lock object associated with ``name``.
Parameters
----------
name : str
Key
Returns
-------
lock : Lock object
This is a file_lock object
'''
return file_keepalive_based_lock(self.jugdir, name)
class file_keepalive_based_lock(file_based_lock):
'''
file_keepalive_based_lock: File-system based locks
Works much like file_based_lock but implements a mechanism to keep locks
alive during execution, making it possible to recognize crashed/killed jobs
as failed.
Functions:
----------
- get(): acquire the lock
- release(): release the lock
- is_locked(): check lock state
'''
def __init__(self, *args, **kwargs):
self.monitor = None
super(file_keepalive_based_lock, self).__init__(*args, **kwargs)
def start_monitor(self):
"""Start side-kick process that ensures locks are refreshed while
main process is active on this task
"""
# No need to be gentle. Killing it straight away is safe.
self.monitor = Popen([sys.executable, "-m", "jug.backends.file_keepalive_monitor", self.fullname])
def stop_monitor(self):
"""Stop side-kick process that ensures locks are refreshed while
main process is active on this lock
"""
if self.monitor is None:
return
try:
self.monitor.kill()
except OSError as e:
logging.warning('keepalive process failed to die with %s' % e)
# subprocess module implements a cleanup mechanism that kicks in when
# a Popen instance is deleted. This prevents leaking subprocesses and
# avoids having to explicitly block and .wait() for a task to finish
del self.monitor
self.monitor = None
def get(self):
'''
lock.get()
Create a lock for name in an NFS compatible way.
And start a lock monitor job
Parameters
----------
None
Returns
-------
locked : bool
Whether the lock was created
'''
acquired = super(file_keepalive_based_lock, self).get()
if acquired:
self.start_monitor()
return acquired
def release(self):
'''
lock.release()
Removes lock
And stops lock monitor job
'''
self.stop_monitor()
return super(file_keepalive_based_lock, self).release()
def fail(self):
'''
lock.fail()
Mark a task as failed.
Has no effect if the task isn't locked other than stopping the monitoring
process if it exists
'''
self.stop_monitor()
return super(file_keepalive_based_lock, self).fail()
def is_failed(self):
'''
failed = lock.is_failed()
Returns whether this task has a lock in failed state.
A lock in failed state is one that exists and was last modified more
than 30 minutes ago.
This code is not race-condition free. It may happen that by the time
this function returns, the failed lock has been released.
'''
failed_lock = time() - (30 * 60) # 30 minutes old lock is a dead lock
if self.is_locked():
try:
t = os.stat(self.fullname)
except OSError:
pass
else:
if t.st_mtime <= failed_lock:
return True
return False