From 721890d22fcdf540bed33bea88f28c9fad002036 Mon Sep 17 00:00:00 2001 From: v9v <36771847+v9v@users.noreply.github.com> Date: Thu, 4 Jul 2019 19:35:47 +0200 Subject: [PATCH 1/5] Fix race condition in autosave Fixes the case where multiple tmux sessions call auto-save at the same time, which occasionally results in multiple instances of save_all() running in parallel and causing issues like #3 and tmux-plugins/tmux-resurrect#294. The sequence in main() is: 1. Check enough_time_since_last_run_passed 2. Save 3. Update last_save_timestamp. The race here is: * process A finishes step 1 and is busy with step 2. The timestamp is not updated yet. * process B comes to step 1, sees the old timestamp and proceeds to step 2, too. --- scripts/continuum_save.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/continuum_save.sh b/scripts/continuum_save.sh index c85f68a..29b24ac 100755 --- a/scripts/continuum_save.sh +++ b/scripts/continuum_save.sh @@ -35,8 +35,11 @@ fetch_and_run_tmux_resurrect_save_script() { } main() { - if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed; then - fetch_and_run_tmux_resurrect_save_script - fi + ( + flock -n 101 || return # The code below is not thread-safe. + if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed; then + fetch_and_run_tmux_resurrect_save_script + fi + ) 101>/tmp/tmux-continuum-autosave.lockfile } main From 10e612d72c49c5c8762cca4fd17c864b3f373f70 Mon Sep 17 00:00:00 2001 From: v9v <36771847+v9v@users.noreply.github.com> Date: Fri, 5 Jul 2019 19:33:12 +0200 Subject: [PATCH 2/5] Handle missing flock If flock is not installed, fall back to the thread-unsafe version. --- scripts/continuum_save.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/continuum_save.sh b/scripts/continuum_save.sh index 29b24ac..e8c3287 100755 --- a/scripts/continuum_save.sh +++ b/scripts/continuum_save.sh @@ -36,7 +36,9 @@ fetch_and_run_tmux_resurrect_save_script() { main() { ( - flock -n 101 || return # The code below is not thread-safe. + # The code after "flock" is not thread-safe. A race condition can be triggered by multiple + # tmux clients performing autosave in parallel. + ! command -v flock || flock -n 101 || return if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed; then fetch_and_run_tmux_resurrect_save_script fi From 800488ca6f848b7d9269bdcfb435f4bf935eca05 Mon Sep 17 00:00:00 2001 From: v9v <36771847+v9v@users.noreply.github.com> Date: Mon, 8 Jul 2019 15:58:42 +0200 Subject: [PATCH 3/5] Replace flock with mkdir flock is not supported on MacOS. `mkdir` locks have a drawback: they are not cleaned up automatically. If the lock owner crashed before cleaning up the lock, the directory will stay in the filesystem and the lock will be never acquired by someone else. To avoid that, we create temporary locks (the lockdir name changes every 100 seconds). We grab two lock (N and N+1) to avoid the case where process A grabs lock N and process B grabs lock N+1 and both enter the critical section. --- scripts/continuum_save.sh | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/scripts/continuum_save.sh b/scripts/continuum_save.sh index e8c3287..a952cf1 100755 --- a/scripts/continuum_save.sh +++ b/scripts/continuum_save.sh @@ -35,13 +35,20 @@ fetch_and_run_tmux_resurrect_save_script() { } main() { - ( - # The code after "flock" is not thread-safe. A race condition can be triggered by multiple - # tmux clients performing autosave in parallel. - ! command -v flock || flock -n 101 || return - if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed; then - fetch_and_run_tmux_resurrect_save_script + # Sometimes tmux starts multiple saves in parallel. We want only one + # save to be running, otherwise we can get corrupted saved state. + # The following implements a lock that auto-expires after 100...200s. + local lockdir_prefix="/tmp/tmux-continuum-$(current_tmux_server_pid)-lock-" + local lockdir1="${lockdir_prefix}$[ `date +%s` / 100 ]" + local lockdir2="${lockdir_prefix}$[ `date +%s` / 100 + 1]" + if mkdir "$lockdir1"; then + trap "rmdir "$lockdir1"" EXIT + if mkdir "$lockdir2"; then + trap "rmdir "$lockdir1" "$lockdir2"" EXIT + if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed; then + fetch_and_run_tmux_resurrect_save_script + fi fi - ) 101>/tmp/tmux-continuum-autosave.lockfile + fi } main From 39174046684efe60b3ed656d99291b835b07f60a Mon Sep 17 00:00:00 2001 From: v9v <36771847+v9v@users.noreply.github.com> Date: Tue, 9 Jul 2019 10:46:56 +0200 Subject: [PATCH 4/5] Fix race condition in saving lock Two consecutive calls to "date +%s" can return different values. Call "date" only once and reuse the result. --- scripts/continuum_save.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/continuum_save.sh b/scripts/continuum_save.sh index a952cf1..77dfe7f 100755 --- a/scripts/continuum_save.sh +++ b/scripts/continuum_save.sh @@ -37,10 +37,11 @@ fetch_and_run_tmux_resurrect_save_script() { main() { # Sometimes tmux starts multiple saves in parallel. We want only one # save to be running, otherwise we can get corrupted saved state. - # The following implements a lock that auto-expires after 100...200s. local lockdir_prefix="/tmp/tmux-continuum-$(current_tmux_server_pid)-lock-" - local lockdir1="${lockdir_prefix}$[ `date +%s` / 100 ]" - local lockdir2="${lockdir_prefix}$[ `date +%s` / 100 + 1]" + # The following implements a lock that auto-expires after 100...200s. + local lock_generation=$[ `date +%s` / 100 ] + local lockdir1="${lockdir_prefix}${lock_generation}" + local lockdir2="${lockdir_prefix}$[ $lock_generation + 1 ]" if mkdir "$lockdir1"; then trap "rmdir "$lockdir1"" EXIT if mkdir "$lockdir2"; then From ff63f86678e594a91f8d3d87e0fb39d38c22ab8c Mon Sep 17 00:00:00 2001 From: v9v <36771847+v9v@users.noreply.github.com> Date: Sat, 13 Jul 2019 14:16:42 +0200 Subject: [PATCH 5/5] continuum_save.sh: Improvements to locking code * moved the locking to a separate function "acquire_lock" * changed $[ ] to $(( )), for consistency with the rest of the file * taking the lock only after checking all other preconditions (they are non-mutating and thread-safe) --- scripts/continuum_save.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/continuum_save.sh b/scripts/continuum_save.sh index 77dfe7f..c0e0e7e 100755 --- a/scripts/continuum_save.sh +++ b/scripts/continuum_save.sh @@ -34,22 +34,27 @@ fetch_and_run_tmux_resurrect_save_script() { fi } -main() { +acquire_lock() { # Sometimes tmux starts multiple saves in parallel. We want only one # save to be running, otherwise we can get corrupted saved state. local lockdir_prefix="/tmp/tmux-continuum-$(current_tmux_server_pid)-lock-" # The following implements a lock that auto-expires after 100...200s. - local lock_generation=$[ `date +%s` / 100 ] + local lock_generation=$((`date +%s` / 100)) local lockdir1="${lockdir_prefix}${lock_generation}" - local lockdir2="${lockdir_prefix}$[ $lock_generation + 1 ]" + local lockdir2="${lockdir_prefix}$(($lock_generation + 1))" if mkdir "$lockdir1"; then trap "rmdir "$lockdir1"" EXIT if mkdir "$lockdir2"; then trap "rmdir "$lockdir1" "$lockdir2"" EXIT - if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed; then - fetch_and_run_tmux_resurrect_save_script - fi + return 0 fi fi + return 1 # Someone else has the lock. +} + +main() { + if supported_tmux_version_ok && auto_save_not_disabled && enough_time_since_last_run_passed && acquire_lock; then + fetch_and_run_tmux_resurrect_save_script + fi } main