Skip to content

Commit

Permalink
馃 backported "Release migration locks during shutdown" (#41402)
Browse files Browse the repository at this point in the history
Co-authored-by: Chris Truter <crisptrutski@users.noreply.github.com>
  • Loading branch information
metabase-bot[bot] and crisptrutski committed Apr 16, 2024
1 parent 0e7ce2e commit 5abf130
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 4 deletions.
5 changes: 4 additions & 1 deletion src/metabase/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@
(task/stop-scheduler!)
(server/stop-web-server!)
(prometheus/shutdown!)
(log/info (trs "Metabase Shutdown COMPLETE")))
;; This timeout was chosen based on a 30s default termination grace period in Kubernetes.
(let [timeout-seconds 20]
(mdb/release-migration-locks! timeout-seconds))
(log/info "Metabase Shutdown COMPLETE"))

(defenterprise ensure-audit-db-installed!
"OSS implementation of `audit-db/ensure-db-installed!`, which is an enterprise feature, so does nothing in the OSS
Expand Down
7 changes: 7 additions & 0 deletions src/metabase/db.clj
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,10 @@
(throw (ex-info "Application database calls are not allowed inside core.async dispatch pool threads."
{})))
resolved-query)

(defn release-migration-locks!
"Wait up to `timeout-seconds` for the current process to release all migration locks, otherwise force release them."
[timeout-seconds]
(if (db-is-set-up?)
:done
(mdb.setup/release-migration-locks! (mdb.connection/data-source) timeout-seconds)))
82 changes: 79 additions & 3 deletions src/metabase/db/liquibase.clj
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@

(set! *warn-on-reflection* true)

(defonce ^{:doc "The set of Liquibase instances which potentially have taken locks by this process."}
potentially-locked-instances
(atom #{}))

(comment
;; load our custom migrations
metabase.db.custom-migrations/keep-me)
Expand Down Expand Up @@ -198,12 +202,12 @@
(try
(force-release-locks! liquibase)
(catch Exception e
(log/error e (trs "Unable to release the Liquibase lock after a migration failure"))))))
(log/error e "Unable to release the Liquibase lock")))))

(defn- wait-for-migration-lock-to-be-cleared
"Check and make sure the database isn't locked. If it is, sleep for 2 seconds and then retry several times. There's a
chance the lock will end up clearing up so we can run migrations normally."
[^Liquibase liquibase]
[liquibase]
(let [retry-counter (volatile! 0)]
(u/auto-retry 5
(when (migration-lock-exists? liquibase)
Expand All @@ -219,6 +223,67 @@
(log/warnf "Migration lock was cleared after %d retries." @retry-counter)
(log/info "No migration lock found."))))

(defn holding-lock?
"Check whether the given Liquibase instance is already holding the database migration lock."
[^Liquibase liquibase]
(->> (.getDatabase liquibase)
(.getLockService (LockServiceFactory/getInstance))
(.hasChangeLogLock)))

(defn- wait-until [done? ^long sleep-ms timeout-ms]
(let [deadline (+ (System/nanoTime) (* 1e6 timeout-ms))]
(loop []
(if (done?)
:done
(if (>= (System/nanoTime) deadline)
:timed-out
(do (Thread/sleep sleep-ms)
(recur)))))))

(defn- locked-instances
"Scan through a global set of potentially locking Liquibase objects, to retrieve the corresponding Lock Service
instances, and filter by their `hasChangeLock` flag. Returns the list of locking instances in the current process."
[]
(filter holding-lock? @potentially-locked-instances))

(defn wait-for-all-locks
"Wait up to a maximum of `timeout-seconds` for the given Liquibase instance to release the migration lock."
[sleep-ms timeout-ms]
(log/warn (locked-instances))
(let [done? #(empty? (locked-instances))]
(if (done?)
:none
(do (log/infof "Waiting for migration lock(s) to be released (max %.1f secs)" (/ timeout-ms 1000.0))
(wait-until done? sleep-ms timeout-ms)))))

(defn- liquibase->url [^Liquibase liquibase]
;; Need to this cast to get access to the metadata. We currently only use JDBC app databases.
(let [^JdbcConnection conn (.. liquibase getDatabase getConnection)]
(.. conn getMetaData getURL)))

(defn release-concurrent-locks!
"Release any locks held by this process corresponding to the same database."
[conn-or-data-source]
;; Check whether there are Liquibase locks held by the current process - we don't want to release the database locks
;; if they are held by another server, for example if this host is part of an "old" fleet shutting down while new
;; servers starting up, of which one is performing the database upgrade to later Metabase version.
;;
(when-let [instances (not-empty (locked-instances))]
;; We cannot use the existing instances to clear the locks, as their connections are blocking on their current
;; long-running transaction. Since we cannot "clone" a connection (they have "forgotten" their password), so we
;; will create a new Liquibase instance using a fresh database connection.
(with-liquibase [liquibase conn-or-data-source]
;; We rely on the way that Liquibase normalizes the connection URL to check whether the blocking and fresh
;; Liquibase instances are pointing to the same database.
(let [url (liquibase->url liquibase)]
(doseq [instance instances]
(when (= url (liquibase->url instance))
;; We assume that the lock is being held for the purpose of migrations, since the other cases where we take
;; locks are very fast, and in practice this method is only called after we have waited for a while to see
;; if the lock was released on its own.
(log/warn "Releasing liquibase lock before migrations finished")
(release-lock-if-needed! liquibase)))))))

(defn migrate-up-if-needed!
"Run any unrun `liquibase` migrations, if needed."
[^Liquibase liquibase ^DataSource data-source]
Expand Down Expand Up @@ -255,11 +320,22 @@
(Scope/child ^Map scope-objects
(reify Scope$ScopedRunner
(run [_]
(swap! potentially-locked-instances conj liquibase)
(.waitForLock lock-service)
(try
(f)
(finally
(.releaseLock lock-service))))))))
(.releaseLock lock-service)
(swap! potentially-locked-instances disj liquibase))))))))

(defmacro with-scope-locked
"Run `body` in a scope on the Liquibase instance `liquibase`.
Liquibase scopes are used to hold configuration and parameters (akin to binding dynamic variables in
Clojure). This function initializes the database and the resource accessor which are often required.
The underlying locks are re-entrant, so it is safe to nest these blocks."
{:style/indent 1}
[liquibase & body]
`(run-in-scope-locked ~liquibase (fn [] ~@body)))

(defn update-with-change-log
"Run update with the change log instances in `liquibase`."
Expand Down
14 changes: 14 additions & 0 deletions src/metabase/db/setup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,20 @@
(run-schema-migrations! db-type data-source auto-migrate?))))
:done)

(defn release-migration-locks!
"Wait up to `timeout-seconds` for the current process to release all migration locks, otherwise force release them."
[data-source timeout-seconds]
(let [sleep-ms 100
timeout-ms (* 1000 timeout-seconds)]
(case (liquibase/wait-for-all-locks sleep-ms timeout-ms)
:none nil
:done (log/info "Migration lock(s) have been released")
:timed-out (do (log/warn "Releasing liquibase locks on shutdown")
;; There's an infinitesimal chance that we released the lock and another server took it between
;; the timeout, and the mutations we now make to these lock tables - but we can't detect that.
(liquibase/release-concurrent-locks! data-source))))
:done)

;;;; Toucan Setup.

;;; Done at namespace load time these days.
Expand Down
43 changes: 43 additions & 0 deletions test/metabase/db/liquibase_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,46 @@
(set/union
(set (liquibase-file->included-ids "migrations/000_legacy_migrations.yaml" driver/*driver*))
(set (liquibase-file->included-ids "migrations/001_update_migrations.yaml" driver/*driver*)))))))))

(deftest wait-for-all-locks-test
(mt/test-drivers #{:h2 :mysql :postgres}
(mt/with-temp-empty-app-db [conn driver/*driver*]
;; We don't need a long time for tests, keep it zippy.
(let [sleep-ms 5
timeout-ms 10]
(liquibase/with-liquibase [liquibase conn]
(testing "Will not wait if no locks are taken"
(is (= :none (liquibase/wait-for-all-locks sleep-ms timeout-ms))))
(testing "Will timeout if a lock is not released"
(liquibase/with-scope-locked liquibase
(is (= :timed-out (liquibase/wait-for-all-locks sleep-ms timeout-ms)))))
(testing "Will return successfully if the lock is released while we are waiting"
(let [migrate-ms 100
timeout-ms 200
locked (promise)]
(future
(liquibase/with-scope-locked liquibase
(deliver locked true)
(Thread/sleep migrate-ms)))
@locked
(is (= :done (liquibase/wait-for-all-locks sleep-ms timeout-ms))))))))))

(deftest release-all-locks-if-needed!-test
(mt/test-drivers #{:h2 :mysql :postgres}
(mt/with-temp-empty-app-db [conn driver/*driver*]
(liquibase/with-liquibase [liquibase conn]
(testing "When we release the locks from outside the migration...\n"
(let [locked (promise)
released (promise)
locked? (promise)]
(future
(liquibase/with-scope-locked liquibase
(is (liquibase/holding-lock? liquibase))
(deliver locked true)
@released
(deliver locked? (liquibase/holding-lock? liquibase))))
@locked
(liquibase/release-concurrent-locks! conn)
(deliver released true)
(testing "The lock was released before the migration finished"
(is (not @locked?)))))))))

0 comments on commit 5abf130

Please sign in to comment.