Merge pull request #736 from martindevans/llama_get_timings

Exposed basic timing information from llama.cpp
SciSharp · May 13, 2024 · 3582d82 · 3582d82
2 parents e2a770e + e38d96b
commit 3582d82
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 2 deletions.
diff --git a/LLama.Examples/Examples/BatchedExecutorFork.cs b/LLama.Examples/Examples/BatchedExecutorFork.cs
@@ -1,4 +1,4 @@
-using LLama.Batched;
+using LLama.Batched;
 using LLama.Common;
 using LLama.Native;
 using LLama.Sampling;
@@ -67,6 +67,13 @@ await AnsiConsole
                 root.Display(display);
                 AnsiConsole.Write(display);
             });
+
+        // Print some stats
+        var timings = executor.Context.NativeHandle.GetTimings();
+        AnsiConsole.MarkupLine($"Total Tokens Evaluated: {timings.TokensEvaluated}");
+        AnsiConsole.MarkupLine($"Total Tokens Sampled: {timings.TokensSampled}");
+        AnsiConsole.MarkupLine($"Eval Time: {(timings.Eval + timings.PromptEval).TotalMilliseconds}ms");
+        AnsiConsole.MarkupLine($"Sample Time: {timings.Sampling.TotalMilliseconds}ms");
     }
 
     private class Node

diff --git a/LLama/Native/LLamaTimings.cs b/LLama/Native/LLamaTimings.cs
@@ -0,0 +1,104 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace LLama.Native;
+
+/// <summary>
+/// LLama performance information
+/// </summary>
+[StructLayout(LayoutKind.Sequential)]
+public struct LLamaTimings
+{
+    /// <summary>
+    /// Timestamp when reset was last called
+    /// </summary>
+    private double t_start_ms;
+
+    /// <summary>
+    /// Timestamp when these timings were read
+    /// </summary>
+    private double t_end_ms;
+
+    /// <summary>
+    /// Loading milliseconds
+    /// </summary>
+    private double t_load_ms;
+
+    /// <summary>
+    /// Total sampling milliseconds
+    /// </summary>
+    private double t_sample_ms;
+
+    /// <summary>
+    /// total milliseconds spent prompt processing
+    /// </summary>
+    private double t_p_eval_ms;
+
+    /// <summary>
+    /// Total milliseconds in eval/decode calls
+    /// </summary>
+    private double t_eval_ms;
+
+    /// <summary>
+    /// number of tokens sampled
+    /// </summary>
+    private int n_sample;
+
+    /// <summary>
+    /// number of tokens in eval calls for the prompt (with batch size > 1)
+    /// </summary>
+    private int n_p_eval;
+
+    /// <summary>
+    /// number of eval calls
+    /// </summary>
+    private int n_eval;
+
+
+
+
+    /// <summary>
+    /// Timestamp when reset was last called
+    /// </summary>
+    public readonly TimeSpan ResetTimestamp => TimeSpan.FromMilliseconds(t_start_ms);
+
+    /// <summary>
+    /// Timestamp when these timings were read
+    /// </summary>
+    public readonly TimeSpan ReadTimestamp => TimeSpan.FromMilliseconds(t_start_ms);
+
+    /// <summary>
+    /// Time spent loading
+    /// </summary>
+    public readonly TimeSpan Loading => TimeSpan.FromMilliseconds(t_load_ms);
+
+    /// <summary>
+    /// Time spent sampling
+    /// </summary>
+    public readonly TimeSpan Sampling => TimeSpan.FromMilliseconds(t_load_ms);
+
+    /// <summary>
+    /// total milliseconds spent prompt processing
+    /// </summary>
+    public TimeSpan PromptEval => TimeSpan.FromMilliseconds(t_p_eval_ms);
+
+    /// <summary>
+    /// Total milliseconds in eval/decode calls
+    /// </summary>
+    public readonly TimeSpan Eval => TimeSpan.FromMilliseconds(t_eval_ms);
+
+    /// <summary>
+    /// Total number of tokens sampled
+    /// </summary>
+    public readonly int TokensSampled => n_sample;
+
+    /// <summary>
+    /// number of tokens in eval calls for the prompt (with batch size > 1)
+    /// </summary>
+    public readonly int PrompTokensEvaluated => n_p_eval;
+
+    /// <summary>
+    /// number of eval calls
+    /// </summary>
+    public readonly int TokensEvaluated => n_p_eval;
+}
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -282,7 +282,21 @@ static SafeLLamaContextHandle()
         /// </summary>
         /// <param name="ctx"></param>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx);
+        private static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx);
+
+        /// <summary>
+        /// get performance information
+        /// </summary>
+        /// <param name="ctx"></param>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern LLamaTimings llama_get_timings(SafeLLamaContextHandle ctx);
+
+        /// <summary>
+        /// Reset performance information
+        /// </summary>
+        /// <param name="ctx"></param>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern void llama_reset_timings(SafeLLamaContextHandle ctx);
         #endregion
 
         /// <summary>
@@ -510,6 +524,26 @@ public void SetThreads(uint threads, uint threadsBatch)
         {
             llama_set_n_threads(this, threads, threadsBatch);
         }
+
+        #region timing
+        /// <summary>
+        /// Get performance information
+        /// </summary>
+        /// <returns></returns>
+        public LLamaTimings GetTimings()
+        {
+            return llama_get_timings(this);
+        }
+
+        /// <summary>
+        /// Reset all performance information for this context
+        /// </summary>
+        public void ResetTimings()
+        {
+            llama_reset_timings(this);
+        }
+        #endregion
+
 
         #region KV Cache Management
         /// <summary>