neuralmagic · bnellnm · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -6,6 +6,8 @@
 from vllm import _custom_ops as ops
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
+from vllm.lowering_utils import vllm_lib, register_vllm_lowering
+
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
 
@@ -110,7 +112,7 @@ def forward_decode(
                   and (max_num_partitions == 1 or num_seqs * num_heads > 512))
         if use_v1:
             # Run PagedAttention V1.
-            ops.paged_attention_v1(
+            torch.ops.vllm.paged_attention_v1(
                 output,
                 query,
                 key_cache,
@@ -139,7 +141,7 @@ def forward_decode(
                 device=output.device,
             )
             max_logits = torch.empty_like(exp_sums)
-            ops.paged_attention_v2(
+            torch.ops.vllm.paged_attention_v2(
                 output,
                 exp_sums,
                 max_logits,
@@ -213,3 +215,160 @@ def copy_blocks(
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
         ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+# needed for compile
+vllm_lib.define(
+    "reshape_and_cache(Tensor key, Tensor value, Tensor key_cache, Tensor value_cache, Tensor slot_mapping, str dtype) -> (Tensor, Tensor)"
+)
+
+
+@torch.library.impl(vllm_lib, "reshape_and_cache", "Meta")
+def _reshape_and_cache_meta(key, value, key_cache, value_cache, slot_mapping,
+                            dtype):
+    return key_cache, value_cache
+
+
+@torch.library.impl(vllm_lib, "reshape_and_cache", "CUDA")
+def _reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
+                       dtype):
+    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                slot_mapping, dtype)
+    return key_cache, value_cache
+
+
+register_vllm_lowering(torch.ops.vllm.reshape_and_cache, [2, 3])
+
+vllm_lib.define(
+    "paged_attention_v1(Tensor out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor context_lens, int block_size, SymInt max_context_len, Tensor? alibi_slopes, str kv_cache_dtype, float kv_scale) -> Tensor"
+)
+#vllm_lib.define(
+#    "paged_attention_v1(Tensor out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor context_lens, int block_size, int max_context_len, Tensor? alibi_slopes, str kv_cache_dtype, float kv_scale) -> Tensor"
+#)
+
+
+@torch.library.impl(vllm_lib, "paged_attention_v1", "Meta")
+def _paged_attention_v1_meta(
+    out,
+    query,
+    key_cache,
+    value_cache,
+    num_kv_heads,
+    scale,
+    block_tables,
+    context_lens,
+    block_size,
+    max_context_len,
+    alibi_slopes,
+    kv_cache_dtype,
+    kv_scale,
+):
+    return out
+
+
+@torch.library.impl(vllm_lib, "paged_attention_v1", "CUDA")
+def _paged_attention_v1(
+    out,
+    query,
+    key_cache,
+    value_cache,
+    num_kv_heads,
+    scale,
+    block_tables,
+    context_lens,
+    block_size,
+    max_context_len,
+    alibi_slopes,
+    kv_cache_dtype,
+    kv_scale,
+):
+    ops.paged_attention_v1(
+        out,
+        query,
+        key_cache,
+        value_cache,
+        num_kv_heads,
+        scale,
+        block_tables,
+        context_lens,
+        block_size,
+        max_context_len,
+        alibi_slopes,
+        kv_cache_dtype,
+        kv_scale,
+    )
+    return out
+
+
+register_vllm_lowering(torch.ops.vllm.paged_attention_v1, [0])
+
+vllm_lib.define(
+    "paged_attention_v2(Tensor out, Tensor exp_sums, Tensor max_logits, Tensor tmp_out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor context_lens, int block_size, SymInt max_context_len, Tensor? alibi_slopes, str kv_cache_dtype, float kv_scale) -> Tensor"
+)
+#vllm_lib.define(
+#    "paged_attention_v2(Tensor out, Tensor exp_sums, Tensor max_logits, Tensor tmp_out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor context_lens, int block_size, int max_context_len, Tensor? alibi_slopes, str kv_cache_dtype, float kv_scale) -> Tensor"
+#)
+
+
+@torch.library.impl(vllm_lib, "paged_attention_v2", "Meta")
+def _paged_attention_v2_meta(
+    out,
+    exp_sums,
+    max_logits,
+    tmp_out,
+    query,
+    key_cache,
+    value_cache,
+    num_kv_heads,
+    scale,
+    block_tables,
+    context_lens,
+    block_size,
+    max_context_len,
+    alibi_slopes,
+    kv_cache_dtype,
+    kv_scale,
+):
+    return out
+
+
+@torch.library.impl(vllm_lib, "paged_attention_v2", "CUDA")
+def _paged_attention_v2(
+    out,
+    exp_sums,
+    max_logits,
+    tmp_out,
+    query,
+    key_cache,
+    value_cache,
+    num_kv_heads,
+    scale,
+    block_tables,
+    context_lens,
+    block_size,
+    max_context_len,
+    alibi_slopes,
+    kv_cache_dtype,
+    kv_scale,
+):
+    ops.paged_attention_v2(
+        out,
+        exp_sums,
+        max_logits,
+        tmp_out,
+        query,
+        key_cache,
+        value_cache,
+        num_kv_heads,
+        scale,
+        block_tables,
+        context_lens,
+        block_size,
+        max_context_len,
+        alibi_slopes,
+        kv_cache_dtype,
+        kv_scale,
+    )
+    return out
+
+
+register_vllm_lowering(torch.ops.vllm.paged_attention_v2, [0])
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -211,8 +211,7 @@ def get_pipeline_model_parallel_group():
 
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
-    return torch.distributed.get_world_size(
-        group=get_tensor_model_parallel_group())
+    return get_tensor_model_parallel_group().size()
 
 
 def get_pipeline_model_parallel_world_size():
@@ -223,7 +222,7 @@ def get_pipeline_model_parallel_world_size():
 
 def get_tensor_model_parallel_rank():
     """Return my rank for the tensor model parallel group."""
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+    return get_tensor_model_parallel_group().rank()
 
 
 def get_pipeline_model_parallel_rank():

diff --git a/vllm/ex/README.md b/vllm/ex/README.md
@@ -0,0 +1,10 @@
+- ex.py - the backend
+- ex_builder.py - compiles/loads C++/CUDA torch modules
+- testex*.py - various tests
+
+TODO
+----
+0. fix stuff
+   - https://github.com/pytorch/pytorch/issues/108446
+1. registration mechanism
+2. backend code generator
diff --git a/vllm/ex/code_cache.py b/vllm/ex/code_cache.py
@@ -0,0 +1,51 @@
+from typing import Callable, Optional
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+class CodeCache:
+    """
+    The CodeCache is a simple map from mangled function names to Callables.
+
+    The CodeCache can be used to store the results of compiled code so that the
+    same Callable can be resued rather than needing to be recompiled.
+
+    Mangled function names should be generated with (or be compatible with) the
+    'utils.mangle_name' function.
+
+    Note: the CodeCache can be initialized with pre-compiled functions.
+    """
+
+    def __init__(self):
+        self.cache = dict()
+
+    """
+    Lookup a Callable for a function based on the 'mangled_name'.  If the name
+    is not present in the cache, call the supplied 'generator' to create
+    the Callable to be associated with the 'mangled_name'.  If the
+    generator fails for any reason a None will be stored in the map and
+    returned instead of a Callable.  This will prevent any failed generators
+    from being called repeatedly.
+    """
+    def lookup_or_create(
+        self,
+        mangled_name: str,
+        generator: Callable
+    ) -> Optional[Callable]:
+        if not mangled_name in self.cache:
+            try:
+                self.cache[mangled_name] = generator()
+            except Exception as ex:
+                self.cache[mangled_name] = None
+                raise ex
+        return self.cache[mangled_name]
+
+    """
+    Add a new entry to the cache.  Return False if an entry with the
+    given name already exists.
+    """
+    def add(mangled_name: str, fn: Optional[Callable]) -> bool:
+        if mangled_name in self.cache:
+            return False
+        self.cache[mangled_name] = fn
+        return True