Skip to content

Commit 8a1bfd6

Browse files
authored
[Serve] MicroServing API refactor (#3071)
This PR refactors the MicroServing REST API. With this PR, we now have all the microserving REST APIs under file `python/mlc_llm/serve/entrypoints/microserving_entrypoints.py`. And relative protocol data structures are placed under `python/mlc_llm/protocol/microserving_protocol.py`. These REST APIs essentially wrap and redirect to the OpenAI `v1/completions` API. Besides, this PR applies some API name renaming to be consistent with writeups.
1 parent 88074ea commit 8a1bfd6

15 files changed

+288
-160
lines changed

cpp/serve/config.cc

+12-12
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,12 @@ Result<DisaggConfig> DisaggConfig::FromJSON(const picojson::object& config) {
7070
DisaggConfig res;
7171
std::optional<std::string> kind = json::LookupOptional<std::string>(config, "kind");
7272
if (kind.has_value()) {
73-
if (kind.value() == "prepare_prefill") {
74-
res.kind = DisaggRequestKind::kPreparePrefill;
75-
} else if (kind.value() == "remote_prefill") {
76-
res.kind = DisaggRequestKind::kRemotePrefill;
77-
} else if (kind.value() == "start_decode") {
78-
res.kind = DisaggRequestKind::kStartDecode;
73+
if (kind.value() == "prepare_receive") {
74+
res.kind = DisaggRequestKind::kPrepareReceive;
75+
} else if (kind.value() == "remote_send") {
76+
res.kind = DisaggRequestKind::kRemoteSend;
77+
} else if (kind.value() == "start_generation") {
78+
res.kind = DisaggRequestKind::kStartGeneration;
7979
} else {
8080
return TResult::Error("Unknown disaggregation request kind " + kind.value());
8181
}
@@ -125,16 +125,16 @@ Result<DisaggConfig> DisaggConfig::FromJSON(const picojson::object& config) {
125125
picojson::object DisaggConfig::AsJSON() const {
126126
picojson::object config;
127127
switch (kind) {
128-
case DisaggRequestKind::kPreparePrefill: {
129-
config["kind"] = picojson::value("prepare_prefill");
128+
case DisaggRequestKind::kPrepareReceive: {
129+
config["kind"] = picojson::value("prepare_receive");
130130
break;
131131
}
132-
case DisaggRequestKind::kRemotePrefill: {
133-
config["kind"] = picojson::value("remote_prefill");
132+
case DisaggRequestKind::kRemoteSend: {
133+
config["kind"] = picojson::value("remote_send");
134134
break;
135135
}
136-
case DisaggRequestKind::kStartDecode: {
137-
config["kind"] = picojson::value("start_decode");
136+
case DisaggRequestKind::kStartGeneration: {
137+
config["kind"] = picojson::value("start_generation");
138138
break;
139139
}
140140
default:

cpp/serve/config.h

+6-6
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ enum class SpecialRequestKind : int {
4848

4949
enum class DisaggRequestKind : int {
5050
kNone = 0,
51-
kPreparePrefill = 1,
52-
kRemotePrefill = 2,
53-
kStartDecode = 3,
51+
kPrepareReceive = 1,
52+
kRemoteSend = 2,
53+
kStartGeneration = 3,
5454
};
5555

5656
/*! \brief Controls the behavior of inference with grammar constraint. */
@@ -70,11 +70,11 @@ class DisaggConfig {
7070
// "kv_window_begin" and "kv_window_end" denote the KV interval of interests.
7171
// "kv_window_end" supports Python style negative indexing.
7272
// The concrete meaning varies for different special request kind:
73-
// - For "prepare_prefill", the begin is always 0, and "[0:end]" denotes
73+
// - For "prepare_receive", the begin is always 0, and "[0:end]" denotes
7474
// the KV range to prefill on a prefill instance.
75-
// - For "remote_prefill", "[begin:end]" means the KV range to compute prefill
75+
// - For "remote_send", "[begin:end]" means the KV range to compute prefill
7676
// and send to the decode instance.
77-
// - For "start_decode", the end is always nullopt, and "[begin:]" denotes
77+
// - For "start_generation", the end is always nullopt, and "[begin:]" denotes
7878
// the KV range to prefill locally on the decode instance.
7979
std::optional<int> kv_window_begin = std::nullopt;
8080
std::optional<int> kv_window_end = std::nullopt;

cpp/serve/engine.cc

+4-4
Original file line numberDiff line numberDiff line change
@@ -548,10 +548,10 @@ class EngineImpl : public Engine {
548548
bool HandleDisaggRequest(Request request) {
549549
DisaggConfig disagg_config = request->generation_cfg->debug_config.disagg_config;
550550
DisaggRequestKind kind = disagg_config.kind;
551-
if (kind == DisaggRequestKind::kPreparePrefill) {
551+
if (kind == DisaggRequestKind::kPrepareReceive) {
552552
// No-op.
553553
return false;
554-
} else if (kind == DisaggRequestKind::kRemotePrefill) {
554+
} else if (kind == DisaggRequestKind::kRemoteSend) {
555555
int input_length = 0;
556556
for (Data input : request->inputs) {
557557
input_length += input->GetLength();
@@ -586,13 +586,13 @@ class EngineImpl : public Engine {
586586
updated_generation_cfg->n = 1;
587587
request->generation_cfg = GenerationConfig(updated_generation_cfg);
588588
return false;
589-
} else if (kind == DisaggRequestKind::kStartDecode) {
589+
} else if (kind == DisaggRequestKind::kStartGeneration) {
590590
auto it_rstate = estate_->request_states.find(request->id);
591591
CHECK(it_rstate != estate_->request_states.end());
592592
ICHECK(!it_rstate->second->entries.empty());
593593
request = it_rstate->second->entries[0]->request;
594594
CHECK(request->generation_cfg->debug_config.disagg_config.kind ==
595-
DisaggRequestKind::kPreparePrefill);
595+
DisaggRequestKind::kPrepareReceive);
596596
int input_length = 0;
597597
for (Data input : request->inputs) {
598598
input_length += input->GetLength();

cpp/serve/engine_actions/action.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class EngineAction : public ObjectRef {
220220
* matched length in the prefix cache.
221221
* \return The created action object.
222222
*/
223-
static EngineAction DisaggPreparePrefill(Array<Model> models, EngineConfig engine_config,
223+
static EngineAction DisaggPrepareReceive(Array<Model> models, EngineConfig engine_config,
224224
std::vector<picojson::object> model_configs,
225225
Optional<EventTraceRecorder> trace_recorder,
226226
FRequestStreamCallback request_stream_callback);
@@ -238,7 +238,7 @@ class EngineAction : public ObjectRef {
238238
* \param device The device of the model for synchronization.
239239
* \return The created action object.
240240
*/
241-
static EngineAction NewRequestPrefillWithKVSend(
241+
static EngineAction DisaggRemoteSend(
242242
Array<Model> models, std::vector<ModelWorkspace> model_workspaces, EngineConfig engine_config,
243243
std::vector<picojson::object> model_configs, Optional<EventTraceRecorder> trace_recorder,
244244
FRequestStreamCallback request_stream_callback, Device device);

cpp/serve/engine_actions/action_commons.cc

+5-6
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,10 @@ Array<EngineAction> CreateEngineActions(
122122
if (model_metadata.disaggregation) {
123123
// Insert the disaggregation actions.
124124
Array<EngineAction> disaggregation_actions = {
125-
EngineAction::DisaggPreparePrefill(models, engine_config, model_configs, trace_recorder,
125+
EngineAction::DisaggPrepareReceive(models, engine_config, model_configs, trace_recorder,
126126
request_stream_callback),
127-
EngineAction::NewRequestPrefillWithKVSend(models, model_workspaces, engine_config,
128-
model_configs, trace_recorder,
129-
request_stream_callback, device)};
127+
EngineAction::DisaggRemoteSend(models, model_workspaces, engine_config, model_configs,
128+
trace_recorder, request_stream_callback, device)};
130129
actions.insert(actions.begin(), disaggregation_actions.begin(), disaggregation_actions.end());
131130
}
132131
return actions;
@@ -302,11 +301,11 @@ void ActionStepPostProcess(Array<Request> requests, EngineState estate, const Ar
302301
}
303302
}
304303

305-
// - For all disaggregation requests with "remote_prefill",
304+
// - For all disaggregation requests with "remote_send",
306305
// if it does not appear in the waiting queue, it means the prefill has been finished.
307306
// In this case, we mark the request as finished.
308307
if (request->generation_cfg->debug_config.disagg_config.kind ==
309-
DisaggRequestKind::kRemotePrefill) {
308+
DisaggRequestKind::kRemoteSend) {
310309
auto it = std::find(estate->waiting_queue.begin(), estate->waiting_queue.end(), request);
311310
if (it == estate->waiting_queue.end()) {
312311
CHECK_EQ(rstate->entries.size(), 1);

cpp/serve/engine_actions/prefill_prepare.cc cpp/serve/engine_actions/disagg_prepare_recv.cc

+6-6
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ namespace serve {
1818
* It picks a new request, reserve its KV data locations, and returns the
1919
* KV data locations and the matched prefix length in prefix cache.
2020
*/
21-
class DisaggPreparePrefillActionObj : public BatchPrefillBaseActionObj {
21+
class DisaggPrepareReceiveActionObj : public BatchPrefillBaseActionObj {
2222
public:
23-
explicit DisaggPreparePrefillActionObj(Array<Model> models, EngineConfig engine_config,
23+
explicit DisaggPrepareReceiveActionObj(Array<Model> models, EngineConfig engine_config,
2424
std::vector<picojson::object> model_configs,
2525
Optional<EventTraceRecorder> trace_recorder,
2626
FRequestStreamCallback request_stream_callback)
@@ -51,7 +51,7 @@ class DisaggPreparePrefillActionObj : public BatchPrefillBaseActionObj {
5151
}
5252

5353
{
54-
NVTXScopedRange nvtx_scope("DisaggPreparePrefill matching prefix");
54+
NVTXScopedRange nvtx_scope("DisaggPrepareReceive matching prefix");
5555
prefix_matched_length = MatchPrefixCache(estate, &prefill_input);
5656
}
5757

@@ -199,7 +199,7 @@ class DisaggPreparePrefillActionObj : public BatchPrefillBaseActionObj {
199199
Request request{nullptr};
200200
for (const Request& request_candidate : estate->waiting_queue) {
201201
if (request_candidate->generation_cfg->debug_config.disagg_config.kind ==
202-
DisaggRequestKind::kPreparePrefill) {
202+
DisaggRequestKind::kPrepareReceive) {
203203
request = request_candidate;
204204
break;
205205
}
@@ -427,11 +427,11 @@ class DisaggPreparePrefillActionObj : public BatchPrefillBaseActionObj {
427427
FRequestStreamCallback request_stream_callback_;
428428
};
429429

430-
EngineAction EngineAction::DisaggPreparePrefill(Array<Model> models, EngineConfig engine_config,
430+
EngineAction EngineAction::DisaggPrepareReceive(Array<Model> models, EngineConfig engine_config,
431431
std::vector<picojson::object> model_configs,
432432
Optional<EventTraceRecorder> trace_recorder,
433433
FRequestStreamCallback request_stream_callback) {
434-
return EngineAction(make_object<DisaggPreparePrefillActionObj>(
434+
return EngineAction(make_object<DisaggPrepareReceiveActionObj>(
435435
std::move(models), std::move(engine_config), std::move(model_configs),
436436
std::move(trace_recorder), std::move(request_stream_callback)));
437437
}

cpp/serve/engine_actions/new_request_prefill_with_kv_send.cc cpp/serve/engine_actions/disagg_remote_send.cc

+13-11
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@ namespace serve {
1616
* Aside from that, this action sends the computed KV data to remote
1717
* instances after computing the KV data.
1818
*/
19-
class NewRequestPrefillWithKVSendActionObj : public BatchPrefillBaseActionObj {
19+
class DisaggRemoteSendActionObj : public BatchPrefillBaseActionObj {
2020
public:
21-
explicit NewRequestPrefillWithKVSendActionObj(
22-
Array<Model> models, std::vector<ModelWorkspace> model_workspaces, EngineConfig engine_config,
23-
std::vector<picojson::object> model_configs, Optional<EventTraceRecorder> trace_recorder,
24-
FRequestStreamCallback request_stream_callback, Device device)
21+
explicit DisaggRemoteSendActionObj(Array<Model> models,
22+
std::vector<ModelWorkspace> model_workspaces,
23+
EngineConfig engine_config,
24+
std::vector<picojson::object> model_configs,
25+
Optional<EventTraceRecorder> trace_recorder,
26+
FRequestStreamCallback request_stream_callback, Device device)
2527
: BatchPrefillBaseActionObj(std::move(models), std::move(engine_config),
2628
std::move(model_configs), std::move(trace_recorder)),
2729
model_workspaces_(std::move(model_workspaces)),
@@ -39,7 +41,7 @@ class NewRequestPrefillWithKVSendActionObj : public BatchPrefillBaseActionObj {
3941
// - Find the requests in `waiting_queue` that can prefill in this step.
4042
std::vector<PrefillInput> prefill_inputs;
4143
{
42-
NVTXScopedRange nvtx_scope("NewRequestPrefillWithKVSend getting requests");
44+
NVTXScopedRange nvtx_scope("DisaggRemoteSend getting requests");
4345
prefill_inputs = GetRequestStateEntriesToPrefill(estate);
4446
if (prefill_inputs.empty()) {
4547
return {};
@@ -48,7 +50,7 @@ class NewRequestPrefillWithKVSendActionObj : public BatchPrefillBaseActionObj {
4850

4951
int num_rsentries = prefill_inputs.size();
5052
{
51-
NVTXScopedRange nvtx_scope("NewRequestPrefillWithKVSend matching prefix");
53+
NVTXScopedRange nvtx_scope("DisaggRemoteSend matching prefix");
5254
for (int i = 0; i < num_rsentries; ++i) {
5355
MatchPrefixCache(estate, &prefill_inputs[i]);
5456
}
@@ -183,12 +185,12 @@ class NewRequestPrefillWithKVSendActionObj : public BatchPrefillBaseActionObj {
183185
}
184186

185187
// Explicitly filter the waiting queue to only keep the requests
186-
// with disaggregation request kind "kRemotePrefill".
188+
// with disaggregation request kind "kRemoteSend".
187189
std::vector<Request> waiting_queue;
188190
waiting_queue.reserve(estate->waiting_queue.size());
189191
for (Request request : estate->waiting_queue) {
190192
if (request->generation_cfg->debug_config.disagg_config.kind ==
191-
DisaggRequestKind::kRemotePrefill) {
193+
DisaggRequestKind::kRemoteSend) {
192194
waiting_queue.push_back(request);
193195
}
194196
}
@@ -481,11 +483,11 @@ class NewRequestPrefillWithKVSendActionObj : public BatchPrefillBaseActionObj {
481483
TVMStreamHandle compute_stream_ = nullptr;
482484
};
483485

484-
EngineAction EngineAction::NewRequestPrefillWithKVSend(
486+
EngineAction EngineAction::DisaggRemoteSend(
485487
Array<Model> models, std::vector<ModelWorkspace> model_workspaces, EngineConfig engine_config,
486488
std::vector<picojson::object> model_configs, Optional<EventTraceRecorder> trace_recorder,
487489
FRequestStreamCallback request_stream_callback, Device device) {
488-
return EngineAction(make_object<NewRequestPrefillWithKVSendActionObj>(
490+
return EngineAction(make_object<DisaggRemoteSendActionObj>(
489491
std::move(models), std::move(model_workspaces), std::move(engine_config),
490492
std::move(model_configs), std::move(trace_recorder), std::move(request_stream_callback),
491493
device));

python/mlc_llm/interface/serve.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from mlc_llm.serve.entrypoints import (
1212
debug_entrypoints,
1313
metrics_entrypoints,
14+
microserving_entrypoints,
1415
openai_entrypoints,
1516
)
1617
from mlc_llm.serve.server import ServerContext
@@ -95,6 +96,7 @@ def serve(
9596

9697
app.include_router(openai_entrypoints.app)
9798
app.include_router(metrics_entrypoints.app)
99+
app.include_router(microserving_entrypoints.app)
98100

99101
server_context.enable_debug = enable_debug
100102

python/mlc_llm/protocol/debug_protocol.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88
class DisaggConfig(BaseModel):
99
"""The class of metadata used in microserving APIs."""
1010

11-
kind: Optional[Literal["prepare_prefill", "remote_prefill", "start_decode"]] = None
11+
kind: Optional[Literal["prepare_receive", "remote_send", "start_generation"]] = None
1212
# "kv_append_metadata" is base64-encoded and is thus a string.
1313
kv_append_metadata: Optional[str] = None
1414
# "kv_window_begin" and "kv_window_end" denote the KV interval of interests.
1515
# "kv_window_end" supports Python style negative indexing.
1616
# The concrete meaning varies for different special request kind:
17-
# - For "prepare_prefill", the begin is always 0, and "[0:end]" denotes
17+
# - For "prepare_receive", the begin is always 0, and "[0:end]" denotes
1818
# the KV range to prefill on a prefill instance.
19-
# - For "remote_prefill", "[begin:end]" means the KV range to compute prefill
19+
# - For "remote_send", "[begin:end]" means the KV range to compute prefill
2020
# and send to the decode instance.
21-
# - For "start_decode", the end is always None, and "[begin:]" denotes
21+
# - For "start_generation", the end is always None, and "[begin:]" denotes
2222
# the KV range to prefill locally on the decode instance.
2323
kv_window_begin: Optional[int] = None
2424
kv_window_end: Optional[int] = None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""Protocols in MLC LLM for MicroServing."""
2+
3+
from pydantic import BaseModel
4+
5+
from mlc_llm.protocol.openai_api_protocol import CompletionRequest
6+
7+
8+
class PrepRecvRequest(CompletionRequest):
9+
"""The extra request body for prep_recv request in MicroServing.
10+
11+
Attributes
12+
----------
13+
kv_window_end : int
14+
[0, kv_window_end] denotes the KV range of the prompt to prefill on
15+
a prefill instance.
16+
The entries of this KV range will be allocated on the decode instance.
17+
"""
18+
19+
kv_window_end: int
20+
21+
22+
class PrepRecvResponse(BaseModel):
23+
"""The response body for prep_recv request in MicroServing.
24+
25+
Attributes
26+
----------
27+
prompt_length : int
28+
The length of the request prompt in tokens.
29+
30+
prefix_matched_length : int
31+
The matched common prefix length on the decode instance when
32+
prefix cache is enabled, or 0 if there is no prefix cache.
33+
34+
kv_append_metadata : str
35+
The metadata of the KV range on the destination decode instance.
36+
"""
37+
38+
prompt_length: int
39+
prefix_matched_length: int
40+
kv_append_metadata: str
41+
42+
43+
class RemoteSendRequest(CompletionRequest):
44+
"""The extra request body for remote_send request in MicroServing.
45+
46+
Attributes
47+
----------
48+
kv_window_begin : int
49+
Denote the start of the KV range to prefill.
50+
51+
kv_window_end : int
52+
Denote the end of the KV range to prefill.
53+
54+
kv_append_metadata : str
55+
The metadata of the KV range on the destination decode instance.
56+
57+
dst_group_offset : int
58+
The node group offset of the destination decode instance.
59+
"""
60+
61+
kv_window_begin: int
62+
kv_window_end: int
63+
kv_append_metadata: str
64+
dst_group_offset: int
65+
66+
67+
class StartGenerateRequest(CompletionRequest):
68+
"""The extra request body for start_generate request in MicroServing.
69+
70+
Attributes
71+
----------
72+
kv_window_begin : int
73+
Denote the start of the KV range to prefill on the decode instance.
74+
"""
75+
76+
kv_window_begin: int

0 commit comments

Comments
 (0)