diff --git a/samples/cpp/text_generation/speculative_decoding_lm.cpp b/samples/cpp/text_generation/speculative_decoding_lm.cpp index 426e056a2d..fa59de95b8 100644 --- a/samples/cpp/text_generation/speculative_decoding_lm.cpp +++ b/samples/cpp/text_generation/speculative_decoding_lm.cpp @@ -6,71 +6,6 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - // to: remove - // if (5 != argc) { - // throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); - // } - - // ov::genai::GenerationConfig config; - // config.max_new_tokens = 500; - // // Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded - // // add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration - // config.num_assistant_tokens = 5; - // // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` - // // config.assistant_confidence_threshold = 0.4; - - // std::string main_model_path = argv[1]; - // std::string draft_model_path = argv[2]; - // std::string type = argv[3]; - // std::string prompt = argv[4]; - - // // User can run main and draft model on different devices. - // // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. - // std::string main_device = "CPU", draft_device = "CPU"; - - // auto streamer = [](std::string subword) { - // std::cout << subword << std::flush; - // return false; - // }; - - // if (type == "cb") { - // // std::cout << "CB" << std::endl; - - // ov::genai::LLMPipeline pipe( - // main_model_path, - // main_device, - // ov::genai::scheduler_config(ov::genai::SchedulerConfig()) - // // ov::genai::draft_model(draft_model_path, draft_device) - // ); - - // // Since the streamer is set, the results will - // // be printed each time a new token is generated. - // pipe.generate(prompt, config, streamer); - // } else if (type == "sd") { - // // std::cout << "SD" << std::endl; - // ov::genai::LLMPipeline pipe( - // main_model_path, - // main_device, - // ov::genai::draft_model(draft_model_path, draft_device) - // ); - - // // Since the streamer is set, the results will - // // be printed each time a new token is generated. - // pipe.generate(prompt, config, streamer); - // } else { - // config.max_ngram_size = 3; - // // std::cout << "PL" << std::endl; - // ov::genai::LLMPipeline pipe( - // main_model_path, - // main_device, - // ov::genai::prompt_lookup(true) - // ); - - // // Since the streamer is set, the results will - // // be printed each time a new token is generated. - // pipe.generate(prompt, config, streamer); - // } - if (4 != argc) { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); } diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index ecda9a88f1..45c1d0f630 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -264,7 +264,6 @@ std::vector ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector& input_ids, const std::vector& sampling_params, const StreamerVariant& streamer) { - // todo: remove ManualTimer generate_timer("generate()"); generate_timer.start(); @@ -308,29 +307,18 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector has_active_request = has_non_finished_requests(); GenerationHandle& generation = generations.at(0); - // todo: remove - float streaming_duraton = 0, thread_duration = 0; - + // create variables to make optimal thread-safe streaming std::mutex mutex; std::unique_lock lock(mutex); std::condition_variable cv; - // todo: remove - ManualTimer thread_timer("streaming"); - thread_timer.start(); - // define stream token lambda to use in `t_stream` - auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() { + auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() { while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) { // waiting for any tokens or request finishing cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; }); - if (streamer_ptr && generation->can_read()) { - // todo: remove - ManualTimer streaming_timer("streaming"); - streaming_timer.start(); - std::unordered_map token = generation->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { if (streamer_ptr->put(gen_token)) { @@ -339,10 +327,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectoris_dropped() && has_active_request) { try { @@ -431,12 +411,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorget_awaiting_requests(); - // todo: shouls be removed - float streaming_duraton = 0, thread_duration = 0; - ManualTimer streaming_timer("gen"); - streaming_timer.start(); - std::atomic continue_streaming = true, has_active_request = has_non_finished_requests(); auto& generation = generations.at(0); - // create variables to make optimal thread-safe streaming std::mutex mutex; std::unique_lock lock(mutex); std::condition_variable cv; // define stream token lambda to use in `t_stream` - auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() { + auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() { while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) { // waiting for any tokens or request finishing cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; }); if (streamer_ptr && generation->can_read()) { - // todo: remove - ManualTimer streaming_timer("streaming"); - streaming_timer.start(); - std::unordered_map token = generation->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { if (streamer_ptr->put(gen_token)) { @@ -144,26 +134,14 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector has_active_request = has_non_finished_requests(); auto& generation = main_generations.at(0); - // todo: remove - ManualTimer thread_timer("threading"); - thread_timer.start(); - // create variables to make optimal thread-safe streaming std::mutex mutex; std::unique_lock lock(mutex); std::condition_variable cv; // define stream token lambda to use in `t_stream` - auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() { + auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() { while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) { // waiting for any tokens or request finishing cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; }); if (streamer_ptr && generation->can_read()) { - // todo: remove - ManualTimer streaming_timer("streaming"); - streaming_timer.start(); - std::unordered_map token = generation->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { if (streamer_ptr->put(gen_token)) { @@ -273,10 +260,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< break; } } - - // todo: remove - streaming_timer.end(); - streaming_duraton += streaming_timer.get_duration(); } }; }; @@ -286,10 +269,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< stream_tokens(); }); - // todo: remove - thread_timer.end(); - thread_duration += thread_timer.get_duration(); - while (!generation->is_dropped() && has_active_request) { try { const auto infer_start = std::chrono::steady_clock::now(); @@ -351,11 +330,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< OPENVINO_ASSERT(results.size() == input_ids.size()); generate_timer.end(); - - // todo: remove - // std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl; - // std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl; - // std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl; return results; }