remove debug info

openvinotoolkit · Jan 17, 2025 · 075245e · 075245e
1 parent d83b538
commit 075245e
Show file tree

Hide file tree

Showing 4 changed files with 4 additions and 146 deletions.
diff --git a/samples/cpp/text_generation/speculative_decoding_lm.cpp b/samples/cpp/text_generation/speculative_decoding_lm.cpp
@@ -6,71 +6,6 @@
 #include "openvino/genai/llm_pipeline.hpp"
 
 int main(int argc, char* argv[]) try {
-    // to: remove
-    // if (5 != argc) {
-    //     throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> <DRAFT_MODEL_DIR> <TYPE> '<PROMPT>'");
-    // }
-
-    // ov::genai::GenerationConfig config;
-    // config.max_new_tokens = 500;
-    // // Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded
-    // // add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration
-    // config.num_assistant_tokens = 5;
-    // // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold`
-    // // config.assistant_confidence_threshold = 0.4;
-
-    // std::string main_model_path = argv[1];
-    // std::string draft_model_path = argv[2];
-    // std::string type = argv[3];
-    // std::string prompt = argv[4];
-
-    // // User can run main and draft model on different devices.
-    // // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
-    // std::string main_device = "CPU", draft_device = "CPU";
-
-    // auto streamer = [](std::string subword) {
-    //     std::cout << subword << std::flush;
-    //     return false;
-    // };
-
-    // if (type == "cb") {
-    //     // std::cout << "CB" << std::endl;
-
-    //     ov::genai::LLMPipeline pipe(
-    //         main_model_path,
-    //         main_device,
-    //         ov::genai::scheduler_config(ov::genai::SchedulerConfig())
-    //         // ov::genai::draft_model(draft_model_path, draft_device)
-    //     );
-
-    //     // Since the streamer is set, the results will
-    //     // be printed each time a new token is generated.
-    //     pipe.generate(prompt, config, streamer);
-    // } else if (type == "sd") {
-    //     // std::cout << "SD" << std::endl;
-    //     ov::genai::LLMPipeline pipe(
-    //         main_model_path,
-    //         main_device,
-    //         ov::genai::draft_model(draft_model_path, draft_device)
-    //     );
-
-    //     // Since the streamer is set, the results will
-    //     // be printed each time a new token is generated.
-    //     pipe.generate(prompt, config, streamer);
-    // } else {    
-    //     config.max_ngram_size = 3;
-    //     // std::cout << "PL" << std::endl;
-    //     ov::genai::LLMPipeline pipe(
-    //         main_model_path,
-    //         main_device,
-    //         ov::genai::prompt_lookup(true)
-    //     );
-
-    //     // Since the streamer is set, the results will
-    //     // be printed each time a new token is generated.
-    //     pipe.generate(prompt, config, streamer);
-    // }
-
     if (4 != argc) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> <DRAFT_MODEL_DIR> '<PROMPT>'");
     }

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -264,7 +264,6 @@ std::vector<EncodedGenerationResult>
 ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<ov::Tensor>& input_ids,
                                                              const std::vector<GenerationConfig>& sampling_params,
                                                              const StreamerVariant& streamer) {
-    // todo: remove
     ManualTimer generate_timer("generate()");
     generate_timer.start();
 
@@ -308,29 +307,18 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
 
     std::atomic<bool> has_active_request = has_non_finished_requests();
     GenerationHandle& generation = generations.at(0);
-    // todo: remove
-    float streaming_duraton = 0, thread_duration = 0;
-
+
     // create variables to make optimal thread-safe streaming
     std::mutex mutex;
     std::unique_lock lock(mutex);
     std::condition_variable cv;
 
-    // todo: remove
-    ManualTimer thread_timer("streaming");
-    thread_timer.start();
-
     // define stream token lambda to use in `t_stream`
-    auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() {
+    auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() {
         while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) {
             // waiting for any tokens or request finishing
             cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; });
-
             if (streamer_ptr && generation->can_read()) {
-                // todo: remove
-                ManualTimer streaming_timer("streaming");
-                streaming_timer.start();
-
                 std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
                 for (const auto& gen_token : token.begin()->second.generated_ids) {
                     if (streamer_ptr->put(gen_token)) {
@@ -339,10 +327,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
                         break;
                     }
                 }
-
-                // todo: remove
-                streaming_timer.end();
-                streaming_duraton += streaming_timer.get_duration();
             }
         };
     };
@@ -351,10 +335,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
     std::thread t_stream([&stream_tokens] {
         stream_tokens();
     });
-
-    // todo: remove
-    thread_timer.end();
-    thread_duration += thread_timer.get_duration();
 
     while (!generation->is_dropped() && has_active_request) {
         try {
@@ -431,12 +411,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
 
     OPENVINO_ASSERT(results.size() == input_ids.size());
 
-    // todo: remove
     generate_timer.end();
-    // std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl;
-    // std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl;
-    // std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl;
-
     return results;
 }
 

diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
@@ -111,31 +111,21 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Ten
     }
     auto all_requests = m_pipeline->get_awaiting_requests();
 
-    // todo: shouls be removed
-    float streaming_duraton = 0, thread_duration = 0;
-    ManualTimer streaming_timer("gen");
-    streaming_timer.start();
-
     std::atomic<bool> continue_streaming = true, has_active_request = has_non_finished_requests();
     auto& generation = generations.at(0);
 
-
     // create variables to make optimal thread-safe streaming
     std::mutex mutex;
     std::unique_lock lock(mutex);
     std::condition_variable cv;
 
     // define stream token lambda to use in `t_stream`
-    auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() {
+    auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() {
         while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) {
             // waiting for any tokens or request finishing
             cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; });
 
             if (streamer_ptr && generation->can_read()) {
-                // todo: remove
-                ManualTimer streaming_timer("streaming");
-                streaming_timer.start();
-
                 std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
                 for (const auto& gen_token : token.begin()->second.generated_ids) {
                     if (streamer_ptr->put(gen_token)) {
@@ -144,26 +134,14 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Ten
                         break;
                     }
                 }
-
-                // todo: remove
-                streaming_timer.end();
-                streaming_duraton += streaming_timer.get_duration();
             }
         };
     };
 
-    // todo: remove
-    ManualTimer thread_timer("threading");
-    thread_timer.start();
-
     // to define streaming thread
     std::thread t_stream([&stream_tokens] {
         stream_tokens();
     });
-
-    // todo: remove
-    thread_timer.end();
-    thread_duration += thread_timer.get_duration();
 
     while (continue_streaming && has_active_request) {
         try {
@@ -225,10 +203,6 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Ten
 
     OPENVINO_ASSERT(results.size() == input_ids.size());
     generate_timer.end();
-    // todo: remove
-    // std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl;
-    // std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl;
-    // std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl;
     return results;
 }
 

diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -237,34 +237,21 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
     }
     auto all_requests = get_awaiting_requests();
 
-    // todo: remove
-    float streaming_duraton = 0, thread_duration = 0;
-    ManualTimer streaming_timer("gen");
-    streaming_timer.start();
-
     std::atomic<bool> has_active_request = has_non_finished_requests();
     auto& generation = main_generations.at(0);
 
-    // todo: remove
-    ManualTimer thread_timer("threading");
-    thread_timer.start();
-
     // create variables to make optimal thread-safe streaming
     std::mutex mutex;
     std::unique_lock lock(mutex);
     std::condition_variable cv;
 
     // define stream token lambda to use in `t_stream`
-    auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() {
+    auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() {
         while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) {
             // waiting for any tokens or request finishing
             cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; });
 
             if (streamer_ptr && generation->can_read()) {
-                // todo: remove
-                ManualTimer streaming_timer("streaming");
-                streaming_timer.start();
-
                 std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
                 for (const auto& gen_token : token.begin()->second.generated_ids) {
                     if (streamer_ptr->put(gen_token)) {
@@ -273,10 +260,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
                         break;
                     }
                 }
-
-                // todo: remove
-                streaming_timer.end();
-                streaming_duraton += streaming_timer.get_duration();
             }
         };
     };
@@ -286,10 +269,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
         stream_tokens();
     });
 
-    // todo: remove
-    thread_timer.end();
-    thread_duration += thread_timer.get_duration();
-
     while (!generation->is_dropped() && has_active_request) {
         try {
             const auto infer_start = std::chrono::steady_clock::now();
@@ -351,11 +330,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
 
     OPENVINO_ASSERT(results.size() == input_ids.size());
     generate_timer.end();
-
-    // todo: remove
-    // std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl;
-    // std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl;
-    // std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl;
     return results;
 }