Skip to content

Commit

Permalink
remove debug info
Browse files Browse the repository at this point in the history
  • Loading branch information
iefode committed Jan 17, 2025
1 parent d83b538 commit 075245e
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 146 deletions.
65 changes: 0 additions & 65 deletions samples/cpp/text_generation/speculative_decoding_lm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,71 +6,6 @@
#include "openvino/genai/llm_pipeline.hpp"

int main(int argc, char* argv[]) try {
// to: remove
// if (5 != argc) {
// throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> <DRAFT_MODEL_DIR> <TYPE> '<PROMPT>'");
// }

// ov::genai::GenerationConfig config;
// config.max_new_tokens = 500;
// // Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded
// // add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration
// config.num_assistant_tokens = 5;
// // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold`
// // config.assistant_confidence_threshold = 0.4;

// std::string main_model_path = argv[1];
// std::string draft_model_path = argv[2];
// std::string type = argv[3];
// std::string prompt = argv[4];

// // User can run main and draft model on different devices.
// // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
// std::string main_device = "CPU", draft_device = "CPU";

// auto streamer = [](std::string subword) {
// std::cout << subword << std::flush;
// return false;
// };

// if (type == "cb") {
// // std::cout << "CB" << std::endl;

// ov::genai::LLMPipeline pipe(
// main_model_path,
// main_device,
// ov::genai::scheduler_config(ov::genai::SchedulerConfig())
// // ov::genai::draft_model(draft_model_path, draft_device)
// );

// // Since the streamer is set, the results will
// // be printed each time a new token is generated.
// pipe.generate(prompt, config, streamer);
// } else if (type == "sd") {
// // std::cout << "SD" << std::endl;
// ov::genai::LLMPipeline pipe(
// main_model_path,
// main_device,
// ov::genai::draft_model(draft_model_path, draft_device)
// );

// // Since the streamer is set, the results will
// // be printed each time a new token is generated.
// pipe.generate(prompt, config, streamer);
// } else {
// config.max_ngram_size = 3;
// // std::cout << "PL" << std::endl;
// ov::genai::LLMPipeline pipe(
// main_model_path,
// main_device,
// ov::genai::prompt_lookup(true)
// );

// // Since the streamer is set, the results will
// // be printed each time a new token is generated.
// pipe.generate(prompt, config, streamer);
// }

if (4 != argc) {
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> <DRAFT_MODEL_DIR> '<PROMPT>'");
}
Expand Down
29 changes: 2 additions & 27 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,6 @@ std::vector<EncodedGenerationResult>
ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<ov::Tensor>& input_ids,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
// todo: remove
ManualTimer generate_timer("generate()");
generate_timer.start();

Expand Down Expand Up @@ -308,29 +307,18 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o

std::atomic<bool> has_active_request = has_non_finished_requests();
GenerationHandle& generation = generations.at(0);
// todo: remove
float streaming_duraton = 0, thread_duration = 0;


// create variables to make optimal thread-safe streaming
std::mutex mutex;
std::unique_lock lock(mutex);
std::condition_variable cv;

// todo: remove
ManualTimer thread_timer("streaming");
thread_timer.start();

// define stream token lambda to use in `t_stream`
auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() {
auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() {
while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) {
// waiting for any tokens or request finishing
cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; });

if (streamer_ptr && generation->can_read()) {
// todo: remove
ManualTimer streaming_timer("streaming");
streaming_timer.start();

std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (streamer_ptr->put(gen_token)) {
Expand All @@ -339,10 +327,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
break;
}
}

// todo: remove
streaming_timer.end();
streaming_duraton += streaming_timer.get_duration();
}
};
};
Expand All @@ -351,10 +335,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
std::thread t_stream([&stream_tokens] {
stream_tokens();
});

// todo: remove
thread_timer.end();
thread_duration += thread_timer.get_duration();

while (!generation->is_dropped() && has_active_request) {
try {
Expand Down Expand Up @@ -431,12 +411,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o

OPENVINO_ASSERT(results.size() == input_ids.size());

// todo: remove
generate_timer.end();
// std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl;
// std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl;
// std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl;

return results;
}

Expand Down
28 changes: 1 addition & 27 deletions src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,31 +111,21 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Ten
}
auto all_requests = m_pipeline->get_awaiting_requests();

// todo: shouls be removed
float streaming_duraton = 0, thread_duration = 0;
ManualTimer streaming_timer("gen");
streaming_timer.start();

std::atomic<bool> continue_streaming = true, has_active_request = has_non_finished_requests();
auto& generation = generations.at(0);


// create variables to make optimal thread-safe streaming
std::mutex mutex;
std::unique_lock lock(mutex);
std::condition_variable cv;

// define stream token lambda to use in `t_stream`
auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() {
auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() {
while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) {
// waiting for any tokens or request finishing
cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; });

if (streamer_ptr && generation->can_read()) {
// todo: remove
ManualTimer streaming_timer("streaming");
streaming_timer.start();

std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (streamer_ptr->put(gen_token)) {
Expand All @@ -144,26 +134,14 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Ten
break;
}
}

// todo: remove
streaming_timer.end();
streaming_duraton += streaming_timer.get_duration();
}
};
};

// todo: remove
ManualTimer thread_timer("threading");
thread_timer.start();

// to define streaming thread
std::thread t_stream([&stream_tokens] {
stream_tokens();
});

// todo: remove
thread_timer.end();
thread_duration += thread_timer.get_duration();

while (continue_streaming && has_active_request) {
try {
Expand Down Expand Up @@ -225,10 +203,6 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector<ov::Ten

OPENVINO_ASSERT(results.size() == input_ids.size());
generate_timer.end();
// todo: remove
// std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl;
// std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl;
// std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl;
return results;
}

Expand Down
28 changes: 1 addition & 27 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,34 +237,21 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
}
auto all_requests = get_awaiting_requests();

// todo: remove
float streaming_duraton = 0, thread_duration = 0;
ManualTimer streaming_timer("gen");
streaming_timer.start();

std::atomic<bool> has_active_request = has_non_finished_requests();
auto& generation = main_generations.at(0);

// todo: remove
ManualTimer thread_timer("threading");
thread_timer.start();

// create variables to make optimal thread-safe streaming
std::mutex mutex;
std::unique_lock lock(mutex);
std::condition_variable cv;

// define stream token lambda to use in `t_stream`
auto stream_tokens = [&generation, &streamer_ptr, &streaming_duraton, &has_active_request, &cv, &lock]() {
auto stream_tokens = [&generation, &streamer_ptr, &has_active_request, &cv, &lock]() {
while (!generation->is_dropped() && (has_active_request || streamer_ptr && generation->can_read())) {
// waiting for any tokens or request finishing
cv.wait(lock, [&generation, &has_active_request]{ return generation->can_read() || !has_active_request; });

if (streamer_ptr && generation->can_read()) {
// todo: remove
ManualTimer streaming_timer("streaming");
streaming_timer.start();

std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (streamer_ptr->put(gen_token)) {
Expand All @@ -273,10 +260,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
break;
}
}

// todo: remove
streaming_timer.end();
streaming_duraton += streaming_timer.get_duration();
}
};
};
Expand All @@ -286,10 +269,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
stream_tokens();
});

// todo: remove
thread_timer.end();
thread_duration += thread_timer.get_duration();

while (!generation->is_dropped() && has_active_request) {
try {
const auto infer_start = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -351,11 +330,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<

OPENVINO_ASSERT(results.size() == input_ids.size());
generate_timer.end();

// todo: remove
// std::cout << std::endl << "STREAMING DURATION: " << streaming_duraton << std::endl;
// std::cout << "GENERATION DURATION: " << generate_timer.get_duration() << std::endl;
// std::cout << "THREAD CREATION DURATION: " << thread_duration << std::endl;
return results;
}

Expand Down

0 comments on commit 075245e

Please sign in to comment.