Skip to content

Commit

Permalink
Fix & fuzz ZSTD_generateSequences
Browse files Browse the repository at this point in the history
This function was seriously flawed:
* It didn't do output bounds checks
* It produced invalid sequences when an uncompressed or RLE block was emitted
* It produced invalid sequences when the block splitter was enabled
* It produced invalid sequences when ZSTD_c_targetCBlockSize was enabled

I've attempted to fix these issues, but this function is just a bad idea,
so I've marked it as deprecated and unsafe. We should replace it with
`ZSTD_extractSequences()` which operates on a compressed frame.
  • Loading branch information
terrelln committed Mar 21, 2024
1 parent 741b87b commit 731f4b7
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 56 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ port: sparc64 support validation in CI, by @Cyan4973
port: AIX compatibility, by @likema
port: HP-UX compatibility, by @likema
doc: Improved specification accuracy, by @elasota
bug: Fix and deprecate ZSTD_generateSequences (#3981)

v1.5.5 (Apr 2023)
fix: fix rare corruption bug affecting the high compression mode, reported by @danlark1 (#3517, @terrelln)
Expand Down
127 changes: 85 additions & 42 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -3361,29 +3361,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
return ZSTDbss_compress;
}

static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
{
const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
const seqDef* seqStoreSeqs = seqStore->sequencesStart;
size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
size_t literalsRead = 0;
size_t lastLLSize;
const seqDef* inSeqs = seqStore->sequencesStart;
const size_t nbInSequences = seqStore->sequences - inSeqs;
const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);

ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
const size_t nbOutSequences = nbInSequences + 1;
size_t nbOutLiterals = 0;
repcodes_t repcodes;
size_t i;
repcodes_t updatedRepcodes;

assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
/* Ensure we have enough space for last literals "sequence" */
assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
for (i = 0; i < seqStoreSeqSize; ++i) {
U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
outSeqs[i].litLength = seqStoreSeqs[i].litLength;
outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
/* Bounds check that we have enough space for every input sequence
* and the block delimiter
*/
assert(seqCollector->seqIndex <= seqCollector->maxSequences);
RETURN_ERROR_IF(
nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
dstSize_tooSmall,
"Not enough space to copy sequences");

ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
for (i = 0; i < nbInSequences; ++i) {
U32 rawOffset;
outSeqs[i].litLength = inSeqs[i].litLength;
outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
outSeqs[i].rep = 0;

/* Handle the possible single length >= 64K
* There can only be one because we add MINMATCH to every match length,
* and blocks are at most 128K.
*/
if (i == seqStore->longLengthPos) {
if (seqStore->longLengthType == ZSTD_llt_literalLength) {
outSeqs[i].litLength += 0x10000;
Expand All @@ -3392,41 +3401,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
}
}

if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
/* Derive the correct offset corresponding to a repcode */
outSeqs[i].rep = seqStoreSeqs[i].offBase;
/* Determine the raw offset given the offBase, which may be a repcode. */
if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
assert(repcode > 0);
outSeqs[i].rep = repcode;
if (outSeqs[i].litLength != 0) {
rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
rawOffset = repcodes.rep[repcode - 1];
} else {
if (outSeqs[i].rep == 3) {
rawOffset = updatedRepcodes.rep[0] - 1;
if (repcode == 3) {
assert(repcodes.rep[0] > 1);
rawOffset = repcodes.rep[0] - 1;
} else {
rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
rawOffset = repcodes.rep[repcode];
}
}
} else {
rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
}
outSeqs[i].offset = rawOffset;
/* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
so we provide seqStoreSeqs[i].offset - 1 */
ZSTD_updateRep(updatedRepcodes.rep,
seqStoreSeqs[i].offBase,
seqStoreSeqs[i].litLength == 0);
literalsRead += outSeqs[i].litLength;

/* Update repcode history for the sequence */
ZSTD_updateRep(repcodes.rep,
inSeqs[i].offBase,
inSeqs[i].litLength == 0);

nbOutLiterals += outSeqs[i].litLength;
}
/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
* If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
* for the block boundary, according to the API.
*/
assert(seqStoreLiteralsSize >= literalsRead);
lastLLSize = seqStoreLiteralsSize - literalsRead;
outSeqs[i].litLength = (U32)lastLLSize;
outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
seqStoreSeqSize++;
zc->seqCollector.seqIndex += seqStoreSeqSize;
assert(nbInLiterals >= nbOutLiterals);
{
const size_t lastLLSize = nbInLiterals - nbOutLiterals;
outSeqs[nbInSequences].litLength = (U32)lastLLSize;
outSeqs[nbInSequences].matchLength = 0;
outSeqs[nbInSequences].offset = 0;
assert(nbOutSequences == nbInSequences + 1);
}
seqCollector->seqIndex += nbOutSequences;
assert(seqCollector->seqIndex <= seqCollector->maxSequences);

return 0;
}

size_t ZSTD_sequenceBound(size_t srcSize) {
return (srcSize / ZSTD_MINMATCH_MIN) + 1;
const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
return maxNbSeq + maxNbDelims;
}

size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
Expand All @@ -3435,6 +3458,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
const size_t dstCapacity = ZSTD_compressBound(srcSize);
void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
SeqCollector seqCollector;
{
int targetCBlockSize;
FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
}
{
int nbWorkers;
FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
}

RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");

Expand All @@ -3444,8 +3477,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
seqCollector.maxSequences = outSeqsSize;
zc->seqCollector = seqCollector;

ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
ZSTD_customFree(dst, ZSTD_defaultCMem);
{
const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
ZSTD_customFree(dst, ZSTD_defaultCMem);
FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
}
assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
return zc->seqCollector.seqIndex;
}

Expand Down Expand Up @@ -4038,8 +4075,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
cSeqsSize = 1;
}

/* Sequence collection not supported when block splitting */
if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return 0;
}
Expand Down Expand Up @@ -4261,6 +4299,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
if (bss == ZSTDbss_noCompress) {
if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
Expand Down Expand Up @@ -4293,11 +4332,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,

{ const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
if (bss == ZSTDbss_noCompress) {
RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
cSize = 0;
goto out;
}
}

if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions lib/compress/zstdmt_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)

static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem)
{
ZSTDMT_bufferPool* const bufPool =
ZSTDMT_bufferPool* const bufPool =
(ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem);
if (bufPool==NULL) return NULL;
if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
Expand Down Expand Up @@ -380,7 +380,7 @@ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers,
ZSTD_customMem cMem)
{
ZSTDMT_CCtxPool* const cctxPool =
ZSTDMT_CCtxPool* const cctxPool =
(ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem);
assert(nbWorkers > 0);
if (!cctxPool) return NULL;
Expand Down
33 changes: 23 additions & 10 deletions lib/zstd.h
Original file line number Diff line number Diff line change
Expand Up @@ -1538,25 +1538,38 @@ typedef enum {
ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);

/*! ZSTD_generateSequences() :
* WARNING: This function is meant for debugging and informational purposes ONLY!
* Its implementation is flawed, and it will be deleted in a future version.
* It is not guaranteed to succeed, as there are several cases where it will give
* up and fail. You should NOT use this function in production code.
*
* This function is deprecated, and will be removed in a future version.
*
* Generate sequences using ZSTD_compress2(), given a source buffer.
*
* @param zc The compression context to be used for ZSTD_compress2(). Set any
* compression parameters you need on this context.
* @param outSeqs The output sequences buffer of size @p outSeqsSize
* @param outSeqsSize The size of the output sequences buffer.
* ZSTD_sequenceBound(srcSize) is an upper bound on the number
* of sequences that can be generated.
* @param src The source buffer to generate sequences from of size @p srcSize.
* @param srcSize The size of the source buffer.
*
* Each block will end with a dummy sequence
* with offset == 0, matchLength == 0, and litLength == length of last literals.
* litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
* simply acts as a block delimiter.
*
* @zc can be used to insert custom compression params.
* This function invokes ZSTD_compress2().
*
* The output of this function can be fed into ZSTD_compressSequences() with CCtx
* setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
* @return : number of sequences generated
* @returns The number of sequences generated, necessarily less than
* ZSTD_sequenceBound(srcSize), or an error code that can be checked
* with ZSTD_isError().
*/

ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
ZSTDLIB_STATIC_API size_t
ZSTD_generateSequences( ZSTD_CCtx* zc,
ZSTD_Sequence* outSeqs, size_t outSeqsSize,
const void* src, size_t srcSize);
ZSTD_generateSequences(ZSTD_CCtx* zc,
ZSTD_Sequence* outSeqs, size_t outSeqsSize,
const void* src, size_t srcSize);

/*! ZSTD_mergeBlockDelimiters() :
* Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
Expand Down
8 changes: 6 additions & 2 deletions tests/fuzz/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ FUZZ_TARGETS := \
seekable_roundtrip \
huf_round_trip \
huf_decompress \
decompress_cross_format
decompress_cross_format \
generate_sequences

all: libregression.a $(FUZZ_TARGETS)

Expand Down Expand Up @@ -239,10 +240,13 @@ huf_round_trip: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_huf_round_trip.o

huf_decompress: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o
$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o $(LIB_FUZZING_ENGINE) -o $@

decompress_cross_format: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_decompress_cross_format.o
$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_decompress_cross_format.o $(LIB_FUZZING_ENGINE) -o $@

generate_sequences: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o
$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o $(LIB_FUZZING_ENGINE) -o $@

libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c d_fuzz_regression_driver.o
$(AR) $(FUZZ_ARFLAGS) $@ d_fuzz_regression_driver.o

Expand Down
1 change: 1 addition & 0 deletions tests/fuzz/fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def __init__(self, input_type, frame_type=FrameType.ZSTD):
'huf_round_trip': TargetInfo(InputType.RAW_DATA),
'huf_decompress': TargetInfo(InputType.RAW_DATA),
'decompress_cross_format': TargetInfo(InputType.RAW_DATA),
'generate_sequences': TargetInfo(InputType.RAW_DATA),
}
TARGETS = list(TARGET_INFO.keys())
ALL_TARGETS = TARGETS + ['all']
Expand Down
88 changes: 88 additions & 0 deletions tests/fuzz/generate_sequences.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#define ZSTD_STATIC_LINKING_ONLY

#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#include "fuzz_data_producer.h"
#include "fuzz_helpers.h"
#include "zstd_helpers.h"

/**
* This fuzz target ensures that ZSTD_generateSequences() does not crash and
* if it succeeds that ZSTD_compressSequences() round trips.
*/

static void testRoundTrip(ZSTD_CCtx* cctx, ZSTD_Sequence const* seqs, size_t nbSeqs, const void* src, size_t srcSize) {
/* Compress the sequences with block delimiters */
const size_t compressBound = ZSTD_compressBound(srcSize);
void* dst = FUZZ_malloc(compressBound);
FUZZ_ASSERT(dst);

size_t compressedSize = ZSTD_compressSequences(cctx, dst, compressBound, seqs, nbSeqs, src, srcSize);
FUZZ_ZASSERT(compressedSize);

void* decompressed = FUZZ_malloc(srcSize);
FUZZ_ASSERT(srcSize == 0 || decompressed);
size_t decompressedSize = ZSTD_decompress(decompressed, srcSize, dst, compressedSize);
FUZZ_ZASSERT(decompressedSize);
FUZZ_ASSERT(decompressedSize == srcSize);
if (srcSize != 0) {
FUZZ_ASSERT(!memcmp(src, decompressed, srcSize));
}

free(decompressed);
free(dst);
}

int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {

FUZZ_dataProducer_t *producer = FUZZ_dataProducer_create(data, size);
size = FUZZ_dataProducer_reserveDataPrefix(producer);

ZSTD_CCtx* cctx = ZSTD_createCCtx();
FUZZ_ASSERT(cctx);

const size_t seqsCapacity = FUZZ_dataProducer_uint32Range(producer, 0, 2 * ZSTD_sequenceBound(size));
ZSTD_Sequence* seqs = (ZSTD_Sequence*)FUZZ_malloc(sizeof(ZSTD_Sequence) * seqsCapacity);
FUZZ_ASSERT(seqsCapacity == 0 || seqs);

FUZZ_setRandomParameters(cctx, size, producer);
FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetCBlockSize, 0));
FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 0));

const size_t nbSeqs = ZSTD_generateSequences(cctx, seqs, seqsCapacity, data, size);
if (ZSTD_isError(nbSeqs)) {
/* Allowed to error if the destination is too small */
if (ZSTD_getErrorCode(nbSeqs) == ZSTD_error_dstSize_tooSmall) {
FUZZ_ASSERT(seqsCapacity < ZSTD_sequenceBound(size));
}
} else {
/* Ensure we round trip with and without block delimiters*/

FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_explicitBlockDelimiters));
testRoundTrip(cctx, seqs, nbSeqs, data, size);

const size_t nbMergedSeqs = ZSTD_mergeBlockDelimiters(seqs, nbSeqs);
FUZZ_ASSERT(nbMergedSeqs <= nbSeqs);
FUZZ_ZASSERT(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only));
FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_noBlockDelimiters));
testRoundTrip(cctx, seqs, nbMergedSeqs, data, size);
}

free(seqs);
ZSTD_freeCCtx(cctx);
FUZZ_dataProducer_free(producer);
return 0;
}
Loading

1 comment on commit 731f4b7

@AV-Coding
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello, for this issue, what would you expect is the outcome if hitting it? Does the compression fail, or does it succeed resulting in a corrupted compressed buffer? For example, an invalid offset. If an invalid compressed block results, is it possible to remedy the issue through manipulating the compressed block?

Please sign in to comment.