Codec Detection

compressionz can automatically detect compression formats by examining the first few bytes of data. This enables handling unknown compressed data without prior knowledge of the format.

Basic Detection

const cz = @import("compressionz");

pub fn decompressAuto(data: []const u8, allocator: std.mem.Allocator) ![]u8 {
    if (cz.Codec.detect(data)) |codec| {
        return cz.decompress(codec, data, allocator);
    }
    return error.UnknownFormat;
}

Codec.detect()

pub fn detect(data: []const u8) ?Codec

Returns the detected codec or null if the format is unknown.

Detection Support

Codec	Detectable	Magic Bytes
LZ4 Frame	✅	`0x04 0x22 0x4D 0x18`
Zstd	✅	`0x28 0xB5 0x2F 0xFD`
Gzip	✅	`0x1F 0x8B`
Zlib	✅	CMF/FLG check
Snappy	✅	`sNaPpY`
LZ4 Raw	❌	No magic
Brotli	❌	No magic
Deflate	❌	No magic

Magic Bytes Reference

LZ4 Frame

Bytes 0-3: 0x04 0x22 0x4D 0x18 (little-endian 0x184D2204)

if (data.len >= 4 and
    data[0] == 0x04 and data[1] == 0x22 and
    data[2] == 0x4D and data[3] == 0x18)
{
    // LZ4 Frame
}

Zstd

Bytes 0-3: 0x28 0xB5 0x2F 0xFD (little-endian 0xFD2FB528)

if (data.len >= 4 and
    data[0] == 0x28 and data[1] == 0xB5 and
    data[2] == 0x2F and data[3] == 0xFD)
{
    // Zstd
}

Gzip

Bytes 0-1: 0x1F 0x8B
Byte 2: Compression method (0x08 = deflate)

if (data.len >= 2 and data[0] == 0x1F and data[1] == 0x8B) {
    // Gzip
}

Zlib

Zlib uses a checksum-based detection:

Byte 0 (CMF): Compression method (low 4 bits = 8 for deflate)
Byte 1 (FLG): Flags
Check: (CMF * 256 + FLG) % 31 == 0

if (data.len >= 2) {
    const cmf = data[0];
    const flg = data[1];
    if ((cmf & 0x0F) == 8 and
        (@as(u16, cmf) * 256 + flg) % 31 == 0)
    {
        // Zlib
    }
}

Snappy (Framed)

Bytes 0-5: "sNaPpY" (stream identifier)

if (data.len >= 6 and std.mem.eql(u8, data[0..6], "sNaPpY")) {
    // Snappy framed format
}

Use Cases

Generic Decompressor

const cz = @import("compressionz");
const std = @import("std");

pub fn decompress(data: []const u8, allocator: std.mem.Allocator) ![]u8 {
    const codec = cz.Codec.detect(data) orelse {
        // Might be uncompressed or undetectable format
        return error.UnknownFormat;
    };

    return cz.decompress(codec, data, allocator);
}

File Handler

pub fn readFile(allocator: std.mem.Allocator, path: []const u8) ![]u8 {
    const data = try std.fs.cwd().readFileAlloc(allocator, path, 100 * 1024 * 1024);
    errdefer allocator.free(data);

    // Try to decompress if compressed
    if (cz.Codec.detect(data)) |codec| {
        const decompressed = try cz.decompress(codec, data, allocator);
        allocator.free(data);
        return decompressed;
    }

    // Return as-is if not compressed
    return data;
}

Multi-Format API

const cz = @import("compressionz");

pub const ContentEncoding = enum {
    none,
    gzip,
    zstd,
    br,  // Brotli

    pub fn fromHeader(header: ?[]const u8) ContentEncoding {
        const value = header orelse return .none;
        if (std.mem.indexOf(u8, value, "zstd") != null) return .zstd;
        if (std.mem.indexOf(u8, value, "br") != null) return .br;
        if (std.mem.indexOf(u8, value, "gzip") != null) return .gzip;
        return .none;
    }

    pub fn toCodec(self: ContentEncoding) ?cz.Codec {
        return switch (self) {
            .none => null,
            .gzip => .gzip,
            .zstd => .zstd,
            .br => .brotli,
        };
    }
};

pub fn decodeResponse(encoding: ContentEncoding, body: []const u8, allocator: std.mem.Allocator) ![]u8 {
    if (encoding.toCodec()) |codec| {
        return cz.decompress(codec, body, allocator);
    }

    // Auto-detect as fallback
    if (cz.Codec.detect(body)) |detected| {
        return cz.decompress(detected, body, allocator);
    }

    // Return uncompressed
    return allocator.dupe(u8, body);
}

Handling Undetectable Formats

For formats without magic bytes, use context or file extensions:

By Extension

pub fn codecFromExtension(path: []const u8) ?cz.Codec {
    const ext = std.fs.path.extension(path);

    const map = std.ComptimeStringMap(cz.Codec, .{
        .{ ".gz", .gzip },
        .{ ".zst", .zstd },
        .{ ".lz4", .lz4 },
        .{ ".br", .brotli },
        .{ ".snappy", .snappy },
        .{ ".zz", .zlib },
    });

    return map.get(ext);
}

By Content-Type

pub fn codecFromContentType(content_type: []const u8) ?cz.Codec {
    if (std.mem.indexOf(u8, content_type, "gzip") != null) return .gzip;
    if (std.mem.indexOf(u8, content_type, "zstd") != null) return .zstd;
    if (std.mem.indexOf(u8, content_type, "br") != null) return .brotli;
    return null;
}

Codec Capabilities Query

Beyond detection, query codec capabilities:

const codec: cz.Codec = .zstd;

// Feature queries
codec.supportsStreaming();    // true
codec.supportsDictionary();   // true
codec.hasBuiltinChecksum();   // true
codec.isFramed();             // true
codec.requiresExpectedSize(); // false

// Metadata
codec.name();       // "Zstandard"
codec.extension();  // ".zst"

Example: Feature-Based Selection

pub fn selectCodec(needs_streaming: bool, needs_dictionary: bool) cz.Codec {
    if (needs_dictionary and needs_streaming) {
        return .zstd;  // Only Zstd has both
    }
    if (needs_streaming) {
        return .gzip;  // Widely compatible streaming
    }
    if (needs_dictionary) {
        return .zstd;  // Best dictionary support
    }
    return .lz4;  // Fastest for simple cases
}

Error Handling

const result = blk: {
    const codec = cz.Codec.detect(data) orelse {
        // Unknown format - might be:
        // 1. Uncompressed data
        // 2. Brotli or Deflate (no magic)
        // 3. Corrupted data

        // Try common undetectable formats
        if (tryBrotli(data, allocator)) |d| break :blk d;
        if (tryDeflate(data, allocator)) |d| break :blk d;

        return error.UnknownFormat;
    };

    break :blk cz.decompress(codec, data, allocator);
};

Performance Note

Detection is O(1) — it only examines the first few bytes:

// Detection is essentially free
const codec = cz.Codec.detect(gigabyte_of_data);  // Instant

Always safe to call on any data, regardless of size.