← Labs On-device · Zig → WASM
Data Lab · Zig + WebAssembly

Profile any dataset, on your device.

Drop in a CSV or TSV and a profiling engine — written in Zig, compiled to WebAssembly — reads every row in a single streaming pass: inferred types, null rates, cardinality, distributions, duplicate rows. It runs entirely inside this browser tab. Nothing is uploaded.

Drop a file — or paste below

Drop a .csv / .tsv here, or click to choose
delimiter is auto-detected · stays on your device
Loading the Zig engine…

How it works. The whole profiler is a single .zig file compiled to a wasm32-freestanding module — no libc, no framework, a few dozen kilobytes — and base-64-inlined into this page. JavaScript copies your file into the module's linear memory and calls one function; everything else (RFC-4180 quote handling, type inference, Welford-style numeric stats, bounded cardinality + frequency counting, Wyhash-based duplicate-row detection) happens in Zig. The file is never sent anywhere.

It's the same idea as the rest of my work — make data fail loudly before production, not quietly after — but freed from any one dataset: point it at your file. A systems language earns its place here: parsing and counting are exactly the kind of tight, allocation-light loops where Zig + WASM runs at near-native speed in a browser tab.

 View the engine source · data-profiler.zigZig 0.16
//! Universal data-quality profiler — Zig → WebAssembly.
//!
//! The whole engine runs on-device, inside the browser. JS copies a CSV/TSV
//! blob into linear memory, calls `profile(ptr, len)`, and reads back a JSON
//! report describing every column: inferred type, null rate, cardinality,
//! numeric stats, most-frequent values, and duplicate-row detection.
//!
//! No libc, no alloc-per-row table: the file is parsed in a single streaming
//! pass and per-column accumulators are updated as cells stream by, so memory
//! tracks the *distinct* data, not the raw size.
//!
//! Build (see build note in this dir):
//!   zig build-exe data-profiler.zig -target wasm32-freestanding \
//!       -fno-entry -O ReleaseSmall -femit-bin=data-profiler.wasm

const std = @import("std");
const alloc = std.heap.wasm_allocator;

// Bounds — keep a hostile/huge file from exhausting browser memory.
const MAX_COLS: usize = 4096;
const MAX_DISTINCT: usize = 50_000; // per column, before we stop tracking new keys
const TOP_N: usize = 6;

// ---------------------------------------------------------------------------
// JS interop
// ---------------------------------------------------------------------------

var g_out: []u8 = &[_]u8{};

/// Reserve `n` bytes in linear memory; JS writes the file bytes here.
/// Returns null (0) if the allocation fails.
export fn wasmAlloc(n: usize) ?[*]u8 {
    const s = alloc.alloc(u8, n) catch return null;
    return s.ptr;
}

/// Parse + profile `len` bytes at `ptr`. Returns the byte length of the JSON
/// report; the report itself starts at `resultPtr()`.
export fn profile(ptr: [*]const u8, len: usize) usize {
    var buf = Buf{};
    buildReport(&buf, ptr[0..len]) catch {
        buf.len = 0;
        buf.put("{\"error\":\"out of memory while profiling\"}");
    };
    g_out = buf.data[0..buf.len];
    return g_out.len;
}

export fn resultPtr() [*]const u8 {
    return g_out.ptr;
}

// ---------------------------------------------------------------------------
// A growable output buffer (no ArrayList dependency — immune to std churn).
// ---------------------------------------------------------------------------

const Buf = struct {
    data: []u8 = &[_]u8{},
    len: usize = 0,

    fn ensure(self: *Buf, add: usize) !void {
        if (self.len + add <= self.data.len) return;
        var cap: usize = if (self.data.len == 0) 8192 else self.data.len * 2;
        while (cap < self.len + add) cap *= 2;
        self.data = try alloc.realloc(self.data, cap);
    }

    fn put(self: *Buf, s: []const u8) void {
        self.ensure(s.len) catch return;
        @memcpy(self.data[self.len..][0..s.len], s);
        self.len += s.len;
    }

    fn putf(self: *Buf, comptime fmt: []const u8, args: anytype) void {
        var tmp: [160]u8 = undefined;
        const r = std.fmt.bufPrint(&tmp, fmt, args) catch return;
        self.put(r);
    }

    /// Write a JSON string literal, escaping as needed.
    fn putStr(self: *Buf, s: []const u8) void {
        self.put("\"");
        for (s) |c| switch (c) {
            '"' => self.put("\\\""),
            '\\' => self.put("\\\\"),
            '\n' => self.put("\\n"),
            '\r' => self.put("\\r"),
            '\t' => self.put("\\t"),
            else => if (c < 0x20) self.putf("\\u{x:0>4}", .{c}) else self.put(&[_]u8{c}),
        };
        self.put("\"");
    }

    /// Write a finite f64 plainly; null otherwise (JSON has no NaN/Inf).
    fn putNum(self: *Buf, x: f64) void {
        if (std.math.isFinite(x)) self.putf("{d}", .{x}) else self.put("null");
    }
};

// ---------------------------------------------------------------------------
// Per-column accumulator
// ---------------------------------------------------------------------------

const Col = struct {
    name: []u8,
    seen_any: bool = false, // saw at least one non-null value
    nulls: u64 = 0,
    count: u64 = 0, // non-null values

    // Type hypotheses — all true until a value refutes them.
    is_int: bool = true,
    is_num: bool = true,
    is_bool: bool = true,
    is_date: bool = true,

    // Numeric stats
    n_num: u64 = 0,
    min: f64 = 0,
    max: f64 = 0,
    sum: f64 = 0,
    sumsq: f64 = 0,

    // Text length
    minlen: usize = std.math.maxInt(usize),
    maxlen: usize = 0,

    distinct: std.StringHashMap(u32),
    distinct_overflow: bool = false,

    fn typeName(self: *const Col) []const u8 {
        if (!self.seen_any) return "empty";
        if (self.is_bool) return "boolean";
        if (self.is_int) return "integer";
        if (self.is_num) return "number";
        if (self.is_date) return "date";
        return "text";
    }
};

// ---------------------------------------------------------------------------
// Value classification helpers (operate on trimmed values)
// ---------------------------------------------------------------------------

fn eqIgnoreCase(a: []const u8, b: []const u8) bool {
    if (a.len != b.len) return false;
    for (a, b) |x, y| if (std.ascii.toLower(x) != std.ascii.toLower(y)) return false;
    return true;
}

fn isNull(v: []const u8) bool {
    if (v.len == 0) return true;
    const tokens = [_][]const u8{ "null", "na", "n/a", "none", "nil", "\\n" };
    for (tokens) |t| if (eqIgnoreCase(v, t)) return true;
    return false;
}

fn isBool(v: []const u8) bool {
    const tokens = [_][]const u8{ "true", "false", "yes", "no" };
    for (tokens) |t| if (eqIgnoreCase(v, t)) return true;
    return false;
}

fn isIntStr(v: []const u8) bool {
    var i: usize = 0;
    if (v.len == 0) return false;
    if (v[0] == '+' or v[0] == '-') i = 1;
    if (i == v.len) return false;
    while (i < v.len) : (i += 1) if (v[i] < '0' or v[i] > '9') return false;
    return true;
}

fn isDateStr(v: []const u8) bool {
    // YYYY-MM-DD or YYYY/MM/DD, optionally followed by a time component.
    if (v.len < 10) return false;
    const sep = v[4];
    if (sep != '-' and sep != '/') return false;
    const digit = struct {
        fn ok(c: u8) bool {
            return c >= '0' and c <= '9';
        }
    }.ok;
    for ([_]usize{ 0, 1, 2, 3, 5, 6, 8, 9 }) |p| if (!digit(v[p])) return false;
    if (v[7] != sep) return false;
    if (v.len == 10) return true;
    return v[10] == ' ' or v[10] == 'T';
}

// ---------------------------------------------------------------------------
// CSV parsing — single streaming pass, RFC-4180-ish quote handling.
// ---------------------------------------------------------------------------

const Parser = struct {
    src: []const u8,
    pos: usize = 0,
    field: Buf = .{}, // unescaped current field (reused)

    fn atEnd(self: *Parser) bool {
        return self.pos >= self.src.len;
    }

    /// Read one field into self.field. Returns the terminator that ended it:
    /// 0 = delimiter, 1 = end of record (newline/EOF).
    fn nextField(self: *Parser, delim: u8) u8 {
        self.field.len = 0;
        const s = self.src;
        if (self.atEnd()) return 1;

        if (s[self.pos] == '"') {
            self.pos += 1; // opening quote
            while (self.pos < s.len) {
                const c = s[self.pos];
                if (c == '"') {
                    if (self.pos + 1 < s.len and s[self.pos + 1] == '"') {
                        self.field.put("\"");
                        self.pos += 2;
                    } else {
                        self.pos += 1; // closing quote
                        break;
                    }
                } else {
                    self.field.put(s[self.pos .. self.pos + 1]);
                    self.pos += 1;
                }
            }
            // consume up to the terminator
            while (self.pos < s.len and s[self.pos] != delim and s[self.pos] != '\n') self.pos += 1;
        } else {
            const start = self.pos;
            while (self.pos < s.len and s[self.pos] != delim and s[self.pos] != '\n') self.pos += 1;
            var end = self.pos;
            if (end > start and s[end - 1] == '\r') end -= 1; // strip CR
            self.field.put(s[start..end]);
        }

        if (self.pos >= s.len) return 1;
        if (s[self.pos] == delim) {
            self.pos += 1;
            return 0;
        }
        // newline
        self.pos += 1;
        return 1;
    }
};

fn detectDelim(src: []const u8) u8 {
    const candidates = [_]u8{ ',', '\t', ';', '|' };
    var best: u8 = ',';
    var best_n: usize = 0;
    for (candidates) |d| {
        var n: usize = 0;
        for (src) |c| {
            if (c == '\n') break;
            if (c == d) n += 1;
        }
        if (n > best_n) {
            best_n = n;
            best = d;
        }
    }
    return best;
}

// ---------------------------------------------------------------------------
// Profiling
// ---------------------------------------------------------------------------

fn observe(col: *Col, raw: []const u8) !void {
    const v = std.mem.trim(u8, raw, " \t\r\n");

    if (isNull(v)) {
        col.nulls += 1;
        return;
    }
    col.seen_any = true;
    col.count += 1;

    // Type refinement
    if (col.is_bool and !isBool(v)) col.is_bool = false;
    if (col.is_int and !isIntStr(v)) col.is_int = false;
    if (col.is_date and !isDateStr(v)) col.is_date = false;
    if (col.is_num) {
        const f = std.fmt.parseFloat(f64, v) catch std.math.nan(f64);
        if (std.math.isFinite(f)) {
            col.n_num += 1;
            if (col.n_num == 1) {
                col.min = f;
                col.max = f;
            } else {
                if (f < col.min) col.min = f;
                if (f > col.max) col.max = f;
            }
            col.sum += f;
            col.sumsq += f * f;
        } else {
            col.is_num = false;
        }
    }

    if (v.len < col.minlen) col.minlen = v.len;
    if (v.len > col.maxlen) col.maxlen = v.len;

    // Distinct + frequency (bounded)
    if (col.distinct.getPtr(v)) |p| {
        p.* += 1;
    } else if (col.distinct.count() < MAX_DISTINCT) {
        const key = try alloc.dupe(u8, v);
        try col.distinct.put(key, 1);
    } else {
        col.distinct_overflow = true;
    }
}

const TopEntry = struct { v: []const u8, c: u32 };

fn writeTop(out: *Buf, col: *Col) void {
    var top: [TOP_N]TopEntry = undefined;
    var n: usize = 0;
    var it = col.distinct.iterator();
    while (it.next()) |e| {
        const entry = TopEntry{ .v = e.key_ptr.*, .c = e.value_ptr.* };
        if (n < TOP_N) {
            top[n] = entry;
            n += 1;
        } else {
            // replace current minimum if this is larger
            var min_i: usize = 0;
            var i: usize = 1;
            while (i < TOP_N) : (i += 1) if (top[i].c < top[min_i].c) {
                min_i = i;
            };
            if (entry.c > top[min_i].c) top[min_i] = entry;
        }
    }
    // simple insertion sort, descending by count
    var i: usize = 1;
    while (i < n) : (i += 1) {
        const x = top[i];
        var j: usize = i;
        while (j > 0 and top[j - 1].c < x.c) : (j -= 1) top[j] = top[j - 1];
        top[j] = x;
    }
    out.put("[");
    var k: usize = 0;
    while (k < n) : (k += 1) {
        if (k != 0) out.put(",");
        out.put("{\"v\":");
        out.putStr(top[k].v);
        out.putf(",\"c\":{d}}}", .{top[k].c});
    }
    out.put("]");
}

fn buildReport(out: *Buf, src: []const u8) !void {
    if (src.len == 0) {
        out.put("{\"error\":\"empty input\"}");
        return;
    }

    const delim = detectDelim(src);
    var p = Parser{ .src = src };

    // --- Header row → column names ---
    var cols: std.ArrayList(Col) = .empty;
    defer cols.deinit(alloc);

    var header_done = false;
    while (!header_done) {
        const term = p.nextField(delim);
        if (cols.items.len < MAX_COLS) {
            const name = try alloc.dupe(u8, p.field.data[0..p.field.len]);
            try cols.append(alloc, .{ .name = name, .distinct = std.StringHashMap(u32).init(alloc) });
        }
        if (term == 1) header_done = true;
    }
    const ncols = cols.items.len;
    if (ncols == 0) {
        out.put("{\"error\":\"no columns detected\"}");
        return;
    }

    // --- Data rows ---
    var rows: u64 = 0;
    var ragged: u64 = 0; // rows whose field count != header count
    var dup_rows: u64 = 0;
    var seen_rows = std.AutoHashMap(u64, u32).init(alloc);
    defer seen_rows.deinit();

    while (!p.atEnd()) {
        var ci: usize = 0;
        var hasher = std.hash.Wyhash.init(0);
        while (true) {
            const term = p.nextField(delim);
            const val = p.field.data[0..p.field.len];
            hasher.update(val);
            hasher.update("\x00");
            if (ci < ncols) try observe(&cols.items[ci], val);
            ci += 1;
            if (term == 1) break;
        }
        // A trailing newline produces one empty phantom field; ignore it.
        if (ci == 1 and p.field.len == 0 and p.atEnd()) break;

        rows += 1;
        if (ci != ncols) ragged += 1;

        const h = hasher.final();
        if (seen_rows.getPtr(h)) |c| {
            c.* += 1;
            dup_rows += 1;
        } else {
            try seen_rows.put(h, 1);
        }
    }

    // --- Emit JSON ---
    var total_nulls: u64 = 0;
    for (cols.items) |*c| total_nulls += c.nulls;

    out.put("{");
    out.putf("\"bytes\":{d},\"rows\":{d},\"cols\":{d},", .{ src.len, rows, ncols });
    out.putf("\"delim\":\"{s}\",", .{switch (delim) {
        ',' => "comma",
        '\t' => "tab",
        ';' => "semicolon",
        '|' => "pipe",
        else => "?",
    }});
    out.putf("\"totalNulls\":{d},\"dupRows\":{d},\"raggedRows\":{d},", .{ total_nulls, dup_rows, ragged });

    out.put("\"columns\":[");
    for (cols.items, 0..) |*c, i| {
        if (i != 0) out.put(",");
        out.put("{\"name\":");
        out.putStr(c.name);
        out.putf(",\"type\":\"{s}\"", .{c.typeName()});
        out.putf(",\"nulls\":{d},\"count\":{d}", .{ c.nulls, c.count });
        out.putf(",\"distinct\":{d},\"distinctOverflow\":{s}", .{
            c.distinct.count(),
            if (c.distinct_overflow) "true" else "false",
        });

        if (c.n_num > 0) {
            const mean = c.sum / @as(f64, @floatFromInt(c.n_num));
            const variance = @max(0.0, c.sumsq / @as(f64, @floatFromInt(c.n_num)) - mean * mean);
            out.put(",\"min\":");
            out.putNum(c.min);
            out.put(",\"max\":");
            out.putNum(c.max);
            out.put(",\"mean\":");
            out.putNum(mean);
            out.put(",\"std\":");
            out.putNum(@sqrt(variance));
        }
        if (c.maxlen > 0 or c.seen_any) {
            const lo = if (c.minlen == std.math.maxInt(usize)) 0 else c.minlen;
            out.putf(",\"minLen\":{d},\"maxLen\":{d}", .{ lo, c.maxlen });
        }
        out.put(",\"top\":");
        writeTop(out, c);
        out.put("}");
    }
    out.put("]}");
}
By John Mikel Regida · Lead Data Architect Engine · Zig 0.16 → wasm32