Drop in a CSV or TSV and a profiling engine — written in Zig, compiled to WebAssembly — reads every row in a single streaming pass: inferred types, null rates, cardinality, distributions, duplicate rows. It runs entirely inside this browser tab. Nothing is uploaded.
How it works. The whole profiler is a single .zig file compiled to a wasm32-freestanding module — no libc, no framework, a few dozen kilobytes — and base-64-inlined into this page. JavaScript copies your file into the module's linear memory and calls one function; everything else (RFC-4180 quote handling, type inference, Welford-style numeric stats, bounded cardinality + frequency counting, Wyhash-based duplicate-row detection) happens in Zig. The file is never sent anywhere.
It's the same idea as the rest of my work — make data fail loudly before production, not quietly after — but freed from any one dataset: point it at your file. A systems language earns its place here: parsing and counting are exactly the kind of tight, allocation-light loops where Zig + WASM runs at near-native speed in a browser tab.
//! Universal data-quality profiler — Zig → WebAssembly.
//!
//! The whole engine runs on-device, inside the browser. JS copies a CSV/TSV
//! blob into linear memory, calls `profile(ptr, len)`, and reads back a JSON
//! report describing every column: inferred type, null rate, cardinality,
//! numeric stats, most-frequent values, and duplicate-row detection.
//!
//! No libc, no alloc-per-row table: the file is parsed in a single streaming
//! pass and per-column accumulators are updated as cells stream by, so memory
//! tracks the *distinct* data, not the raw size.
//!
//! Build (see build note in this dir):
//! zig build-exe data-profiler.zig -target wasm32-freestanding \
//! -fno-entry -O ReleaseSmall -femit-bin=data-profiler.wasm
const std = @import("std");
const alloc = std.heap.wasm_allocator;
// Bounds — keep a hostile/huge file from exhausting browser memory.
const MAX_COLS: usize = 4096;
const MAX_DISTINCT: usize = 50_000; // per column, before we stop tracking new keys
const TOP_N: usize = 6;
// ---------------------------------------------------------------------------
// JS interop
// ---------------------------------------------------------------------------
var g_out: []u8 = &[_]u8{};
/// Reserve `n` bytes in linear memory; JS writes the file bytes here.
/// Returns null (0) if the allocation fails.
export fn wasmAlloc(n: usize) ?[*]u8 {
const s = alloc.alloc(u8, n) catch return null;
return s.ptr;
}
/// Parse + profile `len` bytes at `ptr`. Returns the byte length of the JSON
/// report; the report itself starts at `resultPtr()`.
export fn profile(ptr: [*]const u8, len: usize) usize {
var buf = Buf{};
buildReport(&buf, ptr[0..len]) catch {
buf.len = 0;
buf.put("{\"error\":\"out of memory while profiling\"}");
};
g_out = buf.data[0..buf.len];
return g_out.len;
}
export fn resultPtr() [*]const u8 {
return g_out.ptr;
}
// ---------------------------------------------------------------------------
// A growable output buffer (no ArrayList dependency — immune to std churn).
// ---------------------------------------------------------------------------
const Buf = struct {
data: []u8 = &[_]u8{},
len: usize = 0,
fn ensure(self: *Buf, add: usize) !void {
if (self.len + add <= self.data.len) return;
var cap: usize = if (self.data.len == 0) 8192 else self.data.len * 2;
while (cap < self.len + add) cap *= 2;
self.data = try alloc.realloc(self.data, cap);
}
fn put(self: *Buf, s: []const u8) void {
self.ensure(s.len) catch return;
@memcpy(self.data[self.len..][0..s.len], s);
self.len += s.len;
}
fn putf(self: *Buf, comptime fmt: []const u8, args: anytype) void {
var tmp: [160]u8 = undefined;
const r = std.fmt.bufPrint(&tmp, fmt, args) catch return;
self.put(r);
}
/// Write a JSON string literal, escaping as needed.
fn putStr(self: *Buf, s: []const u8) void {
self.put("\"");
for (s) |c| switch (c) {
'"' => self.put("\\\""),
'\\' => self.put("\\\\"),
'\n' => self.put("\\n"),
'\r' => self.put("\\r"),
'\t' => self.put("\\t"),
else => if (c < 0x20) self.putf("\\u{x:0>4}", .{c}) else self.put(&[_]u8{c}),
};
self.put("\"");
}
/// Write a finite f64 plainly; null otherwise (JSON has no NaN/Inf).
fn putNum(self: *Buf, x: f64) void {
if (std.math.isFinite(x)) self.putf("{d}", .{x}) else self.put("null");
}
};
// ---------------------------------------------------------------------------
// Per-column accumulator
// ---------------------------------------------------------------------------
const Col = struct {
name: []u8,
seen_any: bool = false, // saw at least one non-null value
nulls: u64 = 0,
count: u64 = 0, // non-null values
// Type hypotheses — all true until a value refutes them.
is_int: bool = true,
is_num: bool = true,
is_bool: bool = true,
is_date: bool = true,
// Numeric stats
n_num: u64 = 0,
min: f64 = 0,
max: f64 = 0,
sum: f64 = 0,
sumsq: f64 = 0,
// Text length
minlen: usize = std.math.maxInt(usize),
maxlen: usize = 0,
distinct: std.StringHashMap(u32),
distinct_overflow: bool = false,
fn typeName(self: *const Col) []const u8 {
if (!self.seen_any) return "empty";
if (self.is_bool) return "boolean";
if (self.is_int) return "integer";
if (self.is_num) return "number";
if (self.is_date) return "date";
return "text";
}
};
// ---------------------------------------------------------------------------
// Value classification helpers (operate on trimmed values)
// ---------------------------------------------------------------------------
fn eqIgnoreCase(a: []const u8, b: []const u8) bool {
if (a.len != b.len) return false;
for (a, b) |x, y| if (std.ascii.toLower(x) != std.ascii.toLower(y)) return false;
return true;
}
fn isNull(v: []const u8) bool {
if (v.len == 0) return true;
const tokens = [_][]const u8{ "null", "na", "n/a", "none", "nil", "\\n" };
for (tokens) |t| if (eqIgnoreCase(v, t)) return true;
return false;
}
fn isBool(v: []const u8) bool {
const tokens = [_][]const u8{ "true", "false", "yes", "no" };
for (tokens) |t| if (eqIgnoreCase(v, t)) return true;
return false;
}
fn isIntStr(v: []const u8) bool {
var i: usize = 0;
if (v.len == 0) return false;
if (v[0] == '+' or v[0] == '-') i = 1;
if (i == v.len) return false;
while (i < v.len) : (i += 1) if (v[i] < '0' or v[i] > '9') return false;
return true;
}
fn isDateStr(v: []const u8) bool {
// YYYY-MM-DD or YYYY/MM/DD, optionally followed by a time component.
if (v.len < 10) return false;
const sep = v[4];
if (sep != '-' and sep != '/') return false;
const digit = struct {
fn ok(c: u8) bool {
return c >= '0' and c <= '9';
}
}.ok;
for ([_]usize{ 0, 1, 2, 3, 5, 6, 8, 9 }) |p| if (!digit(v[p])) return false;
if (v[7] != sep) return false;
if (v.len == 10) return true;
return v[10] == ' ' or v[10] == 'T';
}
// ---------------------------------------------------------------------------
// CSV parsing — single streaming pass, RFC-4180-ish quote handling.
// ---------------------------------------------------------------------------
const Parser = struct {
src: []const u8,
pos: usize = 0,
field: Buf = .{}, // unescaped current field (reused)
fn atEnd(self: *Parser) bool {
return self.pos >= self.src.len;
}
/// Read one field into self.field. Returns the terminator that ended it:
/// 0 = delimiter, 1 = end of record (newline/EOF).
fn nextField(self: *Parser, delim: u8) u8 {
self.field.len = 0;
const s = self.src;
if (self.atEnd()) return 1;
if (s[self.pos] == '"') {
self.pos += 1; // opening quote
while (self.pos < s.len) {
const c = s[self.pos];
if (c == '"') {
if (self.pos + 1 < s.len and s[self.pos + 1] == '"') {
self.field.put("\"");
self.pos += 2;
} else {
self.pos += 1; // closing quote
break;
}
} else {
self.field.put(s[self.pos .. self.pos + 1]);
self.pos += 1;
}
}
// consume up to the terminator
while (self.pos < s.len and s[self.pos] != delim and s[self.pos] != '\n') self.pos += 1;
} else {
const start = self.pos;
while (self.pos < s.len and s[self.pos] != delim and s[self.pos] != '\n') self.pos += 1;
var end = self.pos;
if (end > start and s[end - 1] == '\r') end -= 1; // strip CR
self.field.put(s[start..end]);
}
if (self.pos >= s.len) return 1;
if (s[self.pos] == delim) {
self.pos += 1;
return 0;
}
// newline
self.pos += 1;
return 1;
}
};
fn detectDelim(src: []const u8) u8 {
const candidates = [_]u8{ ',', '\t', ';', '|' };
var best: u8 = ',';
var best_n: usize = 0;
for (candidates) |d| {
var n: usize = 0;
for (src) |c| {
if (c == '\n') break;
if (c == d) n += 1;
}
if (n > best_n) {
best_n = n;
best = d;
}
}
return best;
}
// ---------------------------------------------------------------------------
// Profiling
// ---------------------------------------------------------------------------
fn observe(col: *Col, raw: []const u8) !void {
const v = std.mem.trim(u8, raw, " \t\r\n");
if (isNull(v)) {
col.nulls += 1;
return;
}
col.seen_any = true;
col.count += 1;
// Type refinement
if (col.is_bool and !isBool(v)) col.is_bool = false;
if (col.is_int and !isIntStr(v)) col.is_int = false;
if (col.is_date and !isDateStr(v)) col.is_date = false;
if (col.is_num) {
const f = std.fmt.parseFloat(f64, v) catch std.math.nan(f64);
if (std.math.isFinite(f)) {
col.n_num += 1;
if (col.n_num == 1) {
col.min = f;
col.max = f;
} else {
if (f < col.min) col.min = f;
if (f > col.max) col.max = f;
}
col.sum += f;
col.sumsq += f * f;
} else {
col.is_num = false;
}
}
if (v.len < col.minlen) col.minlen = v.len;
if (v.len > col.maxlen) col.maxlen = v.len;
// Distinct + frequency (bounded)
if (col.distinct.getPtr(v)) |p| {
p.* += 1;
} else if (col.distinct.count() < MAX_DISTINCT) {
const key = try alloc.dupe(u8, v);
try col.distinct.put(key, 1);
} else {
col.distinct_overflow = true;
}
}
const TopEntry = struct { v: []const u8, c: u32 };
fn writeTop(out: *Buf, col: *Col) void {
var top: [TOP_N]TopEntry = undefined;
var n: usize = 0;
var it = col.distinct.iterator();
while (it.next()) |e| {
const entry = TopEntry{ .v = e.key_ptr.*, .c = e.value_ptr.* };
if (n < TOP_N) {
top[n] = entry;
n += 1;
} else {
// replace current minimum if this is larger
var min_i: usize = 0;
var i: usize = 1;
while (i < TOP_N) : (i += 1) if (top[i].c < top[min_i].c) {
min_i = i;
};
if (entry.c > top[min_i].c) top[min_i] = entry;
}
}
// simple insertion sort, descending by count
var i: usize = 1;
while (i < n) : (i += 1) {
const x = top[i];
var j: usize = i;
while (j > 0 and top[j - 1].c < x.c) : (j -= 1) top[j] = top[j - 1];
top[j] = x;
}
out.put("[");
var k: usize = 0;
while (k < n) : (k += 1) {
if (k != 0) out.put(",");
out.put("{\"v\":");
out.putStr(top[k].v);
out.putf(",\"c\":{d}}}", .{top[k].c});
}
out.put("]");
}
fn buildReport(out: *Buf, src: []const u8) !void {
if (src.len == 0) {
out.put("{\"error\":\"empty input\"}");
return;
}
const delim = detectDelim(src);
var p = Parser{ .src = src };
// --- Header row → column names ---
var cols: std.ArrayList(Col) = .empty;
defer cols.deinit(alloc);
var header_done = false;
while (!header_done) {
const term = p.nextField(delim);
if (cols.items.len < MAX_COLS) {
const name = try alloc.dupe(u8, p.field.data[0..p.field.len]);
try cols.append(alloc, .{ .name = name, .distinct = std.StringHashMap(u32).init(alloc) });
}
if (term == 1) header_done = true;
}
const ncols = cols.items.len;
if (ncols == 0) {
out.put("{\"error\":\"no columns detected\"}");
return;
}
// --- Data rows ---
var rows: u64 = 0;
var ragged: u64 = 0; // rows whose field count != header count
var dup_rows: u64 = 0;
var seen_rows = std.AutoHashMap(u64, u32).init(alloc);
defer seen_rows.deinit();
while (!p.atEnd()) {
var ci: usize = 0;
var hasher = std.hash.Wyhash.init(0);
while (true) {
const term = p.nextField(delim);
const val = p.field.data[0..p.field.len];
hasher.update(val);
hasher.update("\x00");
if (ci < ncols) try observe(&cols.items[ci], val);
ci += 1;
if (term == 1) break;
}
// A trailing newline produces one empty phantom field; ignore it.
if (ci == 1 and p.field.len == 0 and p.atEnd()) break;
rows += 1;
if (ci != ncols) ragged += 1;
const h = hasher.final();
if (seen_rows.getPtr(h)) |c| {
c.* += 1;
dup_rows += 1;
} else {
try seen_rows.put(h, 1);
}
}
// --- Emit JSON ---
var total_nulls: u64 = 0;
for (cols.items) |*c| total_nulls += c.nulls;
out.put("{");
out.putf("\"bytes\":{d},\"rows\":{d},\"cols\":{d},", .{ src.len, rows, ncols });
out.putf("\"delim\":\"{s}\",", .{switch (delim) {
',' => "comma",
'\t' => "tab",
';' => "semicolon",
'|' => "pipe",
else => "?",
}});
out.putf("\"totalNulls\":{d},\"dupRows\":{d},\"raggedRows\":{d},", .{ total_nulls, dup_rows, ragged });
out.put("\"columns\":[");
for (cols.items, 0..) |*c, i| {
if (i != 0) out.put(",");
out.put("{\"name\":");
out.putStr(c.name);
out.putf(",\"type\":\"{s}\"", .{c.typeName()});
out.putf(",\"nulls\":{d},\"count\":{d}", .{ c.nulls, c.count });
out.putf(",\"distinct\":{d},\"distinctOverflow\":{s}", .{
c.distinct.count(),
if (c.distinct_overflow) "true" else "false",
});
if (c.n_num > 0) {
const mean = c.sum / @as(f64, @floatFromInt(c.n_num));
const variance = @max(0.0, c.sumsq / @as(f64, @floatFromInt(c.n_num)) - mean * mean);
out.put(",\"min\":");
out.putNum(c.min);
out.put(",\"max\":");
out.putNum(c.max);
out.put(",\"mean\":");
out.putNum(mean);
out.put(",\"std\":");
out.putNum(@sqrt(variance));
}
if (c.maxlen > 0 or c.seen_any) {
const lo = if (c.minlen == std.math.maxInt(usize)) 0 else c.minlen;
out.putf(",\"minLen\":{d},\"maxLen\":{d}", .{ lo, c.maxlen });
}
out.put(",\"top\":");
writeTop(out, c);
out.put("}");
}
out.put("]}");
}