joppe/csv_to_dicts.py

"""
Lees een CSV-bestand regel per regel en zet het om naar een lijst van dicts.

De eerste regel wordt gebruikt als de sleutels (kolomnamen). Elke
daaropvolgende regel wordt op het scheidingsteken gesplitst en gekoppeld
aan die sleutels.

Bevat een regel een dubbele aanhalingsteken (`"`), dan wordt er
aanhalingsteken-bewust gesplitst: scheidingstekens binnen `"..."` worden
genegeerd, en een dubbele `""` binnen een aangehaald veld levert één
letterlijke `"` op.

Gebruik:
    uv run csv_to_dicts.py [bestand] [scheidingsteken]

Standaard:
    bestand          = sample.csv
    scheidingsteken  = ,
"""

import sys


def split_quoted(line: str, sep: str) -> list[str]:
    fields: list[str] = []
    buf: list[str] = []
    in_quotes = False
    i = 0
    while i < len(line):
        c = line[i]
        if in_quotes:
            if c == '"':
                if i + 1 < len(line) and line[i + 1] == '"':
                    buf.append('"')
                    i += 2
                    continue
                in_quotes = False
            else:
                buf.append(c)
            i += 1
        elif c == '"' and not buf:
            in_quotes = True
            i += 1
        elif line.startswith(sep, i):
            fields.append("".join(buf))
            buf = []
            i += len(sep)
        else:
            buf.append(c)
            i += 1
    fields.append("".join(buf))
    return fields


def split_line(line: str, sep: str) -> list[str]:
    if '"' in line:
        return split_quoted(line, sep)
    return line.split(sep)


def csv_to_dicts(path: str, sep: str = ",") -> list[dict]:
    rows: list[dict] = []
    with open(path, encoding="utf-8") as f:
        keys: list[str] | None = None
        for line in f:
            line = line.rstrip("\n").rstrip("\r")
            if not line:
                continue
            values = split_line(line, sep)
            if keys is None:
                keys = values
                continue
            rows.append(dict(zip(keys, values)))
    return rows


if __name__ == "__main__":
    path = sys.argv[1] if len(sys.argv) > 1 else "sample.csv"
    sep = sys.argv[2] if len(sys.argv) > 2 else ","

    rows = csv_to_dicts(path, sep)

    print(f"{len(rows)} regels gelezen uit {path}\n")
    for row in rows:
        print(row)