# -*- coding: utf-8 -*-
"""Remplit la colonne Code du CSV mariage à partir des fiches guests Statamic."""
import csv
import re
from pathlib import Path

GUESTS_DIR = Path(__file__).resolve().parent.parent / "content" / "collections" / "guests"
CSV_PATH = Path(r"c:\Users\yanni\OneDrive\mariage.csv")


def parse_frontmatter(md: str) -> dict:
    if "---" not in md:
        return {}
    parts = md.split("---", 2)
    if len(parts) < 3:
        return {}
    fm = parts[1]
    d = {}
    for line in fm.splitlines():
        if ":" not in line:
            continue
        k, _, rest = line.partition(":")
        k = k.strip()
        v = rest.strip()
        if v.startswith("'") and v.endswith("'"):
            v = v[1:-1]
        elif v.startswith('"') and v.endswith('"'):
            v = v[1:-1]
        if k in ("code", "first_name", "last_name", "title"):
            d[k] = v
    return d


def norm(s: str) -> str:
    return (
        s.lower()
        .replace("é", "e")
        .replace("è", "e")
        .replace("ê", "e")
        .replace("ë", "e")
        .replace("à", "a")
        .replace("â", "a")
        .replace("ô", "o")
        .replace("ö", "o")
        .replace("ï", "i")
        .replace("î", "i")
        .replace("ù", "u")
        .replace("û", "u")
        .replace("ç", "c")
        .replace("œ", "oe")
        .strip()
    )


def load_guests():
    by_full = {}
    by_first_last = {}
    by_last_first = {}
    singles = []  # (norm_first, norm_last, code)

    for p in GUESTS_DIR.glob("*.md"):
        d = parse_frontmatter(p.read_text(encoding="utf-8"))
        code = d.get("code")
        if not code:
            continue
        fn = d.get("first_name") or ""
        ln = d.get("last_name") or ""
        title = d.get("title") or ""
        # last name fallback: title
        if not ln and title:
            ln = title

        full = f"{fn} {ln}".strip()
        if full:
            by_full[norm(full)] = code
        if fn and ln:
            key = (norm(fn), norm(ln))
            by_first_last[key] = code
            by_last_first[(norm(ln), norm(fn))] = code
        singles.append(
            {
                "code": code,
                "fn": norm(fn),
                "ln": norm(ln),
                "fn_raw": fn,
                "ln_raw": ln,
            }
        )

    return by_full, by_first_last, singles


def split_csv_names(cell: str):
    """Découpe 'Alice Blumer & Florian' ou 'Stéphanie et Pascal Dubois'."""
    s = cell.strip()
    # retirer parenthèses type (Tarlala Lita)
    s = re.sub(r"\s*\([^)]*\)\s*", " ", s).strip()
    parts = re.split(r"\s*(?:&|\+|/)\s*|\s+et\s+", s, flags=re.IGNORECASE)
    return [p.strip() for p in parts if p.strip()]


def tokenize_person(s: str):
    """Retourne (prénom_tokens, nom_tokens) heuristique."""
    words = s.split()
    if len(words) == 1:
        return ([norm(words[0])], [])
    if len(words) == 2:
        return ([norm(words[0])], [norm(words[1])])
    # 3+ mots: dernier = nom de famille, reste = prénoms
    return ([norm(w) for w in words[:-1]], [norm(words[-1])])


def find_code(person: str, by_full, by_first_last, singles) -> str | None:
    person = person.strip()
    nfull = norm(person)
    if nfull in by_full:
        return by_full[nfull]

    words = person.split()
    if len(words) >= 2:
        fn, ln = norm(words[0]), norm(words[-1])
        if (fn, ln) in by_first_last:
            return by_first_last[(fn, ln)]
        # prénom nom avec plusieurs prénoms
        if len(words) > 2:
            fn2, ln2 = norm(" ".join(words[:-1])), norm(words[-1])
            # essai prénom composé + nom
            if (fn2, ln2) in by_first_last:
                return by_first_last[(fn2, ln2)]

    # une seule unité (prénom seul souvent)
    if len(words) == 1:
        n1 = norm(words[0])
        matches = [g for g in singles if g["fn"] == n1 and not g["ln"]]
        if len(matches) == 1:
            return matches[0]["code"]
        matches2 = [g for g in singles if g["fn"] == n1]
        if len(matches2) == 1:
            return matches2[0]["code"]

    # dernier recours: match sur nom de famille seul si unique
    if len(words) >= 2:
        ln_only = norm(words[-1])
        cand = [g for g in singles if g["ln"] == ln_only]
        if len(cand) == 1:
            return cand[0]["code"]

    return None


def main():
    by_full, by_first_last, singles = load_guests()

    rows = []
    raw = CSV_PATH.read_bytes()
    for enc in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
        try:
            text = raw.decode(enc)
            break
        except UnicodeDecodeError:
            continue
    else:
        text = raw.decode("utf-8", errors="replace")

    from io import StringIO

    f = StringIO(text)
    reader = csv.reader(f, delimiter=";")
    header = next(reader)
    for row in reader:
        rows.append(row)

    # colonnes attendues: Liste de nom;Table ID;...
    try:
        idx_name = header.index("Liste de nom")
        idx_code = header.index("Code")
    except ValueError:
        idx_name = 0
        idx_code = 4

    # Ajouter Code 2 si pas présent (insérer la cellule dans chaque ligne)
    if "Code 2" not in header:
        insert_at = idx_code + 1
        header.insert(insert_at, "Code 2")
        new_rows = []
        for row in rows:
            while len(row) < insert_at:
                row.append("")
            new_rows.append(row[:insert_at] + [""] + row[insert_at:])
        rows = new_rows

    idx_code2 = header.index("Code 2")

    unmatched = []
    for row in rows:
        while len(row) < len(header):
            row.append("")
        name_cell = row[idx_name].strip()
        if not name_cell or name_cell.startswith("#"):
            row[idx_code] = ""
            if idx_code2 < len(row):
                row[idx_code2] = ""
            continue

        parts = split_csv_names(name_cell)
        codes = []
        for p in parts:
            c = find_code(p, by_full, by_first_last, singles)
            codes.append(c)

        if all(codes):
            row[idx_code] = codes[0]
            row[idx_code2] = codes[1] if len(codes) > 1 else ""
        elif len(codes) == 1 and codes[0]:
            row[idx_code] = codes[0]
            row[idx_code2] = ""
        else:
            # fallback: une seule entrée couple_id ? non
            row[idx_code] = " | ".join(c for c in codes if c) or ""
            row[idx_code2] = ""
            if not row[idx_code]:
                unmatched.append((name_cell, parts, codes))

    # Excel Windows: cp1252 ou UTF-8 BOM
    with CSV_PATH.open("w", encoding="utf-8-sig", newline="") as f:
        w = csv.writer(f, delimiter=";")
        w.writerow(header)
        for row in rows:
            # pad
            while len(row) < len(header):
                row.append("")
            w.writerow(row[: len(header)])

    print("OK, fichier écrit:", CSV_PATH)
    if unmatched:
        print("\nNon résolus:")
        for item in unmatched:
            print(" ", item)


if __name__ == "__main__":
    main()