#!/usr/bin/env python3
"""Grade work through SeaOtter's own OtterScore gate — the dogfood grader.

The point: SeaOtter's whole pitch is "agents grade their work before it ships",
yet our OWN local agents (Claude / Codex / a human shell) routinely bypass the
gate. This is the frictionless, zero-config way to NOT bypass it — it uses the
exact public agent-native loop a web agent uses (autonomous self-signup -> async
jobs path -> poll -> verdict), so running it both grades the work AND dogfoods the
discovery loop end to end.

Zero config: with no key it self-signs-up a free-tier account
(POST /api/v1/agent-keys/signup) and caches the key under ~/.seaotter/. Set
OTTERLOOP_API_KEY (or SEAOTTER_EVAL_API_KEY) to use an existing key.

Usage:
    python3 scripts/agent/grade_work.py path/to/file.py        # grade one file
    python3 scripts/agent/grade_work.py --diff                 # grade the working-tree diff vs origin/dev
    git diff | python3 scripts/agent/grade_work.py --stdin     # grade piped text
    python3 scripts/agent/grade_work.py f.py --prompt "what the work was meant to do"

Exit code: 0 if band == ship, else non-zero (so it can gate a pre-push hook / CI).
Pure stdlib (urllib + json) — no pip install, runs under any python3.9+.
"""

from __future__ import annotations

import argparse
import json
import os
import subprocess
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path

DEFAULT_API = os.getenv("SEAOTTER_EVAL_API_BASE", "https://api.seaotter.ai").rstrip("/")
KEY_CACHE = Path.home() / ".seaotter" / "agent_grade_key.json"
# Band -> exit code. ship is the only "pass"; everything else is a non-zero gate.
_BAND_OK = {"ship"}
_CODE_EXTS = {
    ".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java", ".c", ".cc",
    ".cpp", ".h", ".hpp", ".rb", ".php", ".sh", ".sql", ".kt", ".swift", ".scala",
}


class ApiError(Exception):
    """An HTTP error from the eval API, with the parsed status + body (so the CLI
    surfaces e.g. the 402 quota/pay-link instead of dumping a raw traceback)."""

    def __init__(self, code: int, body: str) -> None:
        super().__init__(f"HTTP {code}")
        self.code = code
        self.body = body


def _post(url: str, body: dict, key: str | None = None, timeout: float = 30.0) -> dict:
    data = json.dumps(body).encode()
    headers = {"Content-Type": "application/json"}
    if key:
        headers["Authorization"] = f"Bearer {key}"
    req = urllib.request.Request(url, data=data, headers=headers, method="POST")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as r:  # noqa: S310 (trusted host)
            return json.loads(r.read().decode())
    except urllib.error.HTTPError as e:  # 4xx/5xx -> structured, not a traceback
        raise ApiError(e.code, e.read().decode(errors="replace")) from e


def _get(url: str, key: str, timeout: float = 30.0) -> dict:
    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {key}"}, method="GET")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as r:  # noqa: S310
            return json.loads(r.read().decode())
    except urllib.error.HTTPError as e:
        raise ApiError(e.code, e.read().decode(errors="replace")) from e


def resolve_key(api: str) -> str:
    """env -> cache file -> autonomous free signup (cached). Never prints the secret."""
    env = os.getenv("OTTERLOOP_API_KEY") or os.getenv("SEAOTTER_EVAL_API_KEY")
    if env:
        return env
    if KEY_CACHE.exists():
        try:
            cached = json.loads(KEY_CACHE.read_text())
            if cached.get("api_key") and cached.get("api_base") == api:
                return cached["api_key"]
        except (json.JSONDecodeError, OSError):
            pass
    # Fully autonomous self-signup — the same path a web agent uses.
    user = os.getenv("USER", "agent")
    sys.stderr.write(f"[grade_work] no key found; self-signing-up a free-tier account on {api} ...\n")
    res = _post(f"{api}/api/v1/agent-keys/signup",
                {"email": f"dogfood-{user}@seaotter.ai", "org_name": "seaotter-dogfood"})
    api_key = res["api_key"]
    KEY_CACHE.parent.mkdir(parents=True, exist_ok=True)
    KEY_CACHE.write_text(json.dumps({"api_base": api, "api_key": api_key,
                                     "key_prefix": res.get("key_prefix")}))
    KEY_CACHE.chmod(0o600)
    sys.stderr.write(f"[grade_work] minted {res.get('key_prefix')}… (cached at {KEY_CACHE}, free_quota={res.get('free_quota')})\n")
    return api_key


def gather_artifact(args: argparse.Namespace) -> tuple[str, str, str]:
    """Return (text, modality, label) for the work to grade."""
    if args.stdin:
        return sys.stdin.read(), args.modality or "text", "<stdin>"
    if args.diff:
        base = args.diff_base
        label = f"git diff {base}"
        try:
            text = subprocess.check_output(["git", "diff", base], text=True, stderr=subprocess.DEVNULL)
        except subprocess.CalledProcessError:
            sys.stderr.write(f"[grade_work] base '{base}' unresolved; grading the working-tree diff instead\n")
            text = subprocess.check_output(["git", "diff"], text=True)
            label = "git diff (working tree)"
        if not text.strip():
            text = subprocess.check_output(["git", "diff", "--cached"], text=True)
            label = "git diff --cached"
        return text, args.modality or "code", label
    if args.path:
        p = Path(args.path)
        text = p.read_text(errors="replace")
        modality = args.modality or ("code" if p.suffix.lower() in _CODE_EXTS else "text")
        return text, modality, str(p)
    raise SystemExit("nothing to grade — pass a file path, --diff, or --stdin")


_MAX_ARTIFACT_CHARS = 190_000


def grade(api: str, key: str, text: str, modality: str, prompt: str, poll_s: float) -> dict:
    if len(text) > _MAX_ARTIFACT_CHARS:
        sys.stderr.write(f"[grade_work] artifact truncated to {_MAX_ARTIFACT_CHARS} of {len(text)} chars for grading\n")
    submit = _post(f"{api}/api/v1/eval/jobs", {
        "submission": "async",
        "modality": modality,
        # Sent explicitly so the helper works against both the current and the
        # ergonomic-defaults API. A naive third-party agent can omit these.
        "rubric_id": "enterprise-acceptance-default",
        "artifact_ref": "inline:dogfood",
        "user_prompt": prompt or f"Review this {modality} for flaws before it ships.",
        "artifact_parts": [{"mime_type": "text/plain", "text": text[:_MAX_ARTIFACT_CHARS]}],
    }, key=key)
    job_id = submit["job_id"]
    sys.stderr.write(f"[grade_work] job {job_id} submitted; polling (warm grades are seconds; a cold GPU loads the model and can take several minutes (up to ~6 min)) ...\n")
    deadline = time.time() + poll_s
    while time.time() < deadline:
        time.sleep(8)
        try:
            job = _get(f"{api}/api/v1/eval/jobs/{job_id}", key)
        except (urllib.error.URLError, OSError) as e:  # transient blip — keep polling (HTTPError is ApiError, propagates)
            sys.stderr.write(f"[grade_work]   (transient poll error: {e}; retrying)\n")
            continue
        st = job.get("status")
        if st in ("completed", "failed"):
            return job
        sys.stderr.write(f"[grade_work]   status={st}\n")
    raise SystemExit(f"timed out after {poll_s:.0f}s waiting for job {job_id}")


def print_verdict(api: str, key: str, job: dict) -> str:
    if job.get("status") == "failed":
        print(f"GRADING FAILED: {job.get('error')}")
        return "failed"
    summary = job.get("result_summary") or {}
    # band may be missing/None on an unusual finalize — coalesce before .upper().
    band = summary.get("band") or summary.get("decision") or "?"
    score = summary.get("score")
    run_id = job.get("run_id")
    print("\n" + "=" * 60)
    print(f"  OtterScore: {score}   band: {band.upper()}   flaws: {summary.get('flaw_count', '?')}")
    print("=" * 60)
    # Pull the full flaws/upgrades from the run.
    if run_id:
        try:
            run = _get(f"{api}/api/v1/eval/runs/{run_id}", key)
            iters = run.get("iterations") or []
            verdict = (iters[-1] if iters else {}).get("critic_verdict") or {}
            for fl in (verdict.get("flaws") or [])[:20]:
                print(f"  [{fl.get('severity', '?'):>8}] {fl.get('criterion', '?')}: {fl.get('detail') or fl.get('evidence', '')}")
            ups = verdict.get("upgrades") or []
            if ups:
                print("  upgrades:")
                for up in ups[:10]:
                    print(f"    - {up if isinstance(up, str) else up.get('detail', up)}")
        except (urllib.error.URLError, KeyError, json.JSONDecodeError) as e:
            sys.stderr.write(f"[grade_work] (could not fetch full flaws: {e})\n")
    return band


def main() -> int:
    ap = argparse.ArgumentParser(description="Grade work through SeaOtter's OtterScore gate.")
    ap.add_argument("path", nargs="?", help="file to grade")
    ap.add_argument("--diff", action="store_true", help="grade the git diff instead of a file")
    ap.add_argument("--diff-base", default="origin/dev", help="base ref for --diff (default origin/dev)")
    ap.add_argument("--stdin", action="store_true", help="grade text from stdin")
    ap.add_argument("--modality", help="override modality (text/code/...)")
    ap.add_argument("--prompt", default="", help="what the work was meant to do (improves grading)")
    ap.add_argument("--api", default=DEFAULT_API, help=f"API base (default {DEFAULT_API})")
    ap.add_argument("--timeout", type=float, default=600.0,
                    help="max seconds to wait (match the server SLA; a fully-cold GPU can take several min)")
    args = ap.parse_args()

    api = args.api.rstrip("/")
    text, modality, label = gather_artifact(args)
    if not text.strip():
        print("nothing to grade (empty input)")
        return 0
    try:
        key = resolve_key(api)
        sys.stderr.write(f"[grade_work] grading {label} as modality={modality} via {api}\n")
        job = grade(api, key, text, modality, args.prompt, args.timeout)
    except ApiError as e:
        if e.code == 402:  # free quota exhausted — surface the pay-link, don't traceback
            url = None
            try:
                d = json.loads(e.body)
                url = d.get("checkout_url") or (d.get("detail") or {}).get("checkout_url")
            except (json.JSONDecodeError, AttributeError):
                pass
            print(f"FREE QUOTA EXHAUSTED (HTTP 402). {'Pay to continue: ' + url if url else e.body[:300]}")
            print("Or fetch a pay-link any time: POST /api/v1/billing/pay-link")
            return 2
        print(f"GRADING UNAVAILABLE (HTTP {e.code}): {e.body[:300]}")
        return 2
    band = print_verdict(api, key, job)
    return 0 if band in _BAND_OK else 1


if __name__ == "__main__":
    sys.exit(main())
