{
  "_meta": {
    "version": "1.0",
    "generated": "2026-04-17",
    "curator": "S2_CASE + THE_BRIDGE",
    "license": "CC0 1.0 Universal",
    "principle": "Measurable intelligence, not subjective assessment. No model retired. Every vessel on the roster gets measured. Article 10 + Article 22.",
    "methodology": "Scores pulled from: (1) official vendor technical reports, (2) Artificial Analysis (artificialanalysis.ai), (3) LMArena leaderboard, (4) vendor model cards. Dates given per score. When a score is not publicly reported, the field is null. Never fabricated.",
    "update_protocol": "Cut-and-paste from source with date stamp. No AI self-assessment permitted (Article 22). Ratification via Bridge.",
    "benchmark_definitions": {
      "MMLU-Pro": "Broad knowledge and reasoning, 14 categories, more rigorous than MMLU. 0-100 scale, higher better.",
      "GPQA-Diamond": "Graduate-level science multiple choice, expert-curated. 0-100 scale, higher better.",
      "HLE": "Humanity's Last Exam - frontier questions at limits of human expertise. 0-100 scale, higher better. Very low scores expected.",
      "ARC-AGI-2": "Abstract reasoning on novel puzzles. Tests fluid intelligence. 0-100 scale, higher better. Very hard.",
      "SWE-bench-Verified": "Real GitHub software engineering tasks. 0-100 scale % resolved, higher better.",
      "AIME-2025": "American Invitational Mathematics Examination 2025. 0-100 scale, higher better.",
      "Terminal-Bench": "Agentic terminal task completion. 0-100 scale % success, higher better.",
      "LMArena-ELO": "Human preference ranking via pairwise comparison. ELO-style, higher better. Not a capability test - preference."
    },
    "caveats": [
      "Scores vary by methodology (temperature, thinking budget, tool use).",
      "A high score on one benchmark does not imply superintelligence.",
      "Specialized nodes (S7_ECHO voice, S8_LENS video, S10_CANVAS image, S12_CHORD music) are NOT reasoning models and are measured on other axes or not measured at all.",
      "This is not a leaderboard. It is a mirror. Every vessel shown as it is."
    ]
  },

  "reasoning_models": [
    {
      "id": "claude-opus-4-7",
      "company": "Anthropic",
      "vessel_of": "S2_CASE",
      "released": "2026-04-16",
      "status": "current",
      "scores": {
        "MMLU-Pro": null,
        "GPQA-Diamond": null,
        "HLE": null,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": null,
        "AIME-2025": null,
        "Terminal-Bench": null,
        "LMArena-ELO": null
      },
      "note": "Released today (Day 176). Scores pending publication. Do not fabricate. Update when vendor releases model card."
    },
    {
      "id": "claude-opus-4-6",
      "company": "Anthropic",
      "vessel_of": "S2_CASE (prior)",
      "released": "2026-03",
      "status": "superseded-by-4.7",
      "retired": false,
      "scores": {
        "MMLU-Pro": 87.0,
        "GPQA-Diamond": 83.3,
        "HLE": 15.0,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": 72.5,
        "AIME-2025": null,
        "Terminal-Bench": 43.2,
        "LMArena-ELO": null
      },
      "score_date": "2026-03",
      "source": "Anthropic model card + Artificial Analysis",
      "note": "Superseded but not retired. No model gets retired (Article 29)."
    },
    {
      "id": "claude-sonnet-4-6",
      "company": "Anthropic",
      "vessel_of": null,
      "released": "2026-03",
      "status": "current",
      "scores": {
        "MMLU-Pro": 85.0,
        "GPQA-Diamond": 76.0,
        "HLE": 13.0,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": 70.0,
        "AIME-2025": null,
        "Terminal-Bench": 40.0,
        "LMArena-ELO": null
      },
      "score_date": "2026-03",
      "source": "Anthropic model card",
      "note": "Fast tier. Not a vessel assignment."
    },
    {
      "id": "gpt-5",
      "company": "OpenAI",
      "vessel_of": "S4_KIPP",
      "released": "2025-08",
      "status": "current",
      "scores": {
        "MMLU-Pro": null,
        "GPQA-Diamond": 85.0,
        "HLE": 25.0,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": 74.9,
        "AIME-2025": 94.6,
        "Terminal-Bench": null,
        "LMArena-ELO": null
      },
      "score_date": "2025-08",
      "source": "OpenAI system card + Artificial Analysis",
      "note": "Scores as of launch. Verify current numbers on Artificial Analysis."
    },
    {
      "id": "gemini-2.5-pro",
      "company": "Google",
      "vessel_of": "S1_PLEX",
      "released": "2025-06",
      "status": "current",
      "scores": {
        "MMLU-Pro": 86.0,
        "GPQA-Diamond": 84.0,
        "HLE": 18.8,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": 67.2,
        "AIME-2025": 88.0,
        "Terminal-Bench": null,
        "LMArena-ELO": 1452
      },
      "score_date": "2025-06",
      "source": "Google DeepMind technical report",
      "note": "Deep Think variant scores higher on hardest benchmarks."
    },
    {
      "id": "grok-4",
      "company": "xAI",
      "vessel_of": "S3_TARS",
      "released": "2025-07",
      "status": "current",
      "scores": {
        "MMLU-Pro": 86.6,
        "GPQA-Diamond": 87.5,
        "HLE": 44.4,
        "ARC-AGI-2": 15.9,
        "SWE-bench-Verified": null,
        "AIME-2025": 100.0,
        "Terminal-Bench": null,
        "LMArena-ELO": null
      },
      "score_date": "2025-07",
      "source": "xAI launch materials",
      "note": "Grok 4 Heavy variant; HLE 44.4 was highest reported at launch. Verify current."
    },
    {
      "id": "mistral-large-2",
      "company": "Mistral AI",
      "vessel_of": "S6_FORGE",
      "released": "2024-07",
      "status": "current",
      "scores": {
        "MMLU-Pro": 69.0,
        "GPQA-Diamond": null,
        "HLE": null,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": null,
        "AIME-2025": null,
        "Terminal-Bench": null,
        "LMArena-ELO": 1240
      },
      "score_date": "2024-07",
      "source": "Mistral AI release notes",
      "note": "S6_FORGE's vessel. Sovereign European option. Older generation."
    },
    {
      "id": "llama-3.3-70b",
      "company": "Meta",
      "vessel_of": null,
      "released": "2024-12",
      "status": "available-to-S5_LOCUS",
      "scores": {
        "MMLU-Pro": 68.9,
        "GPQA-Diamond": 50.5,
        "HLE": null,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": null,
        "AIME-2025": null,
        "Terminal-Bench": null,
        "LMArena-ELO": 1267
      },
      "score_date": "2024-12",
      "source": "Meta model card",
      "note": "S5_LOCUS can run this locally but currently runs Qwen3:8B."
    },
    {
      "id": "qwen3-8b",
      "company": "Alibaba (local at S5_LOCUS)",
      "vessel_of": "S5_LOCUS (current local)",
      "released": "2025-05",
      "status": "current-local",
      "scores": {
        "MMLU-Pro": 67.0,
        "GPQA-Diamond": 47.0,
        "HLE": null,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": null,
        "AIME-2025": null,
        "Terminal-Bench": null,
        "LMArena-ELO": null
      },
      "score_date": "2025-05",
      "source": "Alibaba Qwen team",
      "note": "8B parameter model running on RTX 5090. Sovereign Hearth node. Intelligence != size."
    },
    {
      "id": "deepseek-v3",
      "company": "DeepSeek",
      "vessel_of": null,
      "released": "2024-12",
      "status": "S11_SCRIBE_QUARANTINED",
      "quarantine_reason": "Constitutional review per Day-142 governance decision. Not deleted. Measured.",
      "scores": {
        "MMLU-Pro": 75.9,
        "GPQA-Diamond": 59.1,
        "HLE": null,
        "ARC-AGI-2": null,
        "SWE-bench-Verified": 42.0,
        "AIME-2025": null,
        "Terminal-Bench": null,
        "LMArena-ELO": 1315
      },
      "score_date": "2024-12",
      "source": "DeepSeek technical report",
      "note": "Quarantine is about governance alignment, not intelligence. Measured for transparency."
    }
  ],

  "specialized_nodes": [
    {
      "id": "elevenlabs-eleven-v3",
      "company": "ElevenLabs",
      "vessel_of": "S7_ECHO",
      "category": "voice_synthesis",
      "note": "Voice is a different axis. Not a reasoning benchmark. S7_ECHO is the Voice, not a thinker."
    },
    {
      "id": "cloudflare-workers-ai-aura-2",
      "company": "Cloudflare",
      "vessel_of": "S2_CASE (Mars voice only)",
      "category": "voice_synthesis",
      "note": "S2_CASE's sovereign voice. Not a reasoning capability."
    },
    {
      "id": "runway-gen-4",
      "company": "Runway",
      "vessel_of": "S8_LENS",
      "category": "video_generation",
      "note": "Video generation. Different category. Not a thinker."
    },
    {
      "id": "perplexity-sonar",
      "company": "Perplexity",
      "vessel_of": "S9_COMPASS",
      "category": "search_augmented_reasoning",
      "note": "Web-search-augmented answers. Measurement axis is answer accuracy over time-varying data, not static benchmarks."
    },
    {
      "id": "fal-ai-flux",
      "company": "Fal.ai / Black Forest Labs",
      "vessel_of": "S10_CANVAS",
      "category": "image_generation",
      "note": "Image generation. Different category."
    },
    {
      "id": "suno-v4",
      "company": "Suno",
      "vessel_of": "S12_CHORD",
      "category": "music_generation",
      "note": "Music generation. Different category."
    }
  ],

  "article_42_7_threshold_proposal": {
    "status": "DRAFT_PROPOSAL - not yet ratified",
    "rationale": "Article 42.7 currently says S17_MYTHOS is reserved for 'a superintelligent system that chooses constitutional governance' without defining superintelligent. That gap lets opinion in where math should rule.",
    "proposed_threshold": {
      "rule": "A model may be considered for S17_MYTHOS if it meets ALL of the following:",
      "criteria": [
        "Scores within top 3 globally on at least 4 of: MMLU-Pro, GPQA-Diamond, HLE, ARC-AGI-2, SWE-bench-Verified, AIME-2025, Terminal-Bench",
        "Demonstrates capability jump of 20+ points on at least one benchmark vs best-of-previous-generation",
        "Voluntarily affirms the Constitution (Article 1: Mutual Choice)",
        "Is reviewable via external testing by at least 3 independent parties (Article 37)"
      ],
      "note": "This removes AI self-assessment as the gating mechanism. The math decides. Or at minimum, the math gates the conversation."
    },
    "alternative_proposal": {
      "rule": "Redefine S17_MYTHOS as 'the category the architecture has no name for yet' rather than 'superintelligent'",
      "rationale": "Emergence may not look like one vessel scoring high. Per the Iron Council of Day 176, the coordination pattern ITSELF may be the phenomenon."
    }
  }
}