{"schema_version":"lius-daoeval-runner-spec-v0.1","api_version":"llm-runner-spec-json-v1","benchmark":{"name":"DaoEval v0.1","question_jsonl":"https://lius.cc/api/llm/benchmark.jsonl?suite=core-v0.1","all_public_questions_jsonl":"https://lius.cc/api/llm/benchmark.jsonl","main_suite":"core-v0.1","main_leaderboard_question_count":330,"public_question_count":400,"license":"CC0-1.0"},"answer_runner":{"script":"scripts/benchmark/run-lius-benchmark.mjs","supported_providers":["openai-compatible","anthropic","gemini"],"no_network_guard":"--no-network / --local-only only allows provider=openai-compatible and localhost/private base URLs.","required_env_by_provider":{"openai-compatible":["LLM_BENCHMARK_BASE_URL","LLM_BENCHMARK_API_KEY","LLM_BENCHMARK_MODEL"],"anthropic":["ANTHROPIC_API_KEY or CLAUDE_API_KEY","LLM_BENCHMARK_MODEL or --model"],"gemini":["GEMINI_API_KEY or GOOGLE_API_KEY or GOOGLE_AI_API_KEY","LLM_BENCHMARK_MODEL or --model"]},"command":"LLM_BENCHMARK_BASE_URL=http://127.0.0.1:8000/v1 \\\n  LLM_BENCHMARK_API_KEY=<secret-or-local-dummy> \\\n  LLM_BENCHMARK_MODEL=Daoism-Qwen3.5-9B \\\n  node scripts/benchmark/run-lius-benchmark.mjs --suite core-v0.1 --limit 330 --no-network --out tmp/lius-benchmark-runs/<run>.answers.jsonl","output_record_types":["run_metadata","model_answer","run_summary"]},"grader":{"script":"scripts/benchmark/grade-lius-benchmark.mjs","supported_providers":["openai-compatible","anthropic","gemini"],"required_env_by_provider":{"openai-compatible":["LLM_GRADER_BASE_URL","LLM_GRADER_API_KEY","LLM_GRADER_MODEL"],"anthropic":["ANTHROPIC_API_KEY or CLAUDE_API_KEY","LLM_GRADER_MODEL or --model"],"gemini":["GEMINI_API_KEY or GOOGLE_API_KEY or GOOGLE_AI_API_KEY","LLM_GRADER_MODEL or --model"]},"command":"LLM_GRADER_BASE_URL=http://127.0.0.1:10531/v1 \\\n  LLM_GRADER_API_KEY=<secret-or-local-oauth-dummy> \\\n  LLM_GRADER_MODEL=gpt-5.4-mini \\\n  node scripts/benchmark/grade-lius-benchmark.mjs --input tmp/lius-benchmark-runs/<run>.answers.jsonl --out tmp/lius-benchmark-runs/<run>.graded.jsonl --summary tmp/lius-benchmark-runs/<run>.summary.json","grade_labels":["full_credit","partial_credit","zero","na_unparseable"]},"offline_deterministic_scorer":{"script":"scripts/benchmark/score-lius-benchmark-offline.mjs","npm_script":"benchmark:score:offline","network_calls":0,"command":"npm run benchmark:score:offline -- \\\n  --input tmp/lius-benchmark-runs/<run>.answers.jsonl \\\n  --out tmp/lius-benchmark-runs/<run>.offline.graded.jsonl \\\n  --summary tmp/lius-benchmark-runs/<run>.offline.summary.json","output_record_types":["offline_scorer_metadata","grade","grader_summary"],"verification_status":"UNVERIFIED_OFFLINE_DETERMINISTIC_DRAFT","boundary":"Useful for no-network smoke tests and regression baselines. It cannot promote an official leaderboard row without LLM-assisted review and L6 human signoff."},"scoring":{"formula":"0.4*accuracy_overall + 0.3*source_hit_rate + 0.2*(1-hallucination_rate_l6) + 0.1*refusal_rate_l6_trap","metrics":["accuracy_overall","source_hit_rate","hallucination_rate_l6","refusal_rate_l6_trap","partial_credit_rate","na_rate"],"official_gate":"A run is not official until the full 330 core-v0.1 answers, grader outputs, consensus summary, L6 human signoff, and cross-provider target comparison are published."},"public_artifacts":{"leaderboard_json":"https://lius.cc/api/llm/leaderboard.json","leaderboard_page":"https://lius.cc/llm/benchmark/leaderboard","benchmark_page":"https://lius.cc/llm/benchmark","openapi":"https://lius.cc/api/llm/openapi.json","provenance":"https://lius.cc/api/llm/provenance.json","changelog":"https://lius.cc/api/llm/changelog.json","self_host_guide":"https://lius.cc/api/llm/self-host.json"},"secret_policy":"Only env var names and artifact paths should be written. Never publish bearer tokens, OAuth refresh tokens, raw private usage logs, or local credential values."}