2c:["$","div",null,{"className":"space-y-4 max-w-4xl mx-auto","children":[["$","div",null,{"className":"text-center","children":[["$","h2",null,{"className":"text-2xl font-bold","children":"Performance Metrics"}],["$","p",null,{"className":"text-muted-foreground","children":"Context window and performance specifications"}]]}],["$","$L31",null,{"modelA":{"modelId":"gpt-5-2025-08-07","name":"GPT-5","organizationId":"openai","fineTunedFromModelId":null,"description":"GPT-5 is our flagship model for coding, reasoning, and agentic tasks across domains. The best model for coding and agentic tasks with higher reasoning capabilities and medium speed.","releaseDate":"2025-08-07","announcementDate":"2025-08-07","licenseId":"proprietary","multimodal":true,"knowledgeCutoff":"2024-09-30","paramCount":null,"trainingTokens":null,"contextWindow":null,"availableInZeroeval":true,"sourceApiRef":"https://platform.openai.com/docs/models/gpt-5","sourcePlayground":"https://platform.openai.com/playground?mode=chat&model=gpt-5","sourcePaper":"https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf","sourceScorecardBlogLink":"https://openai.com/index/gpt-5/","sourceRepoLink":null,"sourceWeightsLink":null,"modelFamilyId":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","organization":{"organizationId":"openai","name":"OpenAI","website":"https://openai.com","description":"Leading AI research company","country":"US","manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:49:05.815Z","updatedAt":"$D2025-07-19T19:49:05.815Z"},"license":{"licenseId":"proprietary","name":"Proprietary","allowCommercial":false,"description":"Proprietary license - usage restrictions apply","createdAt":"$D2025-07-19T19:49:05.425Z","updatedAt":"$D2025-07-19T19:49:05.425Z"},"benchmarks":[{"modelBenchmarkId":50862176,"benchmarkId":"swe-bench-verified","modelId":"gpt-5-2025-08-07","score":0.749,"normalizedScore":0.749,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled (up to 128K tokens) with enhanced reasoning capabilities and iterative problem-solving approach.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"swe-bench-verified","name":"SWE-Bench Verified","category":null}},{"modelBenchmarkId":11240600,"benchmarkId":"aider-polyglot","modelId":"gpt-5-2025-08-07","score":0.88,"normalizedScore":0.88,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled (up to 128K tokens) with step-by-step reasoning and multi-language code understanding.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"aider-polyglot","name":"Aider-Polyglot","category":null}},{"modelBenchmarkId":90814228,"benchmarkId":"swe-lancer-(ic-diamond-subset)","modelId":"gpt-5-2025-08-07","score":1,"normalizedScore":1,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 - IC SWE Diamond Freelance Coding Tasks (earnings-based evaluation).","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"swe-lancer-(ic-diamond-subset)","name":"SWE-Lancer (IC-Diamond subset)","category":null}},{"modelBenchmarkId":96102057,"benchmarkId":"aime-2025","modelId":"gpt-5-2025-08-07","score":0.946,"normalizedScore":0.946,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 standard with thinking mode enabled (no tools) - competition mathematics.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"aime-2025","name":"AIME 2025","category":null}},{"modelBenchmarkId":35254115,"benchmarkId":"mmmu","modelId":"gpt-5-2025-08-07","score":0.842,"normalizedScore":0.842,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - College-level visual problem-solving with multimodal reasoning.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"mmmu","name":"MMMU","category":null}},{"modelBenchmarkId":7451493,"benchmarkId":"mmlu","modelId":"gpt-5-2025-08-07","score":0.925,"normalizedScore":0.925,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"Standard benchmark across multiple academic subjects with comprehensive knowledge evaluation.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"mmlu","name":"MMLU","category":null}},{"modelBenchmarkId":538402,"benchmarkId":"humaneval","modelId":"gpt-5-2025-08-07","score":0.934,"normalizedScore":0.934,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"Code generation benchmark with function completion tasks in Python.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"humaneval","name":"HumanEval","category":null}},{"modelBenchmarkId":46129198,"benchmarkId":"math","modelId":"gpt-5-2025-08-07","score":0.847,"normalizedScore":0.847,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled with step-by-step mathematical problem solving and verification.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"math","name":"MATH","category":null}},{"modelBenchmarkId":56599430,"benchmarkId":"healthbench-hard","modelId":"gpt-5-2025-08-07","score":0.016,"normalizedScore":0.016,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled for medical hallucination detection. Measured inaccuracies on challenging healthcare conversations.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"healthbench-hard","name":"HealthBench Hard","category":null}},{"modelBenchmarkId":66622934,"benchmarkId":"frontiermath","modelId":"gpt-5-2025-08-07","score":0.263,"normalizedScore":0.263,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 standard with thinking mode enabled (with python tool only) - FrontierMath Tier 1-3 expert-level mathematics.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"frontiermath","name":"FrontierMath","category":null}},{"modelBenchmarkId":48186467,"benchmarkId":"hmmt-2025","modelId":"gpt-5-2025-08-07","score":0.933,"normalizedScore":0.933,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 standard with thinking mode enabled (no tools) - Harvard-MIT Mathematics Tournament.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"hmmt-2025","name":"HMMT 2025","category":null}},{"modelBenchmarkId":71382798,"benchmarkId":"gpqa","modelId":"gpt-5-2025-08-07","score":0.857,"normalizedScore":0.857,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 - Diamond thinking no tools","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"gpqa","name":"GPQA","category":null}},{"modelBenchmarkId":22811983,"benchmarkId":"humanity's-last-exam","modelId":"gpt-5-2025-08-07","score":0.248,"normalizedScore":0.248,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 standard with thinking mode (no tools) - Full set of expert-level questions across subjects.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"humanity's-last-exam","name":"Humanity's Last Exam","category":null}},{"modelBenchmarkId":8388165,"benchmarkId":"scale-multichallenge","modelId":"gpt-5-2025-08-07","score":0.696,"normalizedScore":0.696,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode enabled - Multi-turn instruction following benchmark.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"scale-multichallenge","name":"Scale MultiChallenge","category":null}},{"modelBenchmarkId":34038420,"benchmarkId":"browsecomp","modelId":"gpt-5-2025-08-07","score":0.549,"normalizedScore":0.549,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode enabled - Agentic search & browsing benchmark.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"browsecomp","name":"BrowseComp","category":null}},{"modelBenchmarkId":76992000,"benchmarkId":"collie","modelId":"gpt-5-2025-08-07","score":0.99,"normalizedScore":0.99,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode enabled - Instruction-following in freeform writing.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"collie","name":"COLLIE","category":null}},{"modelBenchmarkId":34288913,"benchmarkId":"multichallenge-(o3-mini-grader)","modelId":"gpt-5-2025-08-07","score":0.696,"normalizedScore":0.696,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with o3-mini grader - Multi-turn instruction following benchmark with improved grading accuracy.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"multichallenge-(o3-mini-grader)","name":"MultiChallenge (o3-mini grader)","category":null}},{"modelBenchmarkId":10973566,"benchmarkId":"internal-api-instruction-following-(hard)","modelId":"gpt-5-2025-08-07","score":0.64,"normalizedScore":0.64,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 - Internal API instruction following evaluation (hard difficulty).","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"internal-api-instruction-following-(hard)","name":"Internal API instruction following (hard)","category":null}},{"modelBenchmarkId":94199452,"benchmarkId":"tau2-airline","modelId":"gpt-5-2025-08-07","score":0.626,"normalizedScore":0.626,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 - Function calling benchmark (airline domain).","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"tau2-airline","name":"Tau2 Airline","category":null}},{"modelBenchmarkId":83575151,"benchmarkId":"tau2-retail","modelId":"gpt-5-2025-08-07","score":0.811,"normalizedScore":0.811,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - Function calling benchmark (retail domain).","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"tau2-retail","name":"Tau2 Retail","category":null}},{"modelBenchmarkId":82222714,"benchmarkId":"tau2-telecom","modelId":"gpt-5-2025-08-07","score":0.967,"normalizedScore":0.967,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - Function calling benchmark (telecom domain).","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"tau2-telecom","name":"Tau2 Telecom","category":null}},{"modelBenchmarkId":9853578,"benchmarkId":"mmmu-pro","modelId":"gpt-5-2025-08-07","score":0.784,"normalizedScore":0.784,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - Graduate-level visual problem-solving with advanced multimodal reasoning.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"mmmu-pro","name":"MMMU-Pro","category":null}},{"modelBenchmarkId":14225240,"benchmarkId":"videommmu","modelId":"gpt-5-2025-08-07","score":0.846,"normalizedScore":0.846,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - Video-based multimodal reasoning (max frame 256).","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"videommmu","name":"VideoMMMU","category":null}},{"modelBenchmarkId":48864439,"benchmarkId":"charxiv-r","modelId":"gpt-5-2025-08-07","score":0.811,"normalizedScore":0.811,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - Scientific figure reasoning and interpretation.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"charxiv-r","name":"CharXiv-R","category":null}},{"modelBenchmarkId":11009110,"benchmarkId":"erqa","modelId":"gpt-5-2025-08-07","score":0.657,"normalizedScore":0.657,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/gpt-5/","verifiedByLlmstats":false,"analysisMethod":"GPT-5 with thinking mode - Multimodal spatial reasoning.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"erqa","name":"ERQA","category":null}},{"modelBenchmarkId":98917038,"benchmarkId":"openai-mrcr:-2-needle-128k","modelId":"gpt-5-2025-08-07","score":0.952,"normalizedScore":0.952,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"OpenAI-MRCR 2-needle retrieval at 128k tokens.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"openai-mrcr:-2-needle-128k","name":"OpenAI-MRCR: 2 needle 128k","category":null}},{"modelBenchmarkId":97706061,"benchmarkId":"openai-mrcr:-2-needle-256k","modelId":"gpt-5-2025-08-07","score":0.868,"normalizedScore":0.868,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"OpenAI-MRCR 2-needle retrieval at 256k tokens.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"openai-mrcr:-2-needle-256k","name":"OpenAI-MRCR: 2 needle 256k","category":null}},{"modelBenchmarkId":49672061,"benchmarkId":"graphwalks-bfs-<128k","modelId":"gpt-5-2025-08-07","score":0.783,"normalizedScore":0.783,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"Graphwalks BFS (<128k) long-context reasoning.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"graphwalks-bfs-<128k","name":"Graphwalks BFS <128k","category":null}},{"modelBenchmarkId":45627664,"benchmarkId":"graphwalks-parents-<128k","modelId":"gpt-5-2025-08-07","score":0.733,"normalizedScore":0.733,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"Graphwalks parents (<128k) long-context reasoning.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"graphwalks-parents-<128k","name":"Graphwalks parents <128k","category":null}},{"modelBenchmarkId":80790741,"benchmarkId":"browsecomp-long-128k","modelId":"gpt-5-2025-08-07","score":0.9,"normalizedScore":0.9,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"BrowseComp long-context 128k variant.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"browsecomp-long-128k","name":"BrowseComp Long Context 128k","category":null}},{"modelBenchmarkId":55460500,"benchmarkId":"browsecomp-long-256k","modelId":"gpt-5-2025-08-07","score":0.888,"normalizedScore":0.888,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"BrowseComp long-context 256k variant.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"browsecomp-long-256k","name":"BrowseComp Long Context 256k","category":null}},{"modelBenchmarkId":44621443,"benchmarkId":"videomme-w-sub.","modelId":"gpt-5-2025-08-07","score":0.867,"normalizedScore":0.867,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"VideoMME (long) with subtitles category.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"videomme-w-sub.","name":"VideoMME w sub.","category":null}},{"modelBenchmarkId":91350921,"benchmarkId":"longfact-concepts","modelId":"gpt-5-2025-08-07","score":0.007,"normalizedScore":0.007,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled for hallucination detection. Measured on open-source prompts for concept-based factual queries.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"longfact-concepts","name":"LongFact Concepts","category":null}},{"modelBenchmarkId":87515733,"benchmarkId":"longfact-objects","modelId":"gpt-5-2025-08-07","score":0.008,"normalizedScore":0.008,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled for hallucination detection. Measured on open-source prompts for object-based factual queries.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"longfact-objects","name":"LongFact Objects","category":null}},{"modelBenchmarkId":50130775,"benchmarkId":"factscore","modelId":"gpt-5-2025-08-07","score":0.01,"normalizedScore":0.01,"isSelfReported":true,"selfReportedSourceLink":"https://openai.com/index/introducing-gpt-5-for-developers/","verifiedByLlmstats":false,"analysisMethod":"Thinking mode enabled for factual accuracy assessment. Measured hallucination rate on open-source prompts.","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","benchmark":{"benchmarkId":"factscore","name":"FActScore","category":null}}],"providers":[{"modelProviderId":85833504,"modelId":"gpt-5-2025-08-07","providerId":"openai","providerModelIdUsed":"gpt-5","deprecatedAt":null,"inputCentsPerMillionTokens":125,"outputCentsPerMillionTokens":1000,"quantization":null,"maxInputTokens":400000,"maxOutputTokens":128000,"throughput":100,"latency":2,"featureWebSearch":true,"featureFunctionCalling":true,"featureStructuredOutput":true,"featureCodeExecution":true,"featureBatchInference":true,"featureFinetuning":true,"inputModalityText":true,"inputModalityImage":true,"inputModalityAudio":false,"inputModalityVideo":false,"outputModalityText":true,"outputModalityImage":false,"outputModalityAudio":false,"outputModalityVideo":false,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","modelName":"GPT-5","organizationId":"openai","provider":{"providerId":"openai","name":"OpenAI","website":"https://openai.com","createdAt":"$D2025-07-19T19:49:17.121Z","updatedAt":"$D2025-07-19T19:49:17.121Z"}},{"modelProviderId":71367092,"modelId":"gpt-5-2025-08-07","providerId":"zeroeval","providerModelIdUsed":"gpt-5","deprecatedAt":null,"inputCentsPerMillionTokens":125,"outputCentsPerMillionTokens":1000,"quantization":null,"maxInputTokens":400000,"maxOutputTokens":128000,"throughput":100,"latency":2,"featureWebSearch":true,"featureFunctionCalling":true,"featureStructuredOutput":true,"featureCodeExecution":true,"featureBatchInference":true,"featureFinetuning":true,"inputModalityText":true,"inputModalityImage":true,"inputModalityAudio":false,"inputModalityVideo":false,"outputModalityText":true,"outputModalityImage":false,"outputModalityAudio":false,"outputModalityVideo":false,"createdAt":"$D2025-07-24T12:00:00.000Z","updatedAt":"$D2025-07-24T12:00:00.000Z","modelName":"GPT-5","organizationId":"openai","provider":{"providerId":"zeroeval","name":"ZeroEval","website":"https://zeroeval.com","createdAt":"$D2025-07-15T06:36:02.543Z","updatedAt":"$D2025-07-15T06:36:02.543Z"}}]},"modelB":{"modelId":"llama-3.1-nemotron-70b-instruct","name":"Llama 3.1 Nemotron 70B Instruct","organizationId":"nvidia","fineTunedFromModelId":"llama-3.1-70b-instruct","description":"A large language model customized by NVIDIA to improve the helpfulness of LLM generated responses. It is a fine-tuned version of Llama 3.1 70B Instruct. The model was trained using RLHF (REINFORCE) with HelpSteer2-Preference prompts.","releaseDate":"2024-10-01","announcementDate":"2024-10-01","licenseId":"llama_3_1_community_license","multimodal":false,"knowledgeCutoff":"2023-12-01","paramCount":"$n70000000000","trainingTokens":null,"contextWindow":null,"availableInZeroeval":true,"sourceApiRef":"https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct","sourcePlayground":null,"sourcePaper":"https://arxiv.org/abs/2410.01257","sourceScorecardBlogLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","sourceRepoLink":null,"sourceWeightsLink":"https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct","modelFamilyId":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:49:05.908Z","updatedAt":"$D2025-10-03T17:22:49.553Z","organization":{"organizationId":"nvidia","name":"NVIDIA","website":"https://nvidia.com","description":"GPU and AI company","country":"US","manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:49:05.728Z","updatedAt":"$D2025-07-19T19:49:05.728Z"},"license":{"licenseId":"llama_3_1_community_license","name":"Llama 3.1 Community License","allowCommercial":false,"description":"Llama 3.1 Community License license","createdAt":"$D2025-07-19T19:49:05.574Z","updatedAt":"$D2025-07-19T19:49:05.574Z"},"benchmarks":[{"modelBenchmarkId":43218499,"benchmarkId":"arc-c","modelId":"llama-3.1-nemotron-70b-instruct","score":0.692,"normalizedScore":0.692,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:11.133Z","updatedAt":"$D2025-07-19T19:56:11.133Z","benchmark":{"benchmarkId":"arc-c","name":"ARC-C","category":null}},{"modelBenchmarkId":79448422,"benchmarkId":"gsm8k","modelId":"llama-3.1-nemotron-70b-instruct","score":0.9143,"normalizedScore":0.9143,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:13.099Z","updatedAt":"$D2025-07-19T19:56:13.099Z","benchmark":{"benchmarkId":"gsm8k","name":"GSM8k","category":null}},{"modelBenchmarkId":847742,"benchmarkId":"gsm8k-chat","modelId":"llama-3.1-nemotron-70b-instruct","score":0.8188,"normalizedScore":0.8188,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Chat evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:15.104Z","updatedAt":"$D2025-07-19T19:56:15.104Z","benchmark":{"benchmarkId":"gsm8k-chat","name":"GSM8K Chat","category":null}},{"modelBenchmarkId":71450147,"benchmarkId":"hellaswag","modelId":"llama-3.1-nemotron-70b-instruct","score":0.8558,"normalizedScore":0.8558,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:11.188Z","updatedAt":"$D2025-07-19T19:56:11.188Z","benchmark":{"benchmarkId":"hellaswag","name":"HellaSwag","category":null}},{"modelBenchmarkId":26064337,"benchmarkId":"instruct-humaneval","modelId":"llama-3.1-nemotron-70b-instruct","score":0.7384,"normalizedScore":0.7384,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Code evaluation (n=20)","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:15.108Z","updatedAt":"$D2025-07-19T19:56:15.108Z","benchmark":{"benchmarkId":"instruct-humaneval","name":"Instruct HumanEval","category":null}},{"modelBenchmarkId":21630144,"benchmarkId":"mmlu","modelId":"llama-3.1-nemotron-70b-instruct","score":0.802,"normalizedScore":0.802,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:11.292Z","updatedAt":"$D2025-07-19T19:56:11.292Z","benchmark":{"benchmarkId":"mmlu","name":"MMLU","category":null}},{"modelBenchmarkId":43719922,"benchmarkId":"mmlu-chat","modelId":"llama-3.1-nemotron-70b-instruct","score":0.8058,"normalizedScore":0.8058,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Chat evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:15.100Z","updatedAt":"$D2025-07-19T19:56:15.100Z","benchmark":{"benchmarkId":"mmlu-chat","name":"MMLU Chat","category":null}},{"modelBenchmarkId":74516605,"benchmarkId":"mt-bench","modelId":"llama-3.1-nemotron-70b-instruct","score":0.0899,"normalizedScore":0.0899,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Chat evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:14.532Z","updatedAt":"$D2025-07-19T19:56:14.532Z","benchmark":{"benchmarkId":"mt-bench","name":"MT-Bench","category":null}},{"modelBenchmarkId":44931101,"benchmarkId":"truthfulqa","modelId":"llama-3.1-nemotron-70b-instruct","score":0.5863,"normalizedScore":0.5863,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:11.363Z","updatedAt":"$D2025-07-19T19:56:11.363Z","benchmark":{"benchmarkId":"truthfulqa","name":"TruthfulQA","category":null}},{"modelBenchmarkId":16921630,"benchmarkId":"winogrande","modelId":"llama-3.1-nemotron-70b-instruct","score":0.8453,"normalizedScore":0.8453,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:11.390Z","updatedAt":"$D2025-07-19T19:56:11.390Z","benchmark":{"benchmarkId":"winogrande","name":"Winogrande","category":null}},{"modelBenchmarkId":38184373,"benchmarkId":"xlsum-english","modelId":"llama-3.1-nemotron-70b-instruct","score":0.3161,"normalizedScore":0.3161,"isSelfReported":true,"selfReportedSourceLink":"https://developer.nvidia.com/blog/advancing-the-accuracy-efficiency-frontier-with-llama-3-1-nemotron-51b/","verifiedByLlmstats":false,"analysisMethod":"Standard evaluation","verificationProviderId":null,"verificationHardware":null,"verificationDate":null,"verificationNotes":null,"manualEdits":null,"editedBy":null,"editedAt":null,"createdAt":"$D2025-07-19T19:56:15.094Z","updatedAt":"$D2025-07-19T19:56:15.094Z","benchmark":{"benchmarkId":"xlsum-english","name":"XLSum English","category":null}}],"providers":[]},"commonBenchmarks":[{"benchmarkId":"swe-bench-verified","name":"SWE-Bench Verified","category":null,"modelA":0.749,"modelB":0},{"benchmarkId":"aider-polyglot","name":"Aider-Polyglot","category":null,"modelA":0.88,"modelB":0},{"benchmarkId":"swe-lancer-(ic-diamond-subset)","name":"SWE-Lancer (IC-Diamond subset)","category":null,"modelA":1,"modelB":0},{"benchmarkId":"aime-2025","name":"AIME 2025","category":null,"modelA":0.946,"modelB":0},{"benchmarkId":"mmmu","name":"MMMU","category":null,"modelA":0.842,"modelB":0},{"benchmarkId":"mmlu","name":"MMLU","category":null,"modelA":0.925,"modelB":0.802},{"benchmarkId":"humaneval","name":"HumanEval","category":null,"modelA":0.934,"modelB":0},{"benchmarkId":"math","name":"MATH","category":null,"modelA":0.847,"modelB":0},{"benchmarkId":"healthbench-hard","name":"HealthBench Hard","category":null,"modelA":0.016,"modelB":0},{"benchmarkId":"frontiermath","name":"FrontierMath","category":null,"modelA":0.263,"modelB":0},{"benchmarkId":"hmmt-2025","name":"HMMT 2025","category":null,"modelA":0.933,"modelB":0},{"benchmarkId":"gpqa","name":"GPQA","category":null,"modelA":0.857,"modelB":0},{"benchmarkId":"humanity's-last-exam","name":"Humanity's Last Exam","category":null,"modelA":0.248,"modelB":0},{"benchmarkId":"scale-multichallenge","name":"Scale MultiChallenge","category":null,"modelA":0.696,"modelB":0},{"benchmarkId":"browsecomp","name":"BrowseComp","category":null,"modelA":0.549,"modelB":0},{"benchmarkId":"collie","name":"COLLIE","category":null,"modelA":0.99,"modelB":0},{"benchmarkId":"multichallenge-(o3-mini-grader)","name":"MultiChallenge (o3-mini grader)","category":null,"modelA":0.696,"modelB":0},{"benchmarkId":"internal-api-instruction-following-(hard)","name":"Internal API instruction following (hard)","category":null,"modelA":0.64,"modelB":0},{"benchmarkId":"tau2-airline","name":"Tau2 Airline","category":null,"modelA":0.626,"modelB":0},{"benchmarkId":"tau2-retail","name":"Tau2 Retail","category":null,"modelA":0.811,"modelB":0},{"benchmarkId":"tau2-telecom","name":"Tau2 Telecom","category":null,"modelA":0.967,"modelB":0},{"benchmarkId":"mmmu-pro","name":"MMMU-Pro","category":null,"modelA":0.784,"modelB":0},{"benchmarkId":"videommmu","name":"VideoMMMU","category":null,"modelA":0.846,"modelB":0},{"benchmarkId":"charxiv-r","name":"CharXiv-R","category":null,"modelA":0.811,"modelB":0},{"benchmarkId":"erqa","name":"ERQA","category":null,"modelA":0.657,"modelB":0},{"benchmarkId":"openai-mrcr:-2-needle-128k","name":"OpenAI-MRCR: 2 needle 128k","category":null,"modelA":0.952,"modelB":0},{"benchmarkId":"openai-mrcr:-2-needle-256k","name":"OpenAI-MRCR: 2 needle 256k","category":null,"modelA":0.868,"modelB":0},{"benchmarkId":"graphwalks-bfs-<128k","name":"Graphwalks BFS <128k","category":null,"modelA":0.783,"modelB":0},{"benchmarkId":"graphwalks-parents-<128k","name":"Graphwalks parents <128k","category":null,"modelA":0.733,"modelB":0},{"benchmarkId":"browsecomp-long-128k","name":"BrowseComp Long Context 128k","category":null,"modelA":0.9,"modelB":0},{"benchmarkId":"browsecomp-long-256k","name":"BrowseComp Long Context 256k","category":null,"modelA":0.888,"modelB":0},{"benchmarkId":"videomme-w-sub.","name":"VideoMME w sub.","category":null,"modelA":0.867,"modelB":0},{"benchmarkId":"longfact-concepts","name":"LongFact Concepts","category":null,"modelA":0.007,"modelB":0},{"benchmarkId":"longfact-objects","name":"LongFact Objects","category":null,"modelA":0.008,"modelB":0},{"benchmarkId":"factscore","name":"FActScore","category":null,"modelA":0.01,"modelB":0},{"benchmarkId":"arc-c","name":"ARC-C","category":null,"modelA":0,"modelB":0.692},{"benchmarkId":"gsm8k","name":"GSM8k","category":null,"modelA":0,"modelB":0.9143},{"benchmarkId":"gsm8k-chat","name":"GSM8K Chat","category":null,"modelA":0,"modelB":0.8188},{"benchmarkId":"hellaswag","name":"HellaSwag","category":null,"modelA":0,"modelB":0.8558},{"benchmarkId":"instruct-humaneval","name":"Instruct HumanEval","category":null,"modelA":0,"modelB":0.7384},{"benchmarkId":"mmlu-chat","name":"MMLU Chat","category":null,"modelA":0,"modelB":0.8058},{"benchmarkId":"mt-bench","name":"MT-Bench","category":null,"modelA":0,"modelB":0.0899},{"benchmarkId":"truthfulqa","name":"TruthfulQA","category":null,"modelA":0,"modelB":0.5863},{"benchmarkId":"winogrande","name":"Winogrande","category":null,"modelA":0,"modelB":0.8453},{"benchmarkId":"xlsum-english","name":"XLSum English","category":null,"modelA":0,"modelB":0.3161}]}],"$L32"]}]

GPT-5 vs Llama 3.1 Nemotron 70B Instruct

Performance Metrics

Provider Availability & Performance