{"models":["GPT-5.4-mini","Claude Sonnet 4.6","GPT-5.5","Claude Opus 4.7","Claude Opus 4.8","Gemini 3.1 Pro","Gemini 3.1 Flash Lite","Kimi K2.5","Kimi K2.6","MiniMax M2.5","MiniMax M2.7","SWE-1.6"],"colors":{"GPT-5.4-mini":"#10a37f","Claude Sonnet 4.6":"#d97757","GPT-5.5":"#10a37f","Claude Opus 4.7":"#d97757","Claude Opus 4.8":"#d97757","Gemini 3.1 Pro":"#1a73e8","Gemini 3.1 Flash Lite":"#1a73e8","Kimi K2.5":"#7c3aed","Kimi K2.6":"#7c3aed","MiniMax M2.5":"#e5484d","MiniMax M2.7":"#e5484d","SWE-1.6":"#d4a017"},"lab_colors":{"OpenAI":"#10a37f","Anthropic":"#d97757","Google":"#1a73e8","Moonshot":"#7c3aed","MiniMax":"#e5484d","Cognition":"#d4a017"},"harness":{"GPT-5.4-mini":"codex","Claude Sonnet 4.6":"claude-code","GPT-5.5":"codex","Claude Opus 4.7":"claude-code","Claude Opus 4.8":"claude-code","Gemini 3.1 Pro":"gemini-cli","Gemini 3.1 Flash Lite":"gemini-cli","Kimi K2.5":"mini-swe-agent","Kimi K2.6":"mini-swe-agent","MiniMax M2.5":"mini-swe-agent","MiniMax M2.7":"mini-swe-agent","SWE-1.6":"devin"},"efforts":{"GPT-5.4-mini":["medium","high","xhigh"],"Claude Sonnet 4.6":["medium","high","xhigh","max"],"GPT-5.5":["low","medium","high","xhigh"],"Claude Opus 4.7":["low","medium","high","xhigh"],"Claude Opus 4.8":["low","medium","high","xhigh","max"],"Gemini 3.1 Pro":["low","high"],"Gemini 3.1 Flash Lite":["minimal","low","medium","high"],"Kimi K2.5":["none"],"Kimi K2.6":["none"],"MiniMax M2.5":["none"],"MiniMax M2.7":["none"],"SWE-1.6":["none"]},"subsets":{"diamond":50,"main":100,"extended":150},"data":{"GPT-5.4-mini":{"medium":{"diamond":{"correct":0.0449,"new_score":0.0414,"tokens":40514.2,"cost":1.0547,"duration_min":7.22,"tool_calls":107.05,"steps":131.25,"ote":234372.4},"main":{"correct":0.137,"new_score":0.1205,"tokens":34719.7,"cost":0.8851,"duration_min":6.5,"tool_calls":97.55,"steps":120.88,"ote":196680.3},"extended":{"correct":0.3219,"new_score":0.2889,"tokens":29705.9,"cost":0.7602,"duration_min":5.74,"tool_calls":87.37,"steps":109.57,"ote":168940.7}},"high":{"diamond":{"correct":0.044,"new_score":0.0403,"tokens":52697.7,"cost":1.2494,"duration_min":8.72,"tool_calls":121.42,"steps":149.8,"ote":277651.2},"main":{"correct":0.1395,"new_score":0.1245,"tokens":45131.4,"cost":1.0537,"duration_min":7.92,"tool_calls":108.64,"steps":135.7,"ote":234154.2},"extended":{"correct":0.3447,"new_score":0.3123,"tokens":38203.1,"cost":0.897,"duration_min":6.85,"tool_calls":96.37,"steps":121.75,"ote":199325.0}},"xhigh":{"diamond":{"correct":0.048,"new_score":0.0458,"tokens":103268.6,"cost":1.9717,"duration_min":16.04,"tool_calls":179.65,"steps":220.41,"ote":438158.1},"main":{"correct":0.2,"new_score":0.1782,"tokens":88550.1,"cost":1.6706,"duration_min":13.78,"tool_calls":159.08,"steps":197.35,"ote":371244.1},"extended":{"correct":0.3987,"new_score":0.3601,"tokens":73573.6,"cost":1.3899,"duration_min":11.5,"tool_calls":135.74,"steps":170.29,"ote":308870.8}}},"Claude Sonnet 4.6":{"medium":{"diamond":{"correct":0.04,"new_score":0.0341,"tokens":32671.7,"cost":2.4707,"duration_min":16.46,"tool_calls":68.45,"steps":113.19,"ote":148786.2},"main":{"correct":0.155,"new_score":0.1363,"tokens":28176.2,"cost":2.0873,"duration_min":15.14,"tool_calls":63.14,"steps":103.99,"ote":125473.6},"extended":{"correct":0.3563,"new_score":0.3222,"tokens":24003.4,"cost":1.775,"duration_min":12.7,"tool_calls":53.96,"steps":89.84,"ote":105461.4}},"high":{"diamond":{"correct":0.036,"new_score":0.0326,"tokens":39292.8,"cost":2.8607,"duration_min":19.74,"tool_calls":71.63,"steps":121.62,"ote":174191.9},"main":{"correct":0.1685,"new_score":0.1507,"tokens":34333.9,"cost":2.4053,"duration_min":17.6,"tool_calls":65.92,"steps":111.8,"ote":146169.5},"extended":{"correct":0.3683,"new_score":0.3356,"tokens":29847.4,"cost":2.04,"duration_min":15.79,"tool_calls":56.25,"steps":96.41,"ote":122638.1}},"xhigh":{"diamond":{"correct":0.04,"new_score":0.0351,"tokens":40075.3,"cost":2.9369,"duration_min":20.3,"tool_calls":75.08,"steps":126.25,"ote":181386.4},"main":{"correct":0.15,"new_score":0.1336,"tokens":34856.1,"cost":2.4828,"duration_min":19.37,"tool_calls":68.59,"steps":115.11,"ote":152399.3},"extended":{"correct":0.3627,"new_score":0.3287,"tokens":30190.9,"cost":2.1034,"duration_min":16.7,"tool_calls":58.28,"steps":99.42,"ote":127592.3}},"max":{"diamond":{"correct":0.032,"new_score":0.0274,"tokens":43700.9,"cost":3.0239,"duration_min":19.41,"tool_calls":74.47,"steps":127.34,"ote":186891.8},"main":{"correct":0.15,"new_score":0.1317,"tokens":38003.6,"cost":2.5742,"duration_min":17.9,"tool_calls":68.73,"steps":117.64,"ote":158362.6},"extended":{"correct":0.35,"new_score":0.317,"tokens":32619.3,"cost":2.1498,"duration_min":15.21,"tool_calls":57.68,"steps":100.32,"ote":131040.5}}},"GPT-5.5":{"low":{"diamond":{"correct":0.0612,"new_score":0.052,"tokens":9286.3,"cost":1.7682,"duration_min":5.74,"tool_calls":47.76,"steps":75.52,"ote":58940.3},"main":{"correct":0.2391,"new_score":0.2114,"tokens":8481.9,"cost":1.6113,"duration_min":5.82,"tool_calls":45.39,"steps":72.65,"ote":53710.8},"extended":{"correct":0.4273,"new_score":0.3873,"tokens":7409.9,"cost":1.3968,"duration_min":5.05,"tool_calls":40.21,"steps":65.05,"ote":46560.9}},"medium":{"diamond":{"correct":0.072,"new_score":0.0631,"tokens":15026.7,"cost":3.6756,"duration_min":7.69,"tool_calls":83.25,"steps":100.79,"ote":122520.4},"main":{"correct":0.248,"new_score":0.2242,"tokens":13911.7,"cost":3.2931,"duration_min":7.77,"tool_calls":78.93,"steps":96.42,"ote":109771.3},"extended":{"correct":0.464,"new_score":0.4229,"tokens":11940.3,"cost":2.77,"duration_min":6.66,"tool_calls":69.54,"steps":86.13,"ote":92333.7}},"high":{"diamond":{"correct":0.06,"new_score":0.052,"tokens":20678.7,"cost":4.9945,"duration_min":9.59,"tool_calls":102.5,"steps":121.44,"ote":166481.5},"main":{"correct":0.281,"new_score":0.2529,"tokens":19002.0,"cost":4.3815,"duration_min":9.5,"tool_calls":96.8,"steps":115.8,"ote":146049.3},"extended":{"correct":0.49,"new_score":0.4476,"tokens":16209.7,"cost":3.6346,"duration_min":8.09,"tool_calls":84.16,"steps":101.96,"ote":121154.0}},"xhigh":{"diamond":{"correct":0.064,"new_score":0.0574,"tokens":27751.9,"cost":6.2297,"duration_min":12.65,"tool_calls":117.91,"steps":138.64,"ote":207658.2},"main":{"correct":0.2825,"new_score":0.2548,"tokens":25575.9,"cost":5.5911,"duration_min":12.06,"tool_calls":113.57,"steps":134.93,"ote":186371.7},"extended":{"correct":0.4903,"new_score":0.4467,"tokens":21819.9,"cost":4.6186,"duration_min":10.17,"tool_calls":98.31,"steps":118.16,"ote":153952.0}}},"Claude Opus 4.7":{"low":{"diamond":{"correct":0.034,"new_score":0.0321,"tokens":19475.4,"cost":2.8927,"duration_min":8.32,"tool_calls":53.21,"steps":75.43,"ote":114198.7},"main":{"correct":0.1987,"new_score":0.1791,"tokens":16932.1,"cost":2.4262,"duration_min":7.6,"tool_calls":47.8,"steps":67.7,"ote":95927.7},"extended":{"correct":0.4004,"new_score":0.3679,"tokens":14319.1,"cost":2.0425,"duration_min":6.57,"tool_calls":41.56,"steps":59.09,"ote":80814.3}},"medium":{"diamond":{"correct":0.056,"new_score":0.0521,"tokens":22796.1,"cost":3.3863,"duration_min":11.93,"tool_calls":63.62,"steps":89.35,"ote":134227.7},"main":{"correct":0.2355,"new_score":0.213,"tokens":20016.9,"cost":2.8502,"duration_min":12.54,"tool_calls":56.75,"steps":80.0,"ote":112633.7},"extended":{"correct":0.4467,"new_score":0.4098,"tokens":16858.8,"cost":2.3613,"duration_min":11.64,"tool_calls":48.71,"steps":68.94,"ote":93303.2}},"high":{"diamond":{"correct":0.048,"new_score":0.0452,"tokens":31980.2,"cost":5.1902,"duration_min":13.45,"tool_calls":84.43,"steps":118.1,"ote":206237.9},"main":{"correct":0.2435,"new_score":0.2207,"tokens":28180.3,"cost":4.4297,"duration_min":15.11,"tool_calls":77.43,"steps":107.77,"ote":175888.1},"extended":{"correct":0.4597,"new_score":0.4225,"tokens":23585.6,"cost":3.6482,"duration_min":14.02,"tool_calls":65.9,"steps":92.43,"ote":144807.3}},"xhigh":{"diamond":{"correct":0.056,"new_score":0.0519,"tokens":40482.9,"cost":7.0998,"duration_min":20.13,"tool_calls":104.71,"steps":147.41,"ote":283151.6},"main":{"correct":0.253,"new_score":0.2296,"tokens":35076.8,"cost":5.9308,"duration_min":17.17,"tool_calls":95.01,"steps":132.69,"ote":236555.0},"extended":{"correct":0.4697,"new_score":0.4324,"tokens":29750.8,"cost":4.928,"duration_min":15.4,"tool_calls":82.08,"steps":114.99,"ote":196417.8}}},"Claude Opus 4.8":{"low":{"diamond":{"correct":0.0867,"new_score":0.0822,"tokens":25805.9,"cost":3.1533,"duration_min":10.33,"tool_calls":57.86,"steps":103.83,"ote":121700.9},"main":{"correct":0.2767,"new_score":0.2528,"tokens":23239.0,"cost":2.6811,"duration_min":10.22,"tool_calls":53.0,"steps":94.61,"ote":103496.1},"extended":{"correct":0.4678,"new_score":0.4306,"tokens":19342.3,"cost":2.2042,"duration_min":8.64,"tool_calls":45.76,"steps":81.12,"ote":85103.7}},"medium":{"diamond":{"correct":0.064,"new_score":0.0586,"tokens":36859.6,"cost":4.0793,"duration_min":14.67,"tool_calls":70.33,"steps":130.89,"ote":160666.0},"main":{"correct":0.294,"new_score":0.2689,"tokens":31700.5,"cost":3.2956,"duration_min":14.42,"tool_calls":62.41,"steps":116.42,"ote":130160.4},"extended":{"correct":0.4947,"new_score":0.4549,"tokens":26579.6,"cost":2.7449,"duration_min":12.55,"tool_calls":54.57,"steps":101.53,"ote":108521.0}},"high":{"diamond":{"correct":0.095,"new_score":0.087,"tokens":42086.6,"cost":4.7633,"duration_min":15.87,"tool_calls":75.58,"steps":144.84,"ote":187576.8},"main":{"correct":0.3295,"new_score":0.3031,"tokens":37279.9,"cost":3.9446,"duration_min":14.49,"tool_calls":68.11,"steps":130.84,"ote":155453.9},"extended":{"correct":0.5277,"new_score":0.4876,"tokens":31049.5,"cost":3.233,"duration_min":12.88,"tool_calls":59.16,"steps":114.01,"ote":127511.1}},"xhigh":{"diamond":{"correct":0.145,"new_score":0.1342,"tokens":70007.6,"cost":8.028,"duration_min":21.82,"tool_calls":96.98,"steps":195.8,"ote":317247.5},"main":{"correct":0.373,"new_score":0.3427,"tokens":62444.8,"cost":6.6881,"duration_min":20.54,"tool_calls":87.92,"steps":178.54,"ote":264674.3},"extended":{"correct":0.5607,"new_score":0.5179,"tokens":51949.3,"cost":5.405,"duration_min":18.64,"tool_calls":75.98,"steps":155.48,"ote":214235.3}},"max":{"diamond":{"correct":0.123,"new_score":0.1135,"tokens":97000.9,"cost":10.566,"duration_min":28.45,"tool_calls":107.98,"steps":237.95,"ote":416001.5},"main":{"correct":0.341,"new_score":0.3129,"tokens":88168.6,"cost":9.0344,"duration_min":25.86,"tool_calls":99.4,"steps":219.75,"ote":354993.3},"extended":{"correct":0.5393,"new_score":0.4989,"tokens":75826.1,"cost":7.5247,"duration_min":22.45,"tool_calls":88.02,"steps":196.21,"ote":296569.2}}},"Gemini 3.1 Pro":{"low":{"diamond":{"correct":0.0533,"new_score":0.0467,"tokens":22182.4,"cost":3.053,"duration_min":17.86,"tool_calls":84.32,"steps":229.99,"ote":261261.3},"main":{"correct":0.1867,"new_score":0.165,"tokens":19650.7,"cost":2.5151,"duration_min":15.5,"tool_calls":74.64,"steps":187.86,"ote":213311.9},"extended":{"correct":0.38,"new_score":0.3423,"tokens":17717.2,"cost":2.2618,"duration_min":13.31,"tool_calls":66.22,"steps":170.02,"ote":190267.3}},"high":{"diamond":{"correct":0.0467,"new_score":0.0431,"tokens":20448.6,"cost":2.7967,"duration_min":14.33,"tool_calls":81.66,"steps":234.54,"ote":239012.0},"main":{"correct":0.185,"new_score":0.1668,"tokens":18997.2,"cost":2.4112,"duration_min":12.89,"tool_calls":73.74,"steps":193.9,"ote":204394.7},"extended":{"correct":0.3633,"new_score":0.3307,"tokens":17049.7,"cost":2.2139,"duration_min":11.42,"tool_calls":66.11,"steps":182.26,"ote":186229.2}}},"Gemini 3.1 Flash Lite":{"minimal":{"diamond":{"correct":0.0,"new_score":0.0,"tokens":11095.5,"cost":0.2837,"duration_min":7.5,"tool_calls":74.47,"steps":273.07,"ote":410708.5},"main":{"correct":0.0269,"new_score":0.0251,"tokens":11462.6,"cost":0.3694,"duration_min":7.44,"tool_calls":85.47,"steps":428.07,"ote":603094.2},"extended":{"correct":0.1007,"new_score":0.0905,"tokens":10474.9,"cost":0.3219,"duration_min":7.94,"tool_calls":76.6,"steps":388.51,"ote":509334.0}},"low":{"diamond":{"correct":0.0067,"new_score":0.0065,"tokens":15534.6,"cost":0.2106,"duration_min":4.54,"tool_calls":51.65,"steps":143.64,"ote":281783.8},"main":{"correct":0.0533,"new_score":0.0484,"tokens":13559.4,"cost":0.1711,"duration_min":4.71,"tool_calls":46.55,"steps":116.24,"ote":227747.6},"extended":{"correct":0.1622,"new_score":0.146,"tokens":13118.1,"cost":0.1729,"duration_min":5.45,"tool_calls":44.62,"steps":127.24,"ote":228292.3}},"medium":{"diamond":{"correct":0.0,"new_score":0.0,"tokens":10825.8,"cost":0.2883,"duration_min":6.0,"tool_calls":79.88,"steps":352.09,"ote":432502.5},"main":{"correct":0.02,"new_score":0.0184,"tokens":10006.2,"cost":0.2586,"duration_min":8.07,"tool_calls":73.21,"steps":288.13,"ote":386816.4},"extended":{"correct":0.12,"new_score":0.1058,"tokens":9538.7,"cost":0.239,"duration_min":6.94,"tool_calls":67.52,"steps":261.69,"ote":351831.9}},"high":{"diamond":{"correct":0.0067,"new_score":0.0061,"tokens":14955.0,"cost":0.1911,"duration_min":4.35,"tool_calls":51.16,"steps":130.68,"ote":263793.7},"main":{"correct":0.05,"new_score":0.0444,"tokens":14679.0,"cost":0.1792,"duration_min":5.01,"tool_calls":49.36,"steps":123.98,"ote":245411.1},"extended":{"correct":0.1556,"new_score":0.1389,"tokens":13897.2,"cost":0.1759,"duration_min":5.08,"tool_calls":46.71,"steps":121.76,"ote":237579.9}}},"Kimi K2.5":{"none":{"diamond":{"correct":0.01,"new_score":0.01,"tokens":20511.2,"cost":3.1993,"duration_min":11.57,"tool_calls":116.55,"steps":118.33,"ote":258596.7},"main":{"correct":0.075,"new_score":0.0686,"tokens":19107.6,"cost":2.5531,"duration_min":10.68,"tool_calls":101.92,"steps":103.81,"ote":216303.4},"extended":{"correct":0.25,"new_score":0.2269,"tokens":17748.5,"cost":2.3582,"duration_min":10.1,"tool_calls":95.2,"steps":97.13,"ote":201945.7}}},"Kimi K2.6":{"none":{"diamond":{"correct":0.04,"new_score":0.0377,"tokens":48049.3,"cost":null,"duration_min":16.27,"tool_calls":150.34,"steps":152.34,"ote":530154.0},"main":{"correct":0.18,"new_score":0.1604,"tokens":41814.9,"cost":null,"duration_min":14.48,"tool_calls":135.02,"steps":136.78,"ote":445021.6},"extended":{"correct":0.4067,"new_score":0.3701,"tokens":35117.1,"cost":null,"duration_min":12.37,"tool_calls":116.85,"steps":118.69,"ote":370056.0}}},"MiniMax M2.5":{"none":{"diamond":{"correct":0.0133,"new_score":0.0108,"tokens":23241.4,"cost":null,"duration_min":11.33,"tool_calls":122.03,"steps":124.03,"ote":195724.3},"main":{"correct":0.06,"new_score":0.0534,"tokens":21691.0,"cost":null,"duration_min":10.09,"tool_calls":110.11,"steps":112.11,"ote":169558.0},"extended":{"correct":0.1733,"new_score":0.1578,"tokens":19962.4,"cost":null,"duration_min":9.21,"tool_calls":101.35,"steps":103.35,"ote":153047.7}}},"MiniMax M2.7":{"none":{"diamond":{"correct":0.0267,"new_score":0.0238,"tokens":34077.2,"cost":null,"duration_min":9.76,"tool_calls":123.5,"steps":125.37,"ote":410416.2},"main":{"correct":0.0667,"new_score":0.0598,"tokens":30587.4,"cost":null,"duration_min":8.62,"tool_calls":107.22,"steps":109.14,"ote":343209.0},"extended":{"correct":0.22,"new_score":0.1986,"tokens":28445.8,"cost":null,"duration_min":7.93,"tool_calls":96.64,"steps":98.58,"ote":310412.8}}},"SWE-1.6":{"none":{"diamond":{"correct":0.0286,"new_score":0.0248,"tokens":null,"cost":null,"duration_min":7.99,"tool_calls":347.43,"steps":366.17,"ote":null},"main":{"correct":0.0606,"new_score":0.055,"tokens":null,"cost":null,"duration_min":7.41,"tool_calls":265.88,"steps":283.75,"ote":null},"extended":{"correct":0.2,"new_score":0.1838,"tokens":null,"cost":null,"duration_min":7.8,"tool_calls":218.76,"steps":236.47,"ote":null}}}}}