more benchmarks

This commit is contained in:
Michael Peter Christen
2025-03-20 18:05:42 +01:00
parent abbb18e326
commit 7447bc401e
4 changed files with 216 additions and 191 deletions

1
.gitignore vendored
View File

@@ -4,6 +4,7 @@ endpoints/OpenAI_GPT-4o.json
endpoints/OpenAI_GPT-o1-Mini.json
endpoints/OpenAI_GPT-o1.json
.DS_Store
*.tar.gz
endpoints/DeepSeek-V3.json
endpoints/DeepSeek-R1.json
problems

View File

@@ -16,17 +16,16 @@ super-human performances in the domain of coding or "being a programmer". See "M
The computed Benchmark ("PE-Bench-Python-100", "PE-Bench-Java-100", "PE-Bench-Rust-100", "PE-Bench-Clojure-100") is the super-human performance factor to code in Python/Java/Rust/Clojure.
The "Economic Score" is the average performance per bytes of model size (times 100). Results are:
<<<<<<< HEAD
| Model | Best<br/>Model<br/>for<br/>Size (GB) | PE-100-<br/>Score | Mem-<br/>Score | Size<br/>(*10^9 Params) | Bits | Context Length<br/>(K) | Python | Java | Rust | Clojure |
| :------------------------------------------------------------------- | -----------------------------------: | ----------------: | -------------: | ----------------------: | ---: | ---------------------: | -----: | ---: | ---: | ------: |
| DeepSeek-V3 | 1342.00 | 16.11 | 1 | 671.0 | 16 | 64 | 20.01 | 16.95 | 12.16 | 5.92 |
| Model | Best<br/>Model<br/>for<br/>Size (GB) | PE-100-<br/>Score | Mem-<br/>Score | Size<br/>*10^9 Params | Bits | Context Length<br/>(K) | Python | Java | Rust | Clojure |
| :------------------------------------------------------------------------ | -----------------------------------: | ----------------: | -------------: | --------------------: | ---: | ---------------------: | -----: | ---: | ---: | ------: |
| DeepSeek-V3 | 1342 | 16.11 | 1 | 671.0 | 16 | 64 | 20.01 | 16.95 | 12.16 | 5.92 |
| GPT-4o | | 14.72 | | | 16 | 128 | 17.05 | 13.87 | 14.57 | 8.24 |
| GPT-o1-Mini | 200.00 | 11.10 | 6 | 100.0 | 16 | 32 | 17.44 | | | |
| GPT-o1-Mini | 200 | 11.08 | 6 | 100.0 | 16 | 32 | 17.44 | | | |
| athene-v2:72b-q8_0 | 72.70 | 10.97 | 15 | 72.7 | 8 | 128 | 16.22 | 10.15 | 5.55 | 3.32 |
| hf.co/bartowski/Athene-V2-Agent-GGUF:Q4_K_M | 36.35 | 10.60 | 29 | 72.7 | 4 | 128 | 14.49 | 10.56 | 6.33 | 3.74 |
| athene-v2:latest | 36.35 | 10.44 | 29 | 72.7 | 4 | 128 | 14.07 | 11.14 | 6.55 | 1.62 |
| hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q4_K_M | 16.40 | 10.33 | 63 | 32.8 | 4 | 32 | 12.72 | 11.67 | 7.25 | 2.89 |
| GPT-o1-Preview | | 10.09 | 2 | 300.0 | 16 | 32 | 15.86 | | | |
| GPT-o1-Preview | | 10.08 | 2 | 300.0 | 16 | 32 | 15.86 | | | |
| qwen2.5:72b-instruct-q4_K_M | | 9.78 | 27 | 72.7 | 4 | 128 | 14.02 | 9.1 | 5.97 | 2.46 |
| qwen2.5:72b-instruct-q8_0 | | 9.77 | 13 | 72.7 | 8 | 128 | 12.98 | 10.5 | 5.41 | 3.49 |
| qwen2.5-coder:32b-instruct-q4_K_M | 16.40 | 9.77 | 60 | 32.8 | 4 | 32 | 14.05 | 8.82 | 6.41 | 2.2 |
@@ -78,6 +77,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| qwen2.5:7b-instruct-q8_0 | | 4.34 | 57 | 7.6 | 8 | 128 | 7.47 | 3.6 | 1.13 | 0.51 |
| falcon3:7b-instruct-q8_0 | | 4.34 | 58 | 7.5 | 8 | 32 | 6.76 | 3.91 | 2.16 | 0.36 |
| hf.co/mradermacher/Viper-Coder-Hybrid-v1.3-GGUF:Q4_K_M | | 4.32 | 58 | 14.8 | 4 | 128 | 6.44 | 4.45 | 1.83 | 0.46 |
| hf.co/bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF:Q4_K_M | | 4.32 | | 23.6 | | | 6.79 | | | |
| hf.co/smirki/UIGEN-T1.1-Qwen-7B-Q4_K_M-GGUF:latest | 3.81 | 4.29 | 113 | 7.62 | 4 | 32 | 6.06 | 4.53 | 2.08 | 0.88 |
| qwen2.5:7b-instruct-q4_K_M | 3.80 | 4.22 | 111 | 7.6 | 4 | 128 | 7.5 | 2.67 | 1.86 | 0.49 |
| mistral-small:24b-instruct-2501-q4_K_M | | 3.98 | 34 | 23.6 | 4 | 32 | 6.32 | 2.89 | 2.47 | 0.93 |
@@ -145,6 +145,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| qwen2.5:1.5b-instruct-q4_K_M | 0.75 | 0.88 | 118 | 1.5 | 4 | 128 | 1.94 | 0.26 | 0.15 | 0.0 |
| gemma3:4b | | 0.88 | 41 | 4.3 | 4 | 128 | 1.56 | 0.55 | 0.43 | 0.0 |
| mixtral:8x7b-instruct-v0.1-q8_0 | | 0.82 | 2 | 46.7 | 8 | 32 | 1.44 | 0.65 | 0.23 | 0.0 |
| hf.co/bartowski/open-r1_OlympicCoder-32B-GGUF:Q4_K_M | | 0.79 | | 32.8 | | | 1.54 | 0.32 | 0.18 | 0.45 |
| hf.co/bartowski/OpenThinker-7B-GGUF:Q4_K_M | | 0.75 | 20 | 7.62 | 4 | 32 | 0.88 | 0.77 | 0.53 | 0.62 |
| command-r7b:7b-12-2024-q4_K_M | | 0.68 | 17 | 8.0 | 4 | 128 | 1.54 | 0.03 | 0.26 | 0.04 |
| llama3.2:3b | | 0.66 | 41 | 3.2 | 4 | 128 | 1.42 | 0.18 | 0.21 | 0.0 |
@@ -154,6 +155,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| qwen:72b-chat-v1.5-q4_K_M | | 0.64 | 2 | 72.0 | 4 | 32 | 0.94 | 0.78 | 0.13 | 0.0 |
| olmo2:7b-1124-instruct-q4_K_M | | 0.60 | 16 | 7.3 | 4 | 4 | 1.42 | 0.08 | 0.02 | 0.0 |
| qwen2.5:0.5b-instruct-q8_0 | 0.50 | 0.57 | 115 | 0.5 | 8 | 128 | 1.33 | 0.0 | 0.21 | 0.0 |
| exaone-deep:32b-q4_K_M | | 0.54 | 3 | 32.0 | 4 | | 0.85 | | | |
| internlm2:1.8b-chat-v2.5-q4_K_M | | 0.51 | 54 | 1.9 | 4 | | 1.24 | 0.06 | 0.0 | 0.0 |
| granite3.1-dense:2b-instruct-q8_0 | | 0.50 | 20 | 2.5 | 8 | 128 | 1.07 | 0.11 | 0.2 | 0.0 |
| llama3.2:latest | | 0.49 | 31 | 3.21 | 4 | 128 | 0.99 | 0.18 | 0.21 | 0.0 |

View File

@@ -469,6 +469,10 @@
"python-100": 6.98,
"rust-100": 0.76
},
"hf.co/bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF:Q4_K_M": {
"_parameter_size": 23.6,
"python-100": 6.79
},
"falcon3:7b-instruct-q8_0": {
"_context_size": 32,
"_parameter_size": 7.5,
@@ -1100,10 +1104,11 @@
"rust-100": 0.13
},
"hf.co/bartowski/open-r1_OlympicCoder-32B-GGUF:Q4_K_M": {
"python-100": 1.54,
"_parameter_size": 32.8,
"clojure-100": 0.45,
"java-100": 0.32,
"rust-100": 0.18,
"clojure-100": 0.45
"python-100": 1.54,
"rust-100": 0.18
},
"Bio-Medical-Llama-3-8B-GGUF:Q8_0": {
"_context_size": 8,
@@ -1265,6 +1270,11 @@
"python-100": 0.85,
"rust-100": 0.15
},
"exaone-deep:32b-q4_K_M": {
"_parameter_size": 32.0,
"_quantization_level": 4,
"python-100": 0.85
},
"smallthinker:3b-preview-q8_0": {
"_context_size": 128,
"_parameter_size": 3.4,
@@ -1648,5 +1658,14 @@
"java-100": 0.0,
"python-100": 0.0,
"rust-100": 0.0
},
"hf.co/bartowski/nvidia_Llama-3_3-Nemotron-Super-49B-v1-GGUF:Q4_K_M": {
"_parameter_size": 49.9
},
"hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q8_0": {
"_parameter_size": 32.8
},
"hf.co/mradermacher/phi-4-abliterated-GGUF:Q8_0": {
"_parameter_size": 14.7
}
}

View File

@@ -87,7 +87,7 @@ print(table)
col_best = "Best<br/>Model<br/>for<br/>Size (GB)"
col_bench_score = "PE-100-<br/>Score"
col_memory_score = "Mem-<br/>Score"
col_size = "Size<br/>(*10^9 Params)"
col_size = "Size<br/>*10^9 Params"
col_quant = "Bits"
col_context = "Context Length<br/>(K)"
col_bench_100 = "PE-Bench-100 Details"
@@ -120,6 +120,9 @@ for key, value in benchmark.items():
col_bench_score_vs = '' if bench_score_v == '' else "{:.2f}".format(bench_score_v)
col_memory_score_vs = '' if memory_score_v == '' else "{:.0f}".format(memory_score_v)
if memory_amount >= 100.0:
col_best_vs = "{:.0f}".format(memory_amount) if best_model else ''
else:
col_best_vs = "{:.2f}".format(memory_amount) if best_model else ''
col_size_vs = str(size_v)
col_quant_vs = str(quant_v)