more benchmarks

2025-03-20 18:05:42 +01:00
parent abbb18e326
commit 7447bc401e
4 changed files with 216 additions and 191 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ endpoints/OpenAI_GPT-4o.json
 endpoints/OpenAI_GPT-o1-Mini.json
 endpoints/OpenAI_GPT-o1.json
 .DS_Store
+*.tar.gz
 endpoints/DeepSeek-V3.json
 endpoints/DeepSeek-R1.json
 problems
--- a/README.md
+++ b/README.md
@@ -16,17 +16,16 @@ super-human performances in the domain of coding or "being a programmer". See "M
 The computed Benchmark ("PE-Bench-Python-100", "PE-Bench-Java-100", "PE-Bench-Rust-100", "PE-Bench-Clojure-100")  is the super-human performance factor to code in Python/Java/Rust/Clojure.
 The "Economic Score" is the average performance per bytes of model size (times 100). Results are:

-<<<<<<< HEAD
-| Model                                                                | Best<br/>Model<br/>for<br/>Size (GB) | PE-100-<br/>Score | Mem-<br/>Score | Size<br/>(*10^9 Params) | Bits | Context Length<br/>(K) | Python | Java | Rust | Clojure |
-| :------------------------------------------------------------------- | -----------------------------------: | ----------------: | -------------: | ----------------------: | ---: | ---------------------: | -----: | ---: | ---: | ------: |
-| DeepSeek-V3                                                          |  1342.00 |  16.11 |      1 |  671.0 |   16 |   64 | 20.01 | 16.95 | 12.16 | 5.92 |
+| Model                                                                     | Best<br/>Model<br/>for<br/>Size (GB) | PE-100-<br/>Score | Mem-<br/>Score | Size<br/>*10^9 Params | Bits | Context Length<br/>(K) | Python | Java | Rust | Clojure |
+| :------------------------------------------------------------------------ | -----------------------------------: | ----------------: | -------------: | --------------------: | ---: | ---------------------: | -----: | ---: | ---: | ------: |
+| DeepSeek-V3                                                               |     1342 |  16.11 |      1 |  671.0 |   16 |   64 | 20.01 | 16.95 | 12.16 | 5.92 |
 | GPT-4o                                                                    |          |  14.72 |        |        |   16 |  128 | 17.05 | 13.87 | 14.57 | 8.24 |
-| GPT-o1-Mini                                                          |   200.00 |  11.10 |      6 |  100.0 |   16 |   32 | 17.44 |      |      |      |
+| GPT-o1-Mini                                                               |      200 |  11.08 |      6 |  100.0 |   16 |   32 | 17.44 |      |      |      |
 | athene-v2:72b-q8_0                                                        |    72.70 |  10.97 |     15 |   72.7 |    8 |  128 | 16.22 | 10.15 | 5.55 | 3.32 |
 | hf.co/bartowski/Athene-V2-Agent-GGUF:Q4_K_M                               |    36.35 |  10.60 |     29 |   72.7 |    4 |  128 | 14.49 | 10.56 | 6.33 | 3.74 |
 | athene-v2:latest                                                          |    36.35 |  10.44 |     29 |   72.7 |    4 |  128 | 14.07 | 11.14 | 6.55 | 1.62 |
 | hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q4_K_M                            |    16.40 |  10.33 |     63 |   32.8 |    4 |   32 | 12.72 | 11.67 | 7.25 | 2.89 |
-| GPT-o1-Preview                                                       |          |  10.09 |      2 |  300.0 |   16 |   32 | 15.86 |      |      |      |
+| GPT-o1-Preview                                                            |          |  10.08 |      2 |  300.0 |   16 |   32 | 15.86 |      |      |      |
 | qwen2.5:72b-instruct-q4_K_M                                               |          |   9.78 |     27 |   72.7 |    4 |  128 | 14.02 |  9.1 | 5.97 | 2.46 |
 | qwen2.5:72b-instruct-q8_0                                                 |          |   9.77 |     13 |   72.7 |    8 |  128 | 12.98 | 10.5 | 5.41 | 3.49 |
 | qwen2.5-coder:32b-instruct-q4_K_M                                         |    16.40 |   9.77 |     60 |   32.8 |    4 |   32 | 14.05 | 8.82 | 6.41 |  2.2 |
@@ -78,6 +77,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | qwen2.5:7b-instruct-q8_0                                                  |          |   4.34 |     57 |    7.6 |    8 |  128 | 7.47 |  3.6 | 1.13 | 0.51 |
 | falcon3:7b-instruct-q8_0                                                  |          |   4.34 |     58 |    7.5 |    8 |   32 | 6.76 | 3.91 | 2.16 | 0.36 |
 | hf.co/mradermacher/Viper-Coder-Hybrid-v1.3-GGUF:Q4_K_M                    |          |   4.32 |     58 |   14.8 |    4 |  128 | 6.44 | 4.45 | 1.83 | 0.46 |
+| hf.co/bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF:Q4_K_M |          |   4.32 |        |   23.6 |      |      | 6.79 |      |      |      |
 | hf.co/smirki/UIGEN-T1.1-Qwen-7B-Q4_K_M-GGUF:latest                        |     3.81 |   4.29 |    113 |   7.62 |    4 |   32 | 6.06 | 4.53 | 2.08 | 0.88 |
 | qwen2.5:7b-instruct-q4_K_M                                                |     3.80 |   4.22 |    111 |    7.6 |    4 |  128 |  7.5 | 2.67 | 1.86 | 0.49 |
 | mistral-small:24b-instruct-2501-q4_K_M                                    |          |   3.98 |     34 |   23.6 |    4 |   32 | 6.32 | 2.89 | 2.47 | 0.93 |
@@ -145,6 +145,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | qwen2.5:1.5b-instruct-q4_K_M                                              |     0.75 |   0.88 |    118 |    1.5 |    4 |  128 | 1.94 | 0.26 | 0.15 |  0.0 |
 | gemma3:4b                                                                 |          |   0.88 |     41 |    4.3 |    4 |  128 | 1.56 | 0.55 | 0.43 |  0.0 |
 | mixtral:8x7b-instruct-v0.1-q8_0                                           |          |   0.82 |      2 |   46.7 |    8 |   32 | 1.44 | 0.65 | 0.23 |  0.0 |
+| hf.co/bartowski/open-r1_OlympicCoder-32B-GGUF:Q4_K_M                      |          |   0.79 |        |   32.8 |      |      | 1.54 | 0.32 | 0.18 | 0.45 |
 | hf.co/bartowski/OpenThinker-7B-GGUF:Q4_K_M                                |          |   0.75 |     20 |   7.62 |    4 |   32 | 0.88 | 0.77 | 0.53 | 0.62 |
 | command-r7b:7b-12-2024-q4_K_M                                             |          |   0.68 |     17 |    8.0 |    4 |  128 | 1.54 | 0.03 | 0.26 | 0.04 |
 | llama3.2:3b                                                               |          |   0.66 |     41 |    3.2 |    4 |  128 | 1.42 | 0.18 | 0.21 |  0.0 |
@@ -154,6 +155,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | qwen:72b-chat-v1.5-q4_K_M                                                 |          |   0.64 |      2 |   72.0 |    4 |   32 | 0.94 | 0.78 | 0.13 |  0.0 |
 | olmo2:7b-1124-instruct-q4_K_M                                             |          |   0.60 |     16 |    7.3 |    4 |    4 | 1.42 | 0.08 | 0.02 |  0.0 |
 | qwen2.5:0.5b-instruct-q8_0                                                |     0.50 |   0.57 |    115 |    0.5 |    8 |  128 | 1.33 |  0.0 | 0.21 |  0.0 |
+| exaone-deep:32b-q4_K_M                                                    |          |   0.54 |      3 |   32.0 |    4 |      | 0.85 |      |      |      |
 | internlm2:1.8b-chat-v2.5-q4_K_M                                           |          |   0.51 |     54 |    1.9 |    4 |      | 1.24 | 0.06 |  0.0 |  0.0 |
 | granite3.1-dense:2b-instruct-q8_0                                         |          |   0.50 |     20 |    2.5 |    8 |  128 | 1.07 | 0.11 |  0.2 |  0.0 |
 | llama3.2:latest                                                           |          |   0.49 |     31 |   3.21 |    4 |  128 | 0.99 | 0.18 | 0.21 |  0.0 |
--- a/benchmark.json
+++ b/benchmark.json
@@ -469,6 +469,10 @@
        "python-100": 6.98,
        "rust-100": 0.76
    },
+    "hf.co/bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF:Q4_K_M": {
+        "_parameter_size": 23.6,
+        "python-100": 6.79
+    },
    "falcon3:7b-instruct-q8_0": {
        "_context_size": 32,
        "_parameter_size": 7.5,
@@ -1100,10 +1104,11 @@
        "rust-100": 0.13
    },
    "hf.co/bartowski/open-r1_OlympicCoder-32B-GGUF:Q4_K_M": {
-        "python-100": 1.54,
+        "_parameter_size": 32.8,
+        "clojure-100": 0.45,
        "java-100": 0.32,
-        "rust-100": 0.18,
-        "clojure-100": 0.45
+        "python-100": 1.54,
+        "rust-100": 0.18
    },
    "Bio-Medical-Llama-3-8B-GGUF:Q8_0": {
        "_context_size": 8,
@@ -1265,6 +1270,11 @@
        "python-100": 0.85,
        "rust-100": 0.15
    },
+    "exaone-deep:32b-q4_K_M": {
+        "_parameter_size": 32.0,
+        "_quantization_level": 4,
+        "python-100": 0.85
+    },
    "smallthinker:3b-preview-q8_0": {
        "_context_size": 128,
        "_parameter_size": 3.4,
@@ -1648,5 +1658,14 @@
        "java-100": 0.0,
        "python-100": 0.0,
        "rust-100": 0.0
+    },
+    "hf.co/bartowski/nvidia_Llama-3_3-Nemotron-Super-49B-v1-GGUF:Q4_K_M": {
+        "_parameter_size": 49.9
+    },
+    "hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q8_0": {
+        "_parameter_size": 32.8
+    },
+    "hf.co/mradermacher/phi-4-abliterated-GGUF:Q8_0": {
+        "_parameter_size": 14.7
    }
 }
--- a/publish.py
+++ b/publish.py
@@ -87,7 +87,7 @@ print(table)
 col_best = "Best<br/>Model<br/>for<br/>Size (GB)"
 col_bench_score = "PE-100-<br/>Score"
 col_memory_score = "Mem-<br/>Score"
-col_size = "Size<br/>(*10^9 Params)"
+col_size = "Size<br/>*10^9 Params"
 col_quant = "Bits"
 col_context = "Context Length<br/>(K)"
 col_bench_100 = "PE-Bench-100 Details"
@@ -120,6 +120,9 @@ for key, value in benchmark.items():

    col_bench_score_vs = '' if bench_score_v == '' else "{:.2f}".format(bench_score_v)
    col_memory_score_vs = '' if memory_score_v == '' else "{:.0f}".format(memory_score_v)
+    if memory_amount >= 100.0:
+        col_best_vs = "{:.0f}".format(memory_amount) if best_model else ''
+    else:
        col_best_vs = "{:.2f}".format(memory_amount) if best_model else ''
    col_size_vs = str(size_v)
    col_quant_vs = str(quant_v)