enhanced code extraction for thinking models and recalculated some benchmarks

2025-05-01 12:38:17 +02:00
parent 811343a570
commit 69877de44f
3 changed files with 57 additions and 24 deletions
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q4_K_M                            |    16.40 |  10.33 |     63 |   32.8 |    4 |   32 | 12.72 | 11.67 | 7.25 | 2.89 |
 | athene-v2:72b-q4_K_M                                                      |          |  10.29 |     28 |   72.7 |    4 |  128 | 14.07 | 11.19 | 6.55 |  0.0 |
 | hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q8_0                              |          |  10.28 |     31 |   32.8 |    8 |   32 | 12.76 | 10.75 | 8.04 | 3.43 |
-| GPT-o1-Preview                                                            |          |  10.23 |      2 |  300.0 |   16 |   32 | 15.86 |      |      |      |
+| GPT-o1-Preview                                                            |          |  10.22 |      2 |  300.0 |   16 |   32 | 15.86 |      |      |      |
 | qwen2.5:72b-instruct-q4_K_M                                               |          |   9.78 |     27 |   72.7 |    4 |  128 | 14.02 |  9.1 | 5.97 | 2.46 |
 | qwen2.5:72b-instruct-q8_0                                                 |          |   9.77 |     13 |   72.7 |    8 |  128 | 12.98 | 10.5 | 5.41 | 3.49 |
 | qwen2.5-coder:32b-instruct-q4_K_M                                         |    16.40 |   9.77 |     60 |   32.8 |    4 |   32 | 14.05 | 8.82 | 6.41 |  2.2 |
@@ -66,6 +66,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | qwen3:30b-a3b-q4_K_M-no_think                                             |          |   6.31 |     41 |   30.5 |    4 |  128 | 8.22 | 8.01 |      |      |
 | vanilj/Phi-4:Q8_0                                                         |          |   6.13 |     42 |   14.7 |    8 |   16 | 9.06 | 5.73 | 3.52 | 0.84 |
 | yi-coder:9b-chat-q4_K_M                                                   |     4.40 |   5.97 |    136 |    8.8 |    4 |  128 | 7.44 | 6.04 | 5.76 | 0.34 |
+| qwq:32b-preview-q8_0                                                      |          |   5.97 |     18 |   32.8 |    8 |   32 | 10.15 | 3.11 | 3.88 | 1.97 |
 | cogito:70b-v1-preview-llama-q4_K_M                                        |          |   5.97 |     17 |   70.6 |    4 |  128 |  7.7 | 7.26 | 3.54 |  0.0 |
 | hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M                                 |          |   5.83 |     17 |   70.6 |    4 |  128 | 8.07 | 6.49 | 2.59 | 1.36 |
 | llama3.1:70b-instruct-q8_0                                                |          |   5.81 |      8 |   70.6 |    8 |  128 | 8.19 | 5.36 |  3.8 |  1.7 |
@@ -82,7 +83,6 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | tulu3:70b-q4_K_M                                                          |          |   5.21 |     15 |   70.6 |    4 |  128 | 7.31 | 4.68 | 3.35 | 2.15 |
 | aravhawk/llama4:scout-q4_K_M                                              |          |   5.17 |     10 |  107.8 |    4 |      | 6.78 | 5.64 | 3.55 | 0.59 |
 | hf.co/ozone-ai/0x-lite-Q4_K_M-GGUF:latest                                 |          |   5.14 |     69 |   14.8 |    4 |   32 | 7.66 | 4.52 | 3.32 | 0.56 |
-| qwq:32b-preview-q8_0                                                      |          |   5.09 |     16 |   32.8 |    8 |   32 | 9.68 | 2.94 | 1.39 | 0.54 |
 | command-a:111b-03-2025-q4_K_M                                             |          |   5.06 |      9 |  111.1 |    4 |  256 | 6.33 | 5.17 | 4.21 | 1.32 |
 | hf.co/bartowski/Qwen2.5-14B-Instruct-1M-GGUF:Q4_K_M                       |          |   4.94 |     67 |   14.8 |    4 |  986 | 7.75 | 4.02 | 2.74 | 0.81 |
 | falcon3:10b-instruct-q4_K_M                                               |          |   4.88 |     95 |   10.3 |    4 |   32 | 6.62 | 5.77 | 2.19 |  0.6 |
@@ -103,7 +103,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | mistral-small:24b-instruct-2501-q4_K_M                                    |          |   3.98 |     34 |   23.6 |    4 |   32 | 6.32 | 2.89 | 2.47 | 0.93 |
 | qwen2.5-coder:7b-instruct-q4_K_M                                          |     3.80 |   3.98 |    105 |    7.6 |    4 |   32 | 4.76 |  4.7 | 2.87 | 0.87 |
 | hf.co/bartowski/Tesslate_Tessa-Rust-T1-7B-GGUF:Q4_K_M                     |          |   3.94 |    104 |   7.62 |    4 |   32 | 6.14 |  3.2 | 2.28 | 0.73 |
-| hf.co/bartowski/open-thoughts_OpenThinker-32B-GGUF:Q4_K_M                 |          |   3.89 |     24 |   32.8 |    4 |   32 |  4.2 | 4.22 | 3.22 | 2.95 |
+| hf.co/bartowski/open-thoughts_OpenThinker-32B-GGUF:Q4_K_M                 |          |   3.90 |     24 |   32.8 |    4 |   32 |  4.2 | 4.22 | 3.28 | 2.95 |
 | gemma3:27b                                                                |          |   3.73 |     27 |   27.4 |    4 |  128 | 7.15 | 0.14 |  3.8 | 0.64 |
 | hf.co/bartowski/HelpingAI_Helpingai3-raw-GGUF:Q4_K_M                      |          |   3.70 |     72 |   10.3 |    4 |   32 | 5.88 | 3.82 | 0.92 | 0.13 |
 | gemma2:27b-instruct-q8_0                                                  |          |   3.65 |     13 |   27.2 |    8 |    8 | 5.18 |  3.3 | 2.47 | 0.98 |
@@ -137,13 +137,13 @@ The "Economic Score" is the average performance per bytes of model size (times 1
 | phi3:14b-medium-128k-instruct-q8_0                                        |          |   2.24 |     16 |   14.0 |    8 |  128 | 4.21 | 1.55 | 0.42 | 0.04 |
 | hf.co/bartowski/THUDM_GLM-Z1-32B-0414-GGUF:Q4_K_M                         |          |   2.24 |     14 |   32.6 |    4 |   32 | 3.13 | 1.58 |  2.0 |  1.1 |
 | llama3.1:8b-instruct-q8_0                                                 |          |   2.03 |     25 |    8.0 |    8 |  128 | 3.26 | 1.78 | 0.94 | 0.09 |
+| qwq:32b-q4_K_M                                                            |          |   2.02 |     12 |   32.8 |    4 |  128 | 2.51 | 1.75 | 1.32 | 2.25 |
 | tulu3:8b-q8_0                                                             |          |   2.01 |     25 |    8.0 |    8 |  128 | 3.91 | 1.06 | 0.42 | 0.49 |
 | hf.co/bartowski/open-r1_OlympicCoder-7B-GGUF:Q4_K_M                       |          |   1.92 |     55 |    7.0 |    4 |   32 | 2.43 | 2.86 | 0.28 | 0.33 |
 | hf.co/bartowski/Yi-1.5-6B-Chat-GGUF:Q8_0                                  |          |   1.87 |     31 |   6.06 |    8 |    4 | 3.92 | 0.92 | 0.13 |  0.0 |
 | hf.co/bartowski/Yi-1.5-6B-Chat-GGUF:Q4_K_M                                |          |   1.81 |     60 |   6.06 |    4 |    4 | 3.71 | 0.87 | 0.32 |  0.0 |
 | deepseek-llm:67b-chat-q4_K_M                                              |          |   1.79 |      5 |   67.0 |    4 |    4 | 2.94 | 1.63 |  0.5 | 0.23 |
 | hf.co/katanemo/Arch-Function-3B.gguf:Q4_K_M                               |     1.54 |   1.76 |    114 |   3.09 |    4 |   32 | 2.81 | 1.55 | 0.66 | 0.43 |
-| qwq:32b-q4_K_M                                                            |          |   1.74 |     11 |   32.8 |    4 |  128 | 1.82 | 1.75 | 1.32 | 2.23 |
 | qwen2.5:3b-instruct-q4_K_M                                                |          |   1.70 |    110 |    3.1 |    4 |  128 | 2.95 | 1.35 | 0.56 | 0.05 |
 | deepseek-r1:70b-llama-distill-q4_K_M                                      |          |   1.69 |      5 |   70.6 |    4 |  128 | 2.17 | 2.49 | 0.19 | 0.41 |
 | deepseek-r1:14b-qwen-distill-q4_K_M                                       |          |   1.69 |     23 |   14.8 |    4 |  128 |  2.6 | 1.02 | 1.19 | 1.02 |
--- a/benchmark.json
+++ b/benchmark.json
@@ -265,6 +265,15 @@
        "python-100": 10.23,
        "rust-100": 4.52
    },
+    "qwq:32b-preview-q8_0": {
+        "_context_size": 32,
+        "_parameter_size": 32.8,
+        "_quantization_level": 8,
+        "clojure-100": 1.97,
+        "java-100": 3.11,
+        "python-100": 10.15,
+        "rust-100": 3.88
+    },
    "GPT-3.5-Turbo": {
        "_context_size": 16,
        "_parameter_size": 175.0,
@@ -301,15 +310,6 @@
        "python-100": 9.7,
        "rust-100": 4.55
    },
-    "qwq:32b-preview-q8_0": {
-        "_context_size": 32,
-        "_parameter_size": 32.8,
-        "_quantization_level": 8,
-        "clojure-100": 0.54,
-        "java-100": 2.94,
-        "python-100": 9.68,
-        "rust-100": 1.39
-    },
    "phi4:14b": {
        "_context_size": 16,
        "_parameter_size": 14.7,
@@ -988,7 +988,7 @@
        "clojure-100": 2.95,
        "java-100": 4.22,
        "python-100": 4.2,
-        "rust-100": 3.22
+        "rust-100": 3.28
    },
    "yi:9b-chat-v1.5-q4_K_M": {
        "_context_size": 4,
@@ -1187,6 +1187,15 @@
        "python-100": 2.53,
        "rust-100": 0.3
    },
+    "qwq:32b-q4_K_M": {
+        "_context_size": 128,
+        "_parameter_size": 32.8,
+        "_quantization_level": 4,
+        "clojure-100": 2.25,
+        "java-100": 1.75,
+        "python-100": 2.51,
+        "rust-100": 1.32
+    },
    "qwen2-math:7b-instruct-q8_0": {
        "_context_size": 4,
        "_parameter_size": 7.6,
@@ -1394,15 +1403,6 @@
        "python-100": 1.83,
        "rust-100": 0.24
    },
-    "qwq:32b-q4_K_M": {
-        "_context_size": 128,
-        "_parameter_size": 32.8,
-        "_quantization_level": 4,
-        "clojure-100": 2.23,
-        "java-100": 1.75,
-        "python-100": 1.82,
-        "rust-100": 1.32
-    },
    "hf.co/bartowski/google_gemma-3-4b-it-qat-GGUF:Q4_0": {
        "_context_size": 128,
        "_parameter_size": 3.88,
@@ -2168,4 +2168,4 @@
        "_parameter_size": 235.1,
        "_quantization_level": 4
    }
-}
+}
--- a/codeextraction.py
+++ b/codeextraction.py
@@ -25,7 +25,40 @@ def get_extension(language):
    else:
        raise Exception(f"Unsupported language: {language}")

+thinking_remove_tags = [
+    ["<|begin_of_thought|>", "<|end_of_thought|>"],
+    ["<think>", "</think>"],
+    ["<think>", "<|im_start|>"],
+    ["<thinking>", "</thinking>"],
+    ["<thought>", "</thought>"],
+    ["<Thought>", "</Thought>"],
+    ["<reason>", "</reason>"],
+    ["<reasoning>", "</reasoning>"]
+]
+
+thinking_keep_tags = [
+    ["<|begin_of_solution|>", "<|end_of_solution|>"]
+]
+
 def extract_code_block(markdown_content, language, extension):
+    # remove thinking parts from the markdown content
+    for tag_pair in thinking_remove_tags:
+        start_tag, end_tag = tag_pair
+        start = markdown_content.find(start_tag)
+        if start != -1:
+            end = markdown_content.find(end_tag, start)
+            if end != -1:
+                # remove everything from the beginning of the text to the end of the thought
+                markdown_content = markdown_content[end + len(end_tag):]
+    for tag_pair in thinking_keep_tags:
+        start_tag, end_tag = tag_pair
+        start = markdown_content.find(start_tag)
+        if start != -1:
+            end = markdown_content.find(end_tag, start)
+            if end != -1:
+                # now we want to keep what is between the two tags
+                markdown_content = markdown_content[start + len(start_tag):end]
+
    # Regular expression to find code blocks between triple backticks
    code_block_pattern = re.compile(r'```(.*?)```', re.DOTALL)