enhanced code extraction for thinking models and recalculated some benchmarks

This commit is contained in:
Michael Peter Christen
2025-05-01 12:38:17 +02:00
parent 811343a570
commit 69877de44f
3 changed files with 57 additions and 24 deletions

View File

@@ -32,7 +32,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q4_K_M | 16.40 | 10.33 | 63 | 32.8 | 4 | 32 | 12.72 | 11.67 | 7.25 | 2.89 |
| athene-v2:72b-q4_K_M | | 10.29 | 28 | 72.7 | 4 | 128 | 14.07 | 11.19 | 6.55 | 0.0 |
| hf.co/bartowski/Sky-T1-32B-Preview-GGUF:Q8_0 | | 10.28 | 31 | 32.8 | 8 | 32 | 12.76 | 10.75 | 8.04 | 3.43 |
| GPT-o1-Preview | | 10.23 | 2 | 300.0 | 16 | 32 | 15.86 | | | |
| GPT-o1-Preview | | 10.22 | 2 | 300.0 | 16 | 32 | 15.86 | | | |
| qwen2.5:72b-instruct-q4_K_M | | 9.78 | 27 | 72.7 | 4 | 128 | 14.02 | 9.1 | 5.97 | 2.46 |
| qwen2.5:72b-instruct-q8_0 | | 9.77 | 13 | 72.7 | 8 | 128 | 12.98 | 10.5 | 5.41 | 3.49 |
| qwen2.5-coder:32b-instruct-q4_K_M | 16.40 | 9.77 | 60 | 32.8 | 4 | 32 | 14.05 | 8.82 | 6.41 | 2.2 |
@@ -66,6 +66,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| qwen3:30b-a3b-q4_K_M-no_think | | 6.31 | 41 | 30.5 | 4 | 128 | 8.22 | 8.01 | | |
| vanilj/Phi-4:Q8_0 | | 6.13 | 42 | 14.7 | 8 | 16 | 9.06 | 5.73 | 3.52 | 0.84 |
| yi-coder:9b-chat-q4_K_M | 4.40 | 5.97 | 136 | 8.8 | 4 | 128 | 7.44 | 6.04 | 5.76 | 0.34 |
| qwq:32b-preview-q8_0 | | 5.97 | 18 | 32.8 | 8 | 32 | 10.15 | 3.11 | 3.88 | 1.97 |
| cogito:70b-v1-preview-llama-q4_K_M | | 5.97 | 17 | 70.6 | 4 | 128 | 7.7 | 7.26 | 3.54 | 0.0 |
| hf.co/bartowski/Anubis-70B-v1-GGUF:Q4_K_M | | 5.83 | 17 | 70.6 | 4 | 128 | 8.07 | 6.49 | 2.59 | 1.36 |
| llama3.1:70b-instruct-q8_0 | | 5.81 | 8 | 70.6 | 8 | 128 | 8.19 | 5.36 | 3.8 | 1.7 |
@@ -82,7 +83,6 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| tulu3:70b-q4_K_M | | 5.21 | 15 | 70.6 | 4 | 128 | 7.31 | 4.68 | 3.35 | 2.15 |
| aravhawk/llama4:scout-q4_K_M | | 5.17 | 10 | 107.8 | 4 | | 6.78 | 5.64 | 3.55 | 0.59 |
| hf.co/ozone-ai/0x-lite-Q4_K_M-GGUF:latest | | 5.14 | 69 | 14.8 | 4 | 32 | 7.66 | 4.52 | 3.32 | 0.56 |
| qwq:32b-preview-q8_0 | | 5.09 | 16 | 32.8 | 8 | 32 | 9.68 | 2.94 | 1.39 | 0.54 |
| command-a:111b-03-2025-q4_K_M | | 5.06 | 9 | 111.1 | 4 | 256 | 6.33 | 5.17 | 4.21 | 1.32 |
| hf.co/bartowski/Qwen2.5-14B-Instruct-1M-GGUF:Q4_K_M | | 4.94 | 67 | 14.8 | 4 | 986 | 7.75 | 4.02 | 2.74 | 0.81 |
| falcon3:10b-instruct-q4_K_M | | 4.88 | 95 | 10.3 | 4 | 32 | 6.62 | 5.77 | 2.19 | 0.6 |
@@ -103,7 +103,7 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| mistral-small:24b-instruct-2501-q4_K_M | | 3.98 | 34 | 23.6 | 4 | 32 | 6.32 | 2.89 | 2.47 | 0.93 |
| qwen2.5-coder:7b-instruct-q4_K_M | 3.80 | 3.98 | 105 | 7.6 | 4 | 32 | 4.76 | 4.7 | 2.87 | 0.87 |
| hf.co/bartowski/Tesslate_Tessa-Rust-T1-7B-GGUF:Q4_K_M | | 3.94 | 104 | 7.62 | 4 | 32 | 6.14 | 3.2 | 2.28 | 0.73 |
| hf.co/bartowski/open-thoughts_OpenThinker-32B-GGUF:Q4_K_M | | 3.89 | 24 | 32.8 | 4 | 32 | 4.2 | 4.22 | 3.22 | 2.95 |
| hf.co/bartowski/open-thoughts_OpenThinker-32B-GGUF:Q4_K_M | | 3.90 | 24 | 32.8 | 4 | 32 | 4.2 | 4.22 | 3.28 | 2.95 |
| gemma3:27b | | 3.73 | 27 | 27.4 | 4 | 128 | 7.15 | 0.14 | 3.8 | 0.64 |
| hf.co/bartowski/HelpingAI_Helpingai3-raw-GGUF:Q4_K_M | | 3.70 | 72 | 10.3 | 4 | 32 | 5.88 | 3.82 | 0.92 | 0.13 |
| gemma2:27b-instruct-q8_0 | | 3.65 | 13 | 27.2 | 8 | 8 | 5.18 | 3.3 | 2.47 | 0.98 |
@@ -137,13 +137,13 @@ The "Economic Score" is the average performance per bytes of model size (times 1
| phi3:14b-medium-128k-instruct-q8_0 | | 2.24 | 16 | 14.0 | 8 | 128 | 4.21 | 1.55 | 0.42 | 0.04 |
| hf.co/bartowski/THUDM_GLM-Z1-32B-0414-GGUF:Q4_K_M | | 2.24 | 14 | 32.6 | 4 | 32 | 3.13 | 1.58 | 2.0 | 1.1 |
| llama3.1:8b-instruct-q8_0 | | 2.03 | 25 | 8.0 | 8 | 128 | 3.26 | 1.78 | 0.94 | 0.09 |
| qwq:32b-q4_K_M | | 2.02 | 12 | 32.8 | 4 | 128 | 2.51 | 1.75 | 1.32 | 2.25 |
| tulu3:8b-q8_0 | | 2.01 | 25 | 8.0 | 8 | 128 | 3.91 | 1.06 | 0.42 | 0.49 |
| hf.co/bartowski/open-r1_OlympicCoder-7B-GGUF:Q4_K_M | | 1.92 | 55 | 7.0 | 4 | 32 | 2.43 | 2.86 | 0.28 | 0.33 |
| hf.co/bartowski/Yi-1.5-6B-Chat-GGUF:Q8_0 | | 1.87 | 31 | 6.06 | 8 | 4 | 3.92 | 0.92 | 0.13 | 0.0 |
| hf.co/bartowski/Yi-1.5-6B-Chat-GGUF:Q4_K_M | | 1.81 | 60 | 6.06 | 4 | 4 | 3.71 | 0.87 | 0.32 | 0.0 |
| deepseek-llm:67b-chat-q4_K_M | | 1.79 | 5 | 67.0 | 4 | 4 | 2.94 | 1.63 | 0.5 | 0.23 |
| hf.co/katanemo/Arch-Function-3B.gguf:Q4_K_M | 1.54 | 1.76 | 114 | 3.09 | 4 | 32 | 2.81 | 1.55 | 0.66 | 0.43 |
| qwq:32b-q4_K_M | | 1.74 | 11 | 32.8 | 4 | 128 | 1.82 | 1.75 | 1.32 | 2.23 |
| qwen2.5:3b-instruct-q4_K_M | | 1.70 | 110 | 3.1 | 4 | 128 | 2.95 | 1.35 | 0.56 | 0.05 |
| deepseek-r1:70b-llama-distill-q4_K_M | | 1.69 | 5 | 70.6 | 4 | 128 | 2.17 | 2.49 | 0.19 | 0.41 |
| deepseek-r1:14b-qwen-distill-q4_K_M | | 1.69 | 23 | 14.8 | 4 | 128 | 2.6 | 1.02 | 1.19 | 1.02 |

View File

@@ -265,6 +265,15 @@
"python-100": 10.23,
"rust-100": 4.52
},
"qwq:32b-preview-q8_0": {
"_context_size": 32,
"_parameter_size": 32.8,
"_quantization_level": 8,
"clojure-100": 1.97,
"java-100": 3.11,
"python-100": 10.15,
"rust-100": 3.88
},
"GPT-3.5-Turbo": {
"_context_size": 16,
"_parameter_size": 175.0,
@@ -301,15 +310,6 @@
"python-100": 9.7,
"rust-100": 4.55
},
"qwq:32b-preview-q8_0": {
"_context_size": 32,
"_parameter_size": 32.8,
"_quantization_level": 8,
"clojure-100": 0.54,
"java-100": 2.94,
"python-100": 9.68,
"rust-100": 1.39
},
"phi4:14b": {
"_context_size": 16,
"_parameter_size": 14.7,
@@ -988,7 +988,7 @@
"clojure-100": 2.95,
"java-100": 4.22,
"python-100": 4.2,
"rust-100": 3.22
"rust-100": 3.28
},
"yi:9b-chat-v1.5-q4_K_M": {
"_context_size": 4,
@@ -1187,6 +1187,15 @@
"python-100": 2.53,
"rust-100": 0.3
},
"qwq:32b-q4_K_M": {
"_context_size": 128,
"_parameter_size": 32.8,
"_quantization_level": 4,
"clojure-100": 2.25,
"java-100": 1.75,
"python-100": 2.51,
"rust-100": 1.32
},
"qwen2-math:7b-instruct-q8_0": {
"_context_size": 4,
"_parameter_size": 7.6,
@@ -1394,15 +1403,6 @@
"python-100": 1.83,
"rust-100": 0.24
},
"qwq:32b-q4_K_M": {
"_context_size": 128,
"_parameter_size": 32.8,
"_quantization_level": 4,
"clojure-100": 2.23,
"java-100": 1.75,
"python-100": 1.82,
"rust-100": 1.32
},
"hf.co/bartowski/google_gemma-3-4b-it-qat-GGUF:Q4_0": {
"_context_size": 128,
"_parameter_size": 3.88,
@@ -2168,4 +2168,4 @@
"_parameter_size": 235.1,
"_quantization_level": 4
}
}
}

View File

@@ -25,7 +25,40 @@ def get_extension(language):
else:
raise Exception(f"Unsupported language: {language}")
thinking_remove_tags = [
["<|begin_of_thought|>", "<|end_of_thought|>"],
["<think>", "</think>"],
["<think>", "<|im_start|>"],
["<thinking>", "</thinking>"],
["<thought>", "</thought>"],
["<Thought>", "</Thought>"],
["<reason>", "</reason>"],
["<reasoning>", "</reasoning>"]
]
thinking_keep_tags = [
["<|begin_of_solution|>", "<|end_of_solution|>"]
]
def extract_code_block(markdown_content, language, extension):
# remove thinking parts from the markdown content
for tag_pair in thinking_remove_tags:
start_tag, end_tag = tag_pair
start = markdown_content.find(start_tag)
if start != -1:
end = markdown_content.find(end_tag, start)
if end != -1:
# remove everything from the beginning of the text to the end of the thought
markdown_content = markdown_content[end + len(end_tag):]
for tag_pair in thinking_keep_tags:
start_tag, end_tag = tag_pair
start = markdown_content.find(start_tag)
if start != -1:
end = markdown_content.find(end_tag, start)
if end != -1:
# now we want to keep what is between the two tags
markdown_content = markdown_content[start + len(start_tag):end]
# Regular expression to find code blocks between triple backticks
code_block_pattern = re.compile(r'```(.*?)```', re.DOTALL)